Coverage for src/archive_md_urls/scan_md.py: 100%

30 statements  

« prev     ^ index     » next       coverage.py v6.4.2, created at 2022-07-28 16:50 +0200

1"""Extract dates and URLs from Markdown files.""" 

2 

3from pathlib import Path 

4from typing import Optional 

5 

6import dateutil.parser 

7import markdown 

8from bs4 import BeautifulSoup 

9 

10# List of URLs considered stable and thus ignored 

11STABLE_URLS: tuple[str, ...] = ( 

12 # archive.org snapshots 

13 "web.archive.org/web/", 

14 # Pelican intra-site links 

15 "{filename}", 

16 "{static}", 

17 # Jekyll intra-site links 

18 "{% post_url", 

19 # Hugo intra-site links 

20 "{{< ref", 

21 "{{< relref", 

22 # Persistent identifier list from ORCID (https://pub.orcid.org/v2.0/identifiers) 

23 "https://arxiv.org/abs/", 

24 "http://www.amazon.com/dp/", 

25 "https://www.authenticus.pt/", 

26 "http://adsabs.harvard.edu/abs/", 

27 "https://ciencia.iscte-iul.pt/id/", 

28 "https://d-nb.info/", 

29 "https://doi.org/", 

30 "http://ethos.bl.uk/OrderDetails.do?uin=", 

31 "https://hal.archives-ouvertes.fr/view/resolver/", 

32 "http://hdl.handle.net/", 

33 "https://www.worldcat.org/isbn/", 

34 "https://portal.issn.org/resource/ISSN/", 

35 "http://zbmath.org/?format=complete&q=an%3A", 

36 "http://www.jstor.org/stable/", 

37 "https://koreamed.org/article/", 

38 "http://lccn.loc.gov/", 

39 "https://www.lens.org/", 

40 "http://www.ams.org/mathscinet-getitem?mr=", 

41 "http://www.worldcat.org/oclc/", 

42 "http://openlibrary.org/b/", 

43 "https://www.osti.gov/biblio/", 

44 "http://identifiers.org/pdb/", 

45 "https://europepmc.org/articles/", 

46 "https://www.ncbi.nlm.nih.gov/pubmed/", 

47 "https://europepmc.org/article/PPR/", 

48 "https://tools.ietf.org/html/", 

49 "https://identifiers.org/rrid/", 

50 "http://papers.ssrn.com/abstract_id=", 

51 "http://zbmath.org/?format=complete&q=" 

52) 

53 

54 

55def scan_md(md_source: str, md_file: Path) -> tuple[Optional[str], list[str]]: 

56 """Extract date and URLs from specified Markdown file. 

57 

58 To get the date, first try to extract it from Markdown meta information. If no date 

59 found, try to extract date from file name by following the Jekyll naming convention 

60 where files for blog posts start with YYYY-MM-DD. Next, try to format date for 

61 Wayback Machine API as YYYYMMDDhhmm. 

62 

63 Args: 

64 md_source (str): Contents of the Markdown file 

65 md_file (Path): Markdown file path 

66 

67 Returns: 

68 tuple[Optional[str], list[str]]: Formatted date (if found) and list of URLs 

69 """ 

70 html, date = convert_markdown(md_source) 

71 if not date: 

72 date = md_file.name[:10] 

73 return format_date(date), filter_urls(get_urls(html)) 

74 

75 

76def convert_markdown(md_source: str) -> tuple[str, Optional[str]]: 

77 """Convert Markdown file to HTML and extract date from metadata. 

78 

79 Args: 

80 md_file (str): Contents of the Markdown file 

81 

82 Returns: 

83 tuple[str, dict[str, Optional[str]]: HTML version of Markdown file and date from 

84 Markdown metadata 

85 """ 

86 md: markdown.core.Markdown = markdown.Markdown(extensions=['meta']) 

87 html: str = md.convert(md_source) 

88 try: 

89 date: Optional[str] = md.Meta['date'][0] 

90 except KeyError: 

91 date = None 

92 return html, date 

93 

94 

95def format_date(date: str) -> Optional[str]: 

96 """Format date according to Wayback Machine API format. 

97 

98 Use dateutil.parser to recognize dates and return them as YYYYMMDDhhmm. If hour and 

99 minute aren't provided, they are set to 0. If format isn't recognized, return None. 

100 

101 Args: 

102 date (str): Date extracted from Markdown metadata or file name 

103 

104 Returns: 

105 Optional[str]: Date formatted as YYYYMMDDhhmm 

106 """ 

107 try: 

108 return dateutil.parser.parse(date).strftime("%Y%m%d%H%M") 

109 # Malformatted date or no date at the beginning of file name 

110 except dateutil.parser._parser.ParserError: 

111 return None 

112 

113 

114def filter_urls(md_urls: list[str]) -> list[str]: 

115 """Take and filter list of URLs for API calls. 

116 

117 Filter out duplicates and remove URLs that are considered stable: 

118 

119 - URLs that already point to archive.org 

120 - Intra-site links (recognizes Pelican, Jekyll and Hugo intra-site link formats) 

121 - URLs containing stable identifiers 

122 

123 Args: 

124 md_urls (list[str]): List of URLs extracted from Markdown file 

125 

126 Returns: 

127 list[str]: Filtered list of URLs 

128 """ 

129 # Remove duplicates 

130 urls: list[str] = list(set(md_urls)) 

131 # Filter out stable URLs 

132 return [url for url in urls if not 

133 any(stable_url in url for stable_url in STABLE_URLS)] 

134 

135 

136def get_urls(html: str) -> list[str]: 

137 """Extract links from converted Markdown HTML. 

138 

139 Args: 

140 html (str): HTML version of Markdown file 

141 

142 Returns: 

143 list[str]: URLs found in HTML 

144 """ 

145 soup = BeautifulSoup(html, "html.parser") 

146 return [a.get('href') for a in soup.find_all('a', href=True)]