Coverage for src/archive_md_urls/scan_md.py: 100%
30 statements
« prev ^ index » next coverage.py v6.4.2, created at 2022-07-28 16:50 +0200
« prev ^ index » next coverage.py v6.4.2, created at 2022-07-28 16:50 +0200
1"""Extract dates and URLs from Markdown files."""
3from pathlib import Path
4from typing import Optional
6import dateutil.parser
7import markdown
8from bs4 import BeautifulSoup
10# List of URLs considered stable and thus ignored
11STABLE_URLS: tuple[str, ...] = (
12 # archive.org snapshots
13 "web.archive.org/web/",
14 # Pelican intra-site links
15 "{filename}",
16 "{static}",
17 # Jekyll intra-site links
18 "{% post_url",
19 # Hugo intra-site links
20 "{{< ref",
21 "{{< relref",
22 # Persistent identifier list from ORCID (https://pub.orcid.org/v2.0/identifiers)
23 "https://arxiv.org/abs/",
24 "http://www.amazon.com/dp/",
25 "https://www.authenticus.pt/",
26 "http://adsabs.harvard.edu/abs/",
27 "https://ciencia.iscte-iul.pt/id/",
28 "https://d-nb.info/",
29 "https://doi.org/",
30 "http://ethos.bl.uk/OrderDetails.do?uin=",
31 "https://hal.archives-ouvertes.fr/view/resolver/",
32 "http://hdl.handle.net/",
33 "https://www.worldcat.org/isbn/",
34 "https://portal.issn.org/resource/ISSN/",
35 "http://zbmath.org/?format=complete&q=an%3A",
36 "http://www.jstor.org/stable/",
37 "https://koreamed.org/article/",
38 "http://lccn.loc.gov/",
39 "https://www.lens.org/",
40 "http://www.ams.org/mathscinet-getitem?mr=",
41 "http://www.worldcat.org/oclc/",
42 "http://openlibrary.org/b/",
43 "https://www.osti.gov/biblio/",
44 "http://identifiers.org/pdb/",
45 "https://europepmc.org/articles/",
46 "https://www.ncbi.nlm.nih.gov/pubmed/",
47 "https://europepmc.org/article/PPR/",
48 "https://tools.ietf.org/html/",
49 "https://identifiers.org/rrid/",
50 "http://papers.ssrn.com/abstract_id=",
51 "http://zbmath.org/?format=complete&q="
52)
55def scan_md(md_source: str, md_file: Path) -> tuple[Optional[str], list[str]]:
56 """Extract date and URLs from specified Markdown file.
58 To get the date, first try to extract it from Markdown meta information. If no date
59 found, try to extract date from file name by following the Jekyll naming convention
60 where files for blog posts start with YYYY-MM-DD. Next, try to format date for
61 Wayback Machine API as YYYYMMDDhhmm.
63 Args:
64 md_source (str): Contents of the Markdown file
65 md_file (Path): Markdown file path
67 Returns:
68 tuple[Optional[str], list[str]]: Formatted date (if found) and list of URLs
69 """
70 html, date = convert_markdown(md_source)
71 if not date:
72 date = md_file.name[:10]
73 return format_date(date), filter_urls(get_urls(html))
76def convert_markdown(md_source: str) -> tuple[str, Optional[str]]:
77 """Convert Markdown file to HTML and extract date from metadata.
79 Args:
80 md_file (str): Contents of the Markdown file
82 Returns:
83 tuple[str, dict[str, Optional[str]]: HTML version of Markdown file and date from
84 Markdown metadata
85 """
86 md: markdown.core.Markdown = markdown.Markdown(extensions=['meta'])
87 html: str = md.convert(md_source)
88 try:
89 date: Optional[str] = md.Meta['date'][0]
90 except KeyError:
91 date = None
92 return html, date
95def format_date(date: str) -> Optional[str]:
96 """Format date according to Wayback Machine API format.
98 Use dateutil.parser to recognize dates and return them as YYYYMMDDhhmm. If hour and
99 minute aren't provided, they are set to 0. If format isn't recognized, return None.
101 Args:
102 date (str): Date extracted from Markdown metadata or file name
104 Returns:
105 Optional[str]: Date formatted as YYYYMMDDhhmm
106 """
107 try:
108 return dateutil.parser.parse(date).strftime("%Y%m%d%H%M")
109 # Malformatted date or no date at the beginning of file name
110 except dateutil.parser._parser.ParserError:
111 return None
114def filter_urls(md_urls: list[str]) -> list[str]:
115 """Take and filter list of URLs for API calls.
117 Filter out duplicates and remove URLs that are considered stable:
119 - URLs that already point to archive.org
120 - Intra-site links (recognizes Pelican, Jekyll and Hugo intra-site link formats)
121 - URLs containing stable identifiers
123 Args:
124 md_urls (list[str]): List of URLs extracted from Markdown file
126 Returns:
127 list[str]: Filtered list of URLs
128 """
129 # Remove duplicates
130 urls: list[str] = list(set(md_urls))
131 # Filter out stable URLs
132 return [url for url in urls if not
133 any(stable_url in url for stable_url in STABLE_URLS)]
136def get_urls(html: str) -> list[str]:
137 """Extract links from converted Markdown HTML.
139 Args:
140 html (str): HTML version of Markdown file
142 Returns:
143 list[str]: URLs found in HTML
144 """
145 soup = BeautifulSoup(html, "html.parser")
146 return [a.get('href') for a in soup.find_all('a', href=True)]