Coverage for src/archive_md_urls/update_files.py: 55%

20 statements  

« prev     ^ index     » next       coverage.py v6.4.2, created at 2022-07-28 16:50 +0200

1"""Turn URLs in Markdown files to Wayback snapshots.""" 

2 

3import re 

4from pathlib import Path 

5from typing import Optional 

6 

7from archive_md_urls.gather_snapshots import gather_snapshots 

8from archive_md_urls.scan_md import scan_md 

9 

10 

11async def update_files(files: list[Path]) -> None: 

12 """Scan and update URLs in Markdown files. 

13 

14 File contents are updated in-place. 

15 

16 Args: 

17 files (list[Path]): List of Markdown files to scan and update 

18 """ 

19 # Keep count of changed URLs to summarize changes to user 

20 changed_urls: int = 0 

21 for file in files: 

22 md_source: str = file.read_text(encoding="utf-8") 

23 date, urls = scan_md(md_source, file) 

24 # Call API and collect snapshots 

25 wayback_urls: dict[str, Optional[str]] = await gather_snapshots(urls, date) 

26 # Update links in file source and write file 

27 updated_md_source: str = update_md_source(md_source, wayback_urls) 

28 file.write_text(updated_md_source, encoding="utf-8") 

29 changed_urls += len([item for item in wayback_urls.values() if item]) 

30 print(f"Changed {changed_urls} {'URL' if changed_urls == 1 else 'URLs'} " 

31 f"in {len(files)} {'file' if len(files) == 1 else 'files'}.") 

32 

33 

34def update_md_source(md_source: str, wayback_urls: dict[str, Optional[str]]) -> str: 

35 """Replace URLs in Markdown file with Wayback Snapshots. 

36 

37 Args: 

38 md_source (str): Content of Markdown file that should be updated 

39 wayback_urls (dict[str, Optional[str]]): URL-Snapshot pairs 

40 

41 Returns: 

42 str: Content of Markdown file with updated URLs 

43 """ 

44 for url, snapshot in wayback_urls.items(): 

45 # Skip cases where no Wayback Snapshot was found 

46 if snapshot: 

47 # Only replace strings which are == url if they are preceded and 

48 # followed by braces to avoid mismatches 

49 md_source = re.sub(fr"(?<=\(){url}(?=\))", snapshot, md_source) 

50 return md_source