Coverage for src/archive_md_urls/gather_snapshots.py: 53%

32 statements  

« prev     ^ index     » next       coverage.py v6.4.2, created at 2022-07-28 16:50 +0200

1"""Take URL and return URL of archive.org snapshot. 

2 

3Given a URL and (optionally) a timestamp, return the URL of the archive.org 

4snapshot closest to the provided timestamp. If no timestamp is provided or no 

5snapshot for the provided timestamp cannot be found, return the latest 

6snapshot. If no snapshot available, return None. 

7""" 

8 

9import asyncio 

10import sys 

11from typing import Any, Optional 

12 

13import httpx 

14import tenacity 

15 

16 

17@tenacity.retry(stop=tenacity.stop_after_attempt(5), wait=tenacity.wait_fixed(2)) 

18async def call_api(client: httpx.AsyncClient, api_call: str) -> dict[str, Any]: 

19 """Call Wayback Machine API and return JSON response. 

20 

21 If API is unresponsive, sleep task for two seconds for a maximum of five times. 

22 

23 Expect the following API responses: 

24 

25 - URL with correctly formatted timestamp (YYYYMMDDhhmmss): 

26 JSON with nearest snapshot, latest snapshot if timestamp was not found 

27 - URL without timestamp: 

28 JSON with latest snapshot available 

29 - Any URL with badly formatted timestamp (e.g. 'May2000'): 

30 Empty JSON 

31 - URL that is has not available in archive.org: 

32 Empty JSON 

33 

34 Args: 

35 client (httpx.AsyncClient): HTTPX AsyncClient to make API calls 

36 api_call (str): Valid call to archive.org API 

37 

38 Returns: 

39 dict[str, Any]: JSON API response 

40 """ 

41 response: httpx.Response = await client.get(api_call) 

42 response.raise_for_status() 

43 return response.json() 

44 

45 

46def build_api_call(url: str, timestamp: Optional[str] = None) -> str: 

47 """Return valid achive.org API call. 

48 

49 Args: 

50 url (str): URL to be searched in the Wayback Machine 

51 timestamp (Optional[str], optional): Timestamp for desired snapshot 

52 

53 Returns: 

54 str: Valid archive.org API call 

55 """ 

56 api_call: str = f"https://archive.org/wayback/available?url={url}" 

57 if timestamp: 

58 api_call += f"&timestamp={timestamp}" 

59 return api_call 

60 

61 

62def get_closest(api_response: dict[str, Any]) -> Optional[str]: 

63 """Get URL of closest snapshot from API response if available. 

64 

65 Returns None if API response is empty. 

66 

67 Args: 

68 api_response: dict[str, Any]: API response as JSON 

69 

70 Returns: 

71 Optional[str]: URL of Wayback Machine snapshot, if any was found 

72 """ 

73 if not api_response['archived_snapshots']: 

74 return None 

75 return api_response['archived_snapshots']['closest']['url'] 

76 

77 

78async def gather_snapshots( 

79 urls: list[str], timestamp: str = None) -> dict[str, Optional[str]]: 

80 """Create HTTPX session for API calls and return gathered snapshots. 

81 

82 To make asynchronous calls, create a task list for calling the call_api function 

83 and get results with asyncio.gather(). The completed tasks will then be used to 

84 build a dict of url-snapshot pairs. 

85 

86 Args: 

87 urls (list[str]): Urls to send to the Wayback Machine API 

88 timestamp (Optional[str]): Timestamp to send to the Wayback Machine API 

89 

90 Returns: 

91 dict[str, Optional[str]]: API call results with original URL as keys and Wayback 

92 snapshot URLs as values 

93 """ 

94 async with httpx.AsyncClient(timeout=None) as client: 

95 # Create task list (with each task being an API call) 

96 tasks: list[asyncio.Task[Any]] = [] 

97 for url in urls: 

98 tasks.append( 

99 asyncio.create_task(call_api(client, build_api_call(url, timestamp))) 

100 ) 

101 # Execute tasks and gather results. If a task failed five times, exit program 

102 try: 

103 api_responses: list[dict[str, Any]] = await asyncio.gather(*tasks) 

104 except tenacity.RetryError: 

105 sys.exit("API appears unresponsive, please try again later.") 

106 # Build url-snapshot pairs from results 

107 wayback_urls: dict[str, Optional[str]] = {} 

108 for api_response in api_responses: 

109 wayback_urls[api_response["url"]] = get_closest(api_response) 

110 return wayback_urls