Coverage for src/archive_md_urls/gather_snapshots.py: 53%
32 statements
« prev ^ index » next coverage.py v6.4.2, created at 2022-07-28 16:50 +0200
« prev ^ index » next coverage.py v6.4.2, created at 2022-07-28 16:50 +0200
1"""Take URL and return URL of archive.org snapshot.
3Given a URL and (optionally) a timestamp, return the URL of the archive.org
4snapshot closest to the provided timestamp. If no timestamp is provided or no
5snapshot for the provided timestamp cannot be found, return the latest
6snapshot. If no snapshot available, return None.
7"""
9import asyncio
10import sys
11from typing import Any, Optional
13import httpx
14import tenacity
17@tenacity.retry(stop=tenacity.stop_after_attempt(5), wait=tenacity.wait_fixed(2))
18async def call_api(client: httpx.AsyncClient, api_call: str) -> dict[str, Any]:
19 """Call Wayback Machine API and return JSON response.
21 If API is unresponsive, sleep task for two seconds for a maximum of five times.
23 Expect the following API responses:
25 - URL with correctly formatted timestamp (YYYYMMDDhhmmss):
26 JSON with nearest snapshot, latest snapshot if timestamp was not found
27 - URL without timestamp:
28 JSON with latest snapshot available
29 - Any URL with badly formatted timestamp (e.g. 'May2000'):
30 Empty JSON
31 - URL that is has not available in archive.org:
32 Empty JSON
34 Args:
35 client (httpx.AsyncClient): HTTPX AsyncClient to make API calls
36 api_call (str): Valid call to archive.org API
38 Returns:
39 dict[str, Any]: JSON API response
40 """
41 response: httpx.Response = await client.get(api_call)
42 response.raise_for_status()
43 return response.json()
46def build_api_call(url: str, timestamp: Optional[str] = None) -> str:
47 """Return valid achive.org API call.
49 Args:
50 url (str): URL to be searched in the Wayback Machine
51 timestamp (Optional[str], optional): Timestamp for desired snapshot
53 Returns:
54 str: Valid archive.org API call
55 """
56 api_call: str = f"https://archive.org/wayback/available?url={url}"
57 if timestamp:
58 api_call += f"×tamp={timestamp}"
59 return api_call
62def get_closest(api_response: dict[str, Any]) -> Optional[str]:
63 """Get URL of closest snapshot from API response if available.
65 Returns None if API response is empty.
67 Args:
68 api_response: dict[str, Any]: API response as JSON
70 Returns:
71 Optional[str]: URL of Wayback Machine snapshot, if any was found
72 """
73 if not api_response['archived_snapshots']:
74 return None
75 return api_response['archived_snapshots']['closest']['url']
78async def gather_snapshots(
79 urls: list[str], timestamp: str = None) -> dict[str, Optional[str]]:
80 """Create HTTPX session for API calls and return gathered snapshots.
82 To make asynchronous calls, create a task list for calling the call_api function
83 and get results with asyncio.gather(). The completed tasks will then be used to
84 build a dict of url-snapshot pairs.
86 Args:
87 urls (list[str]): Urls to send to the Wayback Machine API
88 timestamp (Optional[str]): Timestamp to send to the Wayback Machine API
90 Returns:
91 dict[str, Optional[str]]: API call results with original URL as keys and Wayback
92 snapshot URLs as values
93 """
94 async with httpx.AsyncClient(timeout=None) as client:
95 # Create task list (with each task being an API call)
96 tasks: list[asyncio.Task[Any]] = []
97 for url in urls:
98 tasks.append(
99 asyncio.create_task(call_api(client, build_api_call(url, timestamp)))
100 )
101 # Execute tasks and gather results. If a task failed five times, exit program
102 try:
103 api_responses: list[dict[str, Any]] = await asyncio.gather(*tasks)
104 except tenacity.RetryError:
105 sys.exit("API appears unresponsive, please try again later.")
106 # Build url-snapshot pairs from results
107 wayback_urls: dict[str, Optional[str]] = {}
108 for api_response in api_responses:
109 wayback_urls[api_response["url"]] = get_closest(api_response)
110 return wayback_urls