Enabling the downloading of all ARC artifacts at once.

79cbbac6 · Bryan Tarpley · c6150cff · 79cbbac6
Commit 79cbbac6 authored 1 year ago by Bryan Tarpley
Hide whitespace changes
Inline Side-by-side

Showing

with 104 additions and 41 deletions
+104 -41
--- a/download.py
+++ b/download.py
@@ -2,12 +2,56 @@ import os
 import sys
 import json
 import requests
+import traceback
 from math import ceil, floor

 # ------------------------------------ #
 # utilities							   #
 # ------------------------------------ #

+
+# this function does all the heavy lifting. it takes a dictionary of information representing an ARC archive,
+# the number of artifacts from that archive to download, the path to the download directory, whether to get
+# only the full text for the archive, the corpora host, and the corpus ID for the ARC corpus.
+def download_archive_data(archive, num_artifacts, download_directory, get_full_text, corpora_host, corpus_id):
+	print(
+		f"\nDownloading {desired_data} to the folder '{download_directory}/{archive['handle']}.' Depending on the number of artifacts, this may take some time.\n")
+
+	download_directory = f"{download_directory}/{archive['handle']}"
+	os.makedirs(download_directory, exist_ok=True)
+
+	batches_downloaded = 0
+	batches_needed = ceil(num_artifacts / batch_size)
+	show_progress(batches_downloaded, batches_needed)
+
+	while batches_downloaded < batches_needed:
+		skip = batches_downloaded * batch_size
+		limit = batch_size
+
+		if skip + limit > num_artifacts:
+			limit = num_artifacts - skip
+
+		if get_full_text:
+			ft_req = requests.get(
+				f"{corpora_host}/api/corpus/{corpus_id}/ArcArtifact/?f_archive.id={archive['id']}&only=external_uri,full_text_contents&page={batches_downloaded + 1}&page-size={batch_size}")
+			ft = ft_req.json()
+
+			if 'records' in ft:
+				with open(f"{download_directory}/{batches_downloaded + 1}.json", 'w', encoding='utf-8') as ft_out:
+					json.dump(ft['records'], ft_out, indent=4)
+
+		else:
+			ttl_req = requests.get(
+				f"{corpora_host}/corpus/{corpus_id}/ArcArchive/{archive['id']}/LINCS/?skip={skip}&limit={limit}")
+			ttl = ttl_req.text
+
+			with open(f"{download_directory}/{batches_downloaded + 1}.ttl", 'w', encoding='utf-8') as ttl_out:
+				ttl_out.write(ttl)
+
+		batches_downloaded += 1
+		show_progress(batches_downloaded, batches_needed)
+
+
 # takes a list of dictionaries, each with a 'label' key,
 # and prints out a multi-columned menu of options to select from.
 # it returns the selected dictionary.
@@ -44,13 +88,16 @@ def get_selection(options, prompt):
 		print(" "*indent + (" "*pad).join(f"[{option['num']}] {option['label']}".ljust(col_width) for option in row))

 	choice_found = False
-
 	while not choice_found:
 		choice = input(f"\n{prompt} ")
-		for option in options:
-			if choice == str(option['num']):
-				choice_found = True
-				return option
+		if choice == 'ALL':
+			choice_found = True
+			return options
+		else:
+			for option in options:
+				if choice == str(option['num']):
+					choice_found = True
+					return option

 		print('Invalid selection! Try again...')

@@ -139,6 +186,7 @@ archive_options = []
 req = requests.get(aggregation_url)
 archs_meta = req.json()
 if 'meta' in archs_meta and 'aggregations' in archs_meta['meta'] and 'archs' in archs_meta['meta']['aggregations']:
+	total_artifacts = archs_meta['meta']['total']
 	archs_meta = archs_meta['meta']['aggregations']['archs']

 	req = requests.get(f'{corpora_host}/api/corpus/{corpus_id}/ArcArchive/?page-size=1000&s_name=asc')
@@ -155,7 +203,15 @@ if 'meta' in archs_meta and 'aggregations' in archs_meta['meta'] and 'archs' in
 					'count': archs_meta[arch['handle']]
 				})

-	archive = get_selection(archive_options, "Choose an ARC archive:")
+	archive = get_selection(archive_options, "Choose an ARC archive, or enter ALL:")
+	total_possible_artifacts = 0
+	num_to_skip = 0
+	if type(archive) == list:
+		total_possible_artifacts = total_artifacts
+		num_to_skip = -1
+	else:
+		total_possible_artifacts = archive['count']
+		archive = [archive]
 	
 	num_artifacts = 0
 	while num_artifacts == 0:
@@ -163,49 +219,56 @@ if 'meta' in archs_meta and 'aggregations' in archs_meta['meta'] and 'archs' in
 		if get_full_text:
 			desired_data = "full text"

-		choice = input(f"For how many ARC artifacts do you want {desired_data} [{archive['count']}]? ")
+		choice = input(f"For how many ARC artifacts do you want {desired_data} [{total_possible_artifacts}]? ")
 		if choice == '':
-			num_artifacts = archive['count']
+			num_artifacts = total_possible_artifacts
 		elif choice.isdigit():
 			num_artifacts = int(choice)
-			if num_artifacts > archive['count']:
-				num_artifacts = archive['count']
+			if num_artifacts > total_possible_artifacts:
+				num_artifacts = total_possible_artifacts
 		else:
 			print('Not a valid number of artifacts. Try again...')

-	print(f"\nDownloading {desired_data} to the folder '{archive['handle']}.' Depending on the number of artifacts, this may take some time.\n")
-	
-	download_directory = f"{download_directory}/{archive['handle']}"
-	os.makedirs(download_directory, exist_ok=True)
-
-	batches_downloaded = 0
-	batches_needed = ceil(num_artifacts / batch_size)
-	show_progress(batches_downloaded, batches_needed)
-
-	while batches_downloaded < batches_needed:
-		skip = batches_downloaded * batch_size
-		limit = batch_size
-
-		if skip + limit > num_artifacts:
-			limit = num_artifacts - skip
-
-		if get_full_text:
-			ft_req = requests.get(f"{corpora_host}/api/corpus/{corpus_id}/ArcArtifact/?f_archive.id={archive['id']}&only=external_uri,full_text_contents&page={batches_downloaded + 1}&page-size={batch_size}")
-			ft = ft_req.json()
-
-			if 'records' in ft:
-				with open(f"{download_directory}/{batches_downloaded + 1}.json", 'w', encoding='utf-8') as ft_out:
-					json.dump(ft['records'], ft_out, indent=4)
-
+	while num_to_skip == -1:
+		choice = input(f"How many artifacts would you like to skip (useful for resuming a long-running job) [0]? ")
+		if choice == '':
+			num_to_skip = 0
+		elif choice.isdigit():
+			num_to_skip = int(choice)
 		else:
-			ttl_req = requests.get(f"{corpora_host}/corpus/{corpus_id}/ArcArchive/{archive['id']}/LINCS/?skip={skip}&limit={limit}")
-			ttl = ttl_req.text
-
-			with open(f"{download_directory}/{batches_downloaded + 1}.ttl", 'w', encoding='utf-8') as ttl_out:
-				ttl_out.write(ttl)
+			print('Not a valid number of artifacts. Try again...')

-		batches_downloaded += 1
-		show_progress(batches_downloaded, batches_needed)
+	artifacts_remaining = num_artifacts
+	num_skipped = 0
+	for arch in archive:
+		if artifacts_remaining:
+			arch_limit = arch['count']
+			if artifacts_remaining < arch_limit:
+				arch_limit = artifacts_remaining
+
+			skip_this_archive = False
+			if num_to_skip > 0:
+				if num_skipped + arch_limit < num_to_skip:
+					skip_this_archive = True
+					num_skipped += arch_limit
+
+			if not skip_this_archive:
+				try:
+					download_archive_data(
+						arch,
+						arch_limit,
+						download_directory,
+						get_full_text,
+						corpora_host,
+						corpus_id
+					)
+				except:
+					print("Oh, no! An error occurred:")
+					print(traceback.format_exc())
+					print(f"\n\nWhen resuming this job, you might skip the first {num_artifacts - artifacts_remaining} artifacts.")
+
+			artifacts_remaining -= arch_limit
+			print(f"\n{num_artifacts - artifacts_remaining} total artifacts downloaded so far.")

 	print("\nDownload completed!")
 	exit()