I'm currently working on a project, and so far I've been making it work by downloading the whole folder with the data I need from github on my local files and performing post processing from there. But if I want to post-process from the files directly from github so I can always have updated information, is there a way to get the folder I need into my script without manually downloading it?
Here's the repo, I need the entire data folder and it's composed of yaml files. https://github.com/openstates/people/tree/main/data
Here's how I'm currently extracting files from my local.
def extractingOfficials(file):
with open(file, 'r') as f:
try:
official = yaml.safe_load(f)
return official
except yaml.YAMLError as exc:
print(exc)
def mainExtractingFunction(rootdir):
all_yml_files = []
for subdir, dirs, files in os.walk(rootdir):
for file in files:
# skipping those who are retired and committees offices
if "retired" in subdir or "committees" in subdir:
continue
elif file.endswith(".yml") and "municipalities" not in file:
all_yml_files.append(os.path.join(subdir, file))
with ThreadPool(28) as pool:
allofficials = pool.map(extractingOfficials, all_yml_files)
pool.close()
return allofficials