Check list if file has downloaded and skip if it has?

Question

I am new to Python and sure the below can be optimised however I have ran in to an issue with my last step in my script.

The aim is not to download a file if it has been previously downloaded. At this time I log the download in a file called download_history.log

I need to therefore implement a check here to kind of do the following check the log - if it exists in log do nothing and move to next file if it does not exists download the file and log it in to the file.

Any help would be appreciated.

#!/usr/bin/env python3

import boto
import sys, os
import zipfile
import shutil
import glob
import re
from boto.s3.key import Key
from boto.exception import S3ResponseError


#Make the download files
DOWNLOAD_LOCATION_PATH = os.path.expanduser("~") + "/AWSSplunk/Downloads/"
if not os.path.exists(DOWNLOAD_LOCATION_PATH):
    print ("Making download directory")
    os.mkdir(DOWNLOAD_LOCATION_PATH)

#Delete Output Folder if it exsists
OUTPUT_FOLDER = os.path.expanduser("~") + "/AWSSplunk/Output/"
shutil.rmtree(OUTPUT_FOLDER)

#Define the AWS Bucket
def backup_s3_folder():
    BUCKET_NAME = "my-bucket-name"
    AWS_ACCESS_KEY_ID= os.getenv("##################")
    AWS_ACCESS_SECRET_KEY = os.getenv("#########################")
    conn  = boto.connect_s3(AWS_ACCESS_KEY_ID, AWS_ACCESS_SECRET_KEY)
    bucket = conn.get_bucket(BUCKET_NAME)

    #goto through the list of files
    bucket_list = bucket.list()    

    for l in bucket_list:
        key_string = str(l.key)
        s3_path = DOWNLOAD_LOCATION_PATH + key_string
        try:

            # Add files to the log file
            print ("Downloading file ", key_string)
            file_object = open('download_history.log', 'a')
            file_object.write(key_string)
            file_object.write("\n")

            # Working code
            file_object.close()
            l.get_contents_to_filename(s3_path)
        except (OSError,S3ResponseError) as e:
            pass
            # check if the file has been downloaded locally  
            if not os.path.exists(s3_path):
                try:
                    os.makedirs(s3_path)
                except OSError as exc:
                    # let guard againts race conditions
                    import errno
                    if exc.errno != errno.EEXIST:
                        raise

if __name__ == '__main__':
    backup_s3_folder()

# Start the unzipping process

print("Unzipping Starting")
dir_path = os.path.expanduser("~") + "/AWSSplunk/Downloads/"
for path, dir_list, file_list in os.walk(dir_path):
    for file_name in file_list:
        if file_name.endswith(".zip"):
            abs_file_path = os.path.join(path, file_name)

            parent_path = os.path.split(abs_file_path)[0]
            output_folder_name = os.path.splitext(abs_file_path)[0]
            output_path = os.path.join(parent_path, output_folder_name)

            zip_obj = zipfile.ZipFile(abs_file_path, 'r')
            zip_obj.extractall(output_path)
            zip_obj.close()
print("Unzipping Completed")

# Start moving files to output
print("Moving Files")

FILE_LOCATION_PATH = os.path.expanduser("~") + "/AWSSplunk/Output/"

if not os.path.exists(FILE_LOCATION_PATH):
    print ("Making download directory")
    os.mkdir(FILE_LOCATION_PATH)

# .log files move
for root, dirs, files in os.walk(dir_path):
    for file in files:
        if file.endswith('.log'): 
            count = 1
            destination_file = os.path.join(FILE_LOCATION_PATH, file)
            while os.path.exists(destination_file):
                destination_file = os.path.join(FILE_LOCATION_PATH, f"{file}_{count}")
                count += 1
            shutil.move(os.path.join(root, file), destination_file)

# .txt files move
for root, dirs, files in os.walk(dir_path):
    for file in files:
        if file.endswith('.txt'):
            count = 1
            destination_file = os.path.join(FILE_LOCATION_PATH, file)
            while os.path.exists(destination_file):
                destination_file = os.path.join(FILE_LOCATION_PATH, f"{file}_{count}")
                count += 1
            shutil.move(os.path.join(root, file), destination_file)

# .json files move
for root, dirs, files in os.walk(dir_path):
    for file in files:
        if file.endswith('.json'):
            count = 1
            destination_file = os.path.join(FILE_LOCATION_PATH, file)
            while os.path.exists(destination_file):
                destination_file = os.path.join(FILE_LOCATION_PATH, f"{file}_{count}")
                count += 1
            shutil.move(os.path.join(root, file), destination_file)


print("Files Move Complete")
# Delete Directory
print("Cleaning up Downloads Directory")
shutil.rmtree(DOWNLOAD_LOCATION_PATH)

# Remove EFR Audit Logs stratinbg with 2020
print("Remove the encrypted Audit Logs")
pattern = "^(2020)"
FILE_LOCATION_PATH = os.path.expanduser("~") + "/AWSSplunk/Output/"
for root, dirs, files in os.walk(FILE_LOCATION_PATH):
    for file in filter(lambda x: re.match(pattern, x), files):
        os.remove(os.path.join(root, file))

# Remove EFR Audit Logs stratinbg with EFR
pattern = "^(EFR)"
FILE_LOCATION_PATH = os.path.expanduser("~") + "/AWSSplunk/Output/"
for root, dirs, files in os.walk(FILE_LOCATION_PATH):
    for file in filter(lambda x: re.match(pattern, x), files):
        os.remove(os.path.join(root, file))

# Remove EFR Audit Logs stratinbg with 2019
pattern = "^(2019)"
FILE_LOCATION_PATH = os.path.expanduser("~") + "/AWSSplunk/Output/"
for root, dirs, files in os.walk(FILE_LOCATION_PATH):
    for file in filter(lambda x: re.match(pattern, x), files):
        os.remove(os.path.join(root, file))

# Script clean up        
print("Script Complete")

#with open("download_history.log", "a") as myfile:
#    myfile.write('New Line\n')

If you want to search for the name of the file to download in the download_history.log, you can do something like this answer suggests: https://stackoverflow.com/a/4940068/8446061 — Nanna, Mar 02 '20 at 11:05

score 0 · Answer 1 · answered Mar 02 '20 at 10:54

0

With os you can check whether a file exist or not:

if not os.isfile(PATH_TO_EXPECTED_DOWNLOADED_FILE):
    #do download

For your own security please seperate your steps into functions and build a pipeline of these.

answered Mar 02 '20 at 10:54

dl.meteo

1,658
15
25

where should I be adding this in to my script. – ThisIsLegend1016 Mar 02 '20 at 11:14
This is very complicated to say because your code is not following any recommended guidelines. Actually I can't find the place where you download the data? And keep in mind that StackOverFlow is not there to adapt your code it is there to answer your question or suggest best practice solutions by the community. – dl.meteo Mar 02 '20 at 15:17
Agreed -last time I posted a question I was asked to enter all code so thats what I have done so will do that next time. The issue with your answer is it checks if the file exsists - however I unzip the files then delete the zips. I have however since resolved this issue. – ThisIsLegend1016 Mar 03 '20 at 08:21

Check list if file has downloaded and skip if it has?

1 Answers1