3

I want to download files from a particular s3 bucket based on files Last modified date.

I have researched on how to connect boto3 and there is plenty of code and documentation available for downloading the file without any conditions. I made a pseudo code


def download_file_s3(bucket_name,modified_date)
    # connect to reseource s3
    s3 = boto3.resource('s3',aws_access_key_id='demo', aws_secret_access_key='demo')

    # connect to the desired bucket
    my_bucket = s3.Bucket(bucket_name)

    # Get files 
    for file in my_bucket.objects.all():



I want to complete this function, basically, passing a modified date the function returns the files in the s3 bucket for that particular modified date.

Danish Xavier
  • 1,225
  • 1
  • 10
  • 21

3 Answers3

7

I have a Better solution or a function which could do this automatically. Just pass In the Bucket name and Download path name.

from boto3.session import Session
from datetime import date, timedelta
import boto3
import re


def Download_pdf_specifc_date_subfolder(bucket_name,download_path)
    ACCESS_KEY = 'XYZ'
    SECRET_KEY = 'ABC'
    Bucket_name=bucket_name

    # code to create a session 
    session = Session(aws_access_key_id=ACCESS_KEY,
              aws_secret_access_key=SECRET_KEY)
    s3 = session.resource('s3')
    bucket = s3.Bucket(Bucket_name)

    # code to get the yesterdays date
    yesterday = date.today() - timedelta(days=1)
    x=yesterday.strftime('20%y-%m-%d')
    print(x)

    #code to add the files to a list which needs to be downloaded
    files_to_downloaded = []
    #code to take all the files from s3 under a specific bucket
    for fileObject in bucket.objects.all():
        file_name = str(fileObject.key)
        last_modified=str(fileObject.last_modified)
        last_modified=last_modified.split()
        if last_modified[0]==x:
    # Enter the specific bucketname in the regex in place of Airports to filter only the particluar subfolder
            if re.findall(r"Airports/[a-zA-Z]+", file_name):
                files_to_downloaded.append(file_name)

     # code to Download into a specific Folder 
    for fileObject in bucket.objects.all():
        file_name = str(fileObject.key)
        if file_name in files_to_downloaded:
            print(file_name)
            d_path=download_path + file_name
            print(d_path)
            bucket.download_file(file_name,d_path)

Download_pdf_specifc_date_subfolder(bucket_name,download_path)

Ultimately the function will give the results in the specific Folder with the files to be downloaded.

Rahul Goyal
  • 433
  • 2
  • 8
2

Here is my test code and it will print the last_modified datetime of objects which have the datetime after what I set.

import boto3
from datetime import datetime
from datetime import timezone

s3 = boto3.resource('s3')
response = s3.Bucket('<bucket name>').objects.all()

for item in response:
    obj = s3.Object(item.bucket_name, item.key)
    if obj.last_modified > datetime(2019, 8, 1, 0, 0, 0, tzinfo=timezone.utc):
        print(obj.last_modified)

If you have a specific date, then

import boto3
from datetime import datetime, timezone

s3 = boto3.resource('s3')
response = s3.Bucket('<bucket name>').objects.all()

date = '20190827' # input('Insert Date as a form YYYYmmdd')

for item in response:
    obj = s3.Object(item.bucket_name, item.key)
    if obj.last_modified.strftime('%Y%m%d') == date:
        print(obj.last_modified)

will give the results as follows.

2019-08-27 07:13:04+00:00
2019-08-27 07:13:36+00:00
2019-08-27 07:13:39+00:00
Lamanus
  • 12,898
  • 4
  • 21
  • 47
  • So if I want to check a specific date from the _last_modified_ method. I can check like this `obj.last_modified==date` right? – Danish Xavier Sep 11 '19 at 06:15
0

If edited this answer to download all files after a certain timestamp and then write the current time to a file for use in the next iteration. You can easily adapt this to only download files of a specific date, month, year, yesterday, etc.

import os
import boto3
import datetime
import pandas as pd

### Load AWS Key, Secret and Region 
# ....
###

# Open file to read last download time and update file with current time
latesttime_file = "latest request.txt"
with open(latesttime_file, 'r') as f:
    latest_download = pd.to_datetime(f.read(), utc=True)

with open(latesttime_file, 'w') as f:
    f.write(str(datetime.datetime.utcnow()))

# Initialize S3-client
s3_client = boto3.client('s3',
                         region_name=AWS_REGION,  
                  aws_access_key_id=AWS_KEY_ID, 
                  aws_secret_access_key=AWS_SECRET) 


def download_dir(prefix, local, bucket, timestamp, client=s3_client):
    """
    params:
    - prefix: pattern to match in s3
    - local: local path to folder in which to place files
    - bucket: s3 bucket with target contents
    - client: initialized s3 client object
    """
    keys = []
    dirs = []
    next_token = ''
    base_kwargs = {
        'Bucket':bucket,
        'Prefix':prefix,
    }
    while next_token is not None:
        kwargs = base_kwargs.copy()
        if next_token != '':
            kwargs.update({'ContinuationToken': next_token})
        results = client.list_objects_v2(**kwargs)
        contents = results.get('Contents')
        for i in contents:
            k = i.get('Key')
            t = i.get('LastModified')
            if k[-1] != '/':
                if t > timestamp:
                    keys.append(k)
            else:
                dirs.append(k)
        next_token = results.get('NextContinuationToken')
    for d in dirs:
        dest_pathname = os.path.join(local, d)
        if not os.path.exists(os.path.dirname(dest_pathname)):
            os.makedirs(os.path.dirname(dest_pathname))
    for k in keys:
        dest_pathname = os.path.join(local, k)
        if not os.path.exists(os.path.dirname(dest_pathname)):
            os.makedirs(os.path.dirname(dest_pathname))
        client.download_file(bucket, k, dest_pathname)

download_dir(<prefix or ''>, <local folder to download to>, <bucketname>, latest_download)
Niels Henkens
  • 2,553
  • 1
  • 12
  • 27