-1

Here is my code: I am trying to read all the files of the same format from the s3 bucket Error : "Could not establish source connection [Errno 2] No such file or directory: '/user_code/s3:/"

def s3_file_read(self,source)
    bucket_name = 'xxx'
    region='xxx'
    object_name = 's3-folder-name/'
    ACCESS_KEY_ID = 'xxx'
    ACCESS_SECRET_KEY = 'xxx'
    s3_client = boto3.client('s3',aws_access_key_id=ACCESS_KEY_ID,aws_secret_access_key=ACCESS_SECRET_KEY,region_name=region) 
    file_path = "s3://your-bucket-name/folder-name/"
    prefix = os.path.abspath(file_path) 
    file_list = [os.path.join(prefix, f) for f in os.listdir(prefix) if f.endswith('.csv')]
    print('##################################Reading the file#############################')
    file_type = source['fileType'].lower()
    if source['fileType'] == 'csv':
        try:
            obj = s3_client.get_object(Bucket= bucket_name, Key= object_name)
            file_df = pd.read_csv(obj['Body'])
            print("CSV File read success")
        except Exception as e:
            print("Could not read the file {}".format(e))
    else:
         print("File format supported CSV")


  [1]: https://i.stack.imgur.com/6pX8d.png
JKAK
  • 17
  • 4
  • I might be reading it wrong, but do you mean you want to get all the files from S3 ? It looks like from your code you're getting the files locally though ? – fixatd Nov 20 '20 at 07:27
  • Does this answer your question? [Boto3 to download all files from a S3 Bucket](https://stackoverflow.com/questions/31918960/boto3-to-download-all-files-from-a-s3-bucket) – fixatd Nov 20 '20 at 07:34
  • @fixatd Yes I want to read all the files from s3 – JKAK Nov 20 '20 at 07:50
  • @fixatd I just need to read all the files from the specific folder in the s3 bucket which contains .csv format files. – JKAK Nov 20 '20 at 09:06

1 Answers1

1

I've made some assumptions about what you'd like to do here, but this code will read the keys in a bucket, and create a list of .csv objects only. Then you can read that list and test if a dataframe can be created. If you want to read all those files into one larger dataframe then the end of your function needs to be rewritten.

s3sr = boto3.resource('s3')

#there are other examples of collecting objects, this is just what I use
def get_keys_from_prefix(self, bucket, prefix):
    '''gets list of keys for given bucket and prefix'''
    keys_list = []
    paginator = s3sr.meta.client.get_paginator('list_objects_v2')
    # use Delimiter to limit search to that level of hierarchy
    for page in paginator.paginate(Bucket=bucket, Prefix=prefix, Delimiter='/'):
        keys = [content['Key'] for content in page.get('Contents')]
        print('keys in page: ', len(keys))
        keys_list.extend(keys)
    return keys_list




def s3_file_read(self,source):
    bucket_name = 'xxx'
    region='xxx'
    prefix = 's3-folder-name/'  # if no prfex, pass ''

    ACCESS_KEY_ID = 'xxx'
    ACCESS_SECRET_KEY = 'xxx'
    s3_client = boto3.client('s3',aws_access_key_id=ACCESS_KEY_ID,aws_secret_access_key=ACCESS_SECRET_KEY,region_name=region) 

    keys_list = self.get_keys_from_prefix(bucket_name, prefix)
    csv_list = [f for f in keys_list  if f.endswith('.csv')]


    for csvfile in csv_list:
        try:
            obj = s3_client.get_object(Bucket= bucket_name, Key= csvfile)
            file_df = pd.read_csv(obj['Body'])
            print("CSV File read success")
        except Exception as e:
            print("Could not read the file {}".format(e))
Jonathan Leon
  • 5,440
  • 2
  • 6
  • 14