20

With the v1 version of the listObjects API call, you would have done something like from this SO answer.

var allKeys = [];
function listAllKeys(marker, cb) {
  s3.listObjects({Bucket: s3bucket, Marker: marker}, function(err, data){
     allKeys.push(data.Contents);

    if(data.IsTruncated)
       listAllKeys(data.NextMarker, cb);
    else
       cb();
  });
}

What would be the equivalent of the listObjectsV2 function?

Nicholas Sizer
  • 3,490
  • 3
  • 26
  • 29
eljefedelrodeodeljefe
  • 6,304
  • 7
  • 29
  • 61
  • 1
    Update for '20: Instead of requiring the entire sdk (like most of these answers are suggesting) for your project, only require the necessary modules to reduce the future size of your ```bundle```. ```let aws = require('aws-sdk/global');``` and ```let S3 = require('aws-sdk/clients/s3');``` – Michael Jul 17 '20 at 15:53

10 Answers10

48

this is the best way to do that in my opinion:

const AWS = require('aws-sdk');
const s3 = new AWS.S3();

const listAllKeys = (params, out = []) => new Promise((resolve, reject) => {
  s3.listObjectsV2(params).promise()
    .then(({Contents, IsTruncated, NextContinuationToken}) => {
      out.push(...Contents);
      !IsTruncated ? resolve(out) : resolve(listAllKeys(Object.assign(params, {ContinuationToken: NextContinuationToken}), out));
    })
    .catch(reject);
});

listAllKeys({Bucket: 'bucket-name'})
  .then(console.log)
  .catch(console.log);
Giovanni Bruno
  • 844
  • 2
  • 9
  • 13
  • Why is this best? (I ask because I don't know.) – Chris76786777 May 10 '18 at 04:28
  • 3
    mainly because it's concise and all needed variables are self contained. I basically keep the code clean. – Giovanni Bruno May 10 '18 at 15:37
  • 5
    Wrong answer, yo should not use promises as recursions, they are not the same. Promises will stack on top of each other until last one will resolve. Provided solution will do the job for simple tasks, but if you have big number of records in your bucket script will first start slowing down and eventually run out of memory (in my case was crashing after retrieving first 5 mil). I manage to get around the issue by placing async await promise call in to do while loop with – Den Jun 17 '19 at 10:05
  • 2
    you are right @Den, this is solution is suitable for simple tasks and I also use an async await loop to do bigger tasks – Giovanni Bruno Dec 22 '19 at 09:45
30

Here is the code to get the list of keys from a bucket.

var params = {
    Bucket: 'bucket-name'    
};

var allKeys = [];
listAllKeys();
function listAllKeys() {
    s3.listObjectsV2(params, function (err, data) {
        if (err) {
            console.log(err, err.stack); // an error occurred
        } else {
            var contents = data.Contents;
            contents.forEach(function (content) {
                allKeys.push(content.Key);
            });

            if (data.IsTruncated) {
                params.ContinuationToken = data.NextContinuationToken;
                console.log("get further list...");
                listAllKeys();
            } 

        }
    });
}
notionquest
  • 37,595
  • 6
  • 111
  • 105
17

https://stackoverflow.com/a/57540786/8784402

Using AWS-SDK v3 and Typescript

import {
  paginateListObjectsV2,
  S3Client,
  S3ClientConfig,
} from '@aws-sdk/client-s3';

/* // For Deno
import {
  paginateListObjectsV2,
  S3Client,
  S3ClientConfig,
} from "https://deno.land/x/aws_sdk@v3.32.0-1/client-s3/mod.ts"; */

const s3Config: S3ClientConfig = {
  credentials: {
    accessKeyId: 'accessKeyId',
    secretAccessKey: 'secretAccessKey',
  },
  region: 'us-east-1',
};

const getAllS3Files = async (client: S3Client, s3Opts) => {
  const totalFiles = [];
  for await (const data of paginateListObjectsV2({ client }, s3Opts)) {
    totalFiles.push(...(data.Contents ?? []));
  }
  return totalFiles;
};

const main = async () => {
  const client = new S3Client(s3Config);
  const s3Opts = { Bucket: 'bucket-xyz' };
  console.log(await getAllS3Files(client, s3Opts));
};

main();

For AWS-SDK v2 Using Async Generator

Import S3

const { S3 } = require('aws-sdk');
const s3 = new S3();

create a generator function to retrieve all the files list

async function* listAllKeys(opts) {
  opts = { ...opts };
  do {
    const data = await s3.listObjectsV2(opts).promise();
    opts.ContinuationToken = data.NextContinuationToken;
    yield data;
  } while (opts.ContinuationToken);
}

Prepare aws parameter, based on api docs

const opts = {
  Bucket: 'bucket-xyz' /* required */,
  // ContinuationToken: 'STRING_VALUE',
  // Delimiter: 'STRING_VALUE',
  // EncodingType: url,
  // FetchOwner: true || false,
  // MaxKeys: 'NUMBER_VALUE',
  // Prefix: 'STRING_VALUE',
  // RequestPayer: requester,
  // StartAfter: 'STRING_VALUE'
};

Use generator

async function main() {
  // using for of await loop
  for await (const data of listAllKeys(opts)) {
    console.log(data.Contents);
  }
}
main();

thats it

Or Lazy Load

async function main() {
  const keys = listAllKeys(opts);
  console.log(await keys.next());
  // {value: {…}, done: false}
  console.log(await keys.next());
  // {value: {…}, done: false}
  console.log(await keys.next());
  // {value: undefined, done: true}
}
main();

Or Use generator to make Observable function

const lister = (opts) => (o$) => {
  let needMore = true;
  const process = async () => {
    for await (const data of listAllKeys(opts)) {
      o$.next(data);
      if (!needMore) break;
    }
    o$.complete();
  };
  process();
  return () => (needMore = false);
};

use this observable function with RXJS

// Using Rxjs

const { Observable } = require('rxjs');
const { flatMap } = require('rxjs/operators');

function listAll() {
  return Observable.create(lister(opts))
    .pipe(flatMap((v) => v.Contents))
    .subscribe(console.log);
}

listAll();

or use this observable function with Nodejs EventEmitter

const EventEmitter = require('events');

const _eve = new EventEmitter();

async function onData(data) {
  // will be called for each set of data
  console.log(data);
}
async function onError(error) {
  // will be called if any error
  console.log(error);
}
async function onComplete() {
  // will be called when data completely received
}
_eve.on('next', onData);
_eve.on('error', onError);
_eve.on('complete', onComplete);

const stop = lister(opts)({
  next: (v) => _eve.emit('next', v),
  error: (e) => _eve.emit('error', e),
  complete: (v) => _eve.emit('complete', v),
});
nkitku
  • 4,779
  • 1
  • 31
  • 27
16

Building on previous answers, here is an approach that takes advantage of the Prefix parameter to make multiple calls to s3.listObjectsV2() in parallel.

This has led to 2-15x speedup for me depending on how evenly the keys are distributed and whether or not the code is running locally or on AWS.

You should make sure that the prefixes cover the full range of possible prefixes for your bucket. The code below covers all "safe" characters but S3 supports a wider range of UTF-8 characters.

Note that this example uses async/await so ES2017/Node 8 is required. The example is a Node 8.10 Lambda function.

const AWS = require('aws-sdk');
const s3 = new AWS.S3();

exports.handler = async (event) => {
  // Prefixes are used to fetch data in parallel.
  const numbers = '0123456789'.split('');
  const letters = 'abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ'.split('');
  const special = "!-_'.*()".split(''); // "Safe" S3 special chars
  const prefixes = [...numbers, ...letters, ...special];

  // array of params used to call listObjectsV2 in parallel for each prefix above
  const arrayOfParams = prefixes.map((prefix) => {
    return { Bucket: 'YOUR-BUCKET-NAME', Prefix: prefix }
  });

  const allKeys = [];
  await Promise.all(arrayOfParams.map(params => getAllKeys(params, allKeys)));
  return allKeys.length;
};

async function getAllKeys(params,  allKeys = []){
  const response = await s3.listObjectsV2(params).promise();
  response.Contents.forEach(obj => allKeys.push(obj.Key));

  if (response.NextContinuationToken) {
    params.ContinuationToken = response.NextContinuationToken;
    await getAllKeys(params, allKeys); // RECURSIVE CALL
  }
  return allKeys;
}

Also, for completeness, here is a simpler, non-prefixed async/await version:

const AWS = require('aws-sdk');
const s3 = new AWS.S3();

exports.handler = async (event) => {
  const allKeys = await getAllKeys({ Bucket: 'YOUR-BUCKET-NAME' });
  return allKeys.length;
};

async function getAllKeys(params,  allKeys = []){
  const response = await s3.listObjectsV2(params).promise();
  response.Contents.forEach(obj => allKeys.push(obj.Key));

  if (response.NextContinuationToken) {
    params.ContinuationToken = response.NextContinuationToken;
    await getAllKeys(params, allKeys); // RECURSIVE CALL
  }
  return allKeys;
}
talawahtech
  • 590
  • 4
  • 8
  • 2
    Thank you, could you please post a non `async/await` version? Thanks – loretoparisi Jul 26 '18 at 08:52
  • 1
    @loretoparisi i happened to need the same thing today and made one that does not use either of the keywords you listed, check the answer below in case you still need it :) – Quezler Aug 05 '18 at 17:38
11

I know this has been answered quite a few times, but I thought I'd venture my version. It's based on this answer, but with a few changes that seem worthwhile:

  1. Takes s3 as parameter, instead of pulling it from global context.

  2. It isn't necessary to return a new Promise. s3.listObjectsV2().promise() already returns a promise, can just piggyback on it.

  3. Concats the return values instead of passing it up the call stack as a parameter.

  4. Checks that NextContinuationToken actually has a value. If for some reason IsTruncated is true, but there is no NextContinuationToken, unless you check for that value, the function will recurse forever. This situation can happen if MaxKeys is set to a value less than the total number of objects.

const listAllObjects = (s3, params) => {
    return s3.listObjectsV2(params).promise()
        .then(({ Contents, IsTruncated, NextContinuationToken }) => {
            return IsTruncated && NextContinuationToken
                ? listAllObjects(s3, Object.assign({}, params, { ContinuationToken: NextContinuationToken }))
                    .then(x => Contents.concat(x))
                : Contents
        })
}

And here's a jest test of it:

test('Returns all results on multiple continuations', () => {
    expect.assertions(1)

    let numCalls = 0

    // mock
    const s3 = {
        listObjectsV2: params => {
            numCalls++

            return {
                promise: () => {
                    return new Promise((resolve, reject) => {
                        setTimeout(() => {
                            if(numCalls === 3) {
                                resolve({
                                    Contents: [numCalls],
                                    IsTruncated: false,
                                })
                            }
                            else {
                                resolve({
                                    Contents: [numCalls],
                                    IsTruncated: true,
                                    NextContinuationToken: 'blah'
                                })
                            }
                        }, 200)
                    })
                }
            }
        }
    }

    return listAllObjects(s3, {})
        .then(xs => {
            expect(xs).toEqual([1, 2, 3])
        })
})
Cully
  • 6,427
  • 4
  • 36
  • 58
8

Here's a simple TypeScript 3 implementation using do/while for those who don't like recursive promises :)

export async function listKeys(
  s3client: AWS.S3,
  bucket: string,
  prefix: string,
): Promise<AWS.S3.ObjectList> {
  let token: string = undefined;
  let objectList: AWS.S3.ObjectList = [];
  do {
    const res = await s3client
      .listObjectsV2({
        Prefix: prefix,
        Bucket: bucket,
        ContinuationToken: token,
      })
      .promise();
    token = res.NextContinuationToken;
    objectList = objectList.concat(res.Contents);
  } while (token !== undefined);
  return objectList;
}
Rick B
  • 91
  • 2
  • 4
2

Used one of the answers from here which i needed, and adapted so the result is returned from inside, instead of via an outside array passed as argument, i'll just leave this here in case anyone finds it usefull:

const bucket = {Bucket: '<bucket name here>'};

...

s3files(bucket).then(array => {
        console.log(_.map(array, entry => {
            return entry.Key;
        }));
    });

...

let s3files = function (config) {

    const tmp = Object.assign({}, config);

    return new Promise(resolve => {

        s3.listObjectsV2(tmp).promise().then(response => {

            if (response.IsTruncated) {

                tmp.ContinuationToken = response.NextContinuationToken;

                s3files(tmp).then(array => {
                    resolve(response.Contents.concat(array));
                });

            } else {
                resolve(response.Contents);
            }
        });

    });
};
Quezler
  • 2,376
  • 14
  • 29
0

I have slightly modified the recursive solutions above to obtain this version that is adding a sorting add-on (not a listObjectsV2 feature) , start-after option (StartAfter in NodeJS SDK), and break to MaxKeys.

 function formatSizeUnits(bytes) {
    if (bytes >= 1099511627776) { bytes = (bytes / 1099511627776).toFixed(4) + ' PB'; }
    else if (bytes >= 1073741824) { bytes = (bytes / 1073741824).toFixed(4) + ' GB'; }
    else if (bytes >= 1048576) { bytes = (bytes / 1048576).toFixed(4) + ' MB'; }
    else if (bytes >= 1024) { bytes = (bytes / 1024).toFixed(4) + ' KB'; }
    else if (bytes > 1) { bytes = bytes + ' bytes'; }
    else if (bytes == 1) { bytes = bytes + ' byte'; }
    else { bytes = '0 byte'; }
    return bytes;
}//formatSizeUnits
var listFiles = function (params, callback) {
        var self = this;

        var options = {
            Bucket: self._options.s3.Bucket,
            Prefix: self._options.s3.Key,
            MaxKeys: 100,
            StartAfter: '',
            Desc: true
        };
        for (var attrname in params) { options[attrname] = params[attrname]; }
        var desc = options.Desc;
        delete (options.Desc);
        function listAllKeys(token, results, callback) {
            if (token) options.ContinuationToken = token;
            s3.listObjectsV2(options, (error, data) => {
                if (error) {
                    return callback(error);
                } else {
                    for (var index in data.Contents) {
                        var bucket = data.Contents[index];
                        if (bucket.Size > 0) {
                            var components = bucket.Key.split('/');
                            var name = components[components.length - 1];
                            results.push({
                                name: name,
                                path: bucket.Key,
                                mtime: bucket.LastModified,
                                size: bucket.Size,
                                sizehr: formatSizeUnits(bucket.Size)
                            });
                        }
                    }
                    if (results.length >= options.MaxKeys) { // exit max results
                        results = results.slice(0, options.MaxKeys);
                        return callback(null, results);
                    } else if (data.IsTruncated) { // truncated page
                        return listAllKeys(data.NextContinuationToken, results, callback);
                    } else { // all pages
                        return callback(null, results);
                    }
                }
            });
        }//listAllKeys
        var results = [];
        listAllKeys('', results, (error, results) => {
            if(!Util.empty(results)) {
                if (desc) results.sort((a, b) => (b.mtime).getTime() - (a.mtime).getTime());
                else results.sort((a, b) => (a.mtime).getTime() - (b.mtime).getTime());
            }
            return callback(error, results);
        });
    }//listFiles
loretoparisi
  • 15,724
  • 11
  • 102
  • 146
0
import { S3 } from 'aws-sdk';
import { PromisePool } from '@supercharge/promise-pool';

export const getFilesFromBucket = async ({ region, bucket, folder, withConcurrency }: { region?: string, bucket: string, folder: string, withConcurrency: number }) => {
    const s3Client = new S3({ region: region || 'us-east-1' });

    const response = await s3Client.listObjectsV2({ Bucket: bucket, Prefix: folder }).promise();

    return PromisePool.for(response.Contents as S3.ObjectList)
        .withConcurrency(withConcurrency)
        .process(async (fileContent) => {
            const file = await s3Client.getObject({ Bucket: bucket, Key: `${folder}/${fileContent.Key}` }).promise();

            return file;
        });
}
vimuth
  • 5,064
  • 33
  • 79
  • 116
0

We can use eachPage method to do the paginated queries for us.

const AWS = require("aws-sdk");
const s3 = new AWS.S3();

const listAllObjects = (params) =>
  new Promise((resolve, reject) => {
    let objectList = [];
    s3.listObjectsV2(params).eachPage((err, data) => {
      if (err) return reject(err);
      if (!data) return resolve(objectList);
      objectList = objectList.concat(data.Contents);
      return true;
    });
  });

listAllObjects({Bucket: "bucket-name"})
  .then(console.log)
  .catch(console.log);
Munim Munna
  • 17,178
  • 6
  • 29
  • 58