0

Whats the best way to list nested folders in Google-Cloud-Storage bucket. I have a bucket in which first the year folder then months and days folder respectively and days folder have millions of files, but months folder can miss some days (folder). When i run the code it loops folders as well as files, which is taking hours. Here is my code which I'am using

 public static IEnumerable<string> ListFolders(this StorageClient client, string bucket, string folder = "")
        {
            if (client == null) { throw new ArgumentNullException("this"); }
            if (string.IsNullOrWhiteSpace(bucket)) { throw new ArgumentOutOfRangeException("bucket must be non-empty"); }
            if (!string.IsNullOrEmpty(folder) && !folder.EndsWith(Delimiter.ToString())) { throw new ArgumentException("folder must end in " + Delimiter); }
            if (!string.IsNullOrEmpty(folder) && folder == Delimiter.ToString()) { throw new ArgumentException("root folder is \"\", not " + Delimiter); }

            var prefix = folder ?? "";
            return client
                .ListObjects(bucket, prefix)
                .Select(o => o.Name.Substring(prefix.Length))
                .Where(n => n.Contains(Delimiter))
                .Select(n => n.Split(Delimiter).First())
                .Distinct()
                .Select(n => prefix + n + Delimiter);
        }


private static void ListLiveFolders(string yearFolder)
        {
            var storage = StorageClient.Create(StorageHelper.Credentials);
            var listGcpMonthFolders = StorageHelper.ListFolders(storage, settings.Bucket, $"{settings.BucketFolder}/{yearFolder}/").ToList();

            try
            {
                foreach (var monthFolder in listGcpMonthFolders)
                {
                    Console.WriteLine(monthFolder);
                    var listGcpDaysFolders = StorageHelper.ListFolders(storage, settings.Bucket, monthFolder).ToList();
                    foreach (var daysFolder in listGcpDaysFolders)
                    {
                        Console.WriteLine(daysFolder);
                    }
                }
            }
            catch (Exception exception)
            {
                Console.WriteLine(exception.Message);
            }
        }
skhurams
  • 2,133
  • 7
  • 45
  • 82
  • Just having a look at [this question](https://stackoverflow.com/questions/37074977/how-to-get-list-of-folders-in-a-given-bucket-using-google-cloud-api) it looks like you're having the same problem just different language. Check out the first comment under the accepted answer and see if that helps any. "answered if we pass prefix=abc/xyz with delimiter=/ we get all objects whose name start with abc/xyz as well as prefixes which can be logically considered as subfolder. " – DCCoder Jan 18 '22 at 01:29
  • Can you check if the following [documentation](https://cloud.google.com/storage/docs/samples/storage-list-files#storage_list_files-csharp) helps you? Probably using client library and listing object may give you faster results. If now, then you can try async. – Rajeev Tirumalasetty Jan 18 '22 at 10:57
  • No it will search all objects and then you have to give condition to each object, no help found. – skhurams Jan 28 '22 at 12:51
  • I guess you have to use internal application logic to filter your files. If you want it as a new feature from Google. You can go and open a Feature request. Use this [issue tracker](https://cloud.google.com/support/docs/issue-trackers) link to open a FR. – Rajeev Tirumalasetty Feb 10 '22 at 10:44
  • It seems your code request for objects in every time it loops, that caused it very slow due to http request to google. What if you get everything first then do the logic locally? (sorry for english) – sambath999 May 01 '22 at 15:05

2 Answers2

0

To achieve your goal you have to use internal application logic to filter your files. However, if the solution found in a similar question linked in the comments and official documentation isn't helpful you may open a Feature Request from Google using this Issue Tracker.

mdobrucki
  • 462
  • 1
  • 7
0

Here are my workgrounds using in my .net5 project

  1. Get everything first

    public async Task<List<FolderTree>> GetObjectListAsync(string dirName)
    {
        var __request = _storageService.Objects.List(_bucketName);
        __request.Prefix = dirName;
        __request.Projection = ProjectionEnum.Full;
        var __res = await __request.ExecuteAsync();
        return GCloudHelper.TreeGenerator(__res?.Items);//call this method from GCloudHelper.cs
    }
    
  2. Working with resource locally GCloudHelper.cs

using Newtonsoft.Json;
using System;
using System.Collections.Generic;
using System.Linq;
using System.Web;
using Object = Google.Apis.Storage.v1.Data.Object;
namespace MyGCloudHelper
{
     public class GCloudHelper
     {
        public static List<FolderTree> TreeGenerator(IEnumerable<Object> objects, string matchedPath = " ")
        {
            var __tree = new List<FolderTree>();
            var __items = objects.ToList();
            if (!__items.Any()) return new();

            var __matchedPath = matchedPath;
            var __toDoItems = matchedPath == " " ? __items.Where(x => !x.Name.Contains(__matchedPath)) : __items;
            foreach(var item in __toDoItems)
            {
                var __newName = item.Name.Replace($"{matchedPath}/", "");
                var __arr = __newName.Split('/');
                for (var i = 0; i < __arr.Length; i++)
                {
                    var __part = __arr[i];
                    var __pathArr = __arr[..(i+1)];

                    var __prefix = matchedPath == " " ? "" : $"{matchedPath}/";
                    var __currentPath = __prefix + string.Join('/', __pathArr);

                    //working with directory
                    if (!IsFile(__part))
                    {
                        __tree.Add(new()
                        {
                            level = i,
                            name = __part,
                            path = __currentPath
                        });
                        //check to find matched level from other items
                        var __children = __items.Where(cx => cx.Name != item.Name && cx.Name.Contains(__currentPath)).ToList();
                        if (__children.Any())
                        {
                            __matchedPath = __currentPath;
                            var __subs = GetSubs(__children, __currentPath);
                            __tree[i].subs = __subs;
                        }
                    }
                }

            }
            //get files for each dir
            __tree = GetFiles(__tree, __items);
            return SizeSetter(__tree);
        }

        private static List<FolderTree> SizeSetter(List<FolderTree> tree)
        {
            tree.ForEach(item => {
                var __size = item.files.Sum(f => (decimal)f.size);
                if (item.subs.Any()) item.subs = SizeSetter(item.subs);//recurse
                item.size = (ulong)(__size + item.subs.Sum(sx => (decimal)sx.size));
            });

            return tree;
        }

        private static List<FolderTree> GetFiles(List<FolderTree> dirs, List<Object> items)
        {
            foreach(var dir in dirs)
            {
                var __getFiles = items.Where(x => x.Name.Contains(dir.path));
                foreach (var f in __getFiles)
                {
                    var __parts = f.Name.Split(dir.path + "/");
                    var __fpart = __parts.LastOrDefault()?.Split('/').FirstOrDefault();
                    if (IsFile(__fpart))
                    {
                        dir.files.Add(new()
                        {
                            name = __fpart,
                            link = HttpUtility.UrlDecode(f.MediaLink),
                            size = f.Size,
                            type = f.ContentType,
                            path = $"{dir.path}/{__fpart}",
                            created =  f.TimeCreated,
                            modified = f.Updated
                        });
                    }
                }

                if (dir.subs.Any()) 
                    dir.subs = GetFiles(dir.subs, items);//recurse
            }

            return dirs;
        }

        private static List<FolderTree> GetSubs(List<Object> children, string parantDir)
        {
            var __tree = new List<FolderTree>();
            var __json = JsonConvert.SerializeObject(children);
            var __newList = JsonConvert.DeserializeObject<List<Object>>(__json);
            __newList.ForEach(x => x.Name = x.Name.Replace($"{parantDir}/", ""));

            var __tmpStore = new List<TmpStore>();
            for(var i = 0; i < __newList.Count(); i++)
            {
                var __name = __newList[i].Name.Split('/').FirstOrDefault();
                if (!IsFile(__name))
                    __tmpStore.Add(new() { Name = __name, item = children.FirstOrDefault(fx => fx.Id == __newList[i].Id) });
            }

            var __group = __tmpStore.GroupBy(x => x.Name);
            foreach (var group in __group)
            {
                var __tmp = group.FirstOrDefault();
                var __currentPath = $"{parantDir}/{__tmp.Name}";

                //find children
                var __children = children.Where(cx => cx.Name != __tmp.item.Name && cx.Name.Contains(__currentPath)).ToList();
                var __subs = new List<FolderTree>();
                if (__children.Any())
                    __subs = GetSubs(__children, __currentPath);//recurse

                __tree.Add(new()
                {
                    name = __tmp.Name,
                    path = __currentPath,
                    subs = __subs,
                });
            }

            return __tree;
        }


        private static bool IsFile(string part)
            => part.Split('.').Count() > 1;

    }//class

    internal class TmpStore
    {
        public string Name { get; set; }
        public Object item { get; set; }
    }

    public class FolderTree
    {
        public string name { get; set; }
        public int level { get; set; }
        public string path { get; set; }
        public ulong size { get; set; }
        public List<FolderTree> subs { get; set; } = new();
        public List<FileProp> files { get; set; } = new();
    }

    public class FileProp
    {
        public string name { get; set; }
        public string link { get; set; }
        public ulong? size { get; set; }
        public string type { get; set; }
        public string path { get; set; }
        public DateTime? created { get; set; }
        public DateTime? modified { get; set; }
    }

}

Result
This will produce nested directory tree with files and size
which subs is list of sub directories and files is list of files
=> Result might be not as what as your concept, but it might be helpful

[
    {
        "name": "myFolder",
        "level": 0,
        "path": "myFolder",
        "size": 304340,
        "subs": [
            {
                "name": "Resource",
                "level": 0,
                "path": "myFolder/Resource",
                "size": 304301,
                "subs": [
                    {
                        "name": "Image",
                        "level": 0,
                        "path": "myFolder/Resource/Image",
                        "size": 304301,
                        "subs": [
                            {
                                "name": "Logo",
                                "level": 0,
                                "path": "myFolder/Resource/Image/Logo",
                                "size": 304301,
                                "subs": [],
                                "files": [
                                    {
                                        "name": "fileImg01.png",
                                        "link": "https://storage.googleapis.com/download/storage/v1/b/myBucket/o/myFolder/Resource/Image/Logo/fileImg01.png",
                                        "size": 64436,
                                        "type": "image/jpeg",
                                        "path": "myFolder/Resource/Image/Logo/fileImg01.png",
                                        "created": "2022-05-01T11:13:27.727+07:00",
                                        "modified": "2022-05-01T11:13:27.727+07:00"
                                    },
                                    {
                                        "name": "fileImg02.png",
                                        "link": "https://storage.googleapis.com/download/storage/v1/b/myBucket/o/myFolder/Resource/Image/Logo/fileImg02.png",
                                        "size": 175429,
                                        "type": "image/jpeg",
                                        "path": "myFolder/Resource/Image/Logo/fileImg02.png",
                                        "created": "2022-05-01T11:06:35.58+07:00",
                                        "modified": "2022-05-01T11:06:35.58+07:00"
                                    },
                                    {
                                        "name": "fileImg03.png",
                                        "link": "https://storage.googleapis.com/download/storage/v1/b/myBucket/o/myFolder/Resource/Image/Logo/fileImg03.png",
                                        "size": 64436,
                                        "type": "image/jpeg",
                                        "path": "myFolder/Resource/Image/Logo/fileImg03.png",
                                        "created": "2022-05-01T11:18:42.365+07:00",
                                        "modified": "2022-05-01T11:18:42.365+07:00"
                                    }
                                ]
                            }
                        ],
                        "files": []
                    }
                ],
                "files": []
            }
        ],
        "files": [
            {
                "name": "README.MD",
                "link": "https://storage.googleapis.com/download/storage/v1/b/myBucket/o/myFolder/README.MD",
                "size": 39,
                "type": null,
                "path": "myFolder/README.MD",
                "created": "2022-05-01T11:04:57.565+07:00",
                "modified": "2022-05-01T11:04:57.565+07:00"
            }
        ]
    }
]

Note using C# .net5

sambath999
  • 153
  • 1
  • 7