I'm trying to summarize how much data has been written to a folder in my Data Lake. What is the best way to do this? Should I use a U-SQL job? HDInsights?
3 Answers
There are two ways to do this:
- If it is a one-time operation, you can use Azure Storage Explorer (https://azure.microsoft.com/en-us/features/storage-explorer/), navigate to the Data Lake Store folder and get the size for it.
- If you want a programmatic way to do this, Data Lake Store provides a WebHDFS compliant API that can list several folder attributes: GETCONTENTSUMMARY. You can see more details here: https://learn.microsoft.com/en-us/rest/api/datalakestore/webhdfs-filesystem-apis.
Hope this helps
José

- 121
- 2
-
Hi Jose, Would you know how to do this with ADLS gen2 – 2713 Jul 03 '20 at 02:31
-
For option-2, note it works only with Gen1. And not Gen2 as per this: https://stackoverflow.com/questions/58316970/does-azure-data-lake-gen2-provides-webhdfs-rest-apis – Jatin Feb 04 '22 at 03:58
You can use Python code to loop through the files. Refer here: https://cloudarchitected.com/2019/05/computing-total-storage-size-of-a-folder-in-azure-data-lake-storage-gen2/
In case you would like to quickly cross check this:
Download the Azure Storage Explorer from Windows Application https://azure.microsoft.com/en-in/features/storage-explorer/
Open the folder which you would like to view the size details.
On the top bar menu choose More -> Folder Statistics will help you get the details of the Directory including the size in bytes. Refer the attachment [sample snapshot of the Azure Storage Explorer Menu[1]][1]

- 21
- 3
Below is the script which will help to get folder/files statistics.And also please verify all variables with vaules as per your environment.
import csv, os, datetime,configparser
from azure.datalake.store import core,lib
# Returns the size of each subdirectory
def getUsage(adls_client,data,level):
temp=[]
# Split the path by '/' and store in list
for i in data:
temp.append(i.split('/'))
# Prepare PathList by removing the filenames
path=[]
pathList=[]
for i in temp:
# Ensure Depth of the Path is not crossing level
path=[]
if len(i)-1 >= level:
maxDepth = level
else:
maxDepth = len(i)-1
for j in range(maxDepth):
if i[j] not in path or i[j] != '_SUCCESS':
path.append(i[j])
pathList.append(path)
# Remove duplicates
uniquePaths = set(tuple(x) for x in pathList)
pathsPreparedDU= list("/".join(x) for x in uniquePaths)
# Get usage for the directories from prepared paths
answers=[]
temp=[]
temp2=""
blankLevelCnt =0
for i in pathsPreparedDU:
temp=[]
temp2=""
usage=adls_client.du(i, deep=True, total=True)
temp.append(i.split('/'))
for item in temp:
if len(item) < level+1:
blankLevelCnt = (level+1) - len(item)
temp2=temp2+i
for j in range(blankLevelCnt):
temp2=temp2+"/"
temp2=temp2+str(usage)
answers.append([temp2])
# add element for CSV header
csvList = []
temp=[]
temp.append("Date/Time")
for i in range(level):
temp.append("Level "+str(i+1))
temp.append("Size (Bytes)")
temp.append("Size (KB)")
temp.append("Size (MB)")
temp.append("Size (GB)")
temp.append("Size (TB)")
csvList.append(temp)
now = datetime.datetime.now()
for i in answers:
usageBytes = int(i[0].split('/')[-1])
usageKBytes = round(usageBytes/1024,2)
usageMBytes = round(usageKBytes/1024,2)
usageGBytes = round(usageMBytes/1024,2)
usageTBytes = round(usageGBytes/1024,2)
csvList.append((str(now)[:16]+"/"+i[0]+"/"+str(usageKBytes)+"/"+str(usageMBytes)+"/"+str(usageGBytes)+"/"+str(usageTBytes)).split('/'))
return csvList
# Returns the alds_client object
def connectADLS(tenant_id,app_id,app_key, adls_name):
adls_credentials = lib.auth(tenant_id=tenant_id,client_secret=app_key,client_id=app_id)
return core.AzureDLFileSystem(adls_credentials, store_name=adls_name)
# Returns the all subdirectories under the root directory
def getSubdirectories(adls_client,root_dir):
return adls_client.walk(root_dir)
# Write to CSV
def writeCSV(root_dir,csvList):
fileprefixes = root_dir.split('/')
prefix = "root-"
while('' in fileprefixes) :
fileprefixes.remove('')
if len(fileprefixes) > 0:
prefix = ""
for i in fileprefixes:
prefix = prefix + i + "-"
x = datetime.datetime.today().strftime('%Y-%m-%d')
filename = prefix+""+ x +".csv"
with open(filename, "w+") as csvFile:
writer = csv.writer(csvFile,lineterminator='\n')
writer.writerows(csvList)
csvFile.close()
print("file Generated")
print('##vso[task.setvariable variable=filename;]%s' % (filename))
if __name__ == "__main__":
# 1. Parse config file and get service principal details
config = configparser.ConfigParser()
config.sections()
config.read('config.ini')
tenant_id=config['SERVICE_PRINCIPAL']['tenant_id']
app_id=config['SERVICE_PRINCIPAL']['app_id']
app_key=config['SERVICE_PRINCIPAL']['app_key']
adls_name = config['ADLS_ACCT']['adls_name']
root_dir = config['ADLS_ACCT']['root_dir']
level = config['ADLS_ACCT']['level']
# 2. Connect to ADLS
adls_client = connectADLS(tenant_id,app_id,app_key, adls_name)
# 3. recursively lists all files
data = getSubdirectories(adls_client,root_dir)
# 4. Get usage for the directories
csvList = getUsage(adls_client,data,int(level))
# 5. Upload report to blob
writeCSV(root_dir,csvList)

- 1,234
- 8
- 16

- 11
- 3