6

Is it possible to create a Google apps script which would select automatically duplicate files inside a specific (current) folder (then I just had to delete them with one click) ?

https://developers.google.com/apps-script/reference/drive/

I am asking this because Google drive pc client create random duplicates files when updating them ! They want to solve this bug but I have to manually select them in each folders and then delete them. It is very annoying and time consuming. https://productforums.google.com/forum/#!category-topic/drive/report-a-problem/_ApOopI-Xkw

Edit: Alternatively, as windows cannot create the same filename inside the same folder, find and put in the trash all duplicate filenames inside a specific main folder (and its each of its related subfolders).

Kara
  • 6,115
  • 16
  • 50
  • 57
miodf
  • 524
  • 3
  • 9
  • 21
  • I see your edit, could you tell us what are the file types you want to detect? Did you also need a hint on the dup detection? – Serge insas Mar 30 '14 at 09:02
  • Sure: filetypes : for now ".jpg", ".mp4" and ".mov" files. We are writing at close the same time! Please see also my comment in your answer.;) – miodf Mar 30 '14 at 09:06

2 Answers2

7

For better readability I add the next part of this answer in a second answer.

Here is the code to handle the duplicates : it works in 2 steps :

  1. detect the duplicates and mark them in the sheet to check if everything is fine (markDuplicates())
  2. remove the red background on the files you want to preserve (they will become yellow after step 3 is done)
  3. delete the selected (marked) files, actually move them to trash, you'll have to empty the trash manually from the drive Ui. (trashDuplicates()) The deleted files will be colored in deep red to confirm deletion.

LAST EDIT :

Placed the full code here and made some functions private to avoid errors due to bad script calls.

function startProcess(){
  PropertiesService.getScriptProperties().deleteAllProperties();
  try{
    ScriptApp.deleteTrigger(ScriptApp.getProjectTriggers()[0]);
  }catch(e){}
  var sh = SpreadsheetApp.getActiveSheet();
  sh.getDataRange().clear();
  sh.getRange(1,1,1,4).setValues([['fileName (logged @'+Utilities.formatDate(new Date(),Session.getScriptTimeZone(),'MMM-dd-yyyy HH:mm')+')','fileSize','parent folders tree','fileID']]);
  var trig = ScriptApp.newTrigger('getDriveFilesList_').timeBased().everyMinutes(5).create();
  Logger.log(trig.getUniqueId()+'  '+trig.getHandlerFunction());
  getDriveFilesList_();
}

function getDriveFilesList_(){
  var content = [];
  var startTime = new Date().getTime();
  var sh = SpreadsheetApp.getActiveSheet();
  if( ! PropertiesService.getScriptProperties().getProperty('numberOfFiles')){
    PropertiesService.getScriptProperties().setProperty('numberOfFiles',0);
  }

  var numberOfFiles = Number(PropertiesService.getScriptProperties().getProperty('numberOfFiles'));
  Logger.log(numberOfFiles);
  var max = numberOfFiles+10000;
  if( ! PropertiesService.getScriptProperties().getProperty('continuationToken')){
    var files = DriveApp.getFiles();
   // var files = DriveApp.getFolderById('0B3qSFd_____MTFZMDQ').getFiles();// use this line and comment the above if you want to process a single folder
   // use your chozen folder ID of course (available from the browser url , the part after "https://drive.google.com/?authuser=0#folders/")
  }else{
    var files = DriveApp.continueFileIterator(PropertiesService.getScriptProperties().getProperty('continuationToken'))
    }
  while(files.hasNext() && numberOfFiles<(max)){
    var file = files.next()
    if(file.getSize()>0){
      numberOfFiles++;
      var folder = '(shared)';
      if(file.getParents().hasNext()){folder = getTree_(file)}
      content.push([file.getName(),file.getSize(),folder,file.getId()])
    }    
    if(new Date().getTime()-startTime > 250000){break};
  }
  sh.getRange(sh.getLastRow()+1,1,content.length,content[0].length).setValues(content);
  if(!files.hasNext()){ScriptApp.deleteTrigger(ScriptApp.getProjectTriggers()[0]);Logger.log('done !'); sh.getRange(sh.getLastRow()+1,1).setValue('All files processed ('+numberOfFiles+' found)')};
  var continuationToken = files.getContinuationToken()
  PropertiesService.getScriptProperties().setProperty('numberOfFiles',numberOfFiles);
  PropertiesService.getScriptProperties().setProperty('continuationToken',continuationToken);
}

function markDuplicates(){
  handleDuplicates_(false)
}

function trashDuplicates(){
  handleDuplicates_(true)
}

function handleDuplicates_(trash){  
  var sh = SpreadsheetApp.getActiveSheet();
  sh.setFrozenRows(1);
  sh.sort(1);
  var data = sh.getDataRange().getValues()
  var headers = data.shift()
  var lastComment = data.pop();
  var toDelete = [];
  var item = data[0];
  for(var n=1 ; n<data.length; n++){
    if(data[n][0]==item[0] && data[n][1]==item[1]){
      toDelete.push('delete '+n);
    }
    item=data[n];
  }
  var marker =   sh.getRange(2,1,data.length,1).getBackgrounds();
  for(var n in data){
    if(!trash){marker.push(['#FFF'])};
    if(toDelete.indexOf('delete '+n)>-1 && !trash){
      marker[n][0] = '#F99';
    }
    if(toDelete.indexOf('delete '+n)>-1 && trash){
      if(marker[n][0]=='#ff9999'){
        try{
        DriveApp.getFileById(data[n][3]).setTrashed(trash);
        marker[n][0] = '#F33';
        }catch(err){Logger.log(err)}
      }else{
        marker[n][0] = '#FF9';
      }
    }
  }
  sh.getRange(2,1,marker.length,1).setBackgrounds(marker);
}

function getTree_(file){
  var tree = [];
  var folderP = file.getParents()
  while (folderP.hasNext()){
    var folder = folderP.next();
    folderP = folder.getParents();
    tree.push(folder.getName());
  }
  return tree.reverse().join('/');
}
Serge insas
  • 45,904
  • 7
  • 105
  • 131
  • Thanks. Yesterday, I had the error "Service using too much computer time for one day" for "Summary of failures for Google Apps Script: find_duplicates". I have changed the notifications settings in the current project trigger to immediate. – miodf Mar 31 '14 at 06:38
  • Your code is working. For handleduplicate,I have about 15,000 rows in the spreadsheet. It is highlighting in red duplicate filenames that have the same size. ;) Would it be possible to add not only "parent folder" but also "parent folders" of "parent folder" ? Example: not just"2010" but "PHOTOS/2010". Thanks in advance ;) – miodf Mar 31 '14 at 06:42
  • Another example: with html files. http://hpics.li/66439ae Row 1 = row 3 (what I find surprising is that even the file id is the same!). Same thing for row 2 = row 4. – miodf Mar 31 '14 at 07:22
  • if you see the same id then it is not a duplicate but the very same file in 2 or more different folders... . In that case, you should not delete it. I guess the script should be tuned to allow selective delete depending on color in the sheet... what do you think ? As for the folder, it is of course doable to show the whole folder tree , there has been a few SO post on that subject, feel free to append some necessary code. – Serge insas Mar 31 '14 at 08:00
  • Alas it is really a duplicate ! If I try a search with the filename, you can see that there are 2 files in the same folder here: http://hpics.li/6f5103f . Then, if I go to the folder, there is only one file shown (see top right http://hpics.li/dbe8a05 ) ! I did not run your function trashDuplicates just handleDuplicates so it is not in the trash either : http://hpics.li/8dd4546 note: I have only one folder with the same name : http://hpics.li/2ff3db9 – miodf Mar 31 '14 at 08:47
  • I think I may know why I have only 1 of the 2 files shown in the folder : I have removed this folder from my pc yesterday ! And Google drive client for pc has not already removed them online. – miodf Mar 31 '14 at 09:40
  • Here is another better test with only one file I have created with "Fastone Capture" in my pc inside the folder "screenshots_me" located inside the Google Drive local PC folder. Google drive pc client created alone a duplicate online and locally on my pc! In my PC: it has appended it "(1).jpg" to the filename. And online it has kept the same filename ! Your duplicate spreadsheet shows 4 same filenames and size located in the same folder but with only 2 that have the same fileID http://hpics.li/0d0de03 If I do a search in my drive I have : http://hpics.li/7e64a6c (...) – miodf Mar 31 '14 at 09:41
  • (...) If I look inside the online folder (see top right) : http://hpics.li/cb58df1 My local pc folder view http://hpics.li/7fb2e52 (I manually created only one file 2014-03-29_095332.jpg) – miodf Mar 31 '14 at 09:42
  • Imho, I should choose at the beginning a specific folder like "PHOTOS". As it has a far smaller number of files (and contains also the biggest files sizes) than my other folders in drive, I won't have again, like yesterday and today, the error _"Service using too much computer time for one day"_ (i.e.: one hour of computer time in my case). Per your suggestion, I have changed `var files = DriveApp.getFiles();` with `var files = getFoldersByName('PHOTOS').getFiles();` alas it didn't work as I expected ! Thanks in advance ;) – miodf Mar 31 '14 at 10:30
  • Should be var files = DriveApp.getFoldersByName('PHOTOS').getFiles(); – Serge insas Mar 31 '14 at 10:46
  • Thanks. ;) But it didn't work either on my part ! I also tried unsucessfully `var folderToSearch = "PHOTOS";` with `var files = DriveApp.getFoldersByName(folderToSearch).getFiles();` Both list files inside other folders ! My PHOTO folder is located closer to the root "My Drive". Thanks in advance ;) – miodf Mar 31 '14 at 12:28
  • last version : I added a folder tree display and the ability to select which file to delete or not. – Serge insas Mar 31 '14 at 16:04
  • Many thanks. When I run the function getTree I have this error : TypeError: Cannot call method "getParents" of undefined. (line 90, file "Code"). Thanks in advance ;) – miodf Mar 31 '14 at 16:46
  • You didn't read my instructions : only 2 functions to run in this part : markDuplicate and trashDuplicates. The other are called from within the script, don't use them directly since they need parameters.In the first part use only startProcess. That said, I begin to think that you could accept this answer... don't you ? – Serge insas Mar 31 '14 at 17:32
  • Sorry I haven't seen that you had updated the text above the code ! My bad. ;( I would love to accept your answers because you are so much helpful. ;) But into one hour max of running triggers, the code doesn't go inside all my files. And for some reasons (time may be too late for me or I am just stupid !) but I didn't find a way to directly from the start select which folder (PHOTOS) I want to find duplicates in. Many thanks for your understanding. ;) – miodf Mar 31 '14 at 17:52
  • I guess your issues come from something wrong in execution on your side... the tests I made were positive on thousands of files. You should check that you don't have more the one trigger active and run the steps as described. – Serge insas Mar 31 '14 at 18:08
  • Since you seem really hard to convince, I edited the code with the ability to work in a single folder, you'll have to get its ID, take it from the browser url. – Serge insas Mar 31 '14 at 19:13
  • Wow! This is so great ! ;) Many thanks. ;) By following yours instructions, I was able to find and trash duplicates inside a folder pretty easily. ;) As you recommended, I changed `var files = DriveApp.getFiles(); // var files = DriveApp.getFolderById('0B3qSFd_____MTFZMDQ').getFiles();` with `//var files = DriveApp.getFiles(); var files = DriveApp.getFolderById('0B3qSFd_____MTFZMDQ').getFiles();`and changing the folder Id to the correct one. ;) – miodf Apr 01 '14 at 06:29
  • I have tried to find duplicates not only of all files located inside a folder but also all files located inside all its subfolders. Alas my test is not working : `var files = folder var folder = DriveApp.getFolderById('0Y-wxxxxEU'); while (folder.hasNext()) { var subfolder = folders.next(); subfolder.getFiles(); }` Thanks in advance ;) – miodf Apr 01 '14 at 09:45
  • edit : The error displayed is : _TypeError: Cannot find function hasNext in object (my folder real name here)_ – miodf Apr 01 '14 at 09:58
  • Use the global version and sort the sheet on folder name/tree, now that you use it correctly it should work without time issue. – Serge insas Apr 01 '14 at 10:00
  • Thanks. I have created another new spreadsheet and added your above code. Then run start process. After 5 minutes it has created about 500 rows in my spreadsheet. Then, nothing happens (no more yellow at the top of the script editor). If I go into current project triggers, I have this error : the selected function cannot be found http://hpics.li/b96b117 It is maybe because I had earlier today and yesterday the email titled _"Summary of failures for Google Apps Script:"_ : _"getDriveFilesList_ Service using too much computer time for one day"_ – miodf Apr 01 '14 at 10:16
  • Note: If I add an immediate notification in the trigger, I receive the same email(s) error. – miodf Apr 01 '14 at 10:18
  • Sorry, there is probably a typo in your code... I forgot to add the underscore in function name when creating the trigger in the first version... please update with the code in this post, or add the underscore in scriptApp.new trigger(createFilesList_) – Serge insas Apr 01 '14 at 15:45
  • In fact I have used your correct code above with the underscore see http://hpics.li/817c3eb and here is an example of email received : http://hpics.li/ca2dddc Thanks ;) I am also receiving these emails for other non related GAS triggers that I use. I think I'll wait tomorrow ;) – miodf Apr 01 '14 at 16:22
  • Hello Serge, I have tried again yesterday and got 50 times the same email errors (http://hpics.li/ca2dddc). Today, I have tried with the ss you kindly shared in google drive forum : https://docs.google.com/spreadsheets/d/1AnKTmPYxBb_lXPBUz7RgGhu5uugs6pptJJgDxqBqHbo/edit?usp=sharing I have about 10000 rows found then I still get those error emails. By luck could you provide a way to look for all files inside one main folder and all its subfolders ? Many thanks in advance ;) – miodf Apr 03 '14 at 13:57
  • this post is getting far too long, pls start a new one on this specific folder iteration. thx – Serge insas Apr 03 '14 at 14:17
  • Sorry to bring this back up. I have an issue with duplicate files all within a single folder. I set it to scan my folder and everything works. When I run the duplicates method, I see that the file, and it's duplicate both have the same size, name, creation date, modified date, AND same fileID. Can 2 files in the same folder have the same ID? I search for that file, I get results showing 2 files, but the modified dates are NOT the same (one is the same as in the sheet, the other is not). Looks like the script is showing the 'first' file's details twice and not the duplicates details. – Albert Feb 08 '17 at 06:37
0

This is not going to be simple... DriveApp is not particularly fast and one have usually many files in Drive so a script won't be able to iterate all the files in one single shot.

You will have to process your files in batch and set a time trigger to continue automatically until all the files are processed.

Here is a script that shows how to iterate in small 100 items batches (in real life you can process more than 100, I limit to a small number for demo purpose, just change the value to a more realistic value when you have a realistic idea of how long it takes to execute, see comments in code).

I left aside the duplicate detection because I wasn't sure if all the files had to be examined (Google docs have size=0 so it made me think you wanted to work only on other format files but I wasn't sure) but this part shouldn't be too hard to deal with.

Here is the iteration code, it shows the filenames and size in the logger, along with the global counter :

function searchDupInDrive(){
  if( ! PropertiesService.getScriptProperties().getProperty('numberOfFiles')){PropertiesService.getScriptProperties().setProperty('numberOfFiles',0)};
  var numberOfFiles = Number(PropertiesService.getScriptProperties().getProperty('numberOfFiles'));
  Logger.log(numberOfFiles);
  var max = numberOfFiles+100;// use an appropriate value here so that execution time remains < 5 minutes
  if(numberOfFiles >2000){Logger.log('done !');return}; // this is to limit the demo to a short total execution time
  if( ! PropertiesService.getScriptProperties().getProperty('continuationToken')){
    var files = DriveApp.getFiles();
  }else{
    var files = DriveApp.continueFileIterator(PropertiesService.getScriptProperties().getProperty('continuationToken'))
    }
  while(files.hasNext() && numberOfFiles<(max)){
    var file = files.next()
    Logger.log((numberOfFiles)+' : fileName = '+file.getName()+'  size = '+file.getSize())
    numberOfFiles++;

    // set conditions, store file Names and sizes to be able to compare

  }
  var continuationToken = files.getContinuationToken()
  PropertiesService.getScriptProperties().setProperty('numberOfFiles',numberOfFiles);
  PropertiesService.getScriptProperties().setProperty('continuationToken',continuationToken);
}

Note : to achieve duplication detection you will probably have to store the criteria, file names and sizes somewhere between 2 script runs, I guess script properties would be useable for that too.

Please update here when you have a final result.

Note 2 : I read the thread about this issue : quite horrifying ! I'm glad I'm on Mac OS, no problem like that (for now :-)


EDIT

Here is a second version. In the comments you told me that you added the script to a spreadsheet so it gave me the idea to store the file list in a spreadsheet so that, in a second step, we can sort the files and see /select the duplicates more easily.

By the way, I noticed that the driveApp was faster then I thought, not even sure we have to process in batch... nevertheless I limited the execution time to less than 5 minutes and implemented the script trigger creation / suppression just in case you have a lot of files...

Here is the new code, I still left the duplicate detection aside, you'll find very easily a lot of code examples to select the duplicates in a spreadsheet... this first step of getting all the files was the most non obvious.

Now there are 2 functions, one to start the process (clear the sheet, create the trigger, set a title to columns) and the actual drive reading one (It takes only the files that are NOT google docs, ie files with size>0):

function startProcess(){
  PropertiesService.getScriptProperties().deleteAllProperties();
  var sh = SpreadsheetApp.getActiveSheet();
  sh.getDataRange().clear();
  sh.getRange(1,1,1,4).setValues([['fileName (logged @'+Utilities.formatDate(new Date(),Session.getScriptTimeZone(),'MMM-dd-yyyy HH:mm')+')','fileSize','parent folder','fileID']]);
  var trig = ScriptApp.newTrigger('getDriveFilesList').timeBased().everyMinutes(5).create();
  Logger.log(trig.getUniqueId()+'  '+trig.getHandlerFunction());
  getDriveFilesList();
}

function getDriveFilesList(){
  var content = [];
  var startTime = new Date().getTime();
  var sh = SpreadsheetApp.getActiveSheet();
  if( ! PropertiesService.getScriptProperties().getProperty('numberOfFiles')){
    PropertiesService.getScriptProperties().setProperty('numberOfFiles',0);
  }

  var numberOfFiles = Number(PropertiesService.getScriptProperties().getProperty('numberOfFiles'));
  Logger.log(numberOfFiles);
  var max = numberOfFiles+10000;
  if( ! PropertiesService.getScriptProperties().getProperty('continuationToken')){
    var files = DriveApp.getFiles();
  }else{
    var files = DriveApp.continueFileIterator(PropertiesService.getScriptProperties().getProperty('continuationToken'))
    }
  while(files.hasNext() && numberOfFiles<(max)){
    var file = files.next()
    if(file.getSize()>0){
      numberOfFiles++;
      var folder = '(shared)';
      if(file.getParents().hasNext()){folder = file.getParents().next().getName()}
      content.push([file.getName(),file.getSize(),folder,file.getId()])
    }    
    if(new Date().getTime()-startTime > 250000){break};
  }
  sh.getRange(sh.getLastRow()+1,1,content.length,content[0].length).setValues(content);
  if(!files.hasNext()){ScriptApp.deleteTrigger(ScriptApp.getProjectTriggers()[0]);Logger.log('done !'); sh.getRange(sh.getLastRow()+1,1).setValue('All files processed ('+numberOfFiles+' found)')};
  var continuationToken = files.getContinuationToken()
  PropertiesService.getScriptProperties().setProperty('numberOfFiles',numberOfFiles);
  PropertiesService.getScriptProperties().setProperty('continuationToken',continuationToken);
}
Serge insas
  • 45,904
  • 7
  • 105
  • 131
  • Thanks. For : "I left aside the duplicate detection(...)" in my case I am uploading some folders of photos and videos (per year), so there are no google docs inside those folder. ;) Re: you "note", I am afraid I am just a too small beginner in GAS for doing that already. Anyway I will try your script right away.Thanks. ;) – miodf Mar 30 '14 at 09:03
  • Thanks. No problem. ;) Here my test results: I create a new google spreadsheet. Go to Tools/Script Editor. Copy/paste your code. Save it. Run script. Authorize permission. Then wait a few seconds and go to View/logs to see the results: it displays: 100 filenames with their size. ;) I was not able to detect for now if files are found in a particular order or folder. – miodf Mar 30 '14 at 09:40
  • Thanks for your second version: I run it and at the end of the Execution transcript I have : Execution failed: Cannot find method deleteTrigger((class)). (line 39, file "Code") [250.835 seconds total runtime]. It displays about 2500 rows in the spreadsheet (new version). Thanks in advance ;) – miodf Mar 30 '14 at 18:37
  • Thanks for the edit : I have tried to add a folder only : `var folder = 'PHOTOS';` instead of `var folder = 'root';` but it didn't work. – miodf Mar 30 '14 at 18:48
  • if you want to get only one folder I'd suggest to sort the files in the spreadsheet, that would be the easiest way... else you'd have to change var files = DriveApp.getFiles(); (line 23 in code) and replace driveApp by a folder object. (the var=(shared) is for display only) – Serge insas Mar 30 '14 at 19:34
  • The error you get is because you didnt call the right function : you have to use startProcess and in case of failure you have to clear the trigger and scriptProperties manually. – Serge insas Mar 30 '14 at 19:41
  • see next episode in answer #2 ;) – Serge insas Mar 30 '14 at 20:32