0

I have the following files in a folder:

"ABC"
"ABC 10"
"ABC 22"
"ABC 30"
"ABC L1"
"ABC L2"
"ABC 10 L1"
"ABC 10 L2"
"ABC 22 L1"
"ABC 22 L2"
"ABC 30 L1"
"ABC 30 L2"
"PQR"
"PQR 10"
"PQR 22"
"PQR 30"
"PQR X3"
"PQR X4"
"PQR 10 X3"
"PQR 10 X4"
"PQR 22 X3"
"PQR 22 X4"
"PQR 30 X3"
"PQR 30 X4"

Now I need the unique files list in this folder with certain indexes removed, in this example 10, 22, 30. That means to say my output list should be

['ABC', 'ABC L1', 'ABC L2', 'PQR', 'PQR X3', 'PQR X4' ]

A MWE is given below:

import os
import random
import errno
import itertools
from itertools import repeat
import re

#--------------------------------------
# Create random folders and files

# tzot's forced directory create hack https://stackoverflow.com/a/600612/4576447
def mkdir_p(path):
    try:
        os.makedirs(path)
    except OSError as exc:  # Python >2.5
        if exc.errno == errno.EEXIST and os.path.isdir(path):
            pass
        else:
            raise

if not os.path.isdir('./input_folder'):
    os.makedirs('input_folder')
for i in range(4):
    mkdir_p('./input_folder/folder_ABC_' + str(random.randint(100,399)))

for root, dirs, files in os.walk('./input_folder'):
    for dir in dirs:
        for i in repeat(None,4):
            result = open(os.path.join(root,dir) + '/ABC 10 L' + str(random.randint(0,3)) + '.dat','w')
            result = open(os.path.join(root,dir) + '/ABC 22 L' + str(random.randint(0,3)) + '.dat','w')
            result = open(os.path.join(root,dir) + '/ABC 30 L' + str(random.randint(0,3)) + '.dat','w')
            result =  open(os.path.join(root,dir) + '/PQR 10 X' + str(random.randint(0,3)) + '.dat','w')
            result = open(os.path.join(root,dir) + '/PQR 22 X' + str(random.randint(0,3)) + ' .dat','w')
            result = open(os.path.join(root,dir) + '/PQR 30 X' + str(random.randint(0,3)) + '.dat','w')         
            result = open(os.path.join(root,dir) + '/ABC ' + str(random.randint(0,3)) + '.dat','w')
            result = open(os.path.join(root,dir) + '/PQR ' + str(random.randint(0,3)) + '.dat','w')
#--------------------------------------
# Main rename code


remove = [10, 22, 30]

for root, dirs, files in os.walk('./input_folder'):
    for dir in dirs: 
        print (dir)
        output_files = [s for s in os.listdir(os.path.join(root,dir)) if s.endswith('.dat')]

How to find unique files after removing files that have values in a particular list ('remove' in this example)?

Tom Kurushingal
  • 6,086
  • 20
  • 54
  • 86
  • So is the condition to always remove the middle number? If that's the case, you don't need regex, just split the filename by space and check if the count is 3, if it is, remove the 2nd element from the list and join it again to create a new filename – Boy Jan 24 '19 at 06:23

3 Answers3

0

This is one approach using re and a list comprehension .

Ex:

import re

output_files = ['ABC', 'ABC 10', 'ABC 22', 'ABC 30', 'ABC L1', 'ABC L2', 'ABC 10 L1', 'ABC 10 L2', 'ABC 22 L1', 'ABC 22 L2', 'ABC 30 L1', 'ABC 30 L2', 'PQR', 'PQR 10', 'PQR 22', 'PQR 30', 'PQR X3', 'PQR X4', 'PQR 10 X3', 'PQR 10 X4', 'PQR 22 X3', 'PQR 22 X4', 'PQR 30 X3', 'PQR 30 X4']
remove = ["10", "22", "30"]

pat = re.compile("(" + "|".join(remove) + ")")
print( [i for i in output_files if not pat.search(i)])

Output:

['ABC', 'ABC L1', 'ABC L2', 'PQR', 'PQR X3', 'PQR X4']
Rakesh
  • 81,458
  • 17
  • 76
  • 113
0

You could use a regex approach like so

\s+(?:[13]0|22)

The found matches need to be replaced by '', see a demo on regex101.com.

Jan
  • 42,290
  • 8
  • 54
  • 79
0
import re

regex = re.compile(r'([A-Z]{3})(?:\s+(?:\d+\s+)?([A-Z]\d))?')

files = ['ABC', 'ABC 10', 'ABC 22', 'ABC 30', 'ABC L1', 'ABC L2', 'ABC 10 L1', 'ABC 10 L2', 'ABC 22 L1', 'ABC 22 L2', 'ABC 30 L1', 'ABC 30 L2', 'PQR', 'PQR 10', 'PQR 22', 'PQR 30', 'PQR X3', 'PQR X4', 'PQR 10 X3', 'PQR 10 X4', 'PQR 22 X3', 'PQR 22 X4', 'PQR 30 X3', 'PQR 30 X4']

result = [
    ' '.join(group for group in regex.findall(item)[0] if group)
    for item in files
]

print(result)

# outpout
['ABC', 'ABC', 'ABC', 'ABC', 'ABC L1', 'ABC L2', 'ABC L1', 'ABC L2', 'ABC L1', 'ABC L2', 'ABC L1', 'ABC L2', 'PQR', 'PQR', 'PQR', 'PQR', 'PQR X3', 'PQR X4', 'PQR X3', 'PQR X4', 'PQR X3', 'PQR X4', 'PQR X3', 'PQR X4']

# dedupe:
result = sorted(set(result))
print(result)
# output
['ABC', 'ABC L1', 'ABC L2', 'PQR', 'PQR X3', 'PQR X4']
dopstar
  • 1,478
  • 10
  • 20