0

Tried to delete duplicates but not working. the code imports json and convert to python. it reads fields and values from config file

delete duplicates based on the sort mentioned in the config file.

function to delete duplicates based on specified fields:

import json
import sys
import csv
from datetime import datetime, timedelta

# function to convert datetime format
def convert_datetime(dt, dt_type):
    if dt_type == 'd':
        return datetime.strptime(dt[:19], '%Y-%m-%dT%H:%M:%S').strftime('%Y-%m-%d')
    elif dt_type == 'fd':
        return datetime.strptime(dt[:23], '%Y-%m-%dT%H:%M:%S.%f').strftime('%Y-%m-%d-%H.%M.%S.%f')

# function to delete duplicates based on specified fields
def delete_duplicate(data, config):
    # sort data based on dupvalue field in descending order
    data.sort(key=lambda x: x[config['dupvalue']], reverse=True)
    
    unique_data = []
    unique_keys = set()
    for d in data:
        key = tuple(d[field] for field in config['fields'])
        if key not in unique_keys:
            unique_data.append(d)
            unique_keys.add(key)
    return unique_data

# read config file
with open(sys.argv[2]) as f:
    config = {}
    for line in f:
        field, value, dt = line.strip().split('|')
        config[field] = value
        config[field + '_dt'] = dt

# read data from json file
with open(sys.argv[1]) as f:
    data = json.load(f)

# extract fields from data
fields = config['fields'] = data[0].keys()

# convert datetime fields
for row in data:
    for field in fields:
        if field + '_dt' in config:
            row[field] = convert_datetime(row[field], config[field + '_dt'])

# delete duplicates
data = delete_duplicate(data, config)

# write data to pipe separated file
with open(sys.argv[3], 'w') as f:
    writer = csv.writer(f, delimiter='|')
    writer.writerow(fields)
    for row in data:
        writer.writerow([row.get(field, 'null') for field in fields])

# add header and tail record with count of data to file
with open(sys.argv[3], 'r') as f:
    lines = f.readlines()

with open(sys.argv[3], 'w') as f:
    f.write('HEADER|{}|{}|{}\n'.format(len(fields), len(data), datetime.now().strftime('%Y-%m-%d %H:%M:%S')))
    f.writelines(lines)
    f.write('TAIL|{}|{}|{}\n'.format(len(fields), len(data), datetime.now().strftime('%Y-%m-%d %H:%M:%S')))
James Z
  • 12,209
  • 10
  • 24
  • 44
  • Hi, welcome to StackOverflow. Please take the [tour](https://stackoverflow.com/tour) and learn [How to Ask](https://stackoverflow.com/help/how-to-ask). In order to get help, you will need to provide a [minimal reproducible example](https://stackoverflow.com/help/minimal-reproducible-example). If your question include a pandas dataframe, please provide a [reproducible pandas example](https://stackoverflow.com/questions/20109391/how-to-make-good-reproducible-pandas-examples) – alec_djinn Apr 20 '23 at 09:46
  • Maybe print content of `unique_keys` to see if you are really getting the keys in it? – Andrew Allaire Apr 20 '23 at 16:18

0 Answers0