Using csv and counter modules
Code
import csv
from collections import Counter
with open('f1.txt', 'r') as f1, open('f2.txt', 'r') as f2:
# shown input has multiple spaces between fields
reader1 = csv.reader(f1, delimiter=' ', skipinitialspace=True)
reader2 = csv.reader(f2, delimiter=' ', skipinitialspace=True)
# Use dictionary comprehension to
# convert to dictionary
# converting second value in each row to int
d1 = {x[0]:int(x[1]) for x in reader1}
d2 = {x[0]:int(x[1]) for x in reader2}
# Use Counter to add common keys
cnts = Counter(d1) + Counter(d2)
# Sort by value descending and alphabeical ascending
result = dict(sorted(cnts.items(), key=lambda kv: (-kv[1], kv[0])))
for k, v in result.items():
print(k, v)
Test
File1.txt
eggs 25
beans 10
peas 30
oranges 15
File2.txt
eggs 15
pineapples 45
beans 35
peas 25
Output
peas 55
beans 45
pineapples 45
eggs 40
oranges 15
Update
Code updated based upon actual data example
Issues
- Posted code is multispace delimited
- Actual data is tab delimited
- Many rows in the real data (from comment) is not properly formatted as two-column fields
- Created a function to go line by line through the data to retrieve only valid fields
- Used data link in comment for files 1 & 2
Code Update
from collections import Counter
def get_data(filenm):
'''
Two column CSV tab delimited data
vald - lines with valid data
invalid - lines with invalid data (linenumber, data)
'''
valid, invalid = [], []
with open(filenm, 'r', encoding="utf8") as f:
for i, line in enumerate(f):
row = line.rstrip().split('\t')
if len(row) == 2:
valid.append(row) # Valid row of data
else:
invalid.append((i, line))
return valid, invalid
valid1, invalid1 = get_data('agg1.txt')
valid2, invalid2 =get_data('agg2.txt')
# Convert Valid rows to dictionary
d1 = {x[0]:int(x[1]) for x in valid1}
d2 = {x[0]:int(x[1]) for x in valid2}
cnts = Counter(d1) + Counter(d2)
# Sort by value descending and alphabeical ascending
result = dict(sorted(cnts.items(), key=lambda kv: (-kv[1], kv[0])))
# Show first 10 lines of results
print('First 10 lines of results')
for i, (k, v) in enumerate(result.items()):
print(k, v)
if i > 10:
break
# Show invalid data (line number and line)
print()
print('Invalid file 1')
print(*invalid1, sep = '\n')
print('Invalid file 2')
print(*invalid2, sep = '\n')
Output Update
First 10 lines of results
。 6397586
を 4450628
《 2948712
》 2948688
「 2295146
」 2294570
… 1843528
だ 1530958
いる 841602
こと 761052
? 545826
する 458792
Invalid file 1
(5828, '\t\t\t946\n')
(24158, '133\n')
(24293, '132\n')
(30648, '87\n')
(37889, '58\n')
(46807, '37\n')
(51404, '\t\t\t30\n')
(53151, '27\n')
(54272, '26\n')
(54677, '25\n')
(55962, '24\n')
(57129, '23\n')
(70327, '13\n')
(71287, '12\n')
(73405, '11\n')
(76059, '10\n')
(76214, '10\n')
(82563, '8\n')
(83460, '8\n')
(85801, '7\n')
(88476, '6\n')
(88494, '6\n')
(94354, '5\n')
(94703, '5\n')
(97635, '4\n')
(110152, '3\n')
(110153, '3\n')
(110560, '3\n')
(111046, '3\n')
(117778, '2\n')
(117791, '2\n')
(117795, '\t\uf8f3\t2\n')
(117806, '2\n')
(118312, '2\n')
(119811, '2\n')
(119848, '2\n')
(134106, '1\n')
(134485, '1\n')
(134505, '1\n')
(136092, '1\n')
(136144, '1\n')
(136147, '1\n')
(139521, '1\n')
(139626, '1\n')
(139629, '1\n')
(139645, '1\n')
(139665, '1\n')
(139724, '1\n')
(139877, '1\n')
(139885, '1\n')
(139887, '1\n')
(139897, '1\n')
(139914, '1\n')
(139935, '1\n')
(139936, '1\n')
(139963, '1\n')
(139975, '1\n')
Invalid file 2
(5828, '\t\t\t946\n')
(24158, '133\n')
(24293, '132\n')
(30648, '87\n')
(37889, '58\n')
(46807, '37\n')
(51404, '\t\t\t30\n')
(53151, '27\n')
(54272, '26\n')
(54677, '25\n')
(55962, '24\n')
(57129, '23\n')
(70327, '13\n')
(71287, '12\n')
(73405, '11\n')
(76059, '10\n')
(76214, '10\n')
(82563, '8\n')
(83460, '8\n')
(85801, '7\n')
(88476, '6\n')
(88494, '6\n')
(94354, '5\n')
(94703, '5\n')
(97635, '4\n')
(110152, '3\n')
(110153, '3\n')
(110560, '3\n')
(111046, '3\n')
(117778, '2\n')
(117791, '2\n')
(117795, '\t\uf8f3\t2\n')
(117806, '2\n')
(118312, '2\n')
(119811, '2\n')
(119848, '2\n')
(134106, '1\n')
(134485, '1\n')
(134505, '1\n')
(136092, '1\n')
(136144, '1\n')
(136147, '1\n')
(139521, '1\n')
(139626, '1\n')
(139629, '1\n')
(139645, '1\n')
(139665, '1\n')
(139724, '1\n')
(139877, '1\n')
(139885, '1\n')
(139887, '1\n')
(139897, '1\n')
(139914, '1\n')
(139935, '1\n')
(139936, '1\n')
(139963, '1\n')
(139975, '1\n')