def searchSimilarDocumentsByPhrases(corpus, Ids, contractIds,count,phrases=None):
tfidf = TfidfVectorizer(vocabulary = phrases, ngram_range=(1, 6))
tfs = tfidf.fit_transform(corpus)
feature_names = tfidf.get_feature_names_out()
rows, cols = tfs.nonzero()
phrase_counts = defaultdict(list)
for row, col in zip(rows, cols):
phraseCount = corpus[row].count(feature_names[col])
phrase_counts[feature_names[col]].append({Ids[row]:{contractIds[row]: phraseCount}})
counter=count
phraselist=[]
for phrase, counts in phrase_counts.items():
counts.sort(key=lambda x: list(x.items()), reverse=True)
counts=counts[:counter]
phraselist.append(phrase)
phraselist.append(counts)
return phraselist
Input provided by me is
{ "phrases":
[
"test and evaluation"
],
"count":"16"
}
I am getting Output as
[ "test and evaluation", [ { "1180": { "LMLB_C-157": 4 } }, { "1179": { "LMLB_C-156": 1 } }, { "1156": { "LMLB_C-135": 1 } }, { "1146": { "LMLB_C-125": 2 } }, { "1103": { "LMLB_C-82": 3 } }, { "1099": { "LMLB_C-78": 1 } }, { "1089": { "LMLB_C-68": 2 } }, { "1088": { "LMLB_C-67": 1 } }, { "1087": { "LMLB_C-66": 1 } }, { "1084": { "LMLB_C-63": 1 } }, { "1080": { "LMLB_C-59": 21 } }, { "1078": { "LMLB_C-57": 1 } }, { "1076": { "LMLB_C-55": 2 } }, { "1071": { "LMLB_C-50": 2 } }, { "64": { "LMLB_C-41": 2 } }, { "24": { "LMLB_C-2": 1 } } ],
While I expect the Output to be like below Descending order of the values
[
"test and evaluation",
[
{
"1080": {
"LMLB_C-59": 21
}
},
{
"1180": {
"LMLB_C-157": 4
}
},
{
"1103": {
"LMLB_C-82": 3
}
},
{
"1076": {
"LMLB_C-55": 2
}
},
{
"1071": {
"LMLB_C-50": 2
}
},
{
"1089": {
"LMLB_C-68": 2
}
},
{
"1146": {
"LMLB_C-125": 2
}
},
{
"64": {
"LMLB_C-41": 2
}
},
{
"24": {
"LMLB_C-2": 1
}
},
{
"1088": {
"LMLB_C-67": 1
}
},
{
"1078": {
"LMLB_C-57": 1
}
},
{
"1156": {
"LMLB_C-135": 1
}
},
{
"1099": {
"LMLB_C-78": 1
}
},
{
"1087": {
"LMLB_C-66": 1
}
},
{
"1084": {
"LMLB_C-63": 1
}
},
{
"1179": {
"LMLB_C-156": 1
}
},
]
P.S. I have edited the Input and Output