I have to create a script in Python, which allows me to replace strings in a json file. This file contains patent information, for example:
{
"US-8163793-B2": {
"publication_date": "20120424",
"priority_date": "20090420",
"family_id": "42261969",
"country_code": "US",
"ipc_code": "C07D417/14",
"cpc_code": "C07D471/04",
"assignee_name": "Hoffman-La Roche Inc.",
"title": "Proline derivatives",
"abstract": "The invention relates to a compound of formula (I) wherein A, R 1 -R 6 are as defined in the description and in the claims. The compound of formula (I) can be used as a medicament."
}
However, there are about 15,000 entries. To normalize this document, before performing word embedding, I use software which includes tags in the terms found. The output looks like this:
"Row_1" : {
"COMPANY": [
{
"hitCount": 1,
"sourceTitle": "",
"sourceID": "",
"docTitle": "",
"docID": "Row_1",
"hitID": "COMP642",
"name": "Roche",
"frag_vector_array": [
"16#Hoffman-La {!Roche!} Inc."
],
"totnosyns": 1,
"goodSynCount": 1,
"nonambigsyns": 1,
"score": 1,
"hit_loc_vector": [
16
],
"word_pos_array": [
2
],
"exact_string": "16#90-95",
"exact_array": [
{
"fls": [
16,
90,
95
]
}
],
"entityType": "COMPANY",
"realSynList": [
"Roche"
],
"dictSynList": [
"roche"
],
"kvp": {
"entityType": "COMPANY"
},
"rejected": false,
"entityMeta": {
"_ext_name": "Wikipedia",
"_ext_uri": "http://en.wikipedia.org/wiki/Roche",
"_termite_id": "TCP000392"
},
"section_vector": [
8
],
"dependencyMet": true,
"fuzzyMatches": 0,
"sectionMeta": {
"8": "assignee_name|"
}
}
]
}
This output is also a json file and would be used as a dictionary.
What I need is to replace the terms "name"
, for example "Roche"
, with the "hitID"
, like "COMP642"
, every time that this term appears in the Patents file.
I very very new in Python, so any help or reading recommendation will be of great help.
Thank you!
EDIT
What a tried so far:
with open(file, "rb") as datafile:
json_data = json.loads(datafile.read().decode("utf-8")) # type: object
for paper in json_data:
termite_dict = dict()
termite_dict_all_per_pmid = list()
pmid = int(paper["docID"])
abstract = paper["abstract"]
gene_list = list()
indication_mesh_list = list()
drug_list = list()
mirna_list = list()
company_list = list()
bioproc_list = list()
protype_list = list()
if "termiteTags" in paper:
for termite_tag in paper["termiteTags"]:
type_entry = termite_tag["entityType"]
termite_dict = dict()
name = termite_tag["name"]
exact_tag_locations = termite_tag["exact_string"].split(",")
relevant_tag_locations = list()
words_to_replace = list()
# process and store termite annotations
if type_entry == "GENE":
gene_list.append({"Gene": termite_tag["hitID"]})
elif type_entry == "INDICATION":
info = termite_tag["entityMeta"]
if "mesh_tree" in info:
for e in list(filter(None, termite_tag["entityMeta"]["mesh_tree"].split(";"))):
try:
mesh_id = mesh_tree_nr_to_id_dict[e]
mesh_name = mesh_id_to_name_dict[mesh_id]
indication_mesh_list.append({"name": mesh_name, "id": mesh_id, "key": e})
except KeyError:
continue
elif "_ext_uri" in info:
url = termite_tag["entityMeta"]["_ext_uri"]
try:
mesh_id = url.split("term=")[1]
mesh_name = mesh_id_to_name_dict[mesh_id]
mesh_tree_nr = name_to_mesh_id_dict[mesh_name]
indication_mesh_list.append({"name": mesh_name, "id": mesh_id, "key": mesh_tree_nr})
except KeyError:
print("Issue with Mesh key indication")
elif type_entry == "DRUG":
drug_list.append(termite_tag["name"])
elif type_entry == "MIRNA":
mirna_list.append(termite_tag["hitID"])
elif type_entry == "COMPANY":
company_list.append(termite_tag["name"])
elif type_entry == "BIOPROC":
bioproc_list.append(termite_tag["name"])
elif type_entry == "PROTYP":
protype_list.append(termite_tag["name"])
# store info for positions with words to normalize in abstract text
for hit_number, hit in enumerate(termite_tag["frag_vector_array"]):
hit = hit.replace("\n", " ")
try:
match = re.match(r"^.*{!(.*)!}.*$", hit)
match_word = match.group(1)
except AttributeError:
try:
match = re.match(r"^.*{\*(.*)\*\}.*$", hit)
match_word = match.group(1)
except AttributeError:
print(hit)
if match_word.lower() != name.lower():
exact_locus = exact_tag_locations[hit_number]
if not exact_locus.startswith("-"):
# sentence 0 is paper title
if not exact_locus.startswith("0"):
relevant_tag_locations.append(exact_tag_locations[hit_number])
words_to_replace.append(match_word)
termite_dict["norm"] = name
termite_dict["replace"] = match_word
fr, t = exact_locus.split("#")[1].split("-")
termite_dict["from"] = int(fr)
termite_dict["to"] = int(t)
termite_dict["len"] = int(t) - int(fr)
termite_dict["entityCode"] = entity_type_encoder[termite_tag["entityType"]]
termite_dict_all_per_pmid.append(termite_dict)
termite_dict = dict()
# abstract normalization and bag of words calculations
if len(termite_dict_all_per_pmid) > 0:
sorted_termite_dict_all_per_pmid = sorted(termite_dict_all_per_pmid,
key=lambda k: (k['from'], -k["len"], k["entityCode"]))
normalized_abstract = normalize_abstract(sorted_termite_dict_all_per_pmid, abstract)
termite_dict["Norm_Abstract"] = normalized_abstract
cleaned_abstract_text = abstract_to_words(normalized_abstract)
termite_dict["bag_of_words"] = list(set(cleaned_abstract_text))
termite_dict["docID"] = pmid
if "keywords" in paper:
keywords = [w.strip() for w in paper["keywords"].split(";")]
mesh_list = list()
for word in keywords:
if len(word.split(" ")) == 1 and len(word) > 0 and word[0].islower():
word = word.title()
if word in name_to_mesh_id_dict:
mesh_id = name_to_mesh_id_dict[word]
try:
mesh_list.append([word, mesh_id, mesh_id_to_tree_nr_dict[mesh_id]])
except KeyError:
mesh_list.append([word, mesh_id, ""])
termite_dict["MeshHeadings"] = mesh_list
if len(gene_list) > 0:
termite_dict["Genes"] = gene_list
if len(indication_mesh_list) > 0:
termite_dict["Indications"] = indication_mesh_list
if len(drug_list) > 0:
termite_dict["Drug"] = drug_list
if len(mirna_list) > 0:
termite_dict["MIRNA"] = mirna_list
if len(company_list) > 0:
termite_dict["Company"] = company_list
if len(bioproc_list) > 0:
termite_dict["Bioproc"] = bioproc_list
if len(protype_list) > 0:
termite_dict["Protyp"] = protype_list
# add meta list to be able to query for gene nd indication co-occurrence
meta_list = list()
if "Indications" in termite_dict:
meta_list.extend([indi["key"] for indi in termite_dict["Indications"]])
if "Genes" in termite_dict:
meta_list.extend([gene["Gene"] for gene in termite_dict["Genes"]])
if len(meta_list) > 0:
termite_dict["all_genes_indications"] = meta_list
termite_dict_list.append(termite_dict)
return termite_dict_list