I'm using jupyter notebook and trying to run this code:
import pubchempy as pcp
from rdkit import Chem
import numpy as np
import pandas as pd
import json
import time
import fuzzymatcher
start_time = time.time()
# CD = pd.read_excel(excelpathin)
data = pd.ExcelFile(r"C:\Users\USER\Downloads\MOD_REINJ_NEG_ChemSpider Results.xlsx")
data = pd.ExcelFile(excelpathin)
df = data.parse (sheet_name=0)
inKey = list()
for idx,sd in enumerate(df['Structure']):
print(idx)
F = open ("temp.sdf","w")
F.writelines (sd)
F.close()
suppl = Chem.SDMolSupplier ('temp.sdf')
mol = next (suppl)
if mol==None:
inKey.append (np.nan)
else:
inKey.append (Chem.MolToInchiKey (mol))
inKey = pd.DataFrame(inKey)
inKey.columns = ['InChIKey']
CD = pd.concat([df, inKey], axis=1, sort=False)
print("--- %s seconds --f-add 3 cols" % (time.time() - start_time))
# From here is the joindata function with modification
# Load the parse HMDB file
with open(jsonpathin, 'r') as read_file:
data = json.load(read_file)
start_time = time.time()
# Load the parse HMDB file
# with open('D:/BCDD/Documents/Tal/Projects/HMDB/DataSets/Parser_HMDB.py Output/serum_metabolites.json', 'r') as read_file:
# data = json.load(read_file)
# Create a data frame from the list of dictionaries
# df_hmdb = pd.DataFrame(data, columns=['accession', 'name', 'chemical_formula', 'inchikey', 'disease_name' ])
df_hmdb = pd.DataFrame(data)
df_hmdb.drop(['description', 'synonyms', 'kegg_id', 'meta_cyc_id', 'pathway_name'], axis=1)
df_excel = CD
# Merge by inchikey
joindata_by_inchikey = pd.merge(left=df_excel, right=df_hmdb, how='inner', left_on='InChIKey', right_on='inchikey')
print("--- %s seconds --f-merge by inchikey " % (time.time() - start_time))
start_time = time.time()
# Reduce the rows to those we DID find a match by inchkey in bothe data sets
df_hmdb_reduce_byinchik = df_hmdb.loc[~df_hmdb['inchikey'].isin(df_excel['InChIKey'])]
df_excel_reduce_byinchik = df_excel.loc[~df_excel['InChIKey'].isin(joindata_by_inchikey['InChIKey'])]
# joindata_by_name = fuzzymatcher.fuzzy_left_join(df_excel, df_hmdb, left_on="Name", right_on="name")
joindata_by_name = fuzzymatcher.fuzzy_left_join(df_excel_reduce_byinchik, df_hmdb_reduce_byinchik, left_on="Name",
right_on="name")
# Selecting threshold best_match_score>0.25 maybe adjustments needed
joindata_by_name = joindata_by_name[joindata_by_name['best_match_score'] > 0.55]
# Drop columns the
joindata_by_name.drop(['best_match_score', '__id_left', '__id_right'], axis=1, inplace=True)
print("--- %s seconds --f-merge by name" % (time.time() - start_time))
start_time = time.time()
# Reduce the rows to those we DID find a match by inchkey in and by name both data sets
df_hmdb_reduce_byname = df_hmdb_reduce_byinchik.loc[~df_hmdb_reduce_byinchik['name'].isin(joindata_by_name['name'])]
df_excel_reduce_byname = df_excel_reduce_byinchik.loc[
~df_excel_reduce_byinchik['Name'].isin(joindata_by_name['Name'])]
# Remove spaces between letters on 'Formula' ( there is a warning)
df_excel_reduce_byname.loc[:, 'Formula'] = df_excel_reduce_byname['Formula'].str.replace(' ', '')
# Merge by chemical_formula
joindata_by_CF = pd.merge(left=df_excel_reduce_byname, right=df_hmdb_reduce_byname, how='inner', left_on='Formula',
right_on='chemical_formula')
# This data inculed rows from the original EXCEL file that we did NOT find and match ( by inchikey nor name nor CF)
df_excel_reduce_byCF = df_excel_reduce_byname.loc[
~df_excel_reduce_byname['Formula'].isin(joindata_by_CF['chemical_formula'])]
# Create a list of all columns of the HMDB JSON data
colnames = joindata_by_inchikey.columns[6:]
# Add those names as empty columns to the df_excel_reduce_byCF. reducedata in all the rows from the original Excel
# that did NOT find a match and added the columns of the HMDB
reducedata = df_excel_reduce_byCF.reindex(columns=[*df_excel_reduce_byCF.columns.tolist(), *colnames])
# Append all the data sets
# out = joindata_by_inchikey.append(joindata_by_name.append(joindata_by_CF))
out = joindata_by_inchikey.append(joindata_by_name.append(joindata_by_CF.append(reducedata)))
print("--- %s seconds --f-merge by CF" % (time.time() - start_time))
(I'm not sure the code is relevant since I load files that are not shared in quesion)
This is the error I get:
OperationalError Traceback (most recent call last)
<ipython-input-8-26c4a723b687> in <module>
23 # joindata_by_name = fuzzymatcher.fuzzy_left_join(df_excel, df_hmdb, left_on="Name", right_on="name")
24 joindata_by_name = fuzzymatcher.fuzzy_left_join(df_excel_reduce_byinchik, df_hmdb_reduce_byinchik, left_on="Name",
---> 25 right_on="name")
26
27 # Selecting threshold best_match_score>0.25 maybe adjustments needed
~\Anaconda3\lib\site-packages\fuzzymatcher\__init__.py in fuzzy_left_join(df_left, df_right, left_on, right_on, left_id_col, right_id_col)
39 m = Matcher(dp, dg, s)
40 m.add_data(df_left, df_right, left_on, right_on, left_id_col, right_id_col)
---> 41 m.match_all()
42
43 return m.get_left_join_table()
~\Anaconda3\lib\site-packages\fuzzymatcher\matcher.py in match_all(self)
87 self.scorer.add_data(self)
88
---> 89 self.data_getter.add_data(self)
90
91 # Get a table that contains only the matches, scores and ids
~\Anaconda3\lib\site-packages\fuzzymatcher\data_getter_sqlite.py in add_data(self, matcher)
58 USING fts4({} TEXT, _concat_all TEXT, _concat_all_alternatives TEXT);
59 """.format(matcher.right_id_col)
---> 60 con.execute(sql)
61 con.execute("INSERT INTO fts_target SELECT * FROM df_right_processed")
62
OperationalError: no such module: fts4
The line with error of fuzzymatcher used to work using Pycharme on different computer but on jupyter notebook it does not.
I checked this for answers :
But no use and I do not understand what to do
any help idea hint thoughts are fappreciated