I am trying to use the Python package GEOparse to download microarray data. I have gotten to the step of merging probes and Gene IDs. I want to replace the ID_REF with the ENTREZ_GENE_ID.
However, this is not working. This is what I have so far, I am following this walk through: https://geoparse.readthedocs.io/en/latest/Analyse_hsa-miR-124a-3p_transfection_time-course.html
# Import tools
import GEOparse
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
# download datasets
gse1 = GEOparse.get_GEO(geo="GSE99039", destdir="C:/Users/Highf_000/PycharmProjects/TFTest")
gse2 = GEOparse.get_GEO(geo="GSE6613", destdir="C:/Users/Highf_000/PycharmProjects/TFTest")
gse3 = GEOparse.get_GEO(geo="GSE72267", destdir="C:/Users/Highf_000/PycharmProjects/TFTest")
# import all GSM data for each GSE file
with open("GSE99039_GPL570.csv") as f:
GSE99039_GPL570 = f.read().splitlines()
with open("GSE6613_GPL96.csv") as f:
GSE6613_GPL96 = f.read().splitlines()
with open("GSE72267_GPL571.csv") as f:
GSE72267_GPL571 = f.read().splitlines()
gse1.gsm = gse1.phenotype_data
print(gse1.gsm)
gse2.gsm = gse1.phenotype_data
print(gse2.gsm)
gse3.gsm = gse1.phenotype_data
print(gse3.gsm)
gse1.gpls['GPL570'].table
gse2.gpls['GPL96'].table
gse3.gpls['GPL571'].table
# gse1
pivoted_control_samples = gse1.pivot_samples('VALUE')[GSE99039_GPL570]
print(pivoted_control_samples)
# gse1
# Pulls the probes out
pivoted_control_samples_average = pivoted_control_samples.median(axis=1)
# Print number of probes before filtering
print("Number of probes before filtering: ", len(pivoted_control_samples_average))
# Extract all probes > 0.25
expression_threshold = pivoted_control_samples_average.quantile(0.25)
expressed_probes = pivoted_control_samples_average[pivoted_control_samples_average >= expression_threshold].index.tolist()
# Print probes above cut off
print("Number of probes above threshold: ", len(expressed_probes))
# confirm filtering worked
samples = gse1.pivot_samples("VALUE").ix[expressed_probes]