I want to create control_df
dataframe based on the following conditions:
(i) "mrna_assignment" column contains substring "control //"
(ii) "probeset_type" column contains substring "control"
import pandas as pd
import numpy as np
df.columns = df.iloc[0]
df = df[1:].set_index("Gene Symbol") # set Gene Symbol as row index
df.sort_index() # Sort by row index
df
## Samples are "control" if:
## (i) "mrna_assignment" column contains substring "control //"; OR
## (ii) "probeset_type" column contains substring "control"
control_df = df[df["mrna_assignment"].str.contains("control //")] or df[df["category"].str.contains("control->")]
control_df
Traceback:
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
<ipython-input-167-516e7c765379> in <module>()
3 ## (i) "mrna_assignment" column contains substring "control //"; OR
4 ## (ii) "probeset_type" column contains substring "control"
----> 5 control_df = df[df["mrna_assignment"].str.contains("control //")] or df[df["category"].str.contains("control->")]
6 control_df
/usr/local/lib/python3.7/dist-packages/pandas/core/generic.py in __nonzero__(self)
1536 def __nonzero__(self):
1537 raise ValueError(
-> 1538 f"The truth value of a {type(self).__name__} is ambiguous. "
1539 "Use a.empty, a.bool(), a.item(), a.any() or a.all()."
1540 )
ValueError: The truth value of a DataFrame is ambiguous. Use a.empty, a.bool(), a.item(), a.any() or a.all().
Sample data:
df.iloc[0:50,0:50].to_dict()
{'Cytoband': {'---': '---'},
'Entrez Gene': {'---': '---'},
'GO Biological Process': {'---': nan},
'GO Cellular Component': {'---': nan},
'GO Molecular Function': {'---': nan},
'GO_biological_process': {'---': '---'},
'GO_cellular_component': {'---': '---'},
'GO_molecular_function': {'---': '---'},
'Gene Title': {'---': '---'},
'Pathway': {'---': nan},
'Probe ID': {'---': '7892552'},
'Protein Domains': {'---': nan},
'Swiss-Prot': {'---': '---'},
'UniGene': {'---': nan},
'category': {'---': 'normgene->intron'},
'crosshyb_type': {'---': '---'},
'gene_assignment': {'---': '---'},
'mrna_assignment': {'---': '--- // --- // neg_control // --- // --- // --- // --- // --- // ---'},
nan: {'---': nan},
nan: {'---': 6.89},
nan: {'---': 5.64},
nan: {'---': 6.31},
nan: {'---': 6.24},
nan: {'---': 6.17},
nan: {'---': 5.8},
nan: {'---': 5.94},
nan: {'---': 5.95},
nan: {'---': 5.66},
nan: {'---': 6.38},
nan: {'---': 4.84},
nan: {'---': 5.78},
nan: {'---': 6.5},
nan: {'---': 7.21},
nan: {'---': 6.03},
nan: {'---': 6.74},
nan: {'---': 6.3},
nan: {'---': 6.05},
nan: {'---': 6.35},
nan: {'---': 6.74},
nan: {'---': 6.6},
nan: {'---': 6.84},
'pathway': {'---': '---'},
'protein_domains': {'---': '---'},
'seqname': {'---': '---'},
'start': {'---': '---'},
'stop': {'---': '---'},
'strand': {'---': '---'},
'swissprot': {'---': '---'},
'total_probes': {'---': '4'},
'unigene': {'---': '---'}}