Source code for genewalk.get_indra_stmts

"""This script creates a pickle file of INDRA Statements for a specific set of
genes, based on a pre-assembled network of interactions produced by INDRA. It
loads the INDRA interactions as a pandas DataFrame and, filters it to the genes
and familiex/complexes of interest, as well as targeted biological processes.
It downloads the relevant Statement objects for reference and dumps them into
a pickle file.
"""
import pandas
import pickle
import logging
import argparse
from indra.util import batch_iter
from indra.databases import go_client
from indra.sources import indra_db_rest
from indra.databases import hgnc_client
from indra.ontology.bio import bio_ontology


logger = logging.getLogger('genewalk.get_indra_stmts')


[docs]def load_genes(fname): """Return a list of genes IDs from a file with lines like HGNC:123.""" with open(fname, 'r') as fh: # Get the HGNC IDs from the list of genes file, assuming that # each line looks like HGNC:123 genes = [l.strip().split(':')[1] for l in fh.readlines()] logger.info('Loaded %d genes from %s' % (len(genes), fname)) return genes
[docs]def load_mouse_genes(fname): """Return a list of human genes based on a table of mouse genes.""" # assumes the csv has headers df = pandas.read_csv(fname) for c in df.columns: # assumes the first column starting with MGI is the relevant one # with MGI:IDs if c.startswith('MGI'): df = df.rename(columns={c: 'MGI'}) break mgi_ids = df['MGI'] genes = [] for mgi_id in mgi_ids: if mgi_id.startswith('MGI:'): mgi_id = mgi_id[4:] hgnc_id = hgnc_client.get_hgnc_from_mouse(mgi_id) if not hgnc_id: print('Could not find human gene corresponding to MGI %s' % mgi_id) continue genes.append(hgnc_id) return genes
[docs]def load_indra_df(fname): """Return an INDRA Statement data frame from a pickle file.""" with open(fname, 'rb') as fh: df = pickle.load(fh) logger.info('Loaded %d rows from %s' % (len(df), fname)) return df
[docs]def dump_pickle(stmts, fname): """Dump a list of Statements into a picke file.""" with open(fname, 'wb') as fh: pickle.dump(stmts, fh) logger.info('Dumped %d statements into %s' % (len(stmts), fname))
[docs]def filter_to_genes(df, genes, fplx_terms): """Filter a data frame of INDRA Statements given gene and FamPlex IDs.""" # Look for sources that are in the gene list or whose families/complexes # are in the FamPlex term list source_filter = (((df.agA_ns == 'HGNC') & (df.agA_id.isin(genes))) | ((df.agA_ns == 'FPLX') & (df.agA_id.isin(fplx_terms)))) # Look for targets that are in the gene list or whose families/complexes # are in the FamPlex term list, or which are GO terms target_filter = (((df.agB_ns == 'HGNC') & (df.agB_id.isin(genes))) | ((df.agB_ns == 'FPLX') & (df.agB_id.isin(fplx_terms))) | (df.agB_ns == 'GO')) # sources/targets df = df[source_filter & target_filter] logger.info('Filtered data frame to %d rows.' % len(df)) return df
[docs]def get_famplex_terms(genes): """Get a list of associated FamPlex IDs from a list of gene IDs.""" all_parents = set() for hgnc_id in genes: parent_ids = {p[1] for p in bio_ontology.get_parents('HGNC', hgnc_id)} all_parents |= parent_ids fplx_terms = sorted(list(all_parents)) logger.info('Found %d relevant FamPlex terms.' % (len(fplx_terms))) return fplx_terms
def get_famplex_links_from_stmts(stmts): genes_appearing = set() fplx_appearing = set() for stmt in stmts: agents = [a for a in stmt.agent_list() if a is not None] if len(agents) < 2: continue for agent in agents: if 'HGNC' in agent.db_refs: genes_appearing.add(agent.name) elif 'FPLX' in agent.db_refs: fplx_appearing.add(agent.name) return get_famplex_links_from_lists(genes_appearing, fplx_appearing) def get_famplex_links_from_lists(genes_appearing, fplx_appearing): links = [] for gene in genes_appearing: parent_ids = [p[1] for p in bio_ontology.get_parents('HGNC', gene)] parents_appearing = fplx_appearing & set(parent_ids) links += [(gene, parent) for parent in parents_appearing] for fplx_child in fplx_appearing: parent_ids = [p[1] for p in bio_ontology.get_parents('FPLX', fplx_child)] parents_appearing = fplx_appearing & set(parent_ids) links += [(fplx_child, parent) for parent in parents_appearing] return links
[docs]def download_statements(df, ev_limit=5): """Download the INDRA Statements corresponding to entries in a data frame. """ all_stmts = [] for idx, group in enumerate(batch_iter(df.hash, 500)): logger.info('Getting statement batch %d' % idx) idbp = indra_db_rest.get_statements_by_hash(list(group), ev_limit=ev_limit) all_stmts += idbp.statements return all_stmts
def remap_go_ids(stmts): for stmt in stmts: for agent in stmt.agent_list(): if agent is not None and 'GO' in agent.db_refs: prim_id = go_client.get_primary_id(agent.db_refs['GO']) if prim_id: agent.db_refs['GO'] = prim_id if __name__ == '__main__': # Handle command line arguments # TODO: make specific sets of statements for the use cases available in # the repo (or on S3) and parameterize here to be able to load them. parser = argparse.ArgumentParser( description='Choose a file with a list of genes to get a SIF for.') parser.add_argument('--df', default='data/stmt_df.pkl') parser.add_argument('--genes', default='data/JQ1_HGNCidForINDRA.csv') parser.add_argument('--mouse_genes') parser.add_argument('--stmts', default='data/JQ1_HGNCidForINDRA_stmts.pkl') args = parser.parse_args() # Load genes and get FamPlex terms if args.mouse_genes: genes = load_mouse_genes(args.mouse_genes) else: genes = load_genes(args.genes) fplx_terms = get_famplex_terms(genes) # Load INDRA Statements in a flat data frame df = load_indra_df(args.df) # Filter the data frame to relevant entities df = filter_to_genes(df, genes, fplx_terms) # Download the Statement corresponding to each row stmts = download_statements(df) # Remap any outdated GO IDs remap_go_ids(stmts) # Dump the Statements into a pickle file dump_pickle(stmts, args.stmts)