"""This script creates a pickle file of INDRA Statements for a specific set of
genes, based on a pre-assembled network of interactions produced by INDRA. It
loads the INDRA interactions as a pandas DataFrame and, filters it to the genes
and familiex/complexes of interest, as well as targeted biological processes.
It downloads the relevant Statement objects for reference and dumps them into
a pickle file.
"""
import pandas
import pickle
import logging
import argparse
from indra.util import batch_iter
from indra.databases import go_client
from indra.sources import indra_db_rest
from indra.databases import hgnc_client
from indra.ontology.bio import bio_ontology
logger = logging.getLogger('genewalk.get_indra_stmts')
[docs]def load_genes(fname):
"""Return a list of genes IDs from a file with lines like HGNC:123."""
with open(fname, 'r') as fh:
# Get the HGNC IDs from the list of genes file, assuming that
# each line looks like HGNC:123
genes = [l.strip().split(':')[1] for l in fh.readlines()]
logger.info('Loaded %d genes from %s' % (len(genes), fname))
return genes
[docs]def load_mouse_genes(fname):
"""Return a list of human genes based on a table of mouse genes."""
# assumes the csv has headers
df = pandas.read_csv(fname)
for c in df.columns:
# assumes the first column starting with MGI is the relevant one
# with MGI:IDs
if c.startswith('MGI'):
df = df.rename(columns={c: 'MGI'})
break
mgi_ids = df['MGI']
genes = []
for mgi_id in mgi_ids:
if mgi_id.startswith('MGI:'):
mgi_id = mgi_id[4:]
hgnc_id = hgnc_client.get_hgnc_from_mouse(mgi_id)
if not hgnc_id:
print('Could not find human gene corresponding to MGI %s' % mgi_id)
continue
genes.append(hgnc_id)
return genes
[docs]def load_indra_df(fname):
"""Return an INDRA Statement data frame from a pickle file."""
with open(fname, 'rb') as fh:
df = pickle.load(fh)
logger.info('Loaded %d rows from %s' % (len(df), fname))
return df
[docs]def dump_pickle(stmts, fname):
"""Dump a list of Statements into a picke file."""
with open(fname, 'wb') as fh:
pickle.dump(stmts, fh)
logger.info('Dumped %d statements into %s' % (len(stmts), fname))
[docs]def filter_to_genes(df, genes, fplx_terms):
"""Filter a data frame of INDRA Statements given gene and FamPlex IDs."""
# Look for sources that are in the gene list or whose families/complexes
# are in the FamPlex term list
source_filter = (((df.agA_ns == 'HGNC') & (df.agA_id.isin(genes))) |
((df.agA_ns == 'FPLX') & (df.agA_id.isin(fplx_terms))))
# Look for targets that are in the gene list or whose families/complexes
# are in the FamPlex term list, or which are GO terms
target_filter = (((df.agB_ns == 'HGNC') & (df.agB_id.isin(genes))) |
((df.agB_ns == 'FPLX') & (df.agB_id.isin(fplx_terms))) |
(df.agB_ns == 'GO'))
# sources/targets
df = df[source_filter & target_filter]
logger.info('Filtered data frame to %d rows.' % len(df))
return df
[docs]def get_famplex_terms(genes):
"""Get a list of associated FamPlex IDs from a list of gene IDs."""
all_parents = set()
for hgnc_id in genes:
parent_ids = {p[1] for p in
bio_ontology.get_parents('HGNC', hgnc_id)}
all_parents |= parent_ids
fplx_terms = sorted(list(all_parents))
logger.info('Found %d relevant FamPlex terms.' % (len(fplx_terms)))
return fplx_terms
[docs]def get_famplex_links(df, fname):
"""Given a list of INDRA Statements, construct FamPlex links."""
genes_appearing = (set(df[df.agA_ns == 'HGNC'].agA_name) |
set(df[df.agB_ns == 'HGNC'].agB_name))
fplx_appearing = (set(df[df.agA_ns == 'FPLX'].agA_id) |
set(df[df.agB_ns == 'FPLX'].agB_id))
links = get_famplex_links_from_lists(genes_appearing, fplx_appearing)
with open(fname, 'w') as fh:
for link in links:
fh.write('%s,%s\n' % link)
def get_famplex_links_from_stmts(stmts):
genes_appearing = set()
fplx_appearing = set()
for stmt in stmts:
agents = [a for a in stmt.agent_list() if a is not None]
if len(agents) < 2:
continue
for agent in agents:
if 'HGNC' in agent.db_refs:
genes_appearing.add(agent.name)
elif 'FPLX' in agent.db_refs:
fplx_appearing.add(agent.name)
return get_famplex_links_from_lists(genes_appearing, fplx_appearing)
def get_famplex_links_from_lists(genes_appearing, fplx_appearing):
links = []
for gene in genes_appearing:
parent_ids = [p[1] for p in bio_ontology.get_parents('HGNC', gene)]
parents_appearing = fplx_appearing & set(parent_ids)
links += [(gene, parent) for parent in parents_appearing]
for fplx_child in fplx_appearing:
parent_ids = [p[1] for p in
bio_ontology.get_parents('FPLX', fplx_child)]
parents_appearing = fplx_appearing & set(parent_ids)
links += [(fplx_child, parent) for parent in parents_appearing]
return links
[docs]def download_statements(df, ev_limit=5):
"""Download the INDRA Statements corresponding to entries in a data frame.
"""
all_stmts = []
for idx, group in enumerate(batch_iter(df.hash, 500)):
logger.info('Getting statement batch %d' % idx)
idbp = indra_db_rest.get_statements_by_hash(list(group),
ev_limit=ev_limit)
all_stmts += idbp.statements
return all_stmts
def remap_go_ids(stmts):
for stmt in stmts:
for agent in stmt.agent_list():
if agent is not None and 'GO' in agent.db_refs:
prim_id = go_client.get_primary_id(agent.db_refs['GO'])
if prim_id:
agent.db_refs['GO'] = prim_id
if __name__ == '__main__':
# Handle command line arguments
# TODO: make specific sets of statements for the use cases available in
# the repo (or on S3) and parameterize here to be able to load them.
parser = argparse.ArgumentParser(
description='Choose a file with a list of genes to get a SIF for.')
parser.add_argument('--df', default='data/stmt_df.pkl')
parser.add_argument('--genes', default='data/JQ1_HGNCidForINDRA.csv')
parser.add_argument('--mouse_genes')
parser.add_argument('--stmts', default='data/JQ1_HGNCidForINDRA_stmts.pkl')
args = parser.parse_args()
# Load genes and get FamPlex terms
if args.mouse_genes:
genes = load_mouse_genes(args.mouse_genes)
else:
genes = load_genes(args.genes)
fplx_terms = get_famplex_terms(genes)
# Load INDRA Statements in a flat data frame
df = load_indra_df(args.df)
# Filter the data frame to relevant entities
df = filter_to_genes(df, genes, fplx_terms)
# Download the Statement corresponding to each row
stmts = download_statements(df)
# Remap any outdated GO IDs
remap_go_ids(stmts)
# Dump the Statements into a pickle file
dump_pickle(stmts, args.stmts)