Source code for genewalk.nx_mg_assembler

import pickle
import logging
import itertools
import pandas as pd
import networkx as nx
from goatools.obo_parser import GODag
from genewalk.resources import ResourceManager

logger = logging.getLogger('genewalk.nx_mg_assembler')


[docs]def load_network(network_type, network_file, genes, resource_manager=None):
    """Return a network assembler of the given type based on a set of genes.

    Parameters
    ----------
    network_type : str
        The type of the network to be constructed.
    network_file : str
        The path to a file containing information to construct the network.
    genes : list
        A list of gene references.
    resource_manager : Optional[:py:class:`genewalk.resources.ResourceManager`]
        A resource manager object which, if specified, is used to get the
        resource files. Otherwise, the default resource manager is used.

    Returns
    -------
    :py:class:`genewalk.nx_mg_assembler.NxMgAssembler`
        An instance of an NxMgAssembler containing the assembled networkx
        MultiGraph as its graph attribute.
    """
    if not resource_manager:
        resource_manager = None
    if network_type == 'pc':
        mg = PcNxMgAssembler(genes, resource_manager=resource_manager)
    elif network_type == 'indra':
        logger.info('Loading %s' % network_file)
        with open(network_file, 'rb') as fh:
            stmts = pickle.load(fh)
        mg = IndraNxMgAssembler(genes, stmts,
                                resource_manager=resource_manager)
    elif network_type in {'edge_list', 'sif', 'sif_annot', 'sif_full'}:
        logger.info('Loading user-provided GeneWalk Network from %s.' %
                    network_file)
        mg = UserNxMgAssembler(genes, resource_manager,
                               network_file, gwn_format=network_type)
    else:
        raise ValueError('Unknown network_type: %s' % network_type)
    return mg


[docs]class NxMgAssembler(object):
    """Class which assembles a networkx MultiGraph based on a list of genes.

    Parameters
    ----------
    genes : list of dict
        A list of gene references based on which the graph is assembled.

    Attributes
    ----------
    graph : networkx.MultiGraph
        The assembled graph containing links for interactions between genes,
        GO annotations for genes, and the GO ontology.
    """

    def __init__(self, genes, resource_manager=None):
        self.genes = genes
        self.graph = nx.MultiGraph()
        if not resource_manager:
            self.resource_manager = ResourceManager()
        else:
            self.resource_manager = resource_manager
        self.go_dag = None
        self.goa = None

    def get_go_annots(self):
        if self.goa is None:
            self.goa = self._load_goa_gaf()
        return self.goa

    def get_go_dag(self):
        if self.go_dag is None:
            self.go_dag = GODag(self.resource_manager.get_go_obo())
        return self.go_dag

    def _get_go_terms_for_gene(self, gene):
        # Filter to rows with the given gene's UniProt ID
        if ('UP' not in gene) or ('HGNC_SYMBOL' not in gene):
            return []
        elif gene['HGNC_SYMBOL'] not in self.graph:
            return []
        goa = self.get_go_annots()
        df = goa[goa['DB_ID'] == gene['UP']]
        go_ids = sorted(list(set(df['GO_ID'])))
        return go_ids

[docs]    def add_go_annotations(self):
        """Add edges between gene nodes and GO nodes based on GO
        annotations."""
        logger.info('Adding GO annotations for genes in graph.')
        go_dag = self.get_go_dag()
        for gene in self.genes:
            go_ids = self._get_go_terms_for_gene(gene)
            for go_id in go_ids:
                if go_id in go_dag:
                    go_term = go_dag[go_id]
                    if go_term.is_obsolete:
                        continue
                    self.graph.add_node(go_term.id,
                                        name=go_term.name,
                                        GO=go_term.id,
                                        domain=go_term.namespace)
                    self.graph.add_edge(gene['HGNC_SYMBOL'], go_term.id,
                                        label='GO:annotation')

[docs]    def add_go_ontology(self):
        """Add edges between GO nodes based on the GO ontology."""
        logger.info('Adding GO ontology edges to graph.')
        go_dag = self.get_go_dag()
        for go_term in list(go_dag.values()):
            if go_term.is_obsolete:
                continue
            self.graph.add_node(go_term.id,
                                name=go_term.name,
                                GO=go_term.id,
                                domain=go_term.namespace)
            for parent_term in go_term.parents:
                if parent_term.is_obsolete:
                    continue
                self.graph.add_node(go_term.id,
                                    name=go_term.name,
                                    GO=go_term.id,
                                    domain=go_term.namespace)
                self.graph.add_edge(go_term.id, parent_term.id,
                                    label='GO:is_a')

[docs]    def node2edges(self, node_key):
        """Return the edges corresponding to a node."""
        return self.graph.edges(node_key, keys=True)

[docs]    def save_graph(self, fname):
        """Save the file into a GraphML file.

        Parameters
        ----------
        fname : str
            The name of the file to save the graph into.
        """
        nx.write_graphml(self.graph, fname)

    def _load_goa_gaf(self):
        """Load the gene/GO annotations as a pandas data frame."""
        goa_ec = {'EXP', 'IDA', 'IPI', 'IMP', 'IGI', 'IEP', 'HTP', 'HDA',
                  'HMP', 'HGI', 'HEP', 'IBA', 'IBD'}
        goa = pd.read_csv(self.resource_manager.get_goa_gaf(), sep='\t',
                          comment='!', dtype=str,
                          header=None,
                          names=['DB',
                                 'DB_ID',
                                 'DB_Symbol',
                                 'Qualifier',
                                 'GO_ID',
                                 'DB_Reference',
                                 'Evidence_Code',
                                 'With_From',
                                 'Aspect',
                                 'DB_Object_Name',
                                 'DB_Object_Synonym',
                                 'DB_Object_Type',
                                 'Taxon',
                                 'Date',
                                 'Assigned',
                                 'Annotation_Extension',
                                 'Gene_Product_Form_ID'])
        goa = goa.sort_values(by=['DB_ID', 'GO_ID'])
        # Filter out all "NOT" negative evidences
        goa['Qualifier'].fillna('', inplace=True)
        goa = goa[~goa['Qualifier'].str.startswith('NOT')]
        # Filter to rows with evidence code corresponding to experimental
        # evidence
        goa = goa[goa['Evidence_Code'].isin(goa_ec)]
        return goa


[docs]class PcNxMgAssembler(NxMgAssembler):
    """The PcNxMgAssembler assembles a GeneWalk Network with gene reactions
    from Pathway Commons and GO ontology and annotations into a networkx
    (undirected)  MultiGraph including edge attributes.

    Parameters
    ----------
    genes : list fo dict
        A list of gene references based on which the network is assembled.

    Attributes
    ----------
    graph : networkx.MultiGraph
        A GeneWalk Network that is assembled by this assembler.
    """
    def __init__(self, genes, resource_manager=None):
        super().__init__(genes, resource_manager)
        self.add_pc_edges()
        self.add_go_annotations()
        self.add_go_ontology()

[docs]    def add_pc_edges(self):
        """Add edges between gene nodes based on PathwayCommons
        interactions."""
        logger.info('Adding gene edges from Pathway Commons to graph.')
        gwn_df = pd.read_csv(self.resource_manager.get_pc(), sep='\t',
                             dtype=str, header=None)
        col_mapper = {}
        col_mapper[0] = 'source'
        col_mapper[1] = 'rel_type'
        col_mapper[2] = 'target'
        edge_attributes = True
        gwn_df = gwn_df.rename(mapper=col_mapper, axis='columns')
        pc = nx.from_pandas_edgelist(gwn_df, source='source', target='target',
                                     edge_attr=edge_attributes,
                                     create_using=nx.MultiGraph)
        # subset over genes in the input gene list
        hgnc_symbols = [g['HGNC_SYMBOL'] for g in self.genes]
        hgnc_ids = [g['HGNC'] for g in self.genes]
        up_ids = [g['UP'] for g in self.genes]
        pc.add_nodes_from(hgnc_symbols)
        pc_sub = pc.subgraph(hgnc_symbols)
        gene2hgnc_dict = dict(zip(hgnc_symbols, hgnc_ids))
        nx.set_node_attributes(pc_sub, gene2hgnc_dict, 'HGNC')
        gene2up_dict = dict(zip(hgnc_symbols, up_ids))
        nx.set_node_attributes(pc_sub, gene2up_dict, 'UP')
        # Make a copy to unfreeze graph
        self.graph = nx.MultiGraph(pc_sub)
        logger.info('Number of PC originating nodes %d' %
                    nx.number_of_nodes(self.graph))


[docs]class IndraNxMgAssembler(NxMgAssembler):
    """The IndraNxMgAssembler assembles INDRA Statements and GO ontology /
    annotations into a networkx (undirected) MultiGraph including edge
    attributes. This code is based on INDRA's SifAssembler
    http://indra.readthedocs.io/en/latest/_modules/indra/assemblers/sif_assembler.html

    Parameters
    ----------
    stmts : list[indra.statements.Statement]
        A list of INDRA Statements to be added to the assembler's list
        of Statements.

    Attributes
    ----------
    graph : networkx.MultiGraph
        A GeneWalk Network that is assembled by this assembler.
    """
    def __init__(self, genes, stmts, resource_manager=None):
        self.indra_nodes = set()
        self.stmts = stmts
        super().__init__(genes, resource_manager)
        self.add_indra_edges()
        self.add_fplx_edges()
        self.add_go_annotations()
        self.add_go_ontology()

[docs]    def add_indra_edges(self):
        """Add edges between gene nodes and GO nodes based on INDRA Statements.
        """
        logger.info('Adding nodes from INDRA statements.')
        for i, st in enumerate(self.stmts):
            # Get all agents in the statement
            agents = [a for a in st.agent_list() if a is not None]
            # Only include edges for statements with at least 2 Agents
            # excludes (irrelevant) stmt types: Translocation, ActiveForm,
            # SelfModification
            if len(agents) < 2:
                continue
            # Create a label that is unique to the statement and its type
            edge_type = type(st).__name__
            edge_key = '%d_%s' % (i, edge_type)
            # Iterate over all the agent combinations and add edge
            for a, b in itertools.combinations(agents, 2):
                a_node = self.add_agent_node(a)
                b_node = self.add_agent_node(b)
                self.graph.add_edge(a_node, b_node, key=edge_key,
                                    label=edge_type)
        hgnc_symbols = [g['HGNC_SYMBOL'] for g in self.genes]
        hgnc_ids = [g['HGNC'] for g in self.genes]
        up_ids = [g['UP'] for g in self.genes]
        self.graph.add_nodes_from(hgnc_symbols)
        gene2hgnc_dict = dict(zip(hgnc_symbols, hgnc_ids))
        nx.set_node_attributes(self.graph, gene2hgnc_dict, 'HGNC')
        gene2up_dict = dict(zip(hgnc_symbols, up_ids))
        nx.set_node_attributes(self.graph, gene2up_dict, 'UP')
        logger.info('Number of INDRA originating nodes %d.' %
                    len(self.indra_nodes))

[docs]    def add_fplx_edges(self):
        """Add edges between gene nodes and families/complexes they are part
        of."""
        from genewalk.get_indra_stmts import get_famplex_links_from_stmts
        links = get_famplex_links_from_stmts(self.stmts)
        for s, t in links:
            self.graph.add_edge(s, t, label='FPLX:is_a')

[docs]    def add_agent_node(self, agent):
        """Add a node corresponding to an INDRA Agent."""
        go_id = agent.db_refs.get('GO')
        go_dag = self.get_go_dag()
        if go_id:
            go_id = go_id if go_id.startswith('GO:') else 'GO:%s' % go_id
            node_key = go_id
            # INDRA standardizes GO names so this is generally not
            # necessary
            try:
                name = go_dag[go_id].name
            except KeyError:
                name = agent.name
            self.graph.add_node(node_key, name=name,
                                source='indra', **agent.db_refs)
        else:
            node_key = agent.name
            self.graph.add_node(node_key, name=agent.name, **agent.db_refs,
                                source='indra')
        self.indra_nodes.add(node_key)
        return node_key

[docs]    def node2stmts(self, node_key):
        """Return the INDRA Statements given the key of a graph node."""
        matching_stmts = []
        node_name = self.graph.nodes[node_key]['name']
        for stmt in self.stmts:
            for agent in stmt.agent_list():
                if agent is not None:
                    agent_name = agent.name
                    if agent_name == node_name:
                        matching_stmts.append(stmt)
                        break
        return matching_stmts


[docs]class UserNxMgAssembler(NxMgAssembler):
    """Loads a user-provided GeneWalk Network from a given file.

    Parameters
    ----------
    filepath : str
        Path to the user-provided genewalk network file, assumed to contain
        gene symbols and GO IDs. See gwn_format for supported format details.
    gwn_format : Optional[str]
        'el' (default, edge list: nodeA nodeB (if more columns
        present: interpreted as edge attributes) \
        or 'sif' (simple interaction format: nodeA,<relationship type>,nodeB).
        Do not include column headers.

    Attributes
    ----------
    graph : networkx.MultiGraph
        A GeneWalk Network that is loaded by this assembler.
    """
    def __init__(self, genes, resource_manager, filepath,
                 gwn_format='el'):
        super().__init__(genes, resource_manager=resource_manager)
        self.graph = nx.MultiGraph()
        self.filepath = filepath
        self.gwn_format = gwn_format
        self.add_network_edges()

[docs]    def add_network_edges(self):
        """Assemble the GeneWalk Network from the user-provided file path."""
        gwn_df = pd.read_csv(self.filepath, dtype=str, header=None)
        col_mapper = {}
        if self.gwn_format == 'edge_list':
            col_mapper[0] = 'source'
            col_mapper[1] = 'target'
            edge_attributes = None
        elif self.gwn_format in {'sif', 'sif_annot', 'sif_full'}:
            col_mapper[0] = 'source'
            col_mapper[1] = 'rel_type'
            col_mapper[2] = 'target'
            edge_attributes = True
        else:
            raise ValueError('%s is not a valid GeneWalk network format'
                             % self.gwn_format)

        gwn_df.rename(mapper=col_mapper, axis='columns',
                      inplace=True)
        self.graph = nx.from_pandas_edgelist(gwn_df, 'source', 'target',
                                             edge_attr=edge_attributes,
                                             create_using=nx.MultiGraph)
        logger.info('The graph loaded from %s contains %d nodes'
                    ' including %d GO terms' %
                    (self.filepath, len(self.graph),
                     len([n for n in self.graph if n.startswith('GO:')])))
        gene_list_genes = \
            {(g['ID'] if 'ID' in g else g['HGNC_SYMBOL']) for g in self.genes}
        non_gene_list_non_go_nodes = \
            [n for n in self.graph if not n.startswith('GO:')
             and n not in gene_list_genes]
        if non_gene_list_non_go_nodes:
            logger.info('Removing %d gene nodes from input network '
                        'since they are not in the input gene list.' %
                        len(non_gene_list_non_go_nodes))
        self.graph.remove_nodes_from(non_gene_list_non_go_nodes)
        # If the GO annotations are not provided as part of the SIF
        # then we add those
        if self.gwn_format in {'sif', 'edge_list'}:
            self.add_go_annotations()
        # If the GO DAG is not provided as part of the SIF then we add
        # it
        if self.gwn_format in {'sif', 'sif_annot', 'edge_list'}:
            self.add_go_ontology()
        # If the SIF contains everything then we still have to add
        # some basic node meta-data to the GO nodes for later steps
        if self.gwn_format == 'sif_full':
            go_dag = self.get_go_dag()
            for node in self.graph.nodes:
                if node.startswith('GO:'):
                    go_term = go_dag.get(node)
                    if go_term:
                        self.graph.nodes[node]['GO'] = go_term.id
                        self.graph.nodes[node]['name'] = go_term.name
                        self.graph.nodes[node]['domain'] = go_term.namespace