Source code for genewalk.perform_statistics

import logging
import pandas as pd
import numpy as np
import networkx as nx
from statsmodels.stats.multitest import fdrcorrection
from scipy.stats import gmean, gstd

logger = logging.getLogger('genewalk.perform_statistics')


[docs]class GeneWalk(object): """GeneWalk object that generates the final output list of significant GO terms for each gene in the input list with genes of interest from an experiment, for example differentially expressed genes or CRISPR screen hits. If an input gene is not in the output file, this could have the following reasons: 1) No corresponding HGNC gene symbol, HGNC:ID and/or UniProt:ID could be identified. All are required to map genes and assemble their GO annotations. 2) (if alpha_FDR set to < 1) no GO terms were significant at the chosen significance level alpha_FDR. 3) (in case of mouse genes) no mapped human ortholog was identified. If a gene is listed in the output file with NaN values in the columns ncon_go and ncon_gene, the gene was not included in the GeneWalk network. Parameters ---------- graph : networkx.MultiGraph GeneWalk network for which the statistics are calculated. genes : list of dict List of gene references for relevant genes. nvs : list of dict Node vectors for nodes in the graph. null_dist : np.array Similarity random (null) distribution. """ def __init__(self, graph, genes, nvs, null_dist): self.graph = graph self.genes = genes self.nvs = nvs self.srd = null_dist self.go_nodes = set(nx.get_node_attributes(self.graph, 'GO')) self.gene_nodes = set([g['HGNC_SYMBOL'] for g in self.genes])
[docs] def get_gene_attribs(self, gene): """Return an attribute dict for a given gene.""" if gene['HGNC_SYMBOL'] in self.graph: ncon_gene = len(self.graph[gene['HGNC_SYMBOL']]) else: ncon_gene = np.nan return { 'hgnc_symbol': gene['HGNC_SYMBOL'], 'hgnc_id': gene['HGNC'], 'ncon_gene': ncon_gene }
[docs] def get_go_attribs(self, gene_attribs, nv, alpha_fdr): """Return GO entries and their attributes for a given gene.""" gene_node_id = gene_attribs['hgnc_symbol'] connected = set(self.graph[gene_node_id]) & self.go_nodes go_attribs = [] pvals = [] for go_node_id in connected: go_attrib = {} sim_score = nv.similarity(gene_node_id, go_node_id) go_attrib['sim_score'] = sim_score go_attrib['go_id'] = go_node_id go_attrib['ncon_go'] = len(self.graph[go_node_id]) go_attrib['go_name'] = self.graph.nodes[go_node_id]['name'] go_attrib['go_domain'] = \ self.graph.nodes[go_node_id]['domain'].replace('_', ' ') go_attrib['pval'] = self.psim(sim_score) pvals.append(go_attrib['pval']) go_attribs.append(go_attrib) _, qvals = fdrcorrection(pvals, alpha=alpha_fdr, method='indep') for idx in range(len(go_attribs)): go_attribs[idx]['qval'] = qvals[idx] return go_attribs
def log_stats(self, vals): eps = 1e-16 nreps = len(vals) vals = np.asarray(vals)+eps g_mean = gmean(vals)-eps g_std = gstd(vals) return g_mean, g_mean*(g_std**(-1.96/np.sqrt(nreps))), \ g_mean*(g_std**(1.96/np.sqrt(nreps))) def add_empty_row(self, gene, gene_attribs, base_id_type): row = [gene_attribs['hgnc_symbol'], gene_attribs['hgnc_id'], '', '', '', gene_attribs['ncon_gene'], np.nan, np.nan, np.nan, np.nan, np.nan, np.nan, np.nan, np.nan, np.nan] if base_id_type == 'mgi_id': row = [gene.get('MGI', '')] + row elif base_id_type == 'ensembl_id': row = [gene.get('ENSEMBL', '')] + row elif base_id_type == 'entrez_mouse' or \ base_id_type == 'entrez_human': row = [gene.get('EGID', '')] + row return row
[docs] def generate_output(self, alpha_fdr=1, base_id_type='hgnc_symbol'): """Main function of GeneWalk object that generates the final GeneWalk output table (in csv format). Parameters ---------- alpha_fdr : Optional[float] Significance level for FDR [0,1] (default=1, i.e. all GO terms and their statistics are output). If set to a lower value, only connected GO terms with mean padj < alpha_FDR are output. base_id_type : Optional[str] The type of gene IDs that were the basis of doing the analysis. In case of mgi_id or ensembl_id, we prepend a column to the table for MGI or ENSEMBL IDs, respectively. Default: hgnc_symbol """ rows = [] for gene in self.genes: gene_attribs = self.get_gene_attribs(gene) # gene present in GW network if not np.isnan(gene_attribs['ncon_gene']): all_go_attribs = [self.get_go_attribs(gene_attribs, nv, alpha_fdr) for nv in self.nvs] if all_go_attribs: # gene has GO connections go_attrib_dict = {} for go_attrib_list in all_go_attribs: for go_attribs in go_attrib_list: go_id = go_attribs['go_id'] if go_id in go_attrib_dict: go_attrib_dict[go_id].append(go_attribs) else: go_attrib_dict[go_id] = [go_attribs] for go_id, go_attribs in go_attrib_dict.items(): mean_padj, low_padj, upp_padj = \ self.log_stats([attr['qval'] for attr in go_attribs]) mean_pval, low_pval, upp_pval = \ self.log_stats([attr['pval'] for attr in go_attribs]) mean_sim = np.mean([attr['sim_score'] for attr in go_attribs]) sem_sim = (np.std([attr['sim_score'] for attr in go_attribs]) / np.sqrt(len(self.nvs))) if mean_padj < alpha_fdr or alpha_fdr == 1: row = [gene_attribs['hgnc_symbol'], gene_attribs['hgnc_id'], go_attribs[0]['go_name'], go_attribs[0]['go_id'], go_attribs[0]['go_domain'], gene_attribs['ncon_gene'], go_attribs[0]['ncon_go'], mean_padj, low_padj, upp_padj, mean_pval, low_pval, upp_pval, mean_sim, sem_sim] # If dealing with mouse genes, prepend the MGI ID if base_id_type == 'mgi_id': row = [gene.get('MGI', '')] + row elif base_id_type == 'ensembl_id': row = [gene.get('ENSEMBL', '')] + row elif base_id_type == 'entrez_mouse' or \ base_id_type == 'entrez_human': row = [gene.get('EGID', '')] + row rows.append(row) elif alpha_fdr == 1: # case: no GO connections row = self.add_empty_row(gene, gene_attribs, base_id_type) rows.append(row) elif alpha_fdr == 1: # case: not in graph row = self.add_empty_row(gene, gene_attribs, base_id_type) rows.append(row) header = ['hgnc_symbol', 'hgnc_id', 'go_name', 'go_id', 'go_domain', 'ncon_gene', 'ncon_go', 'mean_padj', 'cilow_padj', 'ciupp_padj', 'mean_pval', 'cilow_pval', 'ciupp_pval', 'mean_sim', 'sem_sim', ] if base_id_type in {'mgi_id', 'ensembl_id', 'entrez_human', 'entrez_mouse'}: header = [base_id_type] + header df = pd.DataFrame.from_records(rows, columns=header) df[base_id_type] = df[base_id_type].astype('category') df[base_id_type].cat.set_categories(df[base_id_type].unique(), inplace=True) df[['ncon_gene', 'ncon_go']] = \ df[['ncon_gene', 'ncon_go']].astype('str') df = df.sort_values(by=[base_id_type, 'go_domain', 'mean_padj', 'mean_sim', 'go_name'], ascending=[True, True, True, False, True]) return df
[docs] def psim(self, sim): """Determine the p-value of the experimental similarity by determining its percentile, i.e. the normalized rank, in the null distribution with random similarity values. """ rank = np.searchsorted(self.srd, sim) pct_rank = float(rank) / len(self.srd) pval = 1 - pct_rank eps = 1e-16 if pval < eps: pval = eps return pval