Source code for genewalk.gene_lists

import csv
import pickle
import logging
from io import StringIO
from os.path import join, abspath, dirname
from collections import Counter
import requests
from indra.databases import hgnc_client, uniprot_client


logger = logging.getLogger('genewalk.gene_lists')


[docs]def read_gene_list(fname, id_type, resource_manager): """Return references for genes from a file with the given ID type. Parameters ---------- fname : str The name of the file containing the list of genes. Each line of the file corresponds to a single gene. id_type : str The type of identifier contained in each line of the gene list file. Possible values are: hgnc_symbol, hgnc_id, ensembl_id, mgi_id. resource_manager : genewalk.resources.ResourceManager ResourceManager object, used to obtain entrez-mgi mappings if necessary. Returns ------- dict A dictionary of references with keys including HGNCSYMBOL, HGNC, UP, and if id_type is mgi_id, MGI, with values corresponding to the identifiers of the provided list of genes. """ with open(fname, 'r') as fh: # This is to make the list unique while preserving # the original order as much as possible unique_lines = [] for line in fh.readlines(): line = line.strip() if line not in unique_lines: unique_lines.append(line) if id_type == 'hgnc_symbol': refs = map_hgnc_symbols(unique_lines) elif id_type == 'hgnc_id': refs = map_hgnc_ids(unique_lines) elif id_type == 'ensembl_id': refs = map_ensembl_ids(unique_lines) elif id_type == 'mgi_id': refs = map_mgi_ids(unique_lines) elif id_type == 'entrez_human': refs = map_entrez_human(unique_lines) elif id_type == 'entrez_mouse': refs = map_entrez_mouse(unique_lines, resource_manager) else: raise ValueError('Unknown id_type: %s' % id_type) if not refs: raise ValueError('None of the IDs in %s could be mapped. It is ' 'likely that the file uses an ID type or format ' 'that GeneWalk cannot interpret.' % fname) return refs
[docs]def map_hgnc_symbols(hgnc_symbols): """Return references based on a list of HGNC symbols.""" refs = [] for hgnc_symbol in hgnc_symbols: ref = {'HGNC_SYMBOL': hgnc_symbol, 'HGNC': None, 'UP': None} hgnc_id = hgnc_client.get_current_hgnc_id(hgnc_symbol) if not hgnc_id: logger.warning('Could not get HGNC ID for symbol %s' % hgnc_symbol) continue elif isinstance(hgnc_id, list): logger.warning('More than one current HGNC ID for outdated ' 'symbol %s' % hgnc_symbol) continue ref['HGNC'] = hgnc_id uniprot_id = hgnc_client.get_uniprot_id(hgnc_id) if not uniprot_id: logger.warning('Could not get UniProt ID for symbol %s' % hgnc_symbol) continue ref['UP'] = uniprot_id refs.append(ref) return refs
[docs]def map_hgnc_ids(hgnc_ids): """Return references based on a list of HGNC IDs.""" refs = [] for hgnc_id in hgnc_ids: if hgnc_id.startswith('HGNC:'): hgnc_id = hgnc_id[5:] hgnc_ref = _refs_from_hgnc_id(hgnc_id) if hgnc_ref is None: continue refs.append(hgnc_ref) return refs
def _refs_from_hgnc_id(hgnc_id): ref = {'HGNC_SYMBOL': None, 'HGNC': hgnc_id, 'UP': None} hgnc_name = hgnc_client.get_hgnc_name(hgnc_id) if not hgnc_name: logger.warning('Could not get HGNC name for ID %s' % hgnc_id) return None ref['HGNC_SYMBOL'] = hgnc_name uniprot_id = hgnc_client.get_uniprot_id(hgnc_id) if not uniprot_id: logger.warning('Could not get UniProt ID for HGNC ID %s' % hgnc_id) return None ref['UP'] = uniprot_id return ref
[docs]def map_mgi_ids(mgi_ids): """Return references based on a list of MGI IDs.""" refs = [] for mgi_id in mgi_ids: if mgi_id.startswith('MGI:'): mgi_id = mgi_id[4:] mgi_ref = _refs_from_mgi_id(mgi_id) if mgi_ref is None: continue refs.append(mgi_ref) return refs
def _refs_from_mgi_id(mgi_id): ref = {'MGI': mgi_id} hgnc_id = hgnc_client.get_hgnc_from_mouse(mgi_id) if hgnc_id is None: logger.warning('Could not get HGNC ID for MGI ID %s' % mgi_id) return None hgnc_ref = _refs_from_hgnc_id(hgnc_id) if hgnc_ref is None: return None ref.update(hgnc_ref) return ref
[docs]def map_ensembl_ids(ensembl_ids): """Return references based on a list of Ensembl IDs.""" refs = [] for ensembl_id in ensembl_ids: ref = {'ENSEMBL': ensembl_id} ensembl_id = ensembl_id.split('.', maxsplit=1)[0] hgnc_id = hgnc_client.get_hgnc_from_ensembl(ensembl_id) if not hgnc_id: logger.warning('Could not get HGNC ID for ENSEMBL ID %s' % ensembl_id) continue hgnc_ref = _refs_from_hgnc_id(hgnc_id) if hgnc_ref is None: continue ref.update(hgnc_ref) refs.append(ref) return refs
[docs]def map_entrez_human(entrez_ids): """Return references based on human Entrez gene IDs.""" refs = [] for entrez_id in entrez_ids: ref = {'EGID': entrez_id} hgnc_id = hgnc_client.get_hgnc_from_entrez(entrez_id) if hgnc_id is None: logger.warning("Could not find HGNC ID for Entrez ID %s" % entrez_id) continue hgnc_ref = _refs_from_hgnc_id(hgnc_id) if hgnc_ref is None: continue ref.update(hgnc_ref) refs.append(ref) return refs
[docs]def map_entrez_mouse(entrez_ids, rm): """Return references based on mouse Entrez gene IDs.""" # Get the entrez file path from the resource manager mgi_entrez_file = rm.get_mgi_entrez() # Process the MGI-Entrez mapping file entrez_to_mgi = {} with open(mgi_entrez_file, 'r') as f: csvreader = csv.reader(f, delimiter='\t') for row in csvreader: # Remove "MGI:" prefix mgi = row[0][4:] entrez = row[8] entrez_to_mgi[entrez] = mgi refs = [] for entrez_id in entrez_ids: mgi_id = entrez_to_mgi.get(entrez_id) if not mgi_id: logger.warning("Could not find an MGI mapping for Entrez ID %s" % entrez_id) continue ref = {'EGID': entrez_id, 'MGI': mgi_id} mgi_refs = _refs_from_mgi_id(mgi_id) if mgi_refs is None: continue ref.update(mgi_refs) refs.append(ref) return refs