Source code for indra.assemblers.cx_assembler

from __future__ import absolute_import, print_function, unicode_literals
from builtins import dict, str
import io
import re
import json
import logging
import itertools
from collections import OrderedDict
from indra.statements import *
from indra.databases import context_client, ndex_client, get_identifiers_url

# Python 2
try:
    basestring
# Python 3
except:
    basestring = str

logger = logging.getLogger('cx_assembler')


[docs]class CxAssembler(object): """This class assembles a CX network from a set of INDRA Statements. The CX format is an aspect oriented data mode for networks. The format is defined at http://www.home.ndexbio.org/data-model/. The CX format is the standard for NDEx and is compatible with CytoScape via the CyNDEx plugin. Parameters ---------- stmts : Optional[list[indra.statements.Statement]] A list of INDRA Statements to be assembled. network_name : Optional[str] The name of the network to be assembled. Default: indra_assembled Attributes ---------- statements : list[indra.statements.Statement] A list of INDRA Statements to be assembled. network_name : str The name of the network to be assembled. cx : dict The structure of the CX network that is assembled. """ def __init__(self, stmts=None, network_name=None): if stmts is None: self.statements = [] else: self.statements = stmts if network_name is None: self.network_name = 'indra_assembled' else: self.network_name = network_name self.cx = {'nodes': [], 'edges': [], 'nodeAttributes': [], 'edgeAttributes': [], 'citations': [], 'edgeCitations': [], 'supports': [], 'edgeSupports': [], 'networkAttributes': []} self._existing_nodes = {} self._existing_edges = {} self._id_counter = 0
[docs] def add_statements(self, stmts): """Add INDRA Statements to the assembler's list of statements. Parameters ---------- stmts : list[indra.statements.Statement] A list of :py:class:`indra.statements.Statement` to be added to the statement list of the assembler. """ for stmt in stmts:
self.statements.append(stmt)
[docs] def make_model(self, add_indra_json=True): """Assemble the CX network from the collected INDRA Statements. This method assembles a CX network from the set of INDRA Statements. The assembled network is set as the assembler's cx argument. Parameters ---------- add_indra_json : Optional[bool] If True, the INDRA Statement JSON annotation is added to each edge in the network. Default: True Returns ------- cx_str : str The json serialized CX model. """ self.add_indra_json = add_indra_json for stmt in self.statements: if isinstance(stmt, Modification): self._add_modification(stmt) if isinstance(stmt, SelfModification): self._add_self_modification(stmt) elif isinstance(stmt, RegulateActivity) or \ isinstance(stmt, RegulateAmount): self._add_regulation(stmt) elif isinstance(stmt, Complex): self._add_complex(stmt) elif isinstance(stmt, Gef): self._add_gef(stmt) elif isinstance(stmt, Gap): self._add_gap(stmt) network_description = '' self.cx['networkAttributes'].append({'n': 'name', 'v': self.network_name}) self.cx['networkAttributes'].append({'n': 'description', 'v': network_description}) cx_str = self.print_cx()
return cx_str
[docs] def print_cx(self, pretty=True): """Return the assembled CX network as a json string. Parameters ---------- pretty : bool If True, the CX string is formatted with indentation (for human viewing) otherwise no indentation is used. Returns ------- json_str : str A json formatted string representation of the CX network. """ def _get_aspect_metadata(aspect): count = len(self.cx.get(aspect)) if self.cx.get(aspect) else 0 if not count: return None data = {'name': aspect, 'idCounter': self._id_counter, 'consistencyGroup': 1, 'elementCount': count} return data full_cx = OrderedDict() full_cx['numberVerification'] = [{'longNumber': 281474976710655}] aspects = ['nodes', 'edges', 'supports', 'citations', 'edgeAttributes', 'edgeCitations', 'edgeSupports', 'networkAttributes', 'nodeAttributes'] full_cx['metaData'] = [] for aspect in aspects: metadata = _get_aspect_metadata(aspect) if metadata: full_cx['metaData'].append(metadata) for k, v in self.cx.items(): full_cx[k] = v full_cx['status'] = [{'error': '', 'success': True}] full_cx = [{k: v} for k, v in full_cx.items()] if pretty: json_str = json.dumps(full_cx, indent=2) else: json_str = json.dumps(full_cx)
return json_str
[docs] def save_model(self, file_name='model.cx'): """Save the assembled CX network in a file. Parameters ---------- file_name : Optional[str] The name of the file to save the CX network to. Default: model.cx """ with open(file_name, 'wt') as fh: cx_str = self.print_cx()
fh.write(cx_str)
[docs] def upload_model(self, ndex_cred): """Creates a new NDEx network of the assembled CX model. To upload the assembled CX model to NDEx, you need to have a registered account on NDEx (http://ndexbio.org/) and have the `ndex` python package installed. The uploaded network is private by default. Parameters ---------- ndex_cred : dict A dictionary with the following entries: 'user': NDEx user name 'password': NDEx password Returns ------- network_id : str The UUID of the NDEx network that was created by uploading the assembled CX model. """ cx_str = self.print_cx() network_id = ndex_client.create_network(cx_str, ndex_cred)
return network_id
[docs] def set_context(self, cell_type): """Set protein expression data and mutational status as node attribute This method uses :py:mod:`indra.databases.context_client` to get protein expression levels and mutational status for a given cell type and set a node attribute for proteins accordingly. Parameters ---------- cell_type : str Cell type name for which expression levels are queried. The cell type name follows the CCLE database conventions. Example: LOXIMVI_SKIN, BT20_BREAST """ node_names = [node['n'] for node in self.cx['nodes']] res_expr = context_client.get_protein_expression(node_names, [cell_type]) res_mut = context_client.get_mutations(node_names, [cell_type]) res_expr = res_expr.get(cell_type) res_mut = res_mut.get(cell_type) if not res_expr: msg = 'Could not get protein expression for %s cell type.' % \ cell_type logger.warning(msg) if not res_mut: msg = 'Could not get mutational status for %s cell type.' % \ cell_type logger.warning(msg) if not res_expr and not res_mut: return self.cx['networkAttributes'].append({'n': 'cellular_context', 'v': cell_type}) counter = 0 for node in self.cx['nodes']: amount = res_expr.get(node['n']) mut = res_mut.get(node['n']) if amount is not None: node_attribute = {'po': node['@id'], 'n': 'expression_amount', 'v': int(amount)} self.cx['nodeAttributes'].append(node_attribute) if mut is not None: is_mutated = 1 if mut else 0 node_attribute = {'po': node['@id'], 'n': 'is_mutated', 'v': is_mutated} self.cx['nodeAttributes'].append(node_attribute) if mut is not None or amount is not None: counter += 1
logger.info('Set context for %d nodes.' % counter) def _get_new_id(self): ret = self._id_counter self._id_counter += 1 return ret def _add_modification(self, stmt): if stmt.enz is None: return enz_id = self._add_node(stmt.enz) sub_id = self._add_node(stmt.sub) stmt_type = stmt.__class__.__name__ self._add_edge(enz_id, sub_id, stmt_type, stmt) def _add_self_modification(self, stmt): enz_id = self._add_node(stmt.enz) stmt_type = stmt.__class__.__name__ self._add_edge(enz_id, enz_id, stmt_type, stmt) def _add_complex(self, stmt): for m1, m2 in itertools.combinations(stmt.members, 2): m1_id = self._add_node(m1) m2_id = self._add_node(m2) self._add_edge(m1_id, m2_id, 'Complex', stmt) def _add_regulation(self, stmt): if stmt.subj is None: return subj_id = self._add_node(stmt.subj) obj_id = self._add_node(stmt.obj) stmt_type = stmt.__class__.__name__ self._add_edge(subj_id, obj_id, stmt_type, stmt) def _add_gef(self, stmt): gef_id = self._add_node(stmt.gef) ras_id = self._add_node(stmt.ras) stmt_type = stmt.__class__.__name__ self._add_edge(gef_id, ras_id, stmt_type, stmt) def _add_gap(self, stmt): gap_id = self._add_node(stmt.gap) ras_id = self._add_node(stmt.ras) stmt_type = stmt.__class__.__name__ self._add_edge(gap_id, ras_id, stmt_type, stmt) def _add_node(self, agent): node_key = agent.name node_id = self._existing_nodes.get(node_key) if node_id is not None: return node_id node_id = self._get_new_id() self._existing_nodes[node_key] = node_id node = {'@id': node_id, 'n': agent.name} self.cx['nodes'].append(node) self._add_node_metadata(node_id, agent) return node_id def _add_node_metadata(self, node_id, agent): agent_type = _get_agent_type(agent) node_attribute = {'po': node_id, 'n': 'type', 'v': agent_type} self.cx['nodeAttributes'].append(node_attribute) for db_name, db_ids in agent.db_refs.items(): if not db_ids: logger.warning('Missing db_id for %s' % agent) continue elif isinstance(db_ids, int): db_id = str(db_ids) elif isinstance(db_ids, basestring): db_id = db_ids else: db_id = db_ids[0] url = get_identifiers_url(db_name, db_id) if not url: continue db_name_map = { 'UP': 'UniProt', 'PUBCHEM': 'PubChem', 'IP': 'InterPro', 'NXPFA': 'NextProtFamily', 'PF': 'Pfam', 'CHEBI': 'ChEBI'} name = db_name_map.get(db_name) if not name: name = db_name node_attribute = {'po': node_id, 'n': name, 'v': url} self.cx['nodeAttributes'].append(node_attribute) def _add_edge(self, source, target, interaction, stmt): edge_key = (source, target, interaction) try: edge_id = self._existing_edges[edge_key] return edge_id except KeyError: pass edge_id = self._get_new_id() self._existing_nodes[edge_key] = edge_id edge = {'@id': edge_id, 's': source, 't': target, 'i': interaction} self.cx['edges'].append(edge) self._add_edge_metadata(edge_id, stmt) return edge_id def _add_edge_metadata(self, edge_id, stmt): # Add the string of the statement itself indra_stmt_str = '%s' % stmt edge_attribute = {'po': edge_id, 'n': 'INDRA statement', 'v': indra_stmt_str} self.cx['edgeAttributes'].append(edge_attribute) # Add INDRA JSON if self.add_indra_json: indra_stmt_json = json.dumps(stmt.to_json()) edge_attribute = {'po': edge_id, 'n': 'INDRA json', 'v': indra_stmt_json} self.cx['edgeAttributes'].append(edge_attribute) # Add the type of statement as the edge type stmt_type, stmt_polarity = _get_stmt_type(stmt) edge_attribute = {'po': edge_id, 'n': 'type', 'v': stmt_type} self.cx['edgeAttributes'].append(edge_attribute) edge_attribute = {'po': edge_id, 'n': 'polarity', 'v': stmt_polarity} self.cx['edgeAttributes'].append(edge_attribute) # Add the citations for the edge pmids = [e.pmid for e in stmt.evidence if e.pmid] edge_citations = [] pmids_added = [] for pmid in pmids: pmid_txt = None if re.match('[0-9]+', pmid): pmid_txt = 'pmid:' + pmid if pmid_txt not in pmids_added: citation_id = self._get_new_id() citation = {'@id': citation_id, 'dc:identifier': pmid_txt} self.cx['citations'].append(citation) edge_citations.append(citation_id) pmids_added.append(pmid_txt) if edge_citations: edge_citation = {'citations': edge_citations, 'po': [edge_id]} self.cx['edgeCitations'].append(edge_citation) # Add the textual supports for the edge texts = [e.text for e in stmt.evidence if e.text] edge_supports = [] for text in texts: text = text.replace('XREF_BIBR', '') support_id = self._get_new_id() support = {'@id': support_id, 'text': text} self.cx['supports'].append(support) edge_supports.append(support_id) if edge_supports: edge_support = {'supports': edge_supports, 'po': [edge_id]} self.cx['edgeSupports'].append(edge_support) belief_str = '%.2f' % stmt.belief edge_attribute = {'po': edge_id, 'n': 'Belief score', 'v': belief_str} self.cx['edgeAttributes'].append(edge_attribute) # NOTE: supports and edgeSupports are currently # not shown on NDEx therefore we add text evidence as a generic # edgeAttribute if texts: text = texts[0] edge_attribute = {'po': edge_id, 'n': 'Text', 'v': text} self.cx['edgeAttributes'].append(edge_attribute) # Add the serialized JSON INDRA Statement stmt_dict = stmt.to_json() edge_attribute = {'po': edge_id, 'n': 'indra', 'v': stmt_dict} self.cx['edgeAttributes'].append(edge_attribute) # Add support type support_type = _get_support_type(stmt) edge_attribute = {'po': edge_id, 'n': 'supportType', 'v': support_type}
self.cx['edgeAttributes'].append(edge_attribute) def _get_support_type(stmt): dbs = ['bel', 'biopax', 'phosphosite', 'biogrid'] readers = ['reach', 'trips', 'sparser', 'r3'] has_db = False has_reading = False for ev in stmt.evidence: if ev.source_api in dbs: has_db = True if ev.source_api in readers: has_reading = True if has_db and not has_reading: return 'database' elif has_db and has_db: return 'database and literature' elif not has_db and has_reading: return 'literature' def _get_stmt_type(stmt): if isinstance(stmt, AddModification): edge_type = 'Modification' edge_polarity = 'positive' elif isinstance(stmt, RemoveModification): edge_type = 'Modification' edge_polarity = 'negative' elif isinstance(stmt, SelfModification): edge_type = 'SelfModification' edge_polarity = 'positive' elif isinstance(stmt, Complex): edge_type = 'Complex' edge_polarity = 'none' elif isinstance(stmt, Activation): edge_type = 'Activation' edge_polarity = 'positive' elif isinstance(stmt, Inhibition): edge_type = 'Inhibition' edge_polarity = 'negative' elif isinstance(stmt, DecreaseAmount): edge_type = 'DecreaseAmount' edge_polarity = 'negative' elif isinstance(stmt, IncreaseAmount): edge_type = 'IncreaseAmount' edge_polarity = 'positive' elif isinstance(stmt, Gef): edge_type = 'Gef' edge_polarity = 'positive' elif isinstance(stmt, Gap): edge_type = 'Gap' edge_polarity = 'negative' else: edge_type = stmt.__class__.__str__() edge_polarity = 'none' return edge_type, edge_polarity def _get_agent_type(agent): hgnc_id = agent.db_refs.get('HGNC') uniprot_id = agent.db_refs.get('UP') pfam_id = agent.db_refs.get('PF') fa_id = agent.db_refs.get('FA') chebi_id = agent.db_refs.get('CHEBI') pubchem_id = agent.db_refs.get('PUBCHEM') be_id = agent.db_refs.get('FPLX') go_id = agent.db_refs.get('GO') if hgnc_id or uniprot_id: agent_type = 'protein' elif pfam_id or fa_id or be_id: agent_type = 'proteinfamily' elif chebi_id or pubchem_id: agent_type = 'chemical' elif go_id: agent_type = 'bioprocess' else: agent_type = 'other' return agent_type