import re
import logging
from copy import copy
import pybel.constants as pc
from pybel.struct import has_protein_modification
from pybel.canonicalize import edge_to_bel
from indra.statements import *
from indra.sources.bel.belrdf_processor import bel_to_indra, chebi_name_id
from indra.databases import hgnc_client, uniprot_client
from indra.assemblers.pybel_assembler import _pybel_indra_act_map
logger = logging.getLogger('pybel_processor')
_pybel_indra_pmod_map = {
'Ph': 'phosphorylation',
'Hy': 'hydroxylation',
'Sumo': 'sumoylation',
'Ac': 'acetylation',
'Glyco': 'glycosylation',
'ADPRib': 'ribosylation',
'Ub': 'ubiquitination',
'Farn': 'farnesylation',
'Gerger': 'geranylgeranylation',
'Palm': 'palmitoylation',
'Myr': 'myristoylation',
'Me': 'methylation',
}
#: A mapping from the BEL text location annotation to the INDRA ones at
# :py:data:`indra.reach.processor._section_list`
#: see https://arty.scai.fraunhofer.de/artifactory/bel/annotation/text-location/text-location-1.0.0.belanno
_pybel_text_location_map = {
"Abstract": 'abstract',
"Results": 'results',
"Legend": 'figure',
"Review": None,
'Introduction': 'introduction',
'Methods': 'methods',
'Discussion': 'discussion',
'Conclusion': 'conclusion'
}
[docs]class PybelProcessor(object):
"""Extract INDRA Statements from a PyBEL Graph.
Currently does not handle non-causal relationships (positiveCorrelation,
(negativeCorrelation, hasVariant, etc.)
Parameters
----------
graph : pybel.BELGraph
PyBEL graph containing the BEL content.
Attributes
----------
statements : list[indra.statements.Statement]
A list of extracted INDRA Statements representing BEL Statements.
"""
def __init__(self, graph):
self.graph = graph
self.statements = []
self.unhandled = []
# FIXME: Handle reactions
def get_statements(self):
for u, v, d in self.graph.edges_iter(data=True):
u_data = self.graph.node[u]
v_data = self.graph.node[v]
# We only interpret causal relations, not correlations
if d[pc.RELATION] not in pc.CAUSAL_RELATIONS:
self.unhandled.append((u_data, v_data, d))
continue
# If the left or right-hand sides involve complex abundances,
# add them as statements
for node_ix, node_data in enumerate((u_data, v_data)):
if node_data[pc.FUNCTION] == pc.COMPLEX:
self._get_complex(u_data, v_data, d, node_ix)
subj_activity = _get_activity_condition(d.get(pc.SUBJECT))
obj_activity = _get_activity_condition(d.get(pc.OBJECT))
obj_to_loc = _get_translocation_target(d.get(pc.OBJECT))
# If the object is a translocation, this represents a controlled
# translocation, which we currently do not represent
if obj_to_loc:
self.unhandled.append((u_data, v_data, d))
logger.info("Controlled translocations are currently not "
"handled: %s)" % edge_to_bel(u_data, v_data, d))
continue
# Modification, e.g.
# x(Foo) -> p(Bar, pmod(Ph))
# act(x(Foo)) -> p(Bar, pmod(Ph))
if v_data[pc.FUNCTION] == pc.PROTEIN and \
has_protein_modification(self.graph, v):
if obj_activity:
logger.info("Ignoring object activity modifier in "
"modification statement: %s, %s, %s" %
(u_data, v_data, d))
else:
self._get_modification(u_data, v_data, d)
elif obj_activity:
# If the agents on the left and right hand sides are the same,
# then get an active form:
# ActiveForm
# p(Foo, {variants}) ->/-| act(p(Foo))
# Also Composite active forms:
# compositeAbundance(p(Foo, pmod('Ph', 'T')),
# p(Foo, pmod('Ph', 'Y'))) ->/-|
# act(p(Foo))
if not subj_activity and _proteins_match(u_data, v_data):
self._get_active_form(u_data, v_data, d)
# Gef
# act(p(Foo)) => gtp(p(Foo))
# Gap
# act(p(Foo)) =| gtp(p(Foo))
elif subj_activity and _rel_is_direct(d) and \
obj_activity.activity_type == 'gtpbound':
self._get_gef_gap(u_data, v_data, d)
# Activation/Inhibition
# x(Foo) -> act(x(Foo))
# act(x(Foo)) -> act(x(Foo))
# GtpActivation
# gtp(p(Foo)) => act(p(Foo))
else:
self._get_regulate_activity(u_data, v_data, d)
# Activations involving biological processes or pathologies
# x(Foo) -> bp(Bar)
elif v_data[pc.FUNCTION] in (pc.BIOPROCESS, pc.PATHOLOGY):
self._get_regulate_activity(u_data, v_data, d)
# Regulate amount
# x(Foo) -> p(Bar)
# x(Foo) -> r(Bar)
# act(x(Foo)) -> p(Bar):
# x(Foo) -> deg(p(Bar))
# act(x(Foo)) ->/-| deg(p(Bar))
elif v_data[pc.FUNCTION] in (pc.PROTEIN, pc.RNA, pc.ABUNDANCE,
pc.COMPLEX, pc.MIRNA) and not obj_activity:
self._get_regulate_amount(u_data, v_data, d)
# Controlled conversions
# x(Foo) -> rxn(reactants(r1,...,rn), products(p1,...pn))
# act(x(Foo)) -> rxn(reactants(r1,...,rn), products(p1,...pn))
# Note that we can't really handle statements where the relation
# is decreases, as inhibition of a reaction match the semantics
# of a controlled conversion
elif v_data[pc.FUNCTION] == pc.REACTION and \
d[pc.RELATION] in pc.CAUSAL_INCREASE_RELATIONS:
self._get_conversion(u_data, v_data, d)
# UNHANDLED
# rxn(reactants(r1,...,rn), products(p1,...pn))
# Complex(a,b)
# p(A, pmod('ph')) -> Complex(A, B)
# Complex(A-Ph, B)
# Complexes
# complex(x(Foo), x(Bar), ...)
else:
self.unhandled.append((u_data, v_data, d))
def _get_complex(self, u_data, v_data, edge_data, node_ix):
# Get an agent with bound conditions from the Complex
assert node_ix in (0, 1)
node_data = [u_data, v_data][node_ix]
cplx_agent = _get_agent(node_data, None)
if cplx_agent is None:
return
agents = [bc.agent for bc in cplx_agent.bound_conditions]
cplx_agent.bound_conditions = []
agents.append(cplx_agent)
ev = _get_evidence(u_data, v_data, edge_data)
stmt = Complex(agents, evidence=[ev])
self.statements.append(stmt)
def _get_regulate_amount(self, u_data, v_data, edge_data):
subj_agent = _get_agent(u_data, edge_data.get(pc.SUBJECT))
obj_agent = _get_agent(v_data, edge_data.get(pc.OBJECT))
if subj_agent is None or obj_agent is None:
self.unhandled.append((u_data, v_data, edge_data))
return
obj_mod = edge_data.get(pc.OBJECT)
deg_polarity = (-1 if obj_mod and obj_mod[pc.MODIFIER] == pc.DEGRADATION
else 1)
rel_polarity = (1 if edge_data[pc.RELATION] in
pc.CAUSAL_INCREASE_RELATIONS else -1)
# Set polarity accordingly based on the relation type and whether
# the object is a degradation node
if deg_polarity * rel_polarity > 0:
stmt_class = IncreaseAmount
else:
stmt_class = DecreaseAmount
ev = _get_evidence(u_data, v_data, edge_data)
stmt = stmt_class(subj_agent, obj_agent, evidence=[ev])
self.statements.append(stmt)
def _get_modification(self, u_data, v_data, edge_data):
subj_agent = _get_agent(u_data, edge_data.get(pc.SUBJECT))
mods, muts = _get_all_pmods(v_data, edge_data)
v_data_no_mods = _remove_pmods(v_data)
obj_agent = _get_agent(v_data_no_mods, edge_data.get(pc.OBJECT))
if subj_agent is None or obj_agent is None:
self.unhandled.append((u_data, v_data, edge_data))
return
for mod in mods:
modclass = modtype_to_modclass[mod.mod_type]
ev = _get_evidence(u_data, v_data, edge_data)
stmt = modclass(subj_agent, obj_agent, mod.residue, mod.position,
evidence=[ev])
self.statements.append(stmt)
def _get_regulate_activity(self, u_data, v_data, edge_data):
# Subject info
subj_agent = _get_agent(u_data, edge_data.get(pc.SUBJECT))
subj_activity = _get_activity_condition(edge_data.get(pc.SUBJECT))
subj_function = u_data.get(pc.FUNCTION)
# Object info
# Note: Don't pass the object modifier data because we don't want to
# put an activity on the agent
obj_agent = _get_agent(v_data, None)
obj_function = v_data.get(pc.FUNCTION)
# If it's a bioprocess object, we won't have an activity in the edge
if obj_function in (pc.BIOPROCESS, pc.PATHOLOGY):
activity_type = 'activity'
else:
obj_activity_condition = \
_get_activity_condition(edge_data.get(pc.OBJECT))
activity_type = obj_activity_condition.activity_type
assert obj_activity_condition.is_active is True
# Check for valid subject/object
if subj_agent is None or obj_agent is None:
self.unhandled.append((u_data, v_data, edge_data))
return
# Check which kind of statement we need to make
# GtpActivation
if subj_activity and subj_activity.activity_type == 'gtpbound' and \
subj_function == pc.PROTEIN and obj_function == pc.PROTEIN and \
edge_data[pc.RELATION] == pc.DIRECTLY_INCREASES:
stmt_class = GtpActivation
elif edge_data[pc.RELATION] in pc.CAUSAL_INCREASE_RELATIONS:
stmt_class = Activation
else:
stmt_class = Inhibition
ev = _get_evidence(u_data, v_data, edge_data)
stmt = stmt_class(subj_agent, obj_agent, activity_type, evidence=[ev])
self.statements.append(stmt)
def _get_active_form(self, u_data, v_data, edge_data):
subj_agent = _get_agent(u_data, edge_data.get(pc.SUBJECT))
# Don't pass the object modifier info because we don't want an activity
# condition applied to the agent
obj_agent = _get_agent(v_data)
if subj_agent is None or obj_agent is None:
self.unhandled.append((u_data, v_data, edge_data))
return
obj_activity_condition = \
_get_activity_condition(edge_data.get(pc.OBJECT))
activity_type = obj_activity_condition.activity_type
# If the relation is DECREASES, this means that this agent state
# is inactivating
is_active = edge_data[pc.RELATION] in pc.CAUSAL_INCREASE_RELATIONS
ev = _get_evidence(u_data, v_data, edge_data)
stmt = ActiveForm(subj_agent, activity_type, is_active, evidence=[ev])
self.statements.append(stmt)
def _get_gef_gap(self, u_data, v_data, edge_data):
subj_agent = _get_agent(u_data, edge_data.get(pc.SUBJECT))
obj_agent = _get_agent(v_data)
if subj_agent is None or obj_agent is None:
self.unhandled.append((u_data, v_data, edge_data))
return
ev = _get_evidence(u_data, v_data, edge_data)
if edge_data[pc.RELATION] in pc.CAUSAL_INCREASE_RELATIONS:
stmt_class = Gef
else:
stmt_class = Gap
stmt = stmt_class(subj_agent, obj_agent, evidence=[ev])
self.statements.append(stmt)
def _get_conversion(self, u_data, v_data, edge_data):
subj_agent = _get_agent(u_data, edge_data.get(pc.SUBJECT))
# Get the nodes for the reactants and products
reactant_agents = [_get_agent(r) for r in v_data[pc.REACTANTS]]
product_agents = [_get_agent(p) for p in v_data[pc.PRODUCTS]]
if subj_agent is None or \
any([r is None for r in reactant_agents]) or \
any([p is None for p in product_agents]):
self.unhandled.append((u_data, v_data, edge_data))
return
ev = _get_evidence(u_data, v_data, edge_data)
stmt = Conversion(subj_agent, obj_from=reactant_agents,
obj_to=product_agents, evidence = ev)
self.statements.append(stmt)
def _get_agent(node_data, node_modifier_data=None):
# FIXME: Handle translocations on the agent for ActiveForms, turn into
# location conditions
# Check the node type/function
node_func = node_data[pc.FUNCTION]
if node_func not in (pc.PROTEIN, pc.RNA, pc.BIOPROCESS, pc.COMPLEX,
pc.PATHOLOGY, pc.ABUNDANCE, pc.MIRNA):
mod_data = ('No node data' if not node_modifier_data
else node_modifier_data.get(pc.CNAME))
logger.info("Nodes of type %s not handled: %s" %
(node_func, mod_data))
return None
# Skip gene/protein fusions
if pc.FUSION in node_data:
logger.info("Gene and protein fusions not handled: %s" % str(node_data))
return None
# COMPLEXES ------------
# First, handle complexes, which will consist recursively of other agents
if node_func == pc.COMPLEX:
# First, check for members: if there are no members, we assume this
# is a named complex
members = node_data.get(pc.MEMBERS)
if members is None:
return None
# Otherwise, get the "main" agent, to which the other members will be
# attached as bound conditions
main_agent = _get_agent(members[0])
# If we can't get the main agent, return None
if main_agent is None:
return None
bound_conditions = [BoundCondition(_get_agent(m), True)
for m in members[1:]]
# Check the bound_conditions for any None agents
if any([bc.agent is None for bc in bound_conditions]):
return None
main_agent.bound_conditions = bound_conditions
# Get activity of main agent
ac = _get_activity_condition(node_modifier_data)
main_agent.activity = ac
return main_agent
# OTHER NODE TYPES -----
# Get node identifier information
name = node_data.get(pc.NAME)
ns = node_data[pc.NAMESPACE]
ident = node_data.get(pc.IDENTIFIER)
# No ID present, get identifier using the name, namespace
db_refs = None
if not ident:
assert name, "Node must have a name if lacking an identifier."
if ns == 'HGNC':
hgnc_id = hgnc_client.get_hgnc_id(name)
if not hgnc_id:
logger.info("Invalid HGNC name: %s (%s)" % (name, node_data))
return None
db_refs = {'HGNC': hgnc_id}
up_id = _get_up_id(hgnc_id)
if up_id:
db_refs['UP'] = up_id
# FIXME: Look up go ID in ontology lookup service
# FIXME: Look up MESH IDs from name
# FIXME: For now, just use node name
elif ns in ('GOBP', 'MESHPP', 'MESHD'):
db_refs = {}
# For now, handle MGI/RGD but putting the name into the db_refs so
# it's clear what namespace the name belongs to
# FIXME: Full implementation would look up MGI/RGD identifiers from
# the names, and obtain corresponding Uniprot IDs
elif ns in ('MGI', 'RGD'):
db_refs = {ns: name}
# Map Selventa families to FamPlexes
elif ns == 'SFAM':
db_refs = {'SFAM': name}
indra_name = bel_to_indra.get(name)
if indra_name is None:
logger.info('Could not find mapping for BEL/SFAM family: '
'%s (%s)' % (name, node_data))
else:
db_refs['FPLX'] = indra_name
name = indra_name
# Map Entrez genes to HGNC/UP
elif ns == 'EGID':
hgnc_id = hgnc_client.get_hgnc_from_entrez(name)
db_refs = {'EGID': name}
if hgnc_id is not None:
db_refs['HGNC'] = hgnc_id
name = hgnc_client.get_hgnc_name(hgnc_id)
up_id = hgnc_client.get_uniprot_id(hgnc_id)
if up_id:
db_refs['UP'] = up_id
else:
logger.info('HGNC entity %s with HGNC ID %s has no '
'corresponding Uniprot ID.' %
(name, hgnc_id))
else:
logger.info('Could not map EGID%s to HGNC.' % name)
name = 'E%s' % name
# CHEBI
elif ns == 'CHEBI':
chebi_id = chebi_name_id.get(name)
if chebi_id:
db_refs = {'CHEBI': chebi_id}
else:
logger.info('CHEBI name %s not found in map.' % name)
# SDIS, SCHEM: Include the name as the ID for the namespace
elif ns in ('SDIS', 'SCHEM'):
db_refs = {ns: name}
else:
print("Unhandled namespace: %s: %s (%s)" % (ns, name, node_data))
# We've already got an identifier, look up other identifiers if necessary
else:
# Get the name, overwriting existing name if necessary
if ns == 'HGNC':
name = hgnc_client.get_hgnc_name(ident)
db_refs = {'HGNC': ident}
up_id = _get_up_id(ident)
if up_id:
db_refs['UP'] = up_id
elif ns == 'UP':
db_refs = {'UP': ident}
name = uniprot_client.get_gene_name(ident)
assert name
if uniprot_client.is_human(ident):
hgnc_id = hgnc_client.get_hgnc_id(name)
if not hgnc_id:
logger.info('Uniprot ID linked to invalid human gene '
'name %s' % name)
else:
db_refs['HGNC'] = hgnc_id
elif ns in ('MGI', 'RGD'):
raise ValueError('Identifiers for MGI and RGD databases are not '
'currently handled: %s' % node_data)
else:
print("Unhandled namespace with identifier: %s: %s (%s)" %
(ns, name, node_data))
if db_refs is None:
logger.info('Unable to get identifier information for node: %s'
% node_data)
return None
# Get modification conditions
mods, muts = _get_all_pmods(node_data)
# Get activity condition
ac = _get_activity_condition(node_modifier_data)
to_loc = _get_translocation_target(node_modifier_data)
# Check for unhandled node modifiers, skip if so
if _has_unhandled_modifiers(node_modifier_data):
return None
# Make the agent
ag = Agent(name, db_refs=db_refs, mods=mods, mutations=muts, activity=ac,
location=to_loc)
return ag
def _get_evidence(u_data, v_data, edge_data):
ev_text = edge_data.get(pc.EVIDENCE)
ev_citation = edge_data.get(pc.CITATION)
ev_pmid = None
if ev_citation:
cit_type = ev_citation[pc.CITATION_TYPE]
cit_ref = ev_citation[pc.CITATION_REFERENCE]
if cit_type == pc.CITATION_TYPE_PUBMED:
ev_pmid = cit_ref
else:
ev_pmid = '%s: %s' % (cit_type, cit_ref)
epistemics = {'direct': _rel_is_direct(edge_data)}
annotations = edge_data.get(pc.ANNOTATIONS, {})
annotations['bel'] = edge_to_bel(u_data, v_data, edge_data)
text_location = annotations.pop('TextLocation', None)
if text_location:
# Handle dictionary text_location like {'Abstract': True}
if isinstance(text_location, dict):
# FIXME: INDRA's section_type entry is meant to contain
# a single section string like "abstract" but in principle
# pybel could have a list of entries in the TextLocation dict.
# Here we just take the first one.
text_location = list(text_location.keys())[0]
epistemics['section_type'] = _pybel_text_location_map.get(text_location)
ev = Evidence(text=ev_text, pmid=ev_pmid, source_api='bel',
source_id=edge_data.get(pc.HASH), epistemics=epistemics,
annotations=annotations)
return ev
def _rel_is_direct(d):
return d[pc.RELATION] in (pc.DIRECTLY_INCREASES, pc.DIRECTLY_DECREASES)
def _get_up_id(hgnc_id):
up_id = hgnc_client.get_uniprot_id(hgnc_id)
if not up_id:
logger.info("No Uniprot ID for HGNC ID %s" % hgnc_id)
return up_id
def _remove_pmods(node_data):
node_data_no_pmods = copy(node_data)
variants = node_data.get(pc.VARIANTS)
if variants:
node_data_no_pmods[pc.VARIANTS] = [var for var in variants
if var[pc.KIND] != pc.PMOD]
return node_data_no_pmods
def _get_all_pmods(node_data, remove_pmods=False):
mods = []
muts = []
variants = node_data.get(pc.VARIANTS)
if not variants:
return mods, muts
for var in variants:
if var[pc.KIND] == pc.HGVS:
hgvs_str = var[pc.IDENTIFIER]
position, res_from, res_to = _parse_mutation(hgvs_str)
if position is None and res_from is None and res_to is None:
logger.info("Could not parse HGVS string %s" % hgvs_str)
else:
mut_cond = MutCondition(position, res_from, res_to)
muts.append(mut_cond)
elif var[pc.KIND] == pc.PMOD:
var_id_dict = var[pc.IDENTIFIER]
var_ns = var_id_dict[pc.NAMESPACE]
if var_ns == pc.BEL_DEFAULT_NAMESPACE:
var_id = var_id_dict[pc.NAME]
mod_type = _pybel_indra_pmod_map.get(var_id)
if mod_type is None:
logger.info("Unhandled modification type %s (%s)" %
(var_id, node_data))
continue
mc = ModCondition(mod_type, var.get(pc.PMOD_CODE),
var.get(pc.PMOD_POSITION))
mods.append(mc)
# FIXME These unhandled mod types should result in throwing out
# the node (raise, or return None)
elif var[pc.KIND] == pc.GMOD:
logger.debug('Unhandled node variant GMOD: %s' % node_data)
elif var[pc.KIND] == pc.FRAGMENT:
logger.debug('Unhandled node variant FRAG: %s' % node_data)
else:
logger.debug('Unknown node variant type: %s' % node_data)
return (mods, muts)
def _get_activity_condition(node_modifier_data):
if node_modifier_data is None or node_modifier_data == {}:
return None
if node_modifier_data[pc.MODIFIER] != pc.ACTIVITY:
return None
effect = node_modifier_data.get(pc.EFFECT)
# No specific effect, just return generic activity
if not effect:
return ActivityCondition('activity', True)
activity_ns = effect[pc.NAMESPACE]
if activity_ns == pc.BEL_DEFAULT_NAMESPACE:
activity_name = effect[pc.NAME]
activity_type = _pybel_indra_act_map.get(activity_name)
# If an activity type in Bel/PyBel that is not implemented in INDRA,
# return generic activity
if activity_type is None:
return ActivityCondition('activity', True)
return ActivityCondition(activity_type, True)
# If an unsupported namespace, simply return generic activity
return ActivityCondition('activity', True)
def _get_translocation_target(node_modifier_data):
# First check if there is a translocation modifier
if node_modifier_data is None or node_modifier_data == {}:
return None
if node_modifier_data[pc.MODIFIER] != pc.TRANSLOCATION:
return None
# Next, make sure there is information on the translocation target
transloc_data = node_modifier_data[pc.EFFECT]
to_loc_info = transloc_data.get(pc.TO_LOC)
if not to_loc_info:
return None
to_loc_ns = to_loc_info.get(pc.NAMESPACE)
to_loc_name = to_loc_info.get(pc.NAME)
# Only use GO Cellular Component location names
if to_loc_ns != 'GOCC' or not to_loc_name:
return None
try:
valid_loc = get_valid_location(to_loc_name)
except InvalidLocationError:
return None
return valid_loc
def _has_unhandled_modifiers(node_modifier_data):
# First check if there is a translocation modifier
if node_modifier_data is None or node_modifier_data == {}:
return False
mod = node_modifier_data.get(pc.MODIFIER)
if mod is None:
return False
if mod in (pc.CELL_SECRETION, pc.CELL_SURFACE_EXPRESSION):
logger.info("Unhandled node modifier data: %s" % node_modifier_data)
return True
def _proteins_match(u_data, v_data):
return (
u_data[pc.FUNCTION] == pc.PROTEIN and
v_data[pc.FUNCTION] == pc.PROTEIN and
pc.NAMESPACE in u_data and pc.NAMESPACE in v_data and
pc.NAME in u_data and pc.NAME in v_data and
u_data[pc.NAMESPACE] == v_data[pc.NAMESPACE] and
u_data[pc.NAME] == v_data[pc.NAME]
)
_hgvs_protein_mutation = re.compile('^p.([A-Z][a-z]{2})(\d+)([A-Z][a-z]{2})')
def _parse_mutation(s):
m = _hgvs_protein_mutation.match(s)
if not m:
return (None, None, None)
from_aa, position, to_aa = m.groups()
return position, from_aa, to_aa