from __future__ import absolute_import, print_function, unicode_literals
from builtins import dict, str
import os
import csv
import sys
import json
import pickle
import logging
from copy import deepcopy
from collections import Counter
from itertools import groupby, chain
from indra.statements import Agent
from indra.databases import uniprot_client, hgnc_client
from indra.util import read_unicode_csv, write_unicode_csv
logger = logging.getLogger('grounding_mapper')
[docs]class GroundingMapper(object):
"""Maps grounding of INDRA Agents based on a given grounding map.
Attributes
----------
gm : dict
The grounding map, a dictionary mapping strings (entity names) to
a dictionary of database identifiers.
agent_map : Optional[dict]
A dictionary mapping strings to grounded INDRA Agents with given state.
"""
def __init__(self, gm, agent_map=None):
self.gm = gm
self.agent_map = agent_map if agent_map is not None else {}
def map_agents(self, stmts, do_rename=True):
# Make a copy of the stmts
mapped_stmts = []
num_skipped = 0
# Iterate over the statements
for stmt in stmts:
mapped_stmt = deepcopy(stmt)
# Iterate over the agents
skip_stmt = False
mapped_agent_list = mapped_stmt.agent_list()
for idx, agent in enumerate(mapped_agent_list):
if agent is None or agent.db_refs.get('TEXT') is None:
continue
agent_text = agent.db_refs.get('TEXT')
mapped_to_agent_json = self.agent_map.get(agent_text)
if mapped_to_agent_json:
mapped_to_agent = \
Agent._from_json(mapped_to_agent_json['agent'])
mapped_agent_list[idx] = mapped_to_agent
mapped_stmt.set_agent_list(mapped_agent_list)
# Look this string up in the grounding map
# If not in the map, leave agent alone and continue
try:
map_db_refs = self.gm[agent_text]
except KeyError:
continue
# If it's in the map but it maps to None, then filter out
# this statement by skipping it
if map_db_refs is None:
# Increase counter if this statement has not already
# been skipped via another agent
if not skip_stmt:
num_skipped += 1
logger.debug("Skipping %s" % agent_text)
skip_stmt = True
# If it has a value that's not None, map it and add it
else:
# Otherwise, update the agent's db_refs field
gene_name = None
map_db_refs = deepcopy(self.gm.get(agent_text))
up_id = map_db_refs.get('UP')
hgnc_sym = map_db_refs.get('HGNC')
if up_id and not hgnc_sym:
gene_name = uniprot_client.get_gene_name(up_id, False)
if gene_name:
hgnc_id = hgnc_client.get_hgnc_id(gene_name)
if hgnc_id:
map_db_refs['HGNC'] = hgnc_id
elif hgnc_sym and not up_id:
# Override the HGNC symbol entry from the grounding
# map with an HGNC ID
hgnc_id = hgnc_client.get_hgnc_id(hgnc_sym)
if hgnc_id:
map_db_refs['HGNC'] = hgnc_id
# Now get the Uniprot ID for the gene
up_id = hgnc_client.get_uniprot_id(hgnc_id)
if up_id:
map_db_refs['UP'] = up_id
# If there's no HGNC ID for this symbol, raise an
# Exception
else:
raise ValueError('No HGNC ID corresponding to gene '
'symbol %s in grounding map.' %
hgnc_sym)
# If we have both, check the gene symbol ID against the
# mapping from Uniprot
elif up_id and hgnc_sym:
# Get HGNC Symbol from Uniprot
gene_name = uniprot_client.get_gene_name(up_id)
if not gene_name:
raise ValueError('No gene name found for Uniprot '
'ID %s (expected %s)' %
(up_id, hgnc_sym))
# We got gene name, compare it to the HGNC name
else:
if gene_name != hgnc_sym:
raise ValueError('Gene name %s for Uniprot ID '
'%s does not match HGNC '
'symbol %s given in grounding '
'map.' %
(gene_name, up_id, hgnc_sym))
else:
hgnc_id = hgnc_client.get_hgnc_id(hgnc_sym)
if not hgnc_id:
raise ValueError('No HGNC ID '
'corresponding to gene '
'symbol %s in grounding '
'map.' % hgnc_sym)
map_db_refs['HGNC'] = hgnc_id
# Assign the DB refs from the grounding map to the agent
agent.db_refs = map_db_refs
# Are we renaming right now?
if do_rename:
# If there's a FamPlex ID, prefer that for the name
if agent.db_refs.get('FPLX'):
agent.name = agent.db_refs.get('FPLX')
# Get the HGNC symbol or gene name (retrieved above)
elif hgnc_sym is not None:
agent.name = hgnc_sym
elif gene_name is not None:
agent.name = gene_name
# Check if we should skip the statement
if not skip_stmt:
mapped_stmts.append(mapped_stmt)
logger.info('%s statements filtered out' % num_skipped)
return mapped_stmts
def rename_agents(self, stmts):
# Make a copy of the stmts
mapped_stmts = deepcopy(stmts)
# Iterate over the statements
for stmt_ix, stmt in enumerate(mapped_stmts):
# Iterate over the agents
for agent in stmt.agent_list():
if agent is None:
continue
old_name = agent.name
# If there's a FamPlex ID, prefer that for the name
if agent.db_refs.get('FPLX'):
agent.name = agent.db_refs.get('FPLX')
# Take a HGNC name from Uniprot next
elif agent.db_refs.get('UP'):
# Try for the gene name
gene_name = uniprot_client.get_gene_name(
agent.db_refs.get('UP'),
web_fallback=False)
if gene_name:
agent.name = gene_name
hgnc_id = hgnc_client.get_hgnc_id(gene_name)
if hgnc_id:
agent.db_refs['HGNC'] = hgnc_id
# Take the text string
#if agent.db_refs.get('TEXT'):
# agent.name = agent.db_refs.get('TEXT')
# If this fails, then we continue with no change
# Fall back to the text string
#elif agent.db_refs.get('TEXT'):
# agent.name = agent.db_refs.get('TEXT')
return mapped_stmts
# TODO: handle the cases when there is more than one entry for the same
# key (e.g., ROS, ER)
def load_grounding_map(grounding_map_path, ignore_path=None):
g_map = {}
map_rows = read_unicode_csv(grounding_map_path, delimiter=',',
quotechar='"',
quoting=csv.QUOTE_MINIMAL,
lineterminator='\r\n')
if ignore_path and os.path.exists(ignore_path):
ignore_rows = read_unicode_csv(ignore_path, delimiter=',',
quotechar='"',
quoting=csv.QUOTE_MINIMAL,
lineterminator='\r\n')
else:
ignore_rows = []
csv_rows = chain(map_rows, ignore_rows)
for row in csv_rows:
key = row[0]
db_refs = {'TEXT': key}
keys = [entry for entry in row[1::2] if entry != '']
values = [entry for entry in row[2::2] if entry != '']
if len(keys) != len(values):
logger.info('ERROR: Mismatched keys and values in row %s' %
str(row))
continue
else:
db_refs.update(dict(zip(keys, values)))
if len(db_refs.keys()) > 1:
g_map[key] = db_refs
else:
g_map[key] = None
return g_map
# Some useful functions for analyzing the grounding of sets of statements
# Put together all agent texts along with their grounding
def all_agents(stmts):
agents = []
for stmt in stmts:
for agent in stmt.agent_list():
# Agents don't always have a TEXT db_refs entry (for instance
# in the case of Statements from databases) so we check for this.
if agent is not None and agent.db_refs.get('TEXT') is not None:
agents.append(agent)
return agents
def agent_texts(agents):
return [ag.db_refs.get('TEXT') for ag in agents]
def get_sentences_for_agent(text, stmts, max_sentences=None):
sentences = []
for stmt in stmts:
for agent in stmt.agent_list():
if agent is not None and agent.db_refs.get('TEXT') == text:
sentences.append((stmt.evidence[0].pmid,
stmt.evidence[0].text))
if max_sentences is not None and \
len(sentences) >= max_sentences:
return sentences
return sentences
def agent_texts_with_grounding(stmts):
allag = all_agents(stmts)
# Convert PFAM-DEF lists into tuples so that they are hashable and can
# be tabulated with a Counter
for ag in allag:
pfam_def = ag.db_refs.get('PFAM-DEF')
if pfam_def is not None:
ag.db_refs['PFAM-DEF'] = tuple(pfam_def)
refs = [tuple(ag.db_refs.items()) for ag in allag]
refs_counter = Counter(refs)
refs_counter_dict = [(dict(entry[0]), entry[1])
for entry in refs_counter.items()]
# First, sort by text so that we can do a groupby
refs_counter_dict.sort(key=lambda x: x[0].get('TEXT'))
# Then group by text
grouped_by_text = []
for k, g in groupby(refs_counter_dict, key=lambda x: x[0].get('TEXT')):
# Total occurrences of this agent text
total = 0
entry = [k]
db_ref_list = []
for db_refs, count in g:
# Check if TEXT is our only key, indicating no grounding
if list(db_refs.keys()) == ['TEXT']:
db_ref_list.append((None, None, count))
# Add any other db_refs (not TEXT)
for db, id in db_refs.items():
if db == 'TEXT':
continue
else:
db_ref_list.append((db, id, count))
total += count
# Sort the db_ref_list by the occurrences of each grounding
entry.append(tuple(sorted(db_ref_list, key=lambda x: x[2],
reverse=True)))
# Now add the total frequency to the entry
entry.append(total)
# And add the entry to the overall list
grouped_by_text.append(tuple(entry))
# Sort the list by the total number of occurrences of each unique key
grouped_by_text.sort(key=lambda x: x[2], reverse=True)
return grouped_by_text
# List of all ungrounded entities by number of mentions
def ungrounded_texts(stmts):
ungrounded = [ag.db_refs['TEXT']
for s in stmts
for ag in s.agent_list()
if ag is not None and ag.db_refs.keys() == ['TEXT']]
ungroundc = Counter(ungrounded)
ungroundc = ungroundc.items()
ungroundc = sorted(ungroundc, key=lambda x: x[1], reverse=True)
return ungroundc
def get_agents_with_name(name, stmts):
return [ag for stmt in stmts for ag in stmt.agent_list()
if ag is not None and ag.name == name]
def save_base_map(filename, grouped_by_text):
rows = []
for group in grouped_by_text:
text_string = group[0]
for db, id, count in group[1]:
if db == 'UP':
name = uniprot_client.get_mnemonic(id)
else:
name = ''
row = [text_string, db, id, count, name]
rows.append(row)
write_unicode_csv(filename, rows, delimiter=',', quotechar='"',
quoting=csv.QUOTE_MINIMAL, lineterminator='\r\n')
[docs]def protein_map_from_twg(twg):
"""Build map of entity texts to validated protein grounding.
Looks at the grounding of the entity texts extracted from the statements
and finds proteins where there is grounding to a human protein that maps to
an HGNC name that is an exact match to the entity text. Returns a dict that
can be used to update/expand the grounding map.
"""
protein_map = {}
unmatched = 0
matched = 0
logger.info('Building grounding map for human proteins')
for agent_text, grounding_list, total_count in twg:
# If 'UP' (Uniprot) not one of the grounding entries for this text,
# then we skip it.
if not 'UP' in [entry[0] for entry in grounding_list]:
continue
# Otherwise, collect all the Uniprot IDs for this protein.
uniprot_ids = [entry[1] for entry in grounding_list
if entry[0] == 'UP']
# For each Uniprot ID, look up the species
for uniprot_id in uniprot_ids:
# If it's not a human protein, skip it
mnemonic = uniprot_client.get_mnemonic(uniprot_id)
if mnemonic is None or not mnemonic.endswith('_HUMAN'):
continue
# Otherwise, look up the gene name in HGNC and match against the
# agent text
gene_name = uniprot_client.get_gene_name(uniprot_id)
if gene_name is None:
unmatched += 1
continue
if agent_text.upper() == gene_name.upper():
matched += 1
protein_map[agent_text] = {'TEXT': agent_text, 'UP': uniprot_id}
else:
unmatched += 1
logger.info('Exact matches for %d proteins' % matched)
logger.info('No match (or no gene name) for %d proteins' % unmatched)
return protein_map
def save_sentences(twg, stmts, filename, agent_limit=300):
sentences = []
unmapped_texts = [t[0] for t in twg]
counter = 0
logger.info('Getting sentences for top %d unmapped agent texts.' %
agent_limit)
for text in unmapped_texts:
agent_sentences = get_sentences_for_agent(text, stmts)
sentences += map(lambda tup: (text,) + tup, agent_sentences)
counter += 1
if counter >= agent_limit:
break
# Write sentences to CSV file
write_unicode_csv(filename, sentences, delimiter=',', quotechar='"',
quoting=csv.QUOTE_MINIMAL, lineterminator='\r\n')
default_grounding_map_path = \
os.path.join(os.path.dirname(__file__),
'../resources/famplex/grounding_map.csv')
default_ignore_path = \
os.path.join(os.path.dirname(__file__),
'../resources/famplex/ignore.csv')
default_agent_grounding_path = \
os.path.join(os.path.dirname(__file__),
'../resources/grounding_agents.json')
default_grounding_map = \
load_grounding_map(default_grounding_map_path, default_ignore_path)
gm = default_grounding_map
with open(default_agent_grounding_path, 'r') as fh:
default_agent_map = json.load(fh)
if __name__ == '__main__':
if len(sys.argv) != 2:
print("Usage: %s stmt_file" % sys.argv[0])
sys.exit()
statement_file = sys.argv[1]
logger.info("Opening statement file %s" % statement_file)
with open(statement_file, 'rb') as f:
st = pickle.load(f)
stmts = []
for stmt_list in st.values():
stmts += stmt_list
twg = agent_texts_with_grounding(stmts)
save_base_map('%s_twg.csv' % statement_file, twg)
# Filter out those entries that are NOT already in the grounding map
filtered_twg = [entry for entry in twg
if entry[0] not in default_grounding_map.keys()]
# For proteins that aren't explicitly grounded in the grounding map,
# check for trivial corrections by building the protein map
prot_map = protein_map_from_twg(twg)
filtered_twg = [entry for entry in filtered_twg
if entry[0] not in prot_map.keys()]
save_base_map('%s_unmapped_twg.csv' % statement_file, filtered_twg)
# For each unmapped string, get sentences and write to file
save_sentences(filtered_twg, stmts,
'%s_unmapped_sentences.csv' % statement_file)