from __future__ import absolute_import, print_function, unicode_literals
from builtins import dict, str
__all__ = ['get_defaults', 'get_primary_db', 'insert_agents',
'insert_db_stmts', 'get_abstracts_by_pmids', 'get_auth_xml_pmcids',
'get_statements_by_gene_role_type', 'get_statements',
'make_stmts_from_db_list']
import os
import json
import logging
from os import path
from indra.databases import hgnc_client
from indra.util.get_version import get_version
from indra.util import unzip_string
from indra.statements import Complex, SelfModification, ActiveForm,\
stmts_from_json
from .database_manager import DatabaseManager, IndraDatabaseError, texttypes
logger = logging.getLogger('db_util')
DEFAULTS_FILE = path.join(path.dirname(path.abspath(__file__)), 'defaults.txt')
__PRIMARY_DB = None
[docs]def get_defaults():
"Get the default database hosts provided in the specified `DEFAULTS_FILE`."
default_default_file = DEFAULTS_FILE
env_key_dict = {'primary': 'INDRADBPRIMARY', 'test': 'INDRADBTEST'}
env = os.environ
available_keys = {k: v for k, v in env_key_dict.items() if v in env.keys()}
if not path.exists(default_default_file) and not available_keys:
raise IndraDatabaseError(
"Cannot find default file or environment vars."
)
elif path.exists(default_default_file):
with open(default_default_file, 'r') as f:
defaults_raw = f.read().splitlines()
defaults_dict = {}
for default_line in defaults_raw:
key, value = default_line.split('=')
defaults_dict[key.strip()] = value.strip()
else:
defaults_dict = {
purpose: env_val for purpose, my_env_key in env_key_dict.items()
for env_key, env_val in env.items() if my_env_key == env_key
}
return defaults_dict
[docs]def get_primary_db(force_new=False):
"""Get a DatabaseManager instance for the primary database host.
The primary database host is defined in the defaults.txt file, or in a file
given by the environment variable DEFAULTS_FILE. Alternatively, it may be
defined by the INDRADBPRIMARY environment variable. If none of the above
are specified, this function will raise an exception.
Note: by default, calling this function twice will return the same
`DatabaseManager` instance. In other words:
> db1 = get_primary_db()
> db2 = get_primary_db()
> db1 is db2
True
This means also that, for example `db1.select_one(db2.TextRef)` will work,
in the above context.
It is still recommended that when creating a script or function, or other
general application, you should not rely on this feature to get your access
to the database, as it can make substituting a different database host both
complicated and messy. Rather, a database instance should be explicitly
passed between different users as is done in the `by_gene_role_type`
function's call to `get_statements` in `indra.db.query_db_stmts`.
Parameters
----------
force_new : bool
If true, a new instance will be created and returned, regardless of
whether there is an existing instance or not. Default is False, so that
if this function has been called before within the global scope, a the
instance that was first created will be returned.
Returns
-------
primary_db : DatabaseManager instance
An instance of the database manager that is attached to the primary
database.
"""
defaults = get_defaults()
if 'primary' in defaults.keys():
primary_host = defaults['primary']
else:
raise IndraDatabaseError("No primary host available in defaults file.")
global __PRIMARY_DB
if __PRIMARY_DB is None or force_new:
__PRIMARY_DB = DatabaseManager(primary_host, label='primary')
__PRIMARY_DB.grab_session()
return __PRIMARY_DB
[docs]def insert_agents(db, stmts, *other_clauses):
"Insert the agents associated with the list of statements."
# Build a dict mapping stmt UUIDs to statement IDs
uuid_list = [s.uuid for s in stmts]
stmt_rec_list = db.select_all('statements',
db.Statements.uuid.in_(uuid_list),
*other_clauses)
stmt_uuid_dict = {uuid: sid for uuid, sid in
db.get_values(stmt_rec_list, ['uuid', 'id'])}
# Now assemble agent records
agent_data = []
for stmt in stmts:
stmt_id = stmt_uuid_dict[stmt.uuid]
for ag_ix, ag in enumerate(stmt.agent_list()):
# If no agent, or no db_refs for the agent, skip the insert
# that follows.
if ag is None or ag.db_refs is None:
continue
if any([isinstance(stmt, tp) for tp in
[Complex, SelfModification, ActiveForm]]):
role = 'OTHER'
elif ag_ix == 0:
role = 'SUBJECT'
elif ag_ix == 1:
role = 'OBJECT'
else:
raise IndraDatabaseError("Unhandled agent role.")
for ns, ag_id in ag.db_refs.items():
ag_rec = (stmt_id, ns, ag_id, role)
agent_data.append(ag_rec)
cols = ('stmt_id', 'db_name', 'db_id', 'role')
db.copy('agents', agent_data, cols)
return
[docs]def insert_db_stmts(db, stmts, db_ref_id):
"""Insert statement, their database, and any affiliated agents.
Note that this method is for uploading statements that came from a
database to our databse, not for inserting any statements to the database.
"""
# Preparing the statements for copying
stmt_data = []
cols = ('uuid', 'db_ref', 'type', 'json', 'indra_version')
for stmt in stmts:
stmt_rec = (
stmt.uuid,
db_ref_id,
stmt.__class__.__name__,
json.dumps(stmt.to_json()).encode('utf8'),
get_version()
)
stmt_data.append(stmt_rec)
db.copy('statements', stmt_data, cols)
db.insert_agents(stmts, db.Statements.db_ref == db_ref_id)
return
[docs]def get_abstracts_by_pmids(db, pmid_list, unzip=True):
"Get abstracts using the pmids in pmid_list."
abst_list = db.filter_query(
[db.TextRef, db.TextContent],
db.TextContent.text_ref_id == db.TextRef.id,
db.TextContent.text_type == 'abstract',
db.TextRef.pmid.in_(pmid_list)
).all()
if unzip:
def unzip_func(s):
return unzip_string(s.tobytes())
else:
def unzip_func(s):
return s
return [(r.pmid, unzip_func(c.content)) for (r, c) in abst_list]
def get_auth_xml_pmcids(db):
tref_list = db.filter_query(
[db.TextRef, db.TextContent],
db.TextRef.id == db.TextContent.text_ref_id,
db.TextContent.text_type == texttypes.FULLTEXT,
db.TextContent.source == 'pmc_auth'
)
return [tref.pmcid for tref in tref_list]
[docs]def get_statements_by_gene_role_type(agent_id=None, agent_ns='HGNC', role=None,
stmt_type=None, count=1000,
do_stmt_count=True, db=None):
"""Get statements from the DB by stmt type, agent, and/or agent role.
Parameters
----------
agent_id : str
String representing the identifier of the agent from the given
namespace. Note: if the agent namespace argument, `agent_ns`, is set
to 'HGNC', this function will treat `agent_id` as an HGNC gene
symbol and perform an internal lookup of the corresponding HGNC ID.
agent_ns : str
Namespace for the identifier given in `agent_id`.
role : str
String corresponding to the role of the agent in the statement.
Options are 'SUBJECT', 'OBJECT', or 'OTHER' (in the case of `Complex`,
`SelfModification`, and `ActiveForm` Statements).
stmt_type : str
Name of the Statement class.
count : int
Number of statements to retrieve in each batch (passed to
:py:func:`get_statements`).
do_stmt_count : bool
Whether or not to perform an initial statement counting step to give
more meaningful progress messages.
db : indra.db.DatabaseManager object.
Optionally specify a database manager that attaches to something
besides the primary database, for example a local databse instance.
Returns
-------
list of Statements from the database corresponding to the query.
"""
if db is None:
db = get_primary_db()
if not (agent_id or role or stmt_type):
raise ValueError('At least one of agent_id, role, or stmt_type '
'must be specified.')
clauses = []
if agent_id and agent_ns == 'HGNC':
hgnc_id = hgnc_client.get_hgnc_id(agent_id)
if not hgnc_id:
logger.warning('Invalid gene name: %s' % agent_id)
return []
clauses.extend([db.Agents.db_name == 'HGNC',
db.Agents.db_id == hgnc_id])
elif agent_id:
clauses.extend([db.Agents.db_name == agent_ns,
db.Agents.db_id == agent_id])
if role:
clauses.append(db.Agents.role == role)
if agent_id or role:
clauses.append(db.Agents.stmt_id == db.Statements.id)
if stmt_type:
clauses.append(db.Statements.type == stmt_type)
stmts = get_statements(clauses, count=count, do_stmt_count=do_stmt_count,
db=db)
return stmts
[docs]def get_statements(clauses, count=1000, do_stmt_count=True, db=None):
"""Select statements according to a given set of clauses.
Parameters
----------
clauses : list
list of sqlalchemy WHERE clauses to pass to the filter query.
count : int
Number of statements to retrieve and process in each batch.
do_stmt_count : bool
Whether or not to perform an initial statement counting step to give
more meaningful progress messages.
db : indra.db.DatabaseManager object.
Optionally specify a database manager that attaches to something
besides the primary database, for example a local database instance.
Returns
-------
list of Statements from the database corresponding to the query.
"""
if db is None:
db = get_primary_db()
stmts = []
q = db.filter_query('statements', *clauses)
if do_stmt_count:
logger.info("Counting statements...")
num_stmts = q.count()
logger.info("Total of %d statements" % num_stmts)
db_stmts = q.yield_per(count)
subset = []
total_counter = 0
for stmt in db_stmts:
subset.append(stmt)
if len(subset) == count:
stmts.extend(make_stmts_from_db_list(subset))
subset = []
total_counter += 1
if total_counter % count == 0:
if do_stmt_count:
logger.info("%d of %d statements" % (total_counter, num_stmts))
else:
logger.info("%d statements" % total_counter)
stmts.extend(make_stmts_from_db_list(subset))
return stmts
def make_stmts_from_db_list(db_stmt_objs):
stmt_json_list = []
for st_obj in db_stmt_objs:
stmt_json_list.append(json.loads(st_obj.json.decode('utf8')))
return stmts_from_json(stmt_json_list)