"""
Contains possible interactions with the Galaxy Datasets
"""
from bioblend.galaxy.client import Client
import requests
import os
import shlex
import time
import logging
import urlparse
import urllib2
log = logging.getLogger(__name__)
[docs]class DatasetClient(Client):
def __init__(self, galaxy_instance):
self.module = 'datasets'
super(DatasetClient, self).__init__(galaxy_instance)
[docs] def show_dataset(self, dataset_id, deleted=False, hda_ldda='hda'):
"""
Display information about and/or content of a dataset. This can be a
history or a library dataset.
:type hda_ldda: string
:param hda_ldda: Whether to show a history dataset ('hda' - the default) or library
dataset ('ldda').
"""
params = dict(
hda_ldda=hda_ldda,
)
return Client._get(self, id=dataset_id, deleted=deleted, params=params)
[docs] def download_dataset(self, dataset_id, file_path=None, use_default_filename=True,
wait_for_completion=False, maxwait=12000):
"""
Downloads the dataset identified by 'id'.
:type dataset_id: string
:param dataset_id: Encoded Dataset ID
:type file_path: string
:param file_path: If the file_path argument is provided, the dataset will be streamed to disk
at that path (Should not contain filename if use_default_name=True).
If the file_path argument is not provided, the dataset content is loaded into memory
and returned by the method (Memory consumption may be heavy as the entire file
will be in memory).
:type use_default_name: boolean
:param use_default_name: If the use_default_name parameter is True, the exported
file will be saved as file_path/%s,
where %s is the dataset name.
If use_default_name is False, file_path is assumed to
contain the full file path including filename.
:type wait_for_completion: boolean
:param wait_for_completion: If wait_for_completion is True, this call will block until the dataset is ready.
If the dataset state becomes invalid, a DatasetStateException will be thrown.
:type maxwait: float
:param maxwait: Time (in seconds) to wait for dataset to complete.
If the dataset state is not complete within this time, a DatasetTimeoutException will be thrown.
:rtype: dict
:return: If a file_path argument is not provided, returns a dict containing the file_content.
Otherwise returns nothing.
"""
if wait_for_completion:
self._block_until_dataset_ready(dataset_id, maxwait=maxwait)
dataset = self.show_dataset(dataset_id)
if not dataset['state'] == 'ok':
raise DatasetStateException("Dataset not ready. Dataset id: %s, current state: %s" % (dataset_id, dataset['state']))
# Currently the Datasets REST API does not provide the download URL, so we construct it
file_ext = dataset.get('file_ext', dataset['data_type'])
download_url = 'datasets/' + dataset_id + '/display?to_ext=' + file_ext
url = urlparse.urljoin(self.gi.base_url, download_url)
# Don't use self.gi.make_get_request as currently the download API does not require a key
r = requests.get(url)
if file_path is None:
return r.content
else:
if use_default_filename:
try:
# First try to get the filename from the response headers
# We expect tokens 'filename' '=' to be followed by the quoted filename
tokens = [x for x in shlex.shlex(r.headers['content-disposition'], posix=True)]
header_filepath = tokens[tokens.index('filename') + 2]
filename = os.path.basename(header_filepath)
except (ValueError, IndexError):
# If the filename was not in the header, build a useable filename ourselves.
filename = dataset['name'] + '.' + file_ext
file_local_path = os.path.join(file_path, filename)
else:
file_local_path = file_path
with open(file_local_path, 'wb') as fp:
fp.write(r.content)
def _is_dataset_complete(self, dataset_id):
dataset = self.show_dataset(dataset_id)
state = dataset['state']
return (state == 'ok' or state == 'error')
def _block_until_dataset_ready(self, dataset_id, maxwait=12000, interval=30, raise_on_timeout=True):
"""
Wait until the dataset state changes to ok or error.
Based on: https://github.com/salimfadhley/jenkinsapi/blob/master/jenkinsapi/api.py
"""
assert maxwait > 0
assert maxwait > interval
assert interval > 0
for time_left in xrange(maxwait, 0, -interval):
if self._is_dataset_complete(dataset_id):
return
log.warn("Waiting for dataset %s to complete. Will wait another %is" % (dataset_id, time_left))
time.sleep(interval)
if raise_on_timeout:
# noinspection PyUnboundLocalVariable
raise DatasetTimeoutException("Waited too long for dataset to complete: %s" % dataset_id)
[docs] def show_stderr(self, dataset_id):
"""
Display stderr output of a dataset.
"""
res = urllib2.urlopen(self.url[:-len("/api/datasets/")+1]+"/datasets/"+dataset_id+"/stderr")
return res.read()
[docs] def show_stdout(self, dataset_id):
"""
Display stdout output of a dataset.
"""
res = urllib2.urlopen(self.url[:-len("/api/datasets/")+1]+"/datasets/"+dataset_id+"/stdout")
return res.read()
[docs]class DatasetStateException(Exception):
def __init__(self, value):
self.value = value
def __str__(self):
return repr(self.value)
[docs]class DatasetTimeoutException(Exception):
def __init__(self, value):
self.value = value
def __str__(self):
return repr(self.value)