Source code for dxpy.bindings.dxfile

# Copyright (C) 2013-2016 DNAnexus, Inc.
#
# This file is part of dx-toolkit (DNAnexus platform client libraries).
#
#   Licensed under the Apache License, Version 2.0 (the "License"); you may not
#   use this file except in compliance with the License. You may obtain a copy
#   of the License at
#
#       http://www.apache.org/licenses/LICENSE-2.0
#
#   Unless required by applicable law or agreed to in writing, software
#   distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
#   WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
#   License for the specific language governing permissions and limitations
#   under the License.

'''
DXFile Handler
**************

This remote file handler is a Python file-like object.
'''

from __future__ import print_function, unicode_literals, division, absolute_import

import os, sys, logging, traceback, hashlib, copy, time
import math
import mmap
from threading import Lock
from multiprocessing import cpu_count

import dxpy
from . import DXDataObject
from ..exceptions import DXFileError, DXIncompleteReadsError
from ..utils import warn
from ..utils.resolver import object_exists_in_project
from ..compat import BytesIO, basestring, USING_PYTHON2, md5_hasher


DXFILE_HTTP_THREADS = min(cpu_count(), 8)
MIN_BUFFER_SIZE = 1024*1024
DEFAULT_BUFFER_SIZE = 1024*1024*16
if dxpy.JOB_ID:
    # Increase HTTP request buffer size when we are running within the
    # platform.
    DEFAULT_BUFFER_SIZE = 1024*1024*96

MD5_READ_CHUNK_SIZE = 1024*1024*4
FILE_REQUEST_TIMEOUT = 60


def _validate_headers(headers):
    for key, value in headers.items():
        if not isinstance(key, basestring):
            raise ValueError("Expected key %r of headers to be a string" % (key,))
        if not isinstance(value, basestring):
            raise ValueError("Expected value %r of headers (associated with key %r) to be a string"
                             % (value, key))
    return headers


def _readable_part_size(num_bytes):
    "Returns the file size in readable form."
    B = num_bytes
    KB = float(1024)
    MB = float(KB * 1024)
    GB = float(MB * 1024)
    TB = float(GB * 1024)

    if B < KB:
        return '{0} {1}'.format(B, 'bytes' if B != 1 else 'byte')
    elif KB <= B < MB:
        return '{0:.2f} KiB'.format(B/KB)
    elif MB <= B < GB:
        return '{0:.2f} MiB'.format(B/MB)
    elif GB <= B < TB:
        return '{0:.2f} GiB'.format(B/GB)
    elif TB <= B:
        return '{0:.2f} TiB'.format(B/TB)


def _get_write_buf_size(buffer_size_hint, file_upload_params, expected_file_size, file_is_mmapd=False):
    max_num_parts = file_upload_params['maximumNumParts']
    min_part_size = file_upload_params['minimumPartSize']
    max_part_size = file_upload_params['maximumPartSize']
    max_file_size = file_upload_params['maximumFileSize']

    if expected_file_size is not None and expected_file_size > max_file_size:
        raise DXFileError("Size of file exceeds maximum of {}".format(_readable_part_size(max_file_size)))

    min_buffer_size = min_part_size
    if expected_file_size is not None:
        # Raise buffer size (for files exceeding DEFAULT_BUFFER_SIZE
        # * the maximium parts allowed bytes) in order to prevent us
        # from exceeding the configured parts limit.
        min_buffer_size = max(min_buffer_size, int(math.ceil(float(expected_file_size) / max_num_parts)))
    max_buffer_size = max_part_size

    assert min_buffer_size <= max_buffer_size

    if file_is_mmapd:
        # If file is mmapd, force the eventual result to be a
        # multiple of the allocation granularity by rounding all of
        # buffer_size, min_buffer_size, and max_buffer_size to a
        # nearby multiple of the allocation granularity (below, the
        # final buffer size will be one of these).
        if min_buffer_size % mmap.ALLOCATIONGRANULARITY != 0:
            min_buffer_size += mmap.ALLOCATIONGRANULARITY - min_buffer_size % mmap.ALLOCATIONGRANULARITY
        if max_buffer_size % mmap.ALLOCATIONGRANULARITY != 0:
            max_buffer_size -= max_buffer_size % mmap.ALLOCATIONGRANULARITY
        buffer_size_hint = buffer_size_hint - buffer_size_hint % mmap.ALLOCATIONGRANULARITY
    else:
        buffer_size_hint = buffer_size_hint

    # Use the user-specified hint if it is a permissible size
    # (satisfies API and large enough to upload file of advertised
    # size). Otherwise, select the closest size that is permissible.
    buffer_size = buffer_size_hint
    buffer_size = max(buffer_size, min_buffer_size)
    buffer_size = min(buffer_size, max_buffer_size)

    if expected_file_size is not None and (buffer_size * max_num_parts < expected_file_size):
        raise AssertionError("part size would be too small to upload the requested number of bytes")

    if file_is_mmapd and buffer_size % mmap.ALLOCATIONGRANULARITY != 0:
        raise AssertionError('part size will not be accepted by mmap')

    return buffer_size


[docs]class DXFile(DXDataObject): '''Remote file object handler. :param dxid: Object ID :type dxid: string :param project: Project ID :type project: string :param mode: One of "r", "w", or "a" for read, write, and append modes, respectively. Use "b" for binary mode. For example, "rb" means open a file for reading in binary mode. :type mode: string .. note:: The attribute values below are current as of the last time :meth:`~dxpy.bindings.DXDataObject.describe` was run. (Access to any of the below attributes causes :meth:`~dxpy.bindings.DXDataObject.describe` to be called if it has never been called before.) .. py:attribute:: media String containing the Internet Media Type (also known as MIME type or Content-type) of the file. .. automethod:: _new ''' _class = "file" _describe = staticmethod(dxpy.api.file_describe) _add_types = staticmethod(dxpy.api.file_add_types) _remove_types = staticmethod(dxpy.api.file_remove_types) _get_details = staticmethod(dxpy.api.file_get_details) _set_details = staticmethod(dxpy.api.file_set_details) _set_visibility = staticmethod(dxpy.api.file_set_visibility) _rename = staticmethod(dxpy.api.file_rename) _set_properties = staticmethod(dxpy.api.file_set_properties) _add_tags = staticmethod(dxpy.api.file_add_tags) _remove_tags = staticmethod(dxpy.api.file_remove_tags) _close = staticmethod(dxpy.api.file_close) _list_projects = staticmethod(dxpy.api.file_list_projects) _http_threadpool_size = DXFILE_HTTP_THREADS _http_threadpool = dxpy.utils.get_futures_threadpool(max_workers=_http_threadpool_size) NO_PROJECT_HINT = 'NO_PROJECT_HINT'
[docs] @classmethod def set_http_threadpool_size(cls, num_threads): ''' .. deprecated:: 0.191.0 ''' print('set_http_threadpool_size is deprecated')
def __init__(self, dxid=None, project=None, mode=None, read_buffer_size=DEFAULT_BUFFER_SIZE, write_buffer_size=DEFAULT_BUFFER_SIZE, expected_file_size=None, file_is_mmapd=False): """ :param dxid: Object ID :type dxid: string :param project: Project ID :type project: string :param mode: One of "r", "w", or "a" for read, write, and append modes, respectively. Add "b" for binary mode. :type mode: string :param read_buffer_size: size of read buffer in bytes :type read_buffer_size: int :param write_buffer_size: hint for size of write buffer in bytes. A lower or higher value may be used depending on region-specific parameters and on the expected file size. :type write_buffer_size: int :param expected_file_size: size of data that will be written, if known :type expected_file_size: int :param file_is_mmapd: True if input file is mmap'd (if so, the write buffer size will be constrained to be a multiple of the allocation granularity) :type file_is_mmapd: bool """ DXDataObject.__init__(self, dxid=dxid, project=project) # By default, a file is created in text mode. This makes a difference # in python 3. self._binary_mode = False if mode is None: self._close_on_exit = True else: if 'b' in mode: self._binary_mode = True mode = mode.replace("b", "") if mode not in ['r', 'w', 'a']: raise ValueError("mode must be one of 'r', 'w', or 'a'. Character 'b' may be used in combination (e.g. 'wb').") self._close_on_exit = (mode == 'w') self._read_buf = BytesIO() self._write_buf = BytesIO() self._read_bufsize = read_buffer_size # Computed lazily later since this depends on the project, and # we want to allow the project to be set as late as possible. # Call _ensure_write_bufsize to ensure that this is set before # trying to read it. self._write_bufsize = None self._write_buffer_size_hint = write_buffer_size self._expected_file_size = expected_file_size self._file_is_mmapd = file_is_mmapd # These are cached once for all download threads. This saves calls to the apiserver. self._download_url, self._download_url_headers, self._download_url_expires = None, None, None # This lock protects accesses to the above three variables, ensuring that they would # be checked and changed atomically. This protects against thread race conditions. self._url_download_mutex = Lock() self._request_iterator, self._response_iterator = None, None self._http_threadpool_futures = set() # Initialize state self._pos = 0 self._file_length = None self._cur_part = 1 self._num_uploaded_parts = 0
[docs] def _new(self, dx_hash, media_type=None, **kwargs): """ :param dx_hash: Standard hash populated in :func:`dxpy.bindings.DXDataObject.new()` containing attributes common to all data object classes. :type dx_hash: dict :param media_type: Internet Media Type :type media_type: string Creates a new remote file with media type *media_type*, if given. """ if media_type is not None: dx_hash["media"] = media_type resp = dxpy.api.file_new(dx_hash, **kwargs) self.set_ids(resp["id"], dx_hash["project"])
def __enter__(self): return self def __exit__(self, type, value, traceback): self.flush() if self._close_on_exit and self._get_state() == "open": self.close() def __del__(self): ''' Exceptions raised here in the destructor are IGNORED by Python! We will try and flush data here just as a safety measure, but you should not rely on this to flush your data! We will be really grumpy and complain if we detect unflushed data here. Use a context manager or flush the object explicitly to avoid this. In addition, when this is triggered by interpreter shutdown, the thread pool is not available, and we will wait for the request queue forever. In this case, we must revert to synchronous, in-thread flushing. We don't know how to detect this condition, so we'll use that for all destructor events. Neither this nor context managers are compatible with kwargs pass-through (so e.g. no custom auth). ''' if not hasattr(self, '_write_buf'): # This occurs when there is an exception initializing the # DXFile object return if self._write_buf.tell() > 0 or len(self._http_threadpool_futures) > 0: warn("=== WARNING! ===") warn("There is still unflushed data in the destructor of a DXFile object!") warn("We will attempt to flush it now, but if an error were to occur, we could not report it back to you.") warn("Your program could fail to flush the data but appear to succeed.") warn("Instead, please call flush() or close(), or use the context managed version (e.g., with open_dxfile(ID, mode='w') as f:)") try: self.flush(multithread=False) except Exception as e: warn("=== Exception occurred while flushing accumulated file data for %r" % (self._dxid,)) traceback.print_exception(*sys.exc_info()) raise def __iter__(self): _buffer = self.read(self._read_bufsize) done = False if USING_PYTHON2: while not done: if b"\n" in _buffer: lines = _buffer.splitlines() for i in range(len(lines) - 1): yield lines[i] _buffer = lines[len(lines) - 1] else: more = self.read(self._read_bufsize) if more == b"": done = True else: _buffer = _buffer + more else: if self._binary_mode: raise DXFileError("Cannot read lines when file opened in binary mode") # python3 is much stricter about distinguishing # 'bytes' from 'str'. while not done: if "\n" in _buffer: lines = _buffer.splitlines() for i in range(len(lines) - 1): yield lines[i] _buffer = lines[len(lines) - 1] else: more = self.read(self._read_bufsize) if more == "": done = True else: _buffer = _buffer + more if _buffer: yield _buffer next = next __next__ = next
[docs] def set_ids(self, dxid, project=None): ''' :param dxid: Object ID :type dxid: string :param project: Project ID :type project: string Discards the currently stored ID and associates the handler with *dxid*. As a side effect, it also flushes the buffer for the previous file object if the buffer is nonempty. ''' if self._dxid is not None: self.flush() DXDataObject.set_ids(self, dxid, project) # Reset state self._pos = 0 self._file_length = None self._cur_part = 1 self._num_uploaded_parts = 0
[docs] def seek(self, offset, from_what=os.SEEK_SET): ''' :param offset: Position in the file to seek to :type offset: integer Seeks to *offset* bytes from the beginning of the file. This is a no-op if the file is open for writing. The position is computed from adding *offset* to a reference point; the reference point is selected by the *from_what* argument. A *from_what* value of 0 measures from the beginning of the file, 1 uses the current file position, and 2 uses the end of the file as the reference point. *from_what* can be omitted and defaults to 0, using the beginning of the file as the reference point. ''' if from_what == os.SEEK_SET: reference_pos = 0 elif from_what == os.SEEK_CUR: reference_pos = self._pos elif from_what == os.SEEK_END: if self._file_length == None: desc = self.describe() self._file_length = int(desc["size"]) reference_pos = self._file_length else: raise DXFileError("Invalid value supplied for from_what") orig_pos = self._pos self._pos = reference_pos + offset in_buf = False orig_buf_pos = self._read_buf.tell() if offset < orig_pos: if orig_buf_pos > orig_pos - offset: # offset is less than original position but within the buffer in_buf = True else: buf_len = dxpy.utils.string_buffer_length(self._read_buf) if buf_len - orig_buf_pos > offset - orig_pos: # offset is greater than original position but within the buffer in_buf = True if in_buf: # offset is within the buffer (at least one byte following # the offset can be read directly out of the buffer) self._read_buf.seek(orig_buf_pos - orig_pos + offset) elif offset == orig_pos: # This seek is a no-op (the cursor is just past the end of # the read buffer and coincides with the desired seek # position). We don't have the data ready, but the request # for the data starting here is already in flight. # # Detecting this case helps to optimize for sequential read # access patterns. pass else: # offset is outside the buffer-- reset buffer and queues. # This is the failsafe behavior self._read_buf = BytesIO() # TODO: if the offset is within the next response(s), don't throw out the queues self._request_iterator, self._response_iterator = None, None
[docs] def tell(self): ''' Returns the current position of the file read cursor. Warning: Because of buffering semantics, this value will **not** be accurate when using the line iterator form (`for line in file`). ''' return self._pos
[docs] def flush(self, multithread=True, **kwargs): ''' Flushes the internal write buffer. ''' if self._write_buf.tell() > 0: data = self._write_buf.getvalue() self._write_buf = BytesIO() if multithread: self._async_upload_part_request(data, index=self._cur_part, **kwargs) else: self.upload_part(data, self._cur_part, **kwargs) self._cur_part += 1 if len(self._http_threadpool_futures) > 0: dxpy.utils.wait_for_all_futures(self._http_threadpool_futures) try: for future in self._http_threadpool_futures: if future.exception() != None: raise future.exception() finally: self._http_threadpool_futures = set()
def _async_upload_part_request(self, *args, **kwargs): while len(self._http_threadpool_futures) >= self._http_threadpool_size: future = dxpy.utils.wait_for_a_future(self._http_threadpool_futures) if future.exception() != None: raise future.exception() self._http_threadpool_futures.remove(future) future = self._http_threadpool.submit(self.upload_part, *args, **kwargs) self._http_threadpool_futures.add(future) def _ensure_write_bufsize(self, **kwargs): if self._write_bufsize is not None: return file_upload_params = dxpy.api.project_describe( self.get_proj_id(), {'fields': {'fileUploadParameters': True}}, **kwargs )['fileUploadParameters'] self._empty_last_part_allowed = file_upload_params['emptyLastPartAllowed'] self._write_bufsize = _get_write_buf_size(self._write_buffer_size_hint, file_upload_params, self._expected_file_size, self._file_is_mmapd) def _write2(self, data, multithread=True, **kwargs): ''' :param data: Data to be written :type data: str or mmap object :param multithread: If True, sends multiple write requests asynchronously :type multithread: boolean Writes the data *data* to the file. .. note:: Writing to remote files is append-only. Using :meth:`seek` does not affect where the next :meth:`write` will occur. ''' if not USING_PYTHON2: assert(isinstance(data, bytes)) self._ensure_write_bufsize(**kwargs) def write_request(data_for_write_req): if multithread: self._async_upload_part_request(data_for_write_req, index=self._cur_part, **kwargs) else: self.upload_part(data_for_write_req, self._cur_part, **kwargs) self._cur_part += 1 if self._write_buf.tell() == 0 and self._write_bufsize == len(data): # In the special case of a write that is the same size as # our write buffer size, and no unflushed data in the # buffer, just directly dispatch the write and bypass the # write buffer. # # This saves a buffer copy, which is especially helpful if # 'data' is actually mmap'd from a file. # # TODO: an additional optimization could be made to allow # the last request from an mmap'd upload to take this path # too (in general it won't because it's not of length # _write_bufsize). This is probably inconsequential though. write_request(data) return remaining_space = self._write_bufsize - self._write_buf.tell() if len(data) <= remaining_space: self._write_buf.write(data) else: self._write_buf.write(data[:remaining_space]) temp_data = self._write_buf.getvalue() self._write_buf = BytesIO() write_request(temp_data) # TODO: check if repeat string splitting is bad for # performance when len(data) >> _write_bufsize self.write(data[remaining_space:], **kwargs)
[docs] def write(self, data, multithread=True, **kwargs): ''' :param data: Data to be written :type data: str or mmap object :param multithread: If True, sends multiple write requests asynchronously :type multithread: boolean Writes the data *data* to the file. .. note:: Writing to remote files is append-only. Using :meth:`seek` does not affect where the next :meth:`write` will occur. ''' if USING_PYTHON2: self._write2(data, multithread=multithread, **kwargs) else: # In python3, the underlying system methods use the 'bytes' type, not 'string' # # This is, hopefully, a temporary hack. It is not a good idea for two reasons: # 1) Performance, we need to make a pass on the data, and need to allocate # another buffer of similar size # 2) The types are wrong. The "bytes" type should be visible to the caller # of the write method, instead of being hidden. # Should we throw an exception if the file is opened in binary mode, # and the data is unicode/text? if isinstance(data, str): bt = data.encode("utf-8") elif isinstance(data, bytearray): bt = bytes(data) elif isinstance(data, bytes): bt = data elif isinstance(data, mmap.mmap): bt = bytes(data) else: raise DXFileError("Invalid type {} for write data argument".format(type(data))) assert(isinstance(bt, bytes)) self._write2(bt, multithread=multithread, **kwargs)
[docs] def closed(self, **kwargs): ''' :returns: Whether the remote file is closed :rtype: boolean Returns :const:`True` if the remote file is closed and :const:`False` otherwise. Note that if it is not closed, it can be in either the "open" or "closing" states. ''' return self.describe(fields={'state'}, **kwargs)["state"] == "closed"
[docs] def close(self, block=False, **kwargs): ''' :param block: If True, this function blocks until the remote file has closed. :type block: boolean Attempts to close the file. .. note:: The remote file cannot be closed until all parts have been fully uploaded. An exception will be thrown if this is not the case. ''' self.flush(**kwargs) # Also populates emptyLastPartAllowed self._ensure_write_bufsize(**kwargs) if self._num_uploaded_parts == 0 and self._empty_last_part_allowed: # We haven't uploaded any parts in this session. # In case no parts have been uploaded at all and region # settings allow last empty part upload, try to upload # an empty part (otherwise files with 0 parts cannot be closed). try: if USING_PYTHON2: self.upload_part('', 1, **kwargs) else: self.upload_part(b'', 1, **kwargs) except dxpy.exceptions.InvalidState: pass if 'report_progress_fn' in kwargs: del kwargs['report_progress_fn'] dxpy.api.file_close(self._dxid, **kwargs) if block: self._wait_on_close(**kwargs)
[docs] def wait_on_close(self, timeout=3600*24*7, **kwargs): ''' :param timeout: Maximum amount of time to wait (in seconds) until the file is closed. :type timeout: integer :raises: :exc:`dxpy.exceptions.DXFileError` if the timeout is reached before the remote file has been closed Waits until the remote file is closed. ''' self._wait_on_close(timeout, **kwargs)
[docs] def upload_part(self, data, index=None, display_progress=False, report_progress_fn=None, **kwargs): """ :param data: Data to be uploaded in this part :type data: str or mmap object, bytes on python3 :param index: Index of part to be uploaded; must be in [1, 10000] :type index: integer :param display_progress: Whether to print "." to stderr when done :type display_progress: boolean :param report_progress_fn: Optional: a function to call that takes in two arguments (self, # bytes transmitted) :type report_progress_fn: function or None :raises: :exc:`dxpy.exceptions.DXFileError` if *index* is given and is not in the correct range, :exc:`urllib3.exceptions.HTTPError` if upload fails Uploads the data in *data* as part number *index* for the associated file. If no value for *index* is given, *index* defaults to 1. This probably only makes sense if this is the only part to be uploaded. """ if not USING_PYTHON2: # In python3, the underlying system methods use the 'bytes' type, not 'string' assert(isinstance(data, bytes)) req_input = {} if index is not None: req_input["index"] = int(index) md5 = md5_hasher() if hasattr(data, 'seek') and hasattr(data, 'tell'): # data is a buffer; record initial position (so we can rewind back) rewind_input_buffer_offset = data.tell() while True: bytes_read = data.read(MD5_READ_CHUNK_SIZE) if bytes_read: md5.update(bytes_read) else: break # rewind the buffer to original position data.seek(rewind_input_buffer_offset) else: md5.update(data) req_input["md5"] = md5.hexdigest() req_input["size"] = len(data) def get_upload_url_and_headers(): # This function is called from within a retry loop, so to avoid amplifying the number of retries # geometrically, we decrease the allowed number of retries for the nested API call every time. if 'max_retries' not in kwargs: kwargs['max_retries'] = dxpy.DEFAULT_RETRIES elif kwargs['max_retries'] > 0: kwargs['max_retries'] -= 1 if "timeout" not in kwargs: kwargs["timeout"] = FILE_REQUEST_TIMEOUT resp = dxpy.api.file_upload(self._dxid, req_input, **kwargs) url = resp["url"] return url, _validate_headers(resp.get("headers", {})) # The file upload API requires us to get a pre-authenticated upload URL (and headers for it) every time we # attempt an upload. Because DXHTTPRequest will retry requests under retryable conditions, we give it a callback # to ask us for a new upload URL every time it attempts a request (instead of giving them directly). dxpy.DXHTTPRequest(get_upload_url_and_headers, data, jsonify_data=False, prepend_srv=False, always_retry=True, timeout=FILE_REQUEST_TIMEOUT, auth=None, method='PUT') self._num_uploaded_parts += 1 if display_progress: warn(".") if report_progress_fn is not None: report_progress_fn(self, len(data))
[docs] def wait_until_parts_uploaded(self, **kwargs): self._wait_until_parts_uploaded(self, **kwargs)
[docs] def get_download_url(self, duration=None, preauthenticated=False, filename=None, project=None, **kwargs): """ :param duration: number of seconds for which the generated URL will be valid, should only be specified when preauthenticated is True :type duration: int :param preauthenticated: if True, generates a 'preauthenticated' download URL, which embeds authentication info in the URL and does not require additional headers :type preauthenticated: bool :param filename: desired filename of the downloaded file :type filename: str :param project: ID of a project containing the file (the download URL will be associated with this project, and this may affect which billing account is billed for this download). If no project is specified, an attempt will be made to verify if the file is in the project from the DXFile handler (as specified by the user or the current project stored in dxpy.WORKSPACE_ID). Otherwise, no hint is supplied. This fall back behavior does not happen inside a job environment. A non preauthenticated URL is only valid as long as the user has access to that project and the project contains that file. :type project: str :returns: download URL and dict containing HTTP headers to be supplied with the request :rtype: tuple (str, dict) :raises: :exc:`~dxpy.exceptions.ResourceNotFound` if a project context was given and the file was not found in that project context. :raises: :exc:`~dxpy.exceptions.ResourceNotFound` if no project context was given and the file was not found in any projects. Obtains a URL that can be used to directly download the associated file. """ with self._url_download_mutex: # Only generate URL if not already cached or expired if self._download_url is None or self._download_url_expires < time.time(): args = {"preauthenticated": preauthenticated} if duration is not None: args["duration"] = duration if filename is not None: args["filename"] = filename # If project=None, we fall back to the project attached to this handler # (if any). If this is supplied, it's treated as a hint: if it's a # project in which this file exists, it's passed on to the # apiserver. Otherwise, NO hint is supplied. In principle supplying a # project in the handler that doesn't contain this file ought to be an # error, but it's this way for backwards compatibility. We don't know # who might be doing downloads and creating handlers without being # careful that the project encoded in the handler contains the file # being downloaded. They may now rely on such behavior. if project is None and 'DX_JOB_ID' not in os.environ: project_from_handler = self.get_proj_id() # object_exists_in_project will call /file-xxxx/describe, which is skipped if the URL is cached if project_from_handler and object_exists_in_project(self.get_id(), project_from_handler): project = project_from_handler if project is not None and project is not DXFile.NO_PROJECT_HINT: args["project"] = project # Test hook to write 'project' argument passed to API call to a # local file if '_DX_DUMP_BILLED_PROJECT' in os.environ: with open(os.environ['_DX_DUMP_BILLED_PROJECT'], "w") as fd: if project is not None and project != DXFile.NO_PROJECT_HINT: fd.write(project) # The idea here is to cache a download URL for the entire file, that will # be good for a few minutes. This avoids each thread having to ask the # server for a URL, increasing server load. # # To avoid thread race conditions, this check/update procedure is protected # with a lock. # logging.debug("Download URL unset or expired, requesting a new one") if "timeout" not in kwargs: kwargs["timeout"] = FILE_REQUEST_TIMEOUT resp = dxpy.api.file_download(self._dxid, args, **kwargs) self._download_url = resp["url"] self._download_url_headers = _validate_headers(resp.get("headers", {})) if preauthenticated: self._download_url_expires = resp["expires"]/1000 - 60 # Try to account for drift else: self._download_url_expires = 32503680000 # doesn't expire (year 3000) # Make a copy, ensuring each thread has its own mutable # version of the headers. Note: python strings are # immutable, so we can safely give a reference to the # download url. retval_download_url = self._download_url retval_download_url_headers = copy.copy(self._download_url_headers) return retval_download_url, retval_download_url_headers
def _generate_read_requests(self, start_pos=0, end_pos=None, project=None, limit_chunk_size=None, **kwargs): # project=None means no hint is to be supplied to the apiserver. It is # an error to supply a project that does not contain this file. if limit_chunk_size is None: limit_chunk_size = self._read_bufsize if self._file_length == None: desc = self.describe(**kwargs) self._file_length = int(desc["size"]) if end_pos == None: end_pos = self._file_length if end_pos > self._file_length: raise DXFileError("Invalid end_pos") def chunk_ranges(start_pos, end_pos, init_chunk_size=1024*64, ramp=2, num_requests_between_ramp=4): cur_chunk_start = start_pos cur_chunk_size = min(init_chunk_size, limit_chunk_size) i = 0 while cur_chunk_start < end_pos: cur_chunk_end = min(cur_chunk_start + cur_chunk_size - 1, end_pos) yield cur_chunk_start, cur_chunk_end cur_chunk_start += cur_chunk_size if cur_chunk_size < limit_chunk_size and i % num_requests_between_ramp == (num_requests_between_ramp - 1): cur_chunk_size = min(cur_chunk_size * ramp, limit_chunk_size) i += 1 for chunk_start_pos, chunk_end_pos in chunk_ranges(start_pos, end_pos): url, headers = self.get_download_url(project=project, **kwargs) # It is possible for chunk_end_pos to be outside of the range of the file yield dxpy._dxhttp_read_range, [url, headers, chunk_start_pos, min(chunk_end_pos, self._file_length - 1), FILE_REQUEST_TIMEOUT], {} def _next_response_content(self, get_first_chunk_sequentially=False): if self._response_iterator is None: self._response_iterator = dxpy.utils.response_iterator( self._request_iterator, self._http_threadpool, do_first_task_sequentially=get_first_chunk_sequentially ) try: return next(self._response_iterator) except: # If an exception is raised, the iterator is unusable for # retrieving any more items. Destroy it so we'll reinitialize it # next time. self._response_iterator = None self._request_iterator = None raise def _read2(self, length=None, use_compression=None, project=None, **kwargs): ''' :param length: Maximum number of bytes to be read :type length: integer :param project: project to use as context for this download (may affect which billing account is billed for this download). If specified, must be a project in which this file exists. If not specified, the project ID specified in the handler is used for the download, IF it contains this file. If set to DXFile.NO_PROJECT_HINT, no project ID is supplied for the download, even if the handler specifies a project ID. :type project: str or None :rtype: string :raises: :exc:`~dxpy.exceptions.ResourceNotFound` if *project* is supplied and it does not contain this file Returns the next *length* bytes, or all the bytes until the end of file (if no *length* is given or there are fewer than *length* bytes left in the file). .. note:: After the first call to read(), the project arg and passthrough kwargs are not respected while using the same response iterator (i.e. until next seek). ''' if self._file_length == None: desc = self.describe(**kwargs) if desc["state"] != "closed": raise DXFileError("Cannot read from file until it is in the closed state") self._file_length = int(desc["size"]) # If running on a worker, wait for the first file download chunk # to come back before issuing any more requests. This ensures # that all subsequent requests can take advantage of caching, # rather than having all of the first DXFILE_HTTP_THREADS # requests simultaneously hit a cold cache. Enforce a minimum # size for this heuristic so we don't incur the overhead for # tiny files (which wouldn't contribute as much to the load # anyway). get_first_chunk_sequentially = (self._file_length > 128 * 1024 and self._pos == 0 and dxpy.JOB_ID) if self._pos == self._file_length: return b"" if length == None or length > self._file_length - self._pos: length = self._file_length - self._pos buf = self._read_buf buf_remaining_bytes = dxpy.utils.string_buffer_length(buf) - buf.tell() if length <= buf_remaining_bytes: self._pos += length return buf.read(length) else: orig_buf_pos = buf.tell() orig_file_pos = self._pos buf.seek(0, os.SEEK_END) self._pos += buf_remaining_bytes while self._pos < orig_file_pos + length: remaining_len = orig_file_pos + length - self._pos if self._response_iterator is None: self._request_iterator = self._generate_read_requests( start_pos=self._pos, project=project, **kwargs) content = self._next_response_content(get_first_chunk_sequentially=get_first_chunk_sequentially) if len(content) < remaining_len: buf.write(content) self._pos += len(content) else: # response goes beyond requested length buf.write(content[:remaining_len]) self._pos += remaining_len self._read_buf = BytesIO() self._read_buf.write(content[remaining_len:]) self._read_buf.seek(0) buf.seek(orig_buf_pos) return buf.read() # Debug fallback # import urllib2 # req = urllib2.Request(url, headers=headers) # response = urllib2.urlopen(req) # return response.read()
[docs] def read(self, length=None, use_compression=None, project=None, **kwargs): data = self._read2(length=length, use_compression=use_compression, project=project, **kwargs) if USING_PYTHON2: return data # In python3, the underlying system methods use the 'bytes' type, not 'string' if self._binary_mode is True: return data return data.decode("utf-8")
[docs] def archive(self, all_copies=False): ''' :param all_copies: Force the transition of files into the archived state. Requesting user must be the ADMIN of the project billTo org. If true, archive all the copies of files in projects with the same billTo org. :type all_copies: boolean :raises: :exc:`~dxpy.exceptions.InvalidState` if the file is not in a live state :raises: :exc:`~dxpy.exceptions.PermissionDenied` if the requesting user does not have CONTRIBUTE access or is not an ADMIN of the project billTo org with allCopies=True. ''' dxpy.api.project_archive(self.get_proj_id(), {"files": [self.get_id()], "allCopies": all_copies})
[docs] def unarchive(self, dry_run=False): ''' :param dry_run: If true, only display the output of the API call without executing the unarchival :type dry_run: boolean :raises: :exc:`~dxpy.exceptions.InvalidState` if the file is not in a closed or archived state :raises: :exc:`~dxpy.exceptions.PermissionDenied` if the requesting user does not have CONTRIBUTE access ''' dxpy.api.project_unarchive(self.get_proj_id(), {"files": [self.get_id()], "dryRun": dry_run})