| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291 |
- """Utilities for fast persistence of big data, with optional compression."""
- # Author: Gael Varoquaux <gael dot varoquaux at normalesup dot org>
- # Copyright (c) 2009 Gael Varoquaux
- # License: BSD Style, 3 clauses.
- import contextlib
- import io
- import pickle
- import sys
- import warnings
- from .compressor import _COMPRESSORS, _ZFILE_PREFIX
- try:
- import numpy as np
- except ImportError:
- np = None
- Unpickler = pickle._Unpickler
- Pickler = pickle._Pickler
- xrange = range
- try:
- # The python standard library can be built without bz2 so we make bz2
- # usage optional.
- # see https://github.com/scikit-learn/scikit-learn/issues/7526 for more
- # details.
- import bz2
- except ImportError:
- bz2 = None
- # Buffer size used in io.BufferedReader and io.BufferedWriter
- _IO_BUFFER_SIZE = 1024**2
- def _is_raw_file(fileobj):
- """Check if fileobj is a raw file object, e.g created with open."""
- fileobj = getattr(fileobj, "raw", fileobj)
- return isinstance(fileobj, io.FileIO)
- def _get_prefixes_max_len():
- # Compute the max prefix len of registered compressors.
- prefixes = [len(compressor.prefix) for compressor in _COMPRESSORS.values()]
- prefixes += [len(_ZFILE_PREFIX)]
- return max(prefixes)
- def _is_numpy_array_byte_order_mismatch(array):
- """Check if numpy array is having byte order mismatch"""
- return (
- sys.byteorder == "big"
- and (
- array.dtype.byteorder == "<"
- or (
- array.dtype.byteorder == "|"
- and array.dtype.fields
- and all(e[0].byteorder == "<" for e in array.dtype.fields.values())
- )
- )
- ) or (
- sys.byteorder == "little"
- and (
- array.dtype.byteorder == ">"
- or (
- array.dtype.byteorder == "|"
- and array.dtype.fields
- and all(e[0].byteorder == ">" for e in array.dtype.fields.values())
- )
- )
- )
- def _ensure_native_byte_order(array):
- """Use the byte order of the host while preserving values
- Does nothing if array already uses the system byte order.
- """
- if _is_numpy_array_byte_order_mismatch(array):
- array = array.byteswap().view(array.dtype.newbyteorder("="))
- return array
- ###############################################################################
- # Cache file utilities
- def _detect_compressor(fileobj):
- """Return the compressor matching fileobj.
- Parameters
- ----------
- fileobj: file object
- Returns
- -------
- str in {'zlib', 'gzip', 'bz2', 'lzma', 'xz', 'compat', 'not-compressed'}
- """
- # Read the magic number in the first bytes of the file.
- max_prefix_len = _get_prefixes_max_len()
- if hasattr(fileobj, "peek"):
- # Peek allows to read those bytes without moving the cursor in the
- # file which.
- first_bytes = fileobj.peek(max_prefix_len)
- else:
- # Fallback to seek if the fileobject is not peekable.
- first_bytes = fileobj.read(max_prefix_len)
- fileobj.seek(0)
- if first_bytes.startswith(_ZFILE_PREFIX):
- return "compat"
- else:
- for name, compressor in _COMPRESSORS.items():
- if first_bytes.startswith(compressor.prefix):
- return name
- return "not-compressed"
- def _buffered_read_file(fobj):
- """Return a buffered version of a read file object."""
- return io.BufferedReader(fobj, buffer_size=_IO_BUFFER_SIZE)
- def _buffered_write_file(fobj):
- """Return a buffered version of a write file object."""
- return io.BufferedWriter(fobj, buffer_size=_IO_BUFFER_SIZE)
- @contextlib.contextmanager
- def _validate_fileobject_and_memmap(fileobj, filename, mmap_mode=None):
- """Utility function opening the right fileobject from a filename.
- The magic number is used to choose between the type of file object to open:
- * regular file object (default)
- * zlib file object
- * gzip file object
- * bz2 file object
- * lzma file object (for xz and lzma compressor)
- Parameters
- ----------
- fileobj: file object
- filename: str
- filename path corresponding to the fileobj parameter.
- mmap_mode: str
- memory map mode that should be used to open the pickle file. This
- parameter is useful to verify that the user is not trying to one with
- compression. Default: None.
- Returns
- -------
- a tuple with a file like object, and the validated mmap_mode.
- """
- # Detect if the fileobj contains compressed data.
- compressor = _detect_compressor(fileobj)
- validated_mmap_mode = mmap_mode
- if compressor == "compat":
- # Compatibility with old pickle mode: simply return the input
- # filename "as-is" and let the compatibility function be called by the
- # caller.
- warnings.warn(
- "The file '%s' has been generated with a joblib "
- "version less than 0.10. "
- "Please regenerate this pickle file." % filename,
- DeprecationWarning,
- stacklevel=2,
- )
- yield filename, validated_mmap_mode
- else:
- if compressor in _COMPRESSORS:
- # based on the compressor detected in the file, we open the
- # correct decompressor file object, wrapped in a buffer.
- compressor_wrapper = _COMPRESSORS[compressor]
- inst = compressor_wrapper.decompressor_file(fileobj)
- fileobj = _buffered_read_file(inst)
- # Checking if incompatible load parameters with the type of file:
- # mmap_mode cannot be used with compressed file or in memory buffers
- # such as io.BytesIO.
- if mmap_mode is not None:
- validated_mmap_mode = None
- if isinstance(fileobj, io.BytesIO):
- warnings.warn(
- "In memory persistence is not compatible with "
- 'mmap_mode "%(mmap_mode)s" flag passed. '
- "mmap_mode option will be ignored." % locals(),
- stacklevel=2,
- )
- elif compressor != "not-compressed":
- warnings.warn(
- 'mmap_mode "%(mmap_mode)s" is not compatible '
- "with compressed file %(filename)s. "
- '"%(mmap_mode)s" flag will be ignored.' % locals(),
- stacklevel=2,
- )
- elif not _is_raw_file(fileobj):
- warnings.warn(
- '"%(fileobj)r" is not a raw file, mmap_mode '
- '"%(mmap_mode)s" flag will be ignored.' % locals(),
- stacklevel=2,
- )
- else:
- validated_mmap_mode = mmap_mode
- yield fileobj, validated_mmap_mode
- def _write_fileobject(filename, compress=("zlib", 3)):
- """Return the right compressor file object in write mode."""
- compressmethod = compress[0]
- compresslevel = compress[1]
- if compressmethod in _COMPRESSORS.keys():
- file_instance = _COMPRESSORS[compressmethod].compressor_file(
- filename, compresslevel=compresslevel
- )
- return _buffered_write_file(file_instance)
- else:
- file_instance = _COMPRESSORS["zlib"].compressor_file(
- filename, compresslevel=compresslevel
- )
- return _buffered_write_file(file_instance)
- # Utility functions/variables from numpy required for writing arrays.
- # We need at least the functions introduced in version 1.9 of numpy. Here,
- # we use the ones from numpy 1.10.2.
- BUFFER_SIZE = 2**18 # size of buffer for reading npz files in bytes
- def _read_bytes(fp, size, error_template="ran out of data"):
- """Read from file-like object until size bytes are read.
- TODO python2_drop: is it still needed? The docstring mentions python 2.6
- and it looks like this can be at least simplified ...
- Raises ValueError if not EOF is encountered before size bytes are read.
- Non-blocking objects only supported if they derive from io objects.
- Required as e.g. ZipExtFile in python 2.6 can return less data than
- requested.
- This function was taken from numpy/lib/format.py in version 1.10.2.
- Parameters
- ----------
- fp: file-like object
- size: int
- error_template: str
- Returns
- -------
- a bytes object
- The data read in bytes.
- """
- data = bytes()
- while True:
- # io files (default in python3) return None or raise on
- # would-block, python2 file will truncate, probably nothing can be
- # done about that. note that regular files can't be non-blocking
- try:
- r = fp.read(size - len(data))
- data += r
- if len(r) == 0 or len(data) == size:
- break
- except io.BlockingIOError:
- pass
- if len(data) != size:
- msg = "EOF: reading %s, expected %d bytes got %d"
- raise ValueError(msg % (error_template, size, len(data)))
- else:
- return data
- def _reconstruct(*args, **kwargs):
- # Wrapper for numpy._core.multiarray._reconstruct with backward compat
- # for numpy 1.X
- #
- # XXX: Remove this function when numpy 1.X is not supported anymore
- np_major_version = np.__version__[:2]
- if np_major_version == "1.":
- from numpy.core.multiarray import _reconstruct as np_reconstruct
- elif np_major_version == "2.":
- from numpy._core.multiarray import _reconstruct as np_reconstruct
- return np_reconstruct(*args, **kwargs)
|