| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459 |
- import numpy as np
- def _validate_1d(a, name, allow_inf=False):
- if np.ndim(a) != 1:
- raise ValueError(f'`{name}` must be a one-dimensional sequence.')
- if np.isnan(a).any():
- raise ValueError(f'`{name}` must not contain nan.')
- if not allow_inf and np.isinf(a).any():
- raise ValueError(f'`{name}` must contain only finite values.')
- def _validate_interval(interval):
- interval = np.asarray(interval)
- if interval.shape == (0,):
- # The input was a sequence with length 0.
- interval = interval.reshape((0, 2))
- if interval.ndim != 2 or interval.shape[-1] != 2:
- raise ValueError('`interval` must be a two-dimensional array with '
- 'shape (m, 2), where m is the number of '
- 'interval-censored values, but got shape '
- f'{interval.shape}')
- if np.isnan(interval).any():
- raise ValueError('`interval` must not contain nan.')
- if np.isinf(interval).all(axis=1).any():
- raise ValueError('In each row in `interval`, both values must not'
- ' be infinite.')
- if (interval[:, 0] > interval[:, 1]).any():
- raise ValueError('In each row of `interval`, the left value must not'
- ' exceed the right value.')
- uncensored_mask = interval[:, 0] == interval[:, 1]
- left_mask = np.isinf(interval[:, 0])
- right_mask = np.isinf(interval[:, 1])
- interval_mask = np.isfinite(interval).all(axis=1) & ~uncensored_mask
- uncensored2 = interval[uncensored_mask, 0]
- left2 = interval[left_mask, 1]
- right2 = interval[right_mask, 0]
- interval2 = interval[interval_mask]
- return uncensored2, left2, right2, interval2
- def _validate_x_censored(x, censored):
- x = np.asarray(x)
- if x.ndim != 1:
- raise ValueError('`x` must be one-dimensional.')
- censored = np.asarray(censored)
- if censored.ndim != 1:
- raise ValueError('`censored` must be one-dimensional.')
- if (~np.isfinite(x)).any():
- raise ValueError('`x` must not contain nan or inf.')
- if censored.size != x.size:
- raise ValueError('`x` and `censored` must have the same length.')
- return x, censored.astype(bool)
- class CensoredData:
- """
- Instances of this class represent censored data.
- Instances may be passed to the ``fit`` method of continuous
- univariate SciPy distributions for maximum likelihood estimation.
- The *only* method of the univariate continuous distributions that
- understands `CensoredData` is the ``fit`` method. An instance of
- `CensoredData` can not be passed to methods such as ``pdf`` and
- ``cdf``.
- An observation is said to be *censored* when the precise value is unknown,
- but it has a known upper and/or lower bound. The conventional terminology
- is:
- * left-censored: an observation is below a certain value but it is
- unknown by how much.
- * right-censored: an observation is above a certain value but it is
- unknown by how much.
- * interval-censored: an observation lies somewhere on an interval between
- two values.
- Left-, right-, and interval-censored data can be represented by
- `CensoredData`.
- For convenience, the class methods ``left_censored`` and
- ``right_censored`` are provided to create a `CensoredData`
- instance from a single one-dimensional array of measurements
- and a corresponding boolean array to indicate which measurements
- are censored. The class method ``interval_censored`` accepts two
- one-dimensional arrays that hold the lower and upper bounds of the
- intervals.
- Parameters
- ----------
- uncensored : array_like, 1D
- Uncensored observations.
- left : array_like, 1D
- Left-censored observations.
- right : array_like, 1D
- Right-censored observations.
- interval : array_like, 2D, with shape (m, 2)
- Interval-censored observations. Each row ``interval[k, :]``
- represents the interval for the kth interval-censored observation.
- Notes
- -----
- In the input array `interval`, the lower bound of the interval may
- be ``-inf``, and the upper bound may be ``inf``, but at least one must be
- finite. When the lower bound is ``-inf``, the row represents a left-
- censored observation, and when the upper bound is ``inf``, the row
- represents a right-censored observation. If the length of an interval
- is 0 (i.e. ``interval[k, 0] == interval[k, 1]``, the observation is
- treated as uncensored. So one can represent all the types of censored
- and uncensored data in ``interval``, but it is generally more convenient
- to use `uncensored`, `left` and `right` for uncensored, left-censored and
- right-censored observations, respectively.
- Examples
- --------
- In the most general case, a censored data set may contain values that
- are left-censored, right-censored, interval-censored, and uncensored.
- For example, here we create a data set with five observations. Two
- are uncensored (values 1 and 1.5), one is a left-censored observation
- of 0, one is a right-censored observation of 10 and one is
- interval-censored in the interval [2, 3].
- >>> import numpy as np
- >>> from scipy.stats import CensoredData
- >>> data = CensoredData(uncensored=[1, 1.5], left=[0], right=[10],
- ... interval=[[2, 3]])
- >>> print(data)
- CensoredData(5 values: 2 not censored, 1 left-censored,
- 1 right-censored, 1 interval-censored)
- Equivalently,
- >>> data = CensoredData(interval=[[1, 1],
- ... [1.5, 1.5],
- ... [-np.inf, 0],
- ... [10, np.inf],
- ... [2, 3]])
- >>> print(data)
- CensoredData(5 values: 2 not censored, 1 left-censored,
- 1 right-censored, 1 interval-censored)
- A common case is to have a mix of uncensored observations and censored
- observations that are all right-censored (or all left-censored). For
- example, consider an experiment in which six devices are started at
- various times and left running until they fail. Assume that time is
- measured in hours, and the experiment is stopped after 30 hours, even
- if all the devices have not failed by that time. We might end up with
- data such as this::
- Device Start-time Fail-time Time-to-failure
- 1 0 13 13
- 2 2 24 22
- 3 5 22 17
- 4 8 23 15
- 5 10 *** >20
- 6 12 *** >18
- Two of the devices had not failed when the experiment was stopped;
- the observations of the time-to-failure for these two devices are
- right-censored. We can represent this data with
- >>> data = CensoredData(uncensored=[13, 22, 17, 15], right=[20, 18])
- >>> print(data)
- CensoredData(6 values: 4 not censored, 2 right-censored)
- Alternatively, we can use the method `CensoredData.right_censored` to
- create a representation of this data. The time-to-failure observations
- are put the list ``ttf``. The ``censored`` list indicates which values
- in ``ttf`` are censored.
- >>> ttf = [13, 22, 17, 15, 20, 18]
- >>> censored = [False, False, False, False, True, True]
- Pass these lists to `CensoredData.right_censored` to create an
- instance of `CensoredData`.
- >>> data = CensoredData.right_censored(ttf, censored)
- >>> print(data)
- CensoredData(6 values: 4 not censored, 2 right-censored)
- If the input data is interval censored and already stored in two
- arrays, one holding the low end of the intervals and another
- holding the high ends, the class method ``interval_censored`` can
- be used to create the `CensoredData` instance.
- This example creates an instance with four interval-censored values.
- The intervals are [10, 11], [0.5, 1], [2, 3], and [12.5, 13.5].
- >>> a = [10, 0.5, 2, 12.5] # Low ends of the intervals
- >>> b = [11, 1.0, 3, 13.5] # High ends of the intervals
- >>> data = CensoredData.interval_censored(low=a, high=b)
- >>> print(data)
- CensoredData(4 values: 0 not censored, 4 interval-censored)
- Finally, we create and censor some data from the `weibull_min`
- distribution, and then fit `weibull_min` to that data. We'll assume
- that the location parameter is known to be 0.
- >>> from scipy.stats import weibull_min
- >>> rng = np.random.default_rng()
- Create the random data set.
- >>> x = weibull_min.rvs(2.5, loc=0, scale=30, size=250, random_state=rng)
- >>> x[x > 40] = 40 # Right-censor values greater or equal to 40.
- Create the `CensoredData` instance with the `right_censored` method.
- The censored values are those where the value is 40.
- >>> data = CensoredData.right_censored(x, x == 40)
- >>> print(data)
- CensoredData(250 values: 215 not censored, 35 right-censored)
- 35 values have been right-censored.
- Fit `weibull_min` to the censored data. We expect to shape and scale
- to be approximately 2.5 and 30, respectively.
- >>> weibull_min.fit(data, floc=0)
- (2.3575922823897315, 0, 30.40650074451254)
- """
- def __init__(self, uncensored=None, *, left=None, right=None,
- interval=None):
- if uncensored is None:
- uncensored = []
- if left is None:
- left = []
- if right is None:
- right = []
- if interval is None:
- interval = np.empty((0, 2))
- _validate_1d(uncensored, 'uncensored')
- _validate_1d(left, 'left')
- _validate_1d(right, 'right')
- uncensored2, left2, right2, interval2 = _validate_interval(interval)
- self._uncensored = np.concatenate((uncensored, uncensored2))
- self._left = np.concatenate((left, left2))
- self._right = np.concatenate((right, right2))
- # Note that by construction, the private attribute _interval
- # will be a 2D array that contains only finite values representing
- # intervals with nonzero but finite length.
- self._interval = interval2
- def __repr__(self):
- uncensored_str = " ".join(np.array_repr(self._uncensored).split())
- left_str = " ".join(np.array_repr(self._left).split())
- right_str = " ".join(np.array_repr(self._right).split())
- interval_str = " ".join(np.array_repr(self._interval).split())
- return (f"CensoredData(uncensored={uncensored_str}, left={left_str}, "
- f"right={right_str}, interval={interval_str})")
- def __str__(self):
- num_nc = len(self._uncensored)
- num_lc = len(self._left)
- num_rc = len(self._right)
- num_ic = len(self._interval)
- n = num_nc + num_lc + num_rc + num_ic
- parts = [f'{num_nc} not censored']
- if num_lc > 0:
- parts.append(f'{num_lc} left-censored')
- if num_rc > 0:
- parts.append(f'{num_rc} right-censored')
- if num_ic > 0:
- parts.append(f'{num_ic} interval-censored')
- return f'CensoredData({n} values: ' + ', '.join(parts) + ')'
- # This is not a complete implementation of the arithmetic operators.
- # All we need is subtracting a scalar and dividing by a scalar.
- def __sub__(self, other):
- return CensoredData(uncensored=self._uncensored - other,
- left=self._left - other,
- right=self._right - other,
- interval=self._interval - other)
- def __truediv__(self, other):
- return CensoredData(uncensored=self._uncensored / other,
- left=self._left / other,
- right=self._right / other,
- interval=self._interval / other)
- def __len__(self):
- """
- The number of values (censored and not censored).
- """
- return (len(self._uncensored) + len(self._left) + len(self._right)
- + len(self._interval))
- def num_censored(self):
- """
- Number of censored values.
- """
- return len(self._left) + len(self._right) + len(self._interval)
- @classmethod
- def right_censored(cls, x, censored):
- """
- Create a `CensoredData` instance of right-censored data.
- Parameters
- ----------
- x : array_like
- `x` is the array of observed data or measurements.
- `x` must be a one-dimensional sequence of finite numbers.
- censored : array_like of bool
- `censored` must be a one-dimensional sequence of boolean
- values. If ``censored[k]`` is True, the corresponding value
- in `x` is right-censored. That is, the value ``x[k]``
- is the lower bound of the true (but unknown) value.
- Returns
- -------
- data : `CensoredData`
- An instance of `CensoredData` that represents the
- collection of uncensored and right-censored values.
- Examples
- --------
- >>> from scipy.stats import CensoredData
- Two uncensored values (4 and 10) and two right-censored values
- (24 and 25).
- >>> data = CensoredData.right_censored([4, 10, 24, 25],
- ... [False, False, True, True])
- >>> data
- CensoredData(uncensored=array([ 4., 10.]),
- left=array([], dtype=float64), right=array([24., 25.]),
- interval=array([], shape=(0, 2), dtype=float64))
- >>> print(data)
- CensoredData(4 values: 2 not censored, 2 right-censored)
- """
- x, censored = _validate_x_censored(x, censored)
- return cls(uncensored=x[~censored], right=x[censored])
- @classmethod
- def left_censored(cls, x, censored):
- """
- Create a `CensoredData` instance of left-censored data.
- Parameters
- ----------
- x : array_like
- `x` is the array of observed data or measurements.
- `x` must be a one-dimensional sequence of finite numbers.
- censored : array_like of bool
- `censored` must be a one-dimensional sequence of boolean
- values. If ``censored[k]`` is True, the corresponding value
- in `x` is left-censored. That is, the value ``x[k]``
- is the upper bound of the true (but unknown) value.
- Returns
- -------
- data : `CensoredData`
- An instance of `CensoredData` that represents the
- collection of uncensored and left-censored values.
- Examples
- --------
- >>> from scipy.stats import CensoredData
- Two uncensored values (0.12 and 0.033) and two left-censored values
- (both 1e-3).
- >>> data = CensoredData.left_censored([0.12, 0.033, 1e-3, 1e-3],
- ... [False, False, True, True])
- >>> data
- CensoredData(uncensored=array([0.12 , 0.033]),
- left=array([0.001, 0.001]), right=array([], dtype=float64),
- interval=array([], shape=(0, 2), dtype=float64))
- >>> print(data)
- CensoredData(4 values: 2 not censored, 2 left-censored)
- """
- x, censored = _validate_x_censored(x, censored)
- return cls(uncensored=x[~censored], left=x[censored])
- @classmethod
- def interval_censored(cls, low, high):
- """
- Create a `CensoredData` instance of interval-censored data.
- This method is useful when all the data is interval-censored, and
- the low and high ends of the intervals are already stored in
- separate one-dimensional arrays.
- Parameters
- ----------
- low : array_like
- The one-dimensional array containing the low ends of the
- intervals.
- high : array_like
- The one-dimensional array containing the high ends of the
- intervals.
- Returns
- -------
- data : `CensoredData`
- An instance of `CensoredData` that represents the
- collection of censored values.
- Examples
- --------
- >>> import numpy as np
- >>> from scipy.stats import CensoredData
- ``a`` and ``b`` are the low and high ends of a collection of
- interval-censored values.
- >>> a = [0.5, 2.0, 3.0, 5.5]
- >>> b = [1.0, 2.5, 3.5, 7.0]
- >>> data = CensoredData.interval_censored(low=a, high=b)
- >>> print(data)
- CensoredData(4 values: 0 not censored, 4 interval-censored)
- """
- _validate_1d(low, 'low', allow_inf=True)
- _validate_1d(high, 'high', allow_inf=True)
- if len(low) != len(high):
- raise ValueError('`low` and `high` must have the same length.')
- interval = np.column_stack((low, high))
- uncensored, left, right, interval = _validate_interval(interval)
- return cls(uncensored=uncensored, left=left, right=right,
- interval=interval)
- def _uncensor(self):
- """
- This function is used when a non-censored version of the data
- is needed to create a rough estimate of the parameters of a
- distribution via the method of moments or some similar method.
- The data is "uncensored" by taking the given endpoints as the
- data for the left- or right-censored data, and the mean for the
- interval-censored data.
- """
- data = np.concatenate((self._uncensored, self._left, self._right,
- self._interval.mean(axis=1)))
- return data
- def _supported(self, a, b):
- """
- Return a subset of self containing the values that are in
- (or overlap with) the interval (a, b).
- """
- uncensored = self._uncensored
- uncensored = uncensored[(a < uncensored) & (uncensored < b)]
- left = self._left
- left = left[a < left]
- right = self._right
- right = right[right < b]
- interval = self._interval
- interval = interval[(a < interval[:, 1]) & (interval[:, 0] < b)]
- return CensoredData(uncensored, left=left, right=right,
- interval=interval)
|