_censored_data.py 18 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459
  1. import numpy as np
  2. def _validate_1d(a, name, allow_inf=False):
  3. if np.ndim(a) != 1:
  4. raise ValueError(f'`{name}` must be a one-dimensional sequence.')
  5. if np.isnan(a).any():
  6. raise ValueError(f'`{name}` must not contain nan.')
  7. if not allow_inf and np.isinf(a).any():
  8. raise ValueError(f'`{name}` must contain only finite values.')
  9. def _validate_interval(interval):
  10. interval = np.asarray(interval)
  11. if interval.shape == (0,):
  12. # The input was a sequence with length 0.
  13. interval = interval.reshape((0, 2))
  14. if interval.ndim != 2 or interval.shape[-1] != 2:
  15. raise ValueError('`interval` must be a two-dimensional array with '
  16. 'shape (m, 2), where m is the number of '
  17. 'interval-censored values, but got shape '
  18. f'{interval.shape}')
  19. if np.isnan(interval).any():
  20. raise ValueError('`interval` must not contain nan.')
  21. if np.isinf(interval).all(axis=1).any():
  22. raise ValueError('In each row in `interval`, both values must not'
  23. ' be infinite.')
  24. if (interval[:, 0] > interval[:, 1]).any():
  25. raise ValueError('In each row of `interval`, the left value must not'
  26. ' exceed the right value.')
  27. uncensored_mask = interval[:, 0] == interval[:, 1]
  28. left_mask = np.isinf(interval[:, 0])
  29. right_mask = np.isinf(interval[:, 1])
  30. interval_mask = np.isfinite(interval).all(axis=1) & ~uncensored_mask
  31. uncensored2 = interval[uncensored_mask, 0]
  32. left2 = interval[left_mask, 1]
  33. right2 = interval[right_mask, 0]
  34. interval2 = interval[interval_mask]
  35. return uncensored2, left2, right2, interval2
  36. def _validate_x_censored(x, censored):
  37. x = np.asarray(x)
  38. if x.ndim != 1:
  39. raise ValueError('`x` must be one-dimensional.')
  40. censored = np.asarray(censored)
  41. if censored.ndim != 1:
  42. raise ValueError('`censored` must be one-dimensional.')
  43. if (~np.isfinite(x)).any():
  44. raise ValueError('`x` must not contain nan or inf.')
  45. if censored.size != x.size:
  46. raise ValueError('`x` and `censored` must have the same length.')
  47. return x, censored.astype(bool)
  48. class CensoredData:
  49. """
  50. Instances of this class represent censored data.
  51. Instances may be passed to the ``fit`` method of continuous
  52. univariate SciPy distributions for maximum likelihood estimation.
  53. The *only* method of the univariate continuous distributions that
  54. understands `CensoredData` is the ``fit`` method. An instance of
  55. `CensoredData` can not be passed to methods such as ``pdf`` and
  56. ``cdf``.
  57. An observation is said to be *censored* when the precise value is unknown,
  58. but it has a known upper and/or lower bound. The conventional terminology
  59. is:
  60. * left-censored: an observation is below a certain value but it is
  61. unknown by how much.
  62. * right-censored: an observation is above a certain value but it is
  63. unknown by how much.
  64. * interval-censored: an observation lies somewhere on an interval between
  65. two values.
  66. Left-, right-, and interval-censored data can be represented by
  67. `CensoredData`.
  68. For convenience, the class methods ``left_censored`` and
  69. ``right_censored`` are provided to create a `CensoredData`
  70. instance from a single one-dimensional array of measurements
  71. and a corresponding boolean array to indicate which measurements
  72. are censored. The class method ``interval_censored`` accepts two
  73. one-dimensional arrays that hold the lower and upper bounds of the
  74. intervals.
  75. Parameters
  76. ----------
  77. uncensored : array_like, 1D
  78. Uncensored observations.
  79. left : array_like, 1D
  80. Left-censored observations.
  81. right : array_like, 1D
  82. Right-censored observations.
  83. interval : array_like, 2D, with shape (m, 2)
  84. Interval-censored observations. Each row ``interval[k, :]``
  85. represents the interval for the kth interval-censored observation.
  86. Notes
  87. -----
  88. In the input array `interval`, the lower bound of the interval may
  89. be ``-inf``, and the upper bound may be ``inf``, but at least one must be
  90. finite. When the lower bound is ``-inf``, the row represents a left-
  91. censored observation, and when the upper bound is ``inf``, the row
  92. represents a right-censored observation. If the length of an interval
  93. is 0 (i.e. ``interval[k, 0] == interval[k, 1]``, the observation is
  94. treated as uncensored. So one can represent all the types of censored
  95. and uncensored data in ``interval``, but it is generally more convenient
  96. to use `uncensored`, `left` and `right` for uncensored, left-censored and
  97. right-censored observations, respectively.
  98. Examples
  99. --------
  100. In the most general case, a censored data set may contain values that
  101. are left-censored, right-censored, interval-censored, and uncensored.
  102. For example, here we create a data set with five observations. Two
  103. are uncensored (values 1 and 1.5), one is a left-censored observation
  104. of 0, one is a right-censored observation of 10 and one is
  105. interval-censored in the interval [2, 3].
  106. >>> import numpy as np
  107. >>> from scipy.stats import CensoredData
  108. >>> data = CensoredData(uncensored=[1, 1.5], left=[0], right=[10],
  109. ... interval=[[2, 3]])
  110. >>> print(data)
  111. CensoredData(5 values: 2 not censored, 1 left-censored,
  112. 1 right-censored, 1 interval-censored)
  113. Equivalently,
  114. >>> data = CensoredData(interval=[[1, 1],
  115. ... [1.5, 1.5],
  116. ... [-np.inf, 0],
  117. ... [10, np.inf],
  118. ... [2, 3]])
  119. >>> print(data)
  120. CensoredData(5 values: 2 not censored, 1 left-censored,
  121. 1 right-censored, 1 interval-censored)
  122. A common case is to have a mix of uncensored observations and censored
  123. observations that are all right-censored (or all left-censored). For
  124. example, consider an experiment in which six devices are started at
  125. various times and left running until they fail. Assume that time is
  126. measured in hours, and the experiment is stopped after 30 hours, even
  127. if all the devices have not failed by that time. We might end up with
  128. data such as this::
  129. Device Start-time Fail-time Time-to-failure
  130. 1 0 13 13
  131. 2 2 24 22
  132. 3 5 22 17
  133. 4 8 23 15
  134. 5 10 *** >20
  135. 6 12 *** >18
  136. Two of the devices had not failed when the experiment was stopped;
  137. the observations of the time-to-failure for these two devices are
  138. right-censored. We can represent this data with
  139. >>> data = CensoredData(uncensored=[13, 22, 17, 15], right=[20, 18])
  140. >>> print(data)
  141. CensoredData(6 values: 4 not censored, 2 right-censored)
  142. Alternatively, we can use the method `CensoredData.right_censored` to
  143. create a representation of this data. The time-to-failure observations
  144. are put the list ``ttf``. The ``censored`` list indicates which values
  145. in ``ttf`` are censored.
  146. >>> ttf = [13, 22, 17, 15, 20, 18]
  147. >>> censored = [False, False, False, False, True, True]
  148. Pass these lists to `CensoredData.right_censored` to create an
  149. instance of `CensoredData`.
  150. >>> data = CensoredData.right_censored(ttf, censored)
  151. >>> print(data)
  152. CensoredData(6 values: 4 not censored, 2 right-censored)
  153. If the input data is interval censored and already stored in two
  154. arrays, one holding the low end of the intervals and another
  155. holding the high ends, the class method ``interval_censored`` can
  156. be used to create the `CensoredData` instance.
  157. This example creates an instance with four interval-censored values.
  158. The intervals are [10, 11], [0.5, 1], [2, 3], and [12.5, 13.5].
  159. >>> a = [10, 0.5, 2, 12.5] # Low ends of the intervals
  160. >>> b = [11, 1.0, 3, 13.5] # High ends of the intervals
  161. >>> data = CensoredData.interval_censored(low=a, high=b)
  162. >>> print(data)
  163. CensoredData(4 values: 0 not censored, 4 interval-censored)
  164. Finally, we create and censor some data from the `weibull_min`
  165. distribution, and then fit `weibull_min` to that data. We'll assume
  166. that the location parameter is known to be 0.
  167. >>> from scipy.stats import weibull_min
  168. >>> rng = np.random.default_rng()
  169. Create the random data set.
  170. >>> x = weibull_min.rvs(2.5, loc=0, scale=30, size=250, random_state=rng)
  171. >>> x[x > 40] = 40 # Right-censor values greater or equal to 40.
  172. Create the `CensoredData` instance with the `right_censored` method.
  173. The censored values are those where the value is 40.
  174. >>> data = CensoredData.right_censored(x, x == 40)
  175. >>> print(data)
  176. CensoredData(250 values: 215 not censored, 35 right-censored)
  177. 35 values have been right-censored.
  178. Fit `weibull_min` to the censored data. We expect to shape and scale
  179. to be approximately 2.5 and 30, respectively.
  180. >>> weibull_min.fit(data, floc=0)
  181. (2.3575922823897315, 0, 30.40650074451254)
  182. """
  183. def __init__(self, uncensored=None, *, left=None, right=None,
  184. interval=None):
  185. if uncensored is None:
  186. uncensored = []
  187. if left is None:
  188. left = []
  189. if right is None:
  190. right = []
  191. if interval is None:
  192. interval = np.empty((0, 2))
  193. _validate_1d(uncensored, 'uncensored')
  194. _validate_1d(left, 'left')
  195. _validate_1d(right, 'right')
  196. uncensored2, left2, right2, interval2 = _validate_interval(interval)
  197. self._uncensored = np.concatenate((uncensored, uncensored2))
  198. self._left = np.concatenate((left, left2))
  199. self._right = np.concatenate((right, right2))
  200. # Note that by construction, the private attribute _interval
  201. # will be a 2D array that contains only finite values representing
  202. # intervals with nonzero but finite length.
  203. self._interval = interval2
  204. def __repr__(self):
  205. uncensored_str = " ".join(np.array_repr(self._uncensored).split())
  206. left_str = " ".join(np.array_repr(self._left).split())
  207. right_str = " ".join(np.array_repr(self._right).split())
  208. interval_str = " ".join(np.array_repr(self._interval).split())
  209. return (f"CensoredData(uncensored={uncensored_str}, left={left_str}, "
  210. f"right={right_str}, interval={interval_str})")
  211. def __str__(self):
  212. num_nc = len(self._uncensored)
  213. num_lc = len(self._left)
  214. num_rc = len(self._right)
  215. num_ic = len(self._interval)
  216. n = num_nc + num_lc + num_rc + num_ic
  217. parts = [f'{num_nc} not censored']
  218. if num_lc > 0:
  219. parts.append(f'{num_lc} left-censored')
  220. if num_rc > 0:
  221. parts.append(f'{num_rc} right-censored')
  222. if num_ic > 0:
  223. parts.append(f'{num_ic} interval-censored')
  224. return f'CensoredData({n} values: ' + ', '.join(parts) + ')'
  225. # This is not a complete implementation of the arithmetic operators.
  226. # All we need is subtracting a scalar and dividing by a scalar.
  227. def __sub__(self, other):
  228. return CensoredData(uncensored=self._uncensored - other,
  229. left=self._left - other,
  230. right=self._right - other,
  231. interval=self._interval - other)
  232. def __truediv__(self, other):
  233. return CensoredData(uncensored=self._uncensored / other,
  234. left=self._left / other,
  235. right=self._right / other,
  236. interval=self._interval / other)
  237. def __len__(self):
  238. """
  239. The number of values (censored and not censored).
  240. """
  241. return (len(self._uncensored) + len(self._left) + len(self._right)
  242. + len(self._interval))
  243. def num_censored(self):
  244. """
  245. Number of censored values.
  246. """
  247. return len(self._left) + len(self._right) + len(self._interval)
  248. @classmethod
  249. def right_censored(cls, x, censored):
  250. """
  251. Create a `CensoredData` instance of right-censored data.
  252. Parameters
  253. ----------
  254. x : array_like
  255. `x` is the array of observed data or measurements.
  256. `x` must be a one-dimensional sequence of finite numbers.
  257. censored : array_like of bool
  258. `censored` must be a one-dimensional sequence of boolean
  259. values. If ``censored[k]`` is True, the corresponding value
  260. in `x` is right-censored. That is, the value ``x[k]``
  261. is the lower bound of the true (but unknown) value.
  262. Returns
  263. -------
  264. data : `CensoredData`
  265. An instance of `CensoredData` that represents the
  266. collection of uncensored and right-censored values.
  267. Examples
  268. --------
  269. >>> from scipy.stats import CensoredData
  270. Two uncensored values (4 and 10) and two right-censored values
  271. (24 and 25).
  272. >>> data = CensoredData.right_censored([4, 10, 24, 25],
  273. ... [False, False, True, True])
  274. >>> data
  275. CensoredData(uncensored=array([ 4., 10.]),
  276. left=array([], dtype=float64), right=array([24., 25.]),
  277. interval=array([], shape=(0, 2), dtype=float64))
  278. >>> print(data)
  279. CensoredData(4 values: 2 not censored, 2 right-censored)
  280. """
  281. x, censored = _validate_x_censored(x, censored)
  282. return cls(uncensored=x[~censored], right=x[censored])
  283. @classmethod
  284. def left_censored(cls, x, censored):
  285. """
  286. Create a `CensoredData` instance of left-censored data.
  287. Parameters
  288. ----------
  289. x : array_like
  290. `x` is the array of observed data or measurements.
  291. `x` must be a one-dimensional sequence of finite numbers.
  292. censored : array_like of bool
  293. `censored` must be a one-dimensional sequence of boolean
  294. values. If ``censored[k]`` is True, the corresponding value
  295. in `x` is left-censored. That is, the value ``x[k]``
  296. is the upper bound of the true (but unknown) value.
  297. Returns
  298. -------
  299. data : `CensoredData`
  300. An instance of `CensoredData` that represents the
  301. collection of uncensored and left-censored values.
  302. Examples
  303. --------
  304. >>> from scipy.stats import CensoredData
  305. Two uncensored values (0.12 and 0.033) and two left-censored values
  306. (both 1e-3).
  307. >>> data = CensoredData.left_censored([0.12, 0.033, 1e-3, 1e-3],
  308. ... [False, False, True, True])
  309. >>> data
  310. CensoredData(uncensored=array([0.12 , 0.033]),
  311. left=array([0.001, 0.001]), right=array([], dtype=float64),
  312. interval=array([], shape=(0, 2), dtype=float64))
  313. >>> print(data)
  314. CensoredData(4 values: 2 not censored, 2 left-censored)
  315. """
  316. x, censored = _validate_x_censored(x, censored)
  317. return cls(uncensored=x[~censored], left=x[censored])
  318. @classmethod
  319. def interval_censored(cls, low, high):
  320. """
  321. Create a `CensoredData` instance of interval-censored data.
  322. This method is useful when all the data is interval-censored, and
  323. the low and high ends of the intervals are already stored in
  324. separate one-dimensional arrays.
  325. Parameters
  326. ----------
  327. low : array_like
  328. The one-dimensional array containing the low ends of the
  329. intervals.
  330. high : array_like
  331. The one-dimensional array containing the high ends of the
  332. intervals.
  333. Returns
  334. -------
  335. data : `CensoredData`
  336. An instance of `CensoredData` that represents the
  337. collection of censored values.
  338. Examples
  339. --------
  340. >>> import numpy as np
  341. >>> from scipy.stats import CensoredData
  342. ``a`` and ``b`` are the low and high ends of a collection of
  343. interval-censored values.
  344. >>> a = [0.5, 2.0, 3.0, 5.5]
  345. >>> b = [1.0, 2.5, 3.5, 7.0]
  346. >>> data = CensoredData.interval_censored(low=a, high=b)
  347. >>> print(data)
  348. CensoredData(4 values: 0 not censored, 4 interval-censored)
  349. """
  350. _validate_1d(low, 'low', allow_inf=True)
  351. _validate_1d(high, 'high', allow_inf=True)
  352. if len(low) != len(high):
  353. raise ValueError('`low` and `high` must have the same length.')
  354. interval = np.column_stack((low, high))
  355. uncensored, left, right, interval = _validate_interval(interval)
  356. return cls(uncensored=uncensored, left=left, right=right,
  357. interval=interval)
  358. def _uncensor(self):
  359. """
  360. This function is used when a non-censored version of the data
  361. is needed to create a rough estimate of the parameters of a
  362. distribution via the method of moments or some similar method.
  363. The data is "uncensored" by taking the given endpoints as the
  364. data for the left- or right-censored data, and the mean for the
  365. interval-censored data.
  366. """
  367. data = np.concatenate((self._uncensored, self._left, self._right,
  368. self._interval.mean(axis=1)))
  369. return data
  370. def _supported(self, a, b):
  371. """
  372. Return a subset of self containing the values that are in
  373. (or overlap with) the interval (a, b).
  374. """
  375. uncensored = self._uncensored
  376. uncensored = uncensored[(a < uncensored) & (uncensored < b)]
  377. left = self._left
  378. left = left[a < left]
  379. right = self._right
  380. right = right[right < b]
  381. interval = self._interval
  382. interval = interval[(a < interval[:, 1]) & (interval[:, 0] < b)]
  383. return CensoredData(uncensored, left=left, right=right,
  384. interval=interval)