_survival.py 25 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686
  1. from dataclasses import dataclass, field
  2. from typing import TYPE_CHECKING, Literal
  3. import warnings
  4. import numpy as np
  5. from scipy import special, interpolate, stats
  6. from scipy._lib._array_api import xp_capabilities
  7. from scipy.stats._censored_data import CensoredData
  8. from scipy.stats._common import ConfidenceInterval
  9. if TYPE_CHECKING:
  10. import numpy.typing as npt
  11. __all__ = ['ecdf', 'logrank']
  12. @dataclass
  13. class EmpiricalDistributionFunction:
  14. """An empirical distribution function produced by `scipy.stats.ecdf`
  15. Attributes
  16. ----------
  17. quantiles : ndarray
  18. The unique values of the sample from which the
  19. `EmpiricalDistributionFunction` was estimated.
  20. probabilities : ndarray
  21. The point estimates of the cumulative distribution function (CDF) or
  22. its complement, the survival function (SF), corresponding with
  23. `quantiles`.
  24. """
  25. quantiles: np.ndarray
  26. probabilities: np.ndarray
  27. # Exclude these from __str__
  28. _n: np.ndarray = field(repr=False) # number "at risk"
  29. _d: np.ndarray = field(repr=False) # number of "deaths"
  30. _sf: np.ndarray = field(repr=False) # survival function for var estimate
  31. _kind: str = field(repr=False) # type of function: "cdf" or "sf"
  32. def __init__(self, q, p, n, d, kind):
  33. self.probabilities = p
  34. self.quantiles = q
  35. self._n = n
  36. self._d = d
  37. self._sf = p if kind == 'sf' else 1 - p
  38. self._kind = kind
  39. f0 = 1 if kind == 'sf' else 0 # leftmost function value
  40. f1 = 1 - f0
  41. # fill_value can't handle edge cases at infinity
  42. x = np.insert(q, [0, len(q)], [-np.inf, np.inf])
  43. y = np.insert(p, [0, len(p)], [f0, f1])
  44. # `or` conditions handle the case of empty x, points
  45. self._f = interpolate.interp1d(x, y, kind='previous',
  46. assume_sorted=True)
  47. def evaluate(self, x):
  48. """Evaluate the empirical CDF/SF function at the input.
  49. Parameters
  50. ----------
  51. x : ndarray
  52. Argument to the CDF/SF
  53. Returns
  54. -------
  55. y : ndarray
  56. The CDF/SF evaluated at the input
  57. """
  58. return self._f(x)
  59. def plot(self, ax=None, **matplotlib_kwargs):
  60. """Plot the empirical distribution function
  61. Available only if ``matplotlib`` is installed.
  62. Parameters
  63. ----------
  64. ax : matplotlib.axes.Axes
  65. Axes object to draw the plot onto, otherwise uses the current Axes.
  66. **matplotlib_kwargs : dict, optional
  67. Keyword arguments passed directly to `matplotlib.axes.Axes.step`.
  68. Unless overridden, ``where='post'``.
  69. Returns
  70. -------
  71. lines : list of `matplotlib.lines.Line2D`
  72. Objects representing the plotted data
  73. """
  74. try:
  75. import matplotlib # noqa: F401
  76. except ModuleNotFoundError as exc:
  77. message = "matplotlib must be installed to use method `plot`."
  78. raise ModuleNotFoundError(message) from exc
  79. if ax is None:
  80. import matplotlib.pyplot as plt
  81. ax = plt.gca()
  82. kwargs = {'where': 'post'}
  83. kwargs.update(matplotlib_kwargs)
  84. delta = np.ptp(self.quantiles)*0.05 # how far past sample edge to plot
  85. q = self.quantiles
  86. q = [q[0] - delta] + list(q) + [q[-1] + delta]
  87. return ax.step(q, self.evaluate(q), **kwargs)
  88. def confidence_interval(self, confidence_level=0.95, *, method='linear'):
  89. """Compute a confidence interval around the CDF/SF point estimate
  90. Parameters
  91. ----------
  92. confidence_level : float, default: 0.95
  93. Confidence level for the computed confidence interval
  94. method : str, {"linear", "log-log"}
  95. Method used to compute the confidence interval. Options are
  96. "linear" for the conventional Greenwood confidence interval
  97. (default) and "log-log" for the "exponential Greenwood",
  98. log-negative-log-transformed confidence interval.
  99. Returns
  100. -------
  101. ci : ``ConfidenceInterval``
  102. An object with attributes ``low`` and ``high``, instances of
  103. `~scipy.stats._result_classes.EmpiricalDistributionFunction` that
  104. represent the lower and upper bounds (respectively) of the
  105. confidence interval.
  106. Notes
  107. -----
  108. Confidence intervals are computed according to the Greenwood formula
  109. (``method='linear'``) or the more recent "exponential Greenwood"
  110. formula (``method='log-log'``) as described in [1]_. The conventional
  111. Greenwood formula can result in lower confidence limits less than 0
  112. and upper confidence limits greater than 1; these are clipped to the
  113. unit interval. NaNs may be produced by either method; these are
  114. features of the formulas.
  115. References
  116. ----------
  117. .. [1] Sawyer, Stanley. "The Greenwood and Exponential Greenwood
  118. Confidence Intervals in Survival Analysis."
  119. https://www.math.wustl.edu/~sawyer/handouts/greenwood.pdf
  120. """
  121. message = ("Confidence interval bounds do not implement a "
  122. "`confidence_interval` method.")
  123. if self._n is None:
  124. raise NotImplementedError(message)
  125. methods = {'linear': self._linear_ci,
  126. 'log-log': self._loglog_ci}
  127. message = f"`method` must be one of {set(methods)}."
  128. if method.lower() not in methods:
  129. raise ValueError(message)
  130. message = "`confidence_level` must be a scalar between 0 and 1."
  131. confidence_level = np.asarray(confidence_level)[()]
  132. if confidence_level.shape or not (0 <= confidence_level <= 1):
  133. raise ValueError(message)
  134. method_fun = methods[method.lower()]
  135. low, high = method_fun(confidence_level)
  136. message = ("The confidence interval is undefined at some observations."
  137. " This is a feature of the mathematical formula used, not"
  138. " an error in its implementation.")
  139. if np.any(np.isnan(low) | np.isnan(high)):
  140. warnings.warn(message, RuntimeWarning, stacklevel=2)
  141. low, high = np.clip(low, 0, 1), np.clip(high, 0, 1)
  142. low = EmpiricalDistributionFunction(self.quantiles, low, None, None,
  143. self._kind)
  144. high = EmpiricalDistributionFunction(self.quantiles, high, None, None,
  145. self._kind)
  146. return ConfidenceInterval(low, high)
  147. def _linear_ci(self, confidence_level):
  148. sf, d, n = self._sf, self._d, self._n
  149. # When n == d, Greenwood's formula divides by zero.
  150. # When s != 0, this can be ignored: var == inf, and CI is [0, 1]
  151. # When s == 0, this results in NaNs. Produce an informative warning.
  152. with np.errstate(divide='ignore', invalid='ignore'):
  153. var = sf ** 2 * np.cumsum(d / (n * (n - d)))
  154. se = np.sqrt(var)
  155. z = special.ndtri(1 / 2 + confidence_level / 2)
  156. z_se = z * se
  157. low = self.probabilities - z_se
  158. high = self.probabilities + z_se
  159. return low, high
  160. def _loglog_ci(self, confidence_level):
  161. sf, d, n = self._sf, self._d, self._n
  162. with np.errstate(divide='ignore', invalid='ignore'):
  163. var = 1 / np.log(sf) ** 2 * np.cumsum(d / (n * (n - d)))
  164. se = np.sqrt(var)
  165. z = special.ndtri(1 / 2 + confidence_level / 2)
  166. with np.errstate(divide='ignore'):
  167. lnl_points = np.log(-np.log(sf))
  168. z_se = z * se
  169. low = np.exp(-np.exp(lnl_points + z_se))
  170. high = np.exp(-np.exp(lnl_points - z_se))
  171. if self._kind == "cdf":
  172. low, high = 1-high, 1-low
  173. return low, high
  174. @dataclass
  175. class ECDFResult:
  176. """ Result object returned by `scipy.stats.ecdf`
  177. Attributes
  178. ----------
  179. cdf : `~scipy.stats._result_classes.EmpiricalDistributionFunction`
  180. An object representing the empirical cumulative distribution function.
  181. sf : `~scipy.stats._result_classes.EmpiricalDistributionFunction`
  182. An object representing the complement of the empirical cumulative
  183. distribution function.
  184. """
  185. cdf: EmpiricalDistributionFunction
  186. sf: EmpiricalDistributionFunction
  187. def __init__(self, q, cdf, sf, n, d):
  188. self.cdf = EmpiricalDistributionFunction(q, cdf, n, d, "cdf")
  189. self.sf = EmpiricalDistributionFunction(q, sf, n, d, "sf")
  190. def _iv_CensoredData(
  191. sample: "npt.ArrayLike | CensoredData", param_name: str = "sample"
  192. ) -> CensoredData:
  193. """Attempt to convert `sample` to `CensoredData`."""
  194. if not isinstance(sample, CensoredData):
  195. try: # takes care of input standardization/validation
  196. sample = CensoredData(uncensored=sample)
  197. except ValueError as e:
  198. message = str(e).replace('uncensored', param_name)
  199. raise type(e)(message) from e
  200. return sample
  201. @xp_capabilities(np_only=True)
  202. def ecdf(sample: "npt.ArrayLike | CensoredData") -> ECDFResult:
  203. """Empirical cumulative distribution function of a sample.
  204. The empirical cumulative distribution function (ECDF) is a step function
  205. estimate of the CDF of the distribution underlying a sample. This function
  206. returns objects representing both the empirical distribution function and
  207. its complement, the empirical survival function.
  208. Parameters
  209. ----------
  210. sample : 1D array_like or `scipy.stats.CensoredData`
  211. Besides array_like, instances of `scipy.stats.CensoredData` containing
  212. uncensored and right-censored observations are supported. Currently,
  213. other instances of `scipy.stats.CensoredData` will result in a
  214. ``NotImplementedError``.
  215. Returns
  216. -------
  217. res : `~scipy.stats._result_classes.ECDFResult`
  218. An object with the following attributes.
  219. cdf : `~scipy.stats._result_classes.EmpiricalDistributionFunction`
  220. An object representing the empirical cumulative distribution
  221. function.
  222. sf : `~scipy.stats._result_classes.EmpiricalDistributionFunction`
  223. An object representing the empirical survival function.
  224. The `cdf` and `sf` attributes themselves have the following attributes.
  225. quantiles : ndarray
  226. The unique values in the sample that defines the empirical CDF/SF.
  227. probabilities : ndarray
  228. The point estimates of the probabilities corresponding with
  229. `quantiles`.
  230. And the following methods:
  231. evaluate(x) :
  232. Evaluate the CDF/SF at the argument.
  233. plot(ax) :
  234. Plot the CDF/SF on the provided axes.
  235. confidence_interval(confidence_level=0.95) :
  236. Compute the confidence interval around the CDF/SF at the values in
  237. `quantiles`.
  238. Notes
  239. -----
  240. When each observation of the sample is a precise measurement, the ECDF
  241. steps up by ``1/len(sample)`` at each of the observations [1]_.
  242. When observations are lower bounds, upper bounds, or both upper and lower
  243. bounds, the data is said to be "censored", and `sample` may be provided as
  244. an instance of `scipy.stats.CensoredData`.
  245. For right-censored data, the ECDF is given by the Kaplan-Meier estimator
  246. [2]_; other forms of censoring are not supported at this time.
  247. Confidence intervals are computed according to the Greenwood formula or the
  248. more recent "Exponential Greenwood" formula as described in [4]_.
  249. References
  250. ----------
  251. .. [1] Conover, William Jay. Practical nonparametric statistics. Vol. 350.
  252. John Wiley & Sons, 1999.
  253. .. [2] Kaplan, Edward L., and Paul Meier. "Nonparametric estimation from
  254. incomplete observations." Journal of the American statistical
  255. association 53.282 (1958): 457-481.
  256. .. [3] Goel, Manish Kumar, Pardeep Khanna, and Jugal Kishore.
  257. "Understanding survival analysis: Kaplan-Meier estimate."
  258. International journal of Ayurveda research 1.4 (2010): 274.
  259. .. [4] Sawyer, Stanley. "The Greenwood and Exponential Greenwood Confidence
  260. Intervals in Survival Analysis."
  261. https://www.math.wustl.edu/~sawyer/handouts/greenwood.pdf
  262. Examples
  263. --------
  264. **Uncensored Data**
  265. As in the example from [1]_ page 79, five boys were selected at random from
  266. those in a single high school. Their one-mile run times were recorded as
  267. follows.
  268. >>> sample = [6.23, 5.58, 7.06, 6.42, 5.20] # one-mile run times (minutes)
  269. The empirical distribution function, which approximates the distribution
  270. function of one-mile run times of the population from which the boys were
  271. sampled, is calculated as follows.
  272. >>> from scipy import stats
  273. >>> res = stats.ecdf(sample)
  274. >>> res.cdf.quantiles
  275. array([5.2 , 5.58, 6.23, 6.42, 7.06])
  276. >>> res.cdf.probabilities
  277. array([0.2, 0.4, 0.6, 0.8, 1. ])
  278. To plot the result as a step function:
  279. >>> import matplotlib.pyplot as plt
  280. >>> ax = plt.subplot()
  281. >>> res.cdf.plot(ax)
  282. >>> ax.set_xlabel('One-Mile Run Time (minutes)')
  283. >>> ax.set_ylabel('Empirical CDF')
  284. >>> plt.show()
  285. **Right-censored Data**
  286. As in the example from [1]_ page 91, the lives of ten car fanbelts were
  287. tested. Five tests concluded because the fanbelt being tested broke, but
  288. the remaining tests concluded for other reasons (e.g. the study ran out of
  289. funding, but the fanbelt was still functional). The mileage driven
  290. with the fanbelts were recorded as follows.
  291. >>> broken = [77, 47, 81, 56, 80] # in thousands of miles driven
  292. >>> unbroken = [62, 60, 43, 71, 37]
  293. Precise survival times of the fanbelts that were still functional at the
  294. end of the tests are unknown, but they are known to exceed the values
  295. recorded in ``unbroken``. Therefore, these observations are said to be
  296. "right-censored", and the data is represented using
  297. `scipy.stats.CensoredData`.
  298. >>> sample = stats.CensoredData(uncensored=broken, right=unbroken)
  299. The empirical survival function is calculated as follows.
  300. >>> res = stats.ecdf(sample)
  301. >>> res.sf.quantiles
  302. array([37., 43., 47., 56., 60., 62., 71., 77., 80., 81.])
  303. >>> res.sf.probabilities
  304. array([1. , 1. , 0.875, 0.75 , 0.75 , 0.75 , 0.75 , 0.5 , 0.25 , 0. ])
  305. To plot the result as a step function:
  306. >>> ax = plt.subplot()
  307. >>> res.sf.plot(ax)
  308. >>> ax.set_xlabel('Fanbelt Survival Time (thousands of miles)')
  309. >>> ax.set_ylabel('Empirical SF')
  310. >>> plt.show()
  311. """
  312. sample = _iv_CensoredData(sample)
  313. if sample.num_censored() == 0:
  314. res = _ecdf_uncensored(sample._uncensor())
  315. elif sample.num_censored() == sample._right.size:
  316. res = _ecdf_right_censored(sample)
  317. else:
  318. # Support additional censoring options in follow-up PRs
  319. message = ("Currently, only uncensored and right-censored data is "
  320. "supported.")
  321. raise NotImplementedError(message)
  322. t, cdf, sf, n, d = res
  323. return ECDFResult(t, cdf, sf, n, d)
  324. def _ecdf_uncensored(sample):
  325. sample = np.sort(sample)
  326. x, counts = np.unique(sample, return_counts=True)
  327. # [1].81 "the fraction of [observations] that are less than or equal to x
  328. events = np.cumsum(counts)
  329. n = sample.size
  330. cdf = events / n
  331. # [1].89 "the relative frequency of the sample that exceeds x in value"
  332. sf = 1 - cdf
  333. at_risk = np.concatenate(([n], n - events[:-1]))
  334. return x, cdf, sf, at_risk, counts
  335. def _ecdf_right_censored(sample):
  336. # It is conventional to discuss right-censored data in terms of
  337. # "survival time", "death", and "loss" (e.g. [2]). We'll use that
  338. # terminology here.
  339. # This implementation was influenced by the references cited and also
  340. # https://www.youtube.com/watch?v=lxoWsVco_iM
  341. # https://en.wikipedia.org/wiki/Kaplan%E2%80%93Meier_estimator
  342. # In retrospect it is probably most easily compared against [3].
  343. # Ultimately, the data needs to be sorted, so this implementation is
  344. # written to avoid a separate call to `unique` after sorting. In hope of
  345. # better performance on large datasets, it also computes survival
  346. # probabilities at unique times only rather than at each observation.
  347. tod = sample._uncensored # time of "death"
  348. tol = sample._right # time of "loss"
  349. times = np.concatenate((tod, tol))
  350. died = np.asarray([1]*tod.size + [0]*tol.size)
  351. # sort by times
  352. i = np.argsort(times)
  353. times = times[i]
  354. died = died[i]
  355. at_risk = np.arange(times.size, 0, -1)
  356. # logical indices of unique times
  357. j = np.diff(times, prepend=-np.inf, append=np.inf) > 0
  358. j_l = j[:-1] # first instances of unique times
  359. j_r = j[1:] # last instances of unique times
  360. # get number at risk and deaths at each unique time
  361. t = times[j_l] # unique times
  362. n = at_risk[j_l] # number at risk at each unique time
  363. cd = np.cumsum(died)[j_r] # cumulative deaths up to/including unique times
  364. d = np.diff(cd, prepend=0) # deaths at each unique time
  365. # compute survival function
  366. sf = np.cumprod((n - d) / n)
  367. cdf = 1 - sf
  368. return t, cdf, sf, n, d
  369. @dataclass
  370. class LogRankResult:
  371. """Result object returned by `scipy.stats.logrank`.
  372. Attributes
  373. ----------
  374. statistic : float ndarray
  375. The computed statistic (defined below). Its magnitude is the
  376. square root of the magnitude returned by most other logrank test
  377. implementations.
  378. pvalue : float ndarray
  379. The computed p-value of the test.
  380. """
  381. statistic: np.ndarray
  382. pvalue: np.ndarray
  383. @xp_capabilities(np_only=True)
  384. def logrank(
  385. x: "npt.ArrayLike | CensoredData",
  386. y: "npt.ArrayLike | CensoredData",
  387. alternative: Literal['two-sided', 'less', 'greater'] = "two-sided"
  388. ) -> LogRankResult:
  389. r"""Compare the survival distributions of two samples via the logrank test.
  390. Parameters
  391. ----------
  392. x, y : array_like or CensoredData
  393. Samples to compare based on their empirical survival functions.
  394. alternative : {'two-sided', 'less', 'greater'}, optional
  395. Defines the alternative hypothesis.
  396. The null hypothesis is that the survival distributions of the two
  397. groups, say *X* and *Y*, are identical.
  398. The following alternative hypotheses [4]_ are available (default is
  399. 'two-sided'):
  400. * 'two-sided': the survival distributions of the two groups are not
  401. identical.
  402. * 'less': survival of group *X* is favored: the group *X* failure rate
  403. function is less than the group *Y* failure rate function at some
  404. times.
  405. * 'greater': survival of group *Y* is favored: the group *X* failure
  406. rate function is greater than the group *Y* failure rate function at
  407. some times.
  408. Returns
  409. -------
  410. res : `~scipy.stats._result_classes.LogRankResult`
  411. An object containing attributes:
  412. statistic : float ndarray
  413. The computed statistic (defined below). Its magnitude is the
  414. square root of the magnitude returned by most other logrank test
  415. implementations.
  416. pvalue : float ndarray
  417. The computed p-value of the test.
  418. See Also
  419. --------
  420. scipy.stats.ecdf
  421. Notes
  422. -----
  423. The logrank test [1]_ compares the observed number of events to
  424. the expected number of events under the null hypothesis that the two
  425. samples were drawn from the same distribution. The statistic is
  426. .. math::
  427. Z_i = \frac{\sum_{j=1}^J(O_{i,j}-E_{i,j})}{\sqrt{\sum_{j=1}^J V_{i,j}}}
  428. \rightarrow \mathcal{N}(0,1)
  429. where
  430. .. math::
  431. E_{i,j} = O_j \frac{N_{i,j}}{N_j},
  432. \qquad
  433. V_{i,j} = E_{i,j} \left(\frac{N_j-O_j}{N_j}\right)
  434. \left(\frac{N_j-N_{i,j}}{N_j-1}\right),
  435. :math:`i` denotes the group (i.e. it may assume values :math:`x` or
  436. :math:`y`, or it may be omitted to refer to the combined sample)
  437. :math:`j` denotes the time (at which an event occurred),
  438. :math:`N` is the number of subjects at risk just before an event occurred,
  439. and :math:`O` is the observed number of events at that time.
  440. The ``statistic`` :math:`Z_x` returned by `logrank` is the (signed) square
  441. root of the statistic returned by many other implementations. Under the
  442. null hypothesis, :math:`Z_x**2` is asymptotically distributed according to
  443. the chi-squared distribution with one degree of freedom. Consequently,
  444. :math:`Z_x` is asymptotically distributed according to the standard normal
  445. distribution. The advantage of using :math:`Z_x` is that the sign
  446. information (i.e. whether the observed number of events tends to be less
  447. than or greater than the number expected under the null hypothesis) is
  448. preserved, allowing `scipy.stats.logrank` to offer one-sided alternative
  449. hypotheses.
  450. References
  451. ----------
  452. .. [1] Mantel N. "Evaluation of survival data and two new rank order
  453. statistics arising in its consideration."
  454. Cancer Chemotherapy Reports, 50(3):163-170, PMID: 5910392, 1966
  455. .. [2] Bland, Altman, "The logrank test", BMJ, 328:1073,
  456. :doi:`10.1136/bmj.328.7447.1073`, 2004
  457. .. [3] "Logrank test", Wikipedia,
  458. https://en.wikipedia.org/wiki/Logrank_test
  459. .. [4] Brown, Mark. "On the choice of variance for the log rank test."
  460. Biometrika 71.1 (1984): 65-74.
  461. .. [5] Klein, John P., and Melvin L. Moeschberger. Survival analysis:
  462. techniques for censored and truncated data. Vol. 1230. New York:
  463. Springer, 2003.
  464. Examples
  465. --------
  466. Reference [2]_ compared the survival times of patients with two different
  467. types of recurrent malignant gliomas. The samples below record the time
  468. (number of weeks) for which each patient participated in the study. The
  469. `scipy.stats.CensoredData` class is used because the data is
  470. right-censored: the uncensored observations correspond with observed deaths
  471. whereas the censored observations correspond with the patient leaving the
  472. study for another reason.
  473. >>> from scipy import stats
  474. >>> x = stats.CensoredData(
  475. ... uncensored=[6, 13, 21, 30, 37, 38, 49, 50,
  476. ... 63, 79, 86, 98, 202, 219],
  477. ... right=[31, 47, 80, 82, 82, 149]
  478. ... )
  479. >>> y = stats.CensoredData(
  480. ... uncensored=[10, 10, 12, 13, 14, 15, 16, 17, 18, 20, 24, 24,
  481. ... 25, 28,30, 33, 35, 37, 40, 40, 46, 48, 76, 81,
  482. ... 82, 91, 112, 181],
  483. ... right=[34, 40, 70]
  484. ... )
  485. We can calculate and visualize the empirical survival functions
  486. of both groups as follows.
  487. >>> import numpy as np
  488. >>> import matplotlib.pyplot as plt
  489. >>> ax = plt.subplot()
  490. >>> ecdf_x = stats.ecdf(x)
  491. >>> ecdf_x.sf.plot(ax, label='Astrocytoma')
  492. >>> ecdf_y = stats.ecdf(y)
  493. >>> ecdf_y.sf.plot(ax, label='Glioblastoma')
  494. >>> ax.set_xlabel('Time to death (weeks)')
  495. >>> ax.set_ylabel('Empirical SF')
  496. >>> plt.legend()
  497. >>> plt.show()
  498. Visual inspection of the empirical survival functions suggests that the
  499. survival times tend to be different between the two groups. To formally
  500. assess whether the difference is significant at the 1% level, we use the
  501. logrank test.
  502. >>> res = stats.logrank(x=x, y=y)
  503. >>> res.statistic
  504. -2.73799
  505. >>> res.pvalue
  506. 0.00618
  507. The p-value is less than 1%, so we can consider the data to be evidence
  508. against the null hypothesis in favor of the alternative that there is a
  509. difference between the two survival functions.
  510. """
  511. # Input validation. `alternative` IV handled in `_get_pvalue` below.
  512. x = _iv_CensoredData(sample=x, param_name='x')
  513. y = _iv_CensoredData(sample=y, param_name='y')
  514. # Combined sample. (Under H0, the two groups are identical.)
  515. xy = CensoredData(
  516. uncensored=np.concatenate((x._uncensored, y._uncensored)),
  517. right=np.concatenate((x._right, y._right))
  518. )
  519. # Extract data from the combined sample
  520. res = ecdf(xy)
  521. idx = res.sf._d.astype(bool) # indices of observed events
  522. times_xy = res.sf.quantiles[idx] # unique times of observed events
  523. at_risk_xy = res.sf._n[idx] # combined number of subjects at risk
  524. deaths_xy = res.sf._d[idx] # combined number of events
  525. # Get the number at risk within each sample.
  526. # First compute the number at risk in group X at each of the `times_xy`.
  527. # Could use `interpolate_1d`, but this is more compact.
  528. res_x = ecdf(x)
  529. i = np.searchsorted(res_x.sf.quantiles, times_xy)
  530. at_risk_x = np.append(res_x.sf._n, 0)[i] # 0 at risk after last time
  531. # Subtract from the combined number at risk to get number at risk in Y
  532. at_risk_y = at_risk_xy - at_risk_x
  533. # Compute the variance.
  534. num = at_risk_x * at_risk_y * deaths_xy * (at_risk_xy - deaths_xy)
  535. den = at_risk_xy**2 * (at_risk_xy - 1)
  536. # Note: when `at_risk_xy == 1`, we would have `at_risk_xy - 1 == 0` in the
  537. # numerator and denominator. Simplifying the fraction symbolically, we
  538. # would always find the overall quotient to be zero, so don't compute it.
  539. i = at_risk_xy > 1
  540. sum_var = np.sum(num[i]/den[i])
  541. # Get the observed and expected number of deaths in group X
  542. n_died_x = x._uncensored.size
  543. sum_exp_deaths_x = np.sum(at_risk_x * (deaths_xy/at_risk_xy))
  544. # Compute the statistic. This is the square root of that in references.
  545. statistic = (n_died_x - sum_exp_deaths_x)/np.sqrt(sum_var)
  546. # Equivalent to chi2(df=1).sf(statistic**2) when alternative='two-sided'
  547. norm = stats._stats_py._SimpleNormal()
  548. pvalue = stats._stats_py._get_pvalue(statistic, norm, alternative, xp=np)
  549. return LogRankResult(statistic=statistic[()], pvalue=pvalue[()])