clusterer.py 4.8 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146
  1. """Define plots for clustering models built with scikit-learn."""
  2. from warnings import simplefilter
  3. import pandas as pd
  4. import sklearn
  5. import wandb
  6. from wandb.integration.sklearn import calculate, utils
  7. # ignore all future warnings
  8. simplefilter(action="ignore", category=FutureWarning)
  9. def clusterer(model, X_train, cluster_labels, labels=None, model_name="Clusterer"): # noqa: N803
  10. """Generates all sklearn clusterer plots supported by W&B.
  11. The following plots are generated:
  12. elbow curve, silhouette plot.
  13. Should only be called with a fitted clusterer (otherwise an error is thrown).
  14. Args:
  15. model: (clusterer) Takes in a fitted clusterer.
  16. X_train: (arr) Training set features.
  17. cluster_labels: (list) Names for cluster labels. Makes plots easier to read
  18. by replacing cluster indexes with corresponding names.
  19. labels: (list) Named labels for target variable (y). Makes plots easier to
  20. read by replacing target values with corresponding index.
  21. For example if `labels=['dog', 'cat', 'owl']` all 0s are
  22. replaced by dog, 1s by cat.
  23. model_name: (str) Model name. Defaults to 'Clusterer'
  24. Returns:
  25. None: To see plots, go to your W&B run page then expand the 'media' tab
  26. under 'auto visualizations'.
  27. Example:
  28. ```python
  29. wandb.sklearn.plot_clusterer(kmeans, X, cluster_labels, labels, "KMeans")
  30. ```
  31. """
  32. wandb.termlog(f"\nPlotting {model_name}.")
  33. if isinstance(model, sklearn.cluster.KMeans):
  34. elbow_curve(model, X_train)
  35. wandb.termlog("Logged elbow curve.")
  36. silhouette(model, X_train, cluster_labels, labels=labels, kmeans=True)
  37. else:
  38. silhouette(model, X_train, cluster_labels, kmeans=False)
  39. wandb.termlog("Logged silhouette plot.")
  40. def elbow_curve(
  41. clusterer=None,
  42. X=None, # noqa: N803
  43. cluster_ranges=None,
  44. n_jobs=1,
  45. show_cluster_time=True,
  46. ):
  47. """Measures and plots variance explained as a function of the number of clusters.
  48. Useful in picking the optimal number of clusters.
  49. Should only be called with a fitted clusterer (otherwise an error is thrown).
  50. Please note this function fits the model on the training set when called.
  51. Args:
  52. model: (clusterer) Takes in a fitted clusterer.
  53. X: (arr) Training set features.
  54. Returns:
  55. None: To see plots, go to your W&B run page then expand the 'media' tab
  56. under 'auto visualizations'.
  57. Example:
  58. ```python
  59. wandb.sklearn.plot_elbow_curve(model, X_train)
  60. ```
  61. """
  62. if not hasattr(clusterer, "n_clusters"):
  63. wandb.termlog(
  64. "n_clusters attribute not in classifier. Cannot plot elbow method."
  65. )
  66. return
  67. not_missing = utils.test_missing(clusterer=clusterer)
  68. correct_types = utils.test_types
  69. is_fitted = utils.test_fitted(clusterer)
  70. if not_missing and correct_types and is_fitted:
  71. elbow_curve_chart = calculate.elbow_curve(
  72. clusterer, X, cluster_ranges, n_jobs, show_cluster_time
  73. )
  74. wandb.log({"elbow_curve": elbow_curve_chart})
  75. def silhouette(
  76. clusterer=None,
  77. X=None, # noqa: N803
  78. cluster_labels=None,
  79. labels=None,
  80. metric="euclidean",
  81. kmeans=True,
  82. ):
  83. """Measures & plots silhouette coefficients.
  84. Silhouette coefficients near +1 indicate that the sample is far away from
  85. the neighboring clusters. A value near 0 indicates that the sample is on or
  86. very close to the decision boundary between two neighboring clusters and
  87. negative values indicate that the samples might have been assigned to the wrong cluster.
  88. Should only be called with a fitted clusterer (otherwise an error is thrown).
  89. Please note this function fits the model on the training set when called.
  90. Args:
  91. model: (clusterer) Takes in a fitted clusterer.
  92. X: (arr) Training set features.
  93. cluster_labels: (list) Names for cluster labels. Makes plots easier to read
  94. by replacing cluster indexes with corresponding names.
  95. Returns:
  96. None: To see plots, go to your W&B run page then expand the 'media' tab
  97. under 'auto visualizations'.
  98. Example:
  99. ```python
  100. wandb.sklearn.plot_silhouette(model, X_train, ["spam", "not spam"])
  101. ```
  102. """
  103. not_missing = utils.test_missing(clusterer=clusterer)
  104. correct_types = utils.test_types(clusterer=clusterer)
  105. is_fitted = utils.test_fitted(clusterer)
  106. if not_missing and correct_types and is_fitted:
  107. if isinstance(X, (pd.DataFrame)):
  108. X = X.values # noqa: N806
  109. silhouette_chart = calculate.silhouette(
  110. clusterer, X, cluster_labels, labels, metric, kmeans
  111. )
  112. wandb.log({"silhouette_plot": silhouette_chart})