Skip to content

Commit f994057

Browse files
committed
Pushing the docs to dev/ for branch: main, commit 4137ec60e9242744b72127af41dc61478fa0acc1
1 parent ce813df commit f994057

File tree

1,567 files changed

+6123
-6167
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

1,567 files changed

+6123
-6167
lines changed
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.

dev/_downloads/3f7191b01d0103d1886c959ed7687c4d/plot_bicluster_newsgroups.ipynb

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,7 @@
1515
},
1616
"outputs": [],
1717
"source": [
18-
"# Authors: The scikit-learn developers\n# SPDX-License-Identifier: BSD-3-Clause\n\nimport operator\nfrom collections import defaultdict\nfrom time import time\n\nimport numpy as np\n\nfrom sklearn.cluster import MiniBatchKMeans, SpectralCoclustering\nfrom sklearn.datasets import fetch_20newsgroups\nfrom sklearn.feature_extraction.text import TfidfVectorizer\nfrom sklearn.metrics.cluster import v_measure_score\n\n\ndef number_normalizer(tokens):\n \"\"\"Map all numeric tokens to a placeholder.\n\n For many applications, tokens that begin with a number are not directly\n useful, but the fact that such a token exists can be relevant. By applying\n this form of dimensionality reduction, some methods may perform better.\n \"\"\"\n return (\"#NUMBER\" if token[0].isdigit() else token for token in tokens)\n\n\nclass NumberNormalizingVectorizer(TfidfVectorizer):\n def build_tokenizer(self):\n tokenize = super().build_tokenizer()\n return lambda doc: list(number_normalizer(tokenize(doc)))\n\n\n# exclude 'comp.os.ms-windows.misc'\ncategories = [\n \"alt.atheism\",\n \"comp.graphics\",\n \"comp.sys.ibm.pc.hardware\",\n \"comp.sys.mac.hardware\",\n \"comp.windows.x\",\n \"misc.forsale\",\n \"rec.autos\",\n \"rec.motorcycles\",\n \"rec.sport.baseball\",\n \"rec.sport.hockey\",\n \"sci.crypt\",\n \"sci.electronics\",\n \"sci.med\",\n \"sci.space\",\n \"soc.religion.christian\",\n \"talk.politics.guns\",\n \"talk.politics.mideast\",\n \"talk.politics.misc\",\n \"talk.religion.misc\",\n]\nnewsgroups = fetch_20newsgroups(categories=categories)\ny_true = newsgroups.target\n\nvectorizer = NumberNormalizingVectorizer(stop_words=\"english\", min_df=5)\ncocluster = SpectralCoclustering(\n n_clusters=len(categories), svd_method=\"arpack\", random_state=0\n)\nkmeans = MiniBatchKMeans(\n n_clusters=len(categories), batch_size=20000, random_state=0, n_init=3\n)\n\nprint(\"Vectorizing...\")\nX = vectorizer.fit_transform(newsgroups.data)\n\nprint(\"Coclustering...\")\nstart_time = time()\ncocluster.fit(X)\ny_cocluster = cocluster.row_labels_\nprint(\n \"Done in {:.2f}s. V-measure: {:.4f}\".format(\n time() - start_time, v_measure_score(y_cocluster, y_true)\n )\n)\n\nprint(\"MiniBatchKMeans...\")\nstart_time = time()\ny_kmeans = kmeans.fit_predict(X)\nprint(\n \"Done in {:.2f}s. V-measure: {:.4f}\".format(\n time() - start_time, v_measure_score(y_kmeans, y_true)\n )\n)\n\nfeature_names = vectorizer.get_feature_names_out()\ndocument_names = list(newsgroups.target_names[i] for i in newsgroups.target)\n\n\ndef bicluster_ncut(i):\n rows, cols = cocluster.get_indices(i)\n if not (np.any(rows) and np.any(cols)):\n import sys\n\n return sys.float_info.max\n row_complement = np.nonzero(np.logical_not(cocluster.rows_[i]))[0]\n col_complement = np.nonzero(np.logical_not(cocluster.columns_[i]))[0]\n # Note: the following is identical to X[rows[:, np.newaxis],\n # cols].sum() but much faster in scipy <= 0.16\n weight = X[rows][:, cols].sum()\n cut = X[row_complement][:, cols].sum() + X[rows][:, col_complement].sum()\n return cut / weight\n\n\ndef most_common(d):\n \"\"\"Items of a defaultdict(int) with the highest values.\n\n Like Counter.most_common in Python >=2.7.\n \"\"\"\n return sorted(d.items(), key=operator.itemgetter(1), reverse=True)\n\n\nbicluster_ncuts = list(bicluster_ncut(i) for i in range(len(newsgroups.target_names)))\nbest_idx = np.argsort(bicluster_ncuts)[:5]\n\nprint()\nprint(\"Best biclusters:\")\nprint(\"----------------\")\nfor idx, cluster in enumerate(best_idx):\n n_rows, n_cols = cocluster.get_shape(cluster)\n cluster_docs, cluster_words = cocluster.get_indices(cluster)\n if not len(cluster_docs) or not len(cluster_words):\n continue\n\n # categories\n counter = defaultdict(int)\n for i in cluster_docs:\n counter[document_names[i]] += 1\n cat_string = \", \".join(\n \"{:.0f}% {}\".format(float(c) / n_rows * 100, name)\n for name, c in most_common(counter)[:3]\n )\n\n # words\n out_of_cluster_docs = cocluster.row_labels_ != cluster\n out_of_cluster_docs = np.where(out_of_cluster_docs)[0]\n word_col = X[:, cluster_words]\n word_scores = np.array(\n word_col[cluster_docs, :].sum(axis=0)\n - word_col[out_of_cluster_docs, :].sum(axis=0)\n )\n word_scores = word_scores.ravel()\n important_words = list(\n feature_names[cluster_words[i]] for i in word_scores.argsort()[:-11:-1]\n )\n\n print(\"bicluster {} : {} documents, {} words\".format(idx, n_rows, n_cols))\n print(\"categories : {}\".format(cat_string))\n print(\"words : {}\\n\".format(\", \".join(important_words)))"
18+
"# Authors: The scikit-learn developers\n# SPDX-License-Identifier: BSD-3-Clause\nfrom collections import Counter\nfrom time import time\n\nimport numpy as np\n\nfrom sklearn.cluster import MiniBatchKMeans, SpectralCoclustering\nfrom sklearn.datasets import fetch_20newsgroups\nfrom sklearn.feature_extraction.text import TfidfVectorizer\nfrom sklearn.metrics.cluster import v_measure_score\n\n\ndef number_normalizer(tokens):\n \"\"\"Map all numeric tokens to a placeholder.\n\n For many applications, tokens that begin with a number are not directly\n useful, but the fact that such a token exists can be relevant. By applying\n this form of dimensionality reduction, some methods may perform better.\n \"\"\"\n return (\"#NUMBER\" if token[0].isdigit() else token for token in tokens)\n\n\nclass NumberNormalizingVectorizer(TfidfVectorizer):\n def build_tokenizer(self):\n tokenize = super().build_tokenizer()\n return lambda doc: list(number_normalizer(tokenize(doc)))\n\n\n# exclude 'comp.os.ms-windows.misc'\ncategories = [\n \"alt.atheism\",\n \"comp.graphics\",\n \"comp.sys.ibm.pc.hardware\",\n \"comp.sys.mac.hardware\",\n \"comp.windows.x\",\n \"misc.forsale\",\n \"rec.autos\",\n \"rec.motorcycles\",\n \"rec.sport.baseball\",\n \"rec.sport.hockey\",\n \"sci.crypt\",\n \"sci.electronics\",\n \"sci.med\",\n \"sci.space\",\n \"soc.religion.christian\",\n \"talk.politics.guns\",\n \"talk.politics.mideast\",\n \"talk.politics.misc\",\n \"talk.religion.misc\",\n]\nnewsgroups = fetch_20newsgroups(categories=categories)\ny_true = newsgroups.target\n\nvectorizer = NumberNormalizingVectorizer(stop_words=\"english\", min_df=5)\ncocluster = SpectralCoclustering(\n n_clusters=len(categories), svd_method=\"arpack\", random_state=0\n)\nkmeans = MiniBatchKMeans(\n n_clusters=len(categories), batch_size=20000, random_state=0, n_init=3\n)\n\nprint(\"Vectorizing...\")\nX = vectorizer.fit_transform(newsgroups.data)\n\nprint(\"Coclustering...\")\nstart_time = time()\ncocluster.fit(X)\ny_cocluster = cocluster.row_labels_\nprint(\n f\"Done in {time() - start_time:.2f}s. V-measure: \\\n{v_measure_score(y_cocluster, y_true):.4f}\"\n)\n\n\nprint(\"MiniBatchKMeans...\")\nstart_time = time()\ny_kmeans = kmeans.fit_predict(X)\nprint(\n f\"Done in {time() - start_time:.2f}s. V-measure: \\\n{v_measure_score(y_kmeans, y_true):.4f}\"\n)\n\n\nfeature_names = vectorizer.get_feature_names_out()\ndocument_names = list(newsgroups.target_names[i] for i in newsgroups.target)\n\n\ndef bicluster_ncut(i):\n rows, cols = cocluster.get_indices(i)\n if not (np.any(rows) and np.any(cols)):\n import sys\n\n return sys.float_info.max\n row_complement = np.nonzero(np.logical_not(cocluster.rows_[i]))[0]\n col_complement = np.nonzero(np.logical_not(cocluster.columns_[i]))[0]\n # Note: the following is identical to X[rows[:, np.newaxis],\n # cols].sum() but much faster in scipy <= 0.16\n weight = X[rows][:, cols].sum()\n cut = X[row_complement][:, cols].sum() + X[rows][:, col_complement].sum()\n return cut / weight\n\n\nbicluster_ncuts = list(bicluster_ncut(i) for i in range(len(newsgroups.target_names)))\nbest_idx = np.argsort(bicluster_ncuts)[:5]\n\nprint()\nprint(\"Best biclusters:\")\nprint(\"----------------\")\nfor idx, cluster in enumerate(best_idx):\n n_rows, n_cols = cocluster.get_shape(cluster)\n cluster_docs, cluster_words = cocluster.get_indices(cluster)\n if not len(cluster_docs) or not len(cluster_words):\n continue\n\n # categories\n counter = Counter(document_names[doc] for doc in cluster_docs)\n\n cat_string = \", \".join(\n f\"{(c / n_rows * 100):.0f}% {name}\" for name, c in counter.most_common(3)\n )\n\n # words\n out_of_cluster_docs = cocluster.row_labels_ != cluster\n out_of_cluster_docs = np.where(out_of_cluster_docs)[0]\n word_col = X[:, cluster_words]\n word_scores = np.array(\n word_col[cluster_docs, :].sum(axis=0)\n - word_col[out_of_cluster_docs, :].sum(axis=0)\n )\n word_scores = word_scores.ravel()\n important_words = list(\n feature_names[cluster_words[i]] for i in word_scores.argsort()[:-11:-1]\n )\n\n print(f\"bicluster {idx} : {n_rows} documents, {n_cols} words\")\n print(f\"categories : {cat_string}\")\n print(f\"words : {', '.join(important_words)}\\n\")"
1919
]
2020
}
2121
],
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.

dev/_downloads/e68419b513284db108081422c73a5667/plot_bicluster_newsgroups.py

Lines changed: 13 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -25,9 +25,7 @@
2525

2626
# Authors: The scikit-learn developers
2727
# SPDX-License-Identifier: BSD-3-Clause
28-
29-
import operator
30-
from collections import defaultdict
28+
from collections import Counter
3129
from time import time
3230

3331
import numpy as np
@@ -95,20 +93,20 @@ def build_tokenizer(self):
9593
cocluster.fit(X)
9694
y_cocluster = cocluster.row_labels_
9795
print(
98-
"Done in {:.2f}s. V-measure: {:.4f}".format(
99-
time() - start_time, v_measure_score(y_cocluster, y_true)
100-
)
96+
f"Done in {time() - start_time:.2f}s. V-measure: \
97+
{v_measure_score(y_cocluster, y_true):.4f}"
10198
)
10299

100+
103101
print("MiniBatchKMeans...")
104102
start_time = time()
105103
y_kmeans = kmeans.fit_predict(X)
106104
print(
107-
"Done in {:.2f}s. V-measure: {:.4f}".format(
108-
time() - start_time, v_measure_score(y_kmeans, y_true)
109-
)
105+
f"Done in {time() - start_time:.2f}s. V-measure: \
106+
{v_measure_score(y_kmeans, y_true):.4f}"
110107
)
111108

109+
112110
feature_names = vectorizer.get_feature_names_out()
113111
document_names = list(newsgroups.target_names[i] for i in newsgroups.target)
114112

@@ -128,14 +126,6 @@ def bicluster_ncut(i):
128126
return cut / weight
129127

130128

131-
def most_common(d):
132-
"""Items of a defaultdict(int) with the highest values.
133-
134-
Like Counter.most_common in Python >=2.7.
135-
"""
136-
return sorted(d.items(), key=operator.itemgetter(1), reverse=True)
137-
138-
139129
bicluster_ncuts = list(bicluster_ncut(i) for i in range(len(newsgroups.target_names)))
140130
best_idx = np.argsort(bicluster_ncuts)[:5]
141131

@@ -149,12 +139,10 @@ def most_common(d):
149139
continue
150140

151141
# categories
152-
counter = defaultdict(int)
153-
for i in cluster_docs:
154-
counter[document_names[i]] += 1
142+
counter = Counter(document_names[doc] for doc in cluster_docs)
143+
155144
cat_string = ", ".join(
156-
"{:.0f}% {}".format(float(c) / n_rows * 100, name)
157-
for name, c in most_common(counter)[:3]
145+
f"{(c / n_rows * 100):.0f}% {name}" for name, c in counter.most_common(3)
158146
)
159147

160148
# words
@@ -170,6 +158,6 @@ def most_common(d):
170158
feature_names[cluster_words[i]] for i in word_scores.argsort()[:-11:-1]
171159
)
172160

173-
print("bicluster {} : {} documents, {} words".format(idx, n_rows, n_cols))
174-
print("categories : {}".format(cat_string))
175-
print("words : {}\n".format(", ".join(important_words)))
161+
print(f"bicluster {idx} : {n_rows} documents, {n_cols} words")
162+
print(f"categories : {cat_string}")
163+
print(f"words : {', '.join(important_words)}\n")
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.

dev/_downloads/scikit-learn-docs.zip

-17.5 KB
Binary file not shown.

0 commit comments

Comments
 (0)