8000 new commit · scikit-learn/scikit-learn@1c9f918 · GitHub
[go: up one dir, main page]

Skip to content

Commit 1c9f918

Browse files
committed
new commit
1 parent b495849 commit 1c9f918

File tree

4 files changed

+12
-136
lines changed

4 files changed

+12
-136
lines changed

doc/modules/svm.rst

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -529,7 +529,7 @@ test vectors must be provided.
529529
>>> from sklearn import svm
530530
>>> X = np.array([[0, 0], [1, 1]])
531531
>>> y = [0, 1]
532-
>>> clf = svm.SVC(kernel='precomputed')
532+
>>> clf = svm.SVC(gamma='auto', kernel='precomputed')
533533
>>> # linear kernel computation
534534
>>> gram = np.dot(X, X.T)
535535
>>> clf.fit(gram, y) # doctest: +NORMALIZE_WHITESPACE

doc/tutorial/basic/tutorial.rst

Lines changed: 11 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -214,12 +214,12 @@ persistence model, namely `pickle <https://docs.python.org/2/library/pickle.html
214214

215215
>>> from sklearn import svm
216216
>>> from sklearn import datasets
217-
>>> clf = svm.SVC(gamma='auto')
217+
>>> clf = svm.SVC(gamma='scale')
218218
>>> iris = datasets.load_iris()
219219
>>> X, y = iris.data, iris.target
220220
>>> clf.fit(X, y) # doctest: +NORMALIZE_WHITESPACE
221221
SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
222-
decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf',
222+
decision_function_shape='ovr', degree=3, gamma='scale', kernel='rbf',
223223
max_iter=-1, probability=False, random_state=None, shrinking=True,
224224
tol=0.001, verbose=False)
225225

@@ -290,10 +290,10 @@ maintained::
290290
>>> from sklearn import datasets
291291
>>> from sklearn.svm import SVC
292292
>>> iris = datasets.load_iris()
293-
>>> clf = SVC()
293+
>>> clf = SVC(gamma='scale')
294294
>>> clf.fit(iris.data, iris.target) # doctest: +NORMALIZE_WHITESPACE
295295
SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
296-
decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf',
296+
decision_function_shape='ovr', degree=3, gamma='scale', kernel='rbf',
297297
max_iter=-1, probability=False, random_state=None, shrinking=True,
298298
tol=0.001, verbose=False)
299299

@@ -302,7 +302,7 @@ maintained::
302302

303303
>>> clf.fit(iris.data, iris.target_names[iris.target]) # doctest: +NORMALIZE_WHITESPACE
304304
SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
305-
decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf',
305+
decision_function_shape='ovr', degree=3, gamma='scale', kernel='rbf',
306306
max_iter=-1, probability=False, random_state=None, shrinking=True,
307307
tol=0.001, verbose=False)
308308

@@ -328,25 +328,25 @@ more than once will overwrite what was learned by any previous ``fit()``::
328328
>>> y = rng.binomial(1, 0.5, 100)
329329
>>> X_test = rng.rand(5, 10)
330330

331-
>>> clf = SVC()
331+
>>> clf = SVC(gamma='scale')
332332
>>> clf.set_params(kernel='linear').fit(X, y) # doctest: +NORMALIZE_WHITESPACE
333333
SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
334-
decision_function_shape='ovr', degree=3, gamma='auto', kernel='linear',
334+
decision_function_shape='ovr', degree=3, gamma='scale', kernel='linear',
335335
max_iter=-1, probability=False, random_state=None, shrinking=True,
336336
tol=0.001, verbose=False)
337337
>>> clf.predict(X_test)
338338
array([1, 0, 1, 1, 0])
339339

340340
>>> clf.set_params(kernel='rbf').fit(X, y) # doctest: +NORMALIZE_WHITESPACE
341341
SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
342-
decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf',
342+
decision_function_shape='ovr', degree=3, gamma='scale', kernel='rbf',
343343
max_iter=-1, probability=False, random_state=None, shrinking=True,
344344
tol=0.001, verbose=False)
345345
>>> clf.predict(X_test)
346346
array([0, 0, 0, 1, 0])
347347

348348
Here, the default kernel ``rbf`` is first changed to ``linear`` after the
349-
estimator has been constructed via ``SVC()``, and changed back to ``rbf`` to
349+
estimator has been constructed via ``SVC(gamma='scale')``, and changed back to ``rbf`` to
350350
refit the estimator and to make a second prediction.
351351

352352
Multiclass vs. multilabel fitting
@@ -363,7 +363,8 @@ the target data fit upon::
363363
>>> X = [[1, 2], [2, 4], [4, 5], [3, 2], [3, 1]]
364364
>>> y = [0, 0, 1, 1, 2]
365365

366-
>>> classif = OneVsRestClassifier(estimator=SVC(random_state=0))
366+
>>> classif = OneVsRestClassifier(estimator=SVC(gamma='scale',
367+
... random_state=0))
367368
>>> classif.fit(X, y).predict(X)
368369
array([0, 0, 1, 1, 2])
369370

Lines changed: 0 additions & 62 deletions
Original file line numberDiff line numberDiff line change
@@ -1,62 +0,0 @@
1-
"""Build a language detector model
2-
3-
The goal of this exercise is to train a linear classifier on text features
4-
that represent sequences of up to 3 consecutive characters so as to be
5-
recognize natural languages by using the frequencies of short character
6-
sequences as 'fingerprints'.
7-
8-
"""
9-
# Author: Olivier Grisel <olivier.grisel@ensta.org>
10-
# License: Simplified BSD
11-
12-
import sys
13-
14-
from sklearn.feature_extraction.text import TfidfVectorizer
15-
from sklearn.linear_model import Perceptron
16-
from sklearn.pipeline import Pipeline
17-
from sklearn.datasets import load_files
18-
from sklearn.model_selection import train_test_split
19-
from sklearn import metrics
20-
21-
22-
# The training data folder must be passed as first argument
23-
languages_data_folder = sys.argv[1]
24-
dataset = load_files(languages_data_folder)
25-
26-
# Split the dataset in training and test set:
27-
docs_train, docs_test, y_train, y_test = train_test_split(
28-
dataset.data, dataset.target, test_size=0.5)
29-
30-
31-
# TASK: Build a vectorizer that splits strings into sequence of 1 to 3
32-
# characters instead of word tokens
33-
34-
# TASK: Build a vectorizer / classifier pipeline using the previous analyzer
35-
# the pipeline instance should stored in a variable named clf
36-
37-
# TASK: Fit the pipeline on the training set
38-
39-
# TASK: Predict the outcome on the testing set in a variable named y_predicted
40-
41-
# Print the classification report
42-
print(metrics.classification_report(y_test, y_predicted,
43-
target_names=dataset.target_names))
44-
45-
# Plot the confusion matrix
46-
cm = metrics.confusion_matrix(y_test, y_predicted)
47-
print(cm)
48-
49-
#import matplotlib.pyplot as plt
50-
#plt.matshow(cm, cmap=plt.cm.jet)
51-
#plt.show()
52-
53-
# Predict the result on some short new sentences:
54-
sentences = [
55-
u'This is a language detection test.',
56-
u'Ceci est un test de d\xe9tection de la langue.',
57-
u'Dies ist ein Test, um die Sprache zu erkennen.',
58-
]
59-
predicted = clf.predict(sentences)
60-
61-
for s, p in zip(sentences, predicted):
62-
print(u'The language of "%s" is "%s"' % (s, dataset.target_names[p]))
Lines changed: 0 additions & 63 deletions
Original file line numberDiff line numberDiff line change
@@ -1,63 +0,0 @@
1-
"""Build a sentiment analysis / polarity model
2-
3-
Sentiment analysis can be casted as a binary text classification problem,
4-
that is fitting a linear classifier on features extracted from the text
5-
of the user messages so as to guess wether the opinion of the author is
6-
positive or negative.
7-
8-
In this examples we will use a movie review dataset.
9-
10-
"""
11-
# Author: Olivier Grisel <olivier.grisel@ensta.org>
12-
# License: Simplified BSD
13-
14-
import sys
15-
from sklearn.feature_extraction.text import TfidfVectorizer
16-
from sklearn.svm import LinearSVC
17-
from sklearn.pipeline import Pipeline
18-
from sklearn.model_selection import GridSearchCV
19-
from sklearn.datasets import load_files
20-
from sklearn.model_selection import train_test_split
21-
from sklearn import metrics
22-
23-
24-
if __name__ == "__main__":
25-
# NOTE: we put the following in a 'if __name__ == "__main__"' protected
26-
# block to be able to use a multi-core grid search that also works under
27-
# Windows, see: http://docs.python.org/library/multiprocessing.html#windows
28-
# The multiprocessing module is used as the backend of joblib.Parallel
29-
# that is used when n_jobs != 1 in GridSearchCV
30-
31-
# the training data folder must be passed as first argument
32-
movie_reviews_data_folder = sys.argv[1]
33-
dataset = load_files(movie_reviews_data_folder, shuffle=False)
34-
print("n_samples: %d" % len(dataset.data))
35-
36-
# split the dataset in training and test set:
37-
docs_train, docs_test, y_train, y_test = train_test_split(
38-
dataset.data, dataset.target, test_size=0.25, random_state=None)
39-
40-
# TASK: Build a vectorizer / classifier pipeline that filters out tokens
41-
# that are too rare or too frequent
42-
43-
# TASK: Build a grid search to find out whether unigrams or bigrams are
44-
# more useful.
45-
# Fit the pipeline on the training set using grid search for the parameters
46-
47-
# TASK: print the cross-validated scores for the each parameters set
48-
# explored by the grid search
49-
50-
# TASK: Predict the outcome on the testing set and store it in a variable
51-
# named y_predicted
52-
53-
# Print the classification report
54-
print(metrics.classification_report(y_test, y_predicted,
55-
target_names=dataset.target_names))
56-
57-
# Print and plot the confusion matrix
58-
cm = metrics.confusion_matrix(y_test, y_predicted)
59-
print(cm)
60-
61-
# import matplotlib.pyplot as plt
62-
# plt.matshow(cm)
63-
# plt.show()

0 commit comments

Comments
 (0)
0