Skip to content

Commit 5eadd8d

Browse files
authored
Update README.rst
1 parent 621969d commit 5eadd8d

1 file changed

Lines changed: 42 additions & 7 deletions

File tree

README.rst

Lines changed: 42 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -450,15 +450,50 @@ we start to review some random projection techniques.
450450

451451
.. code:: python
452452
453-
import numpy as np
454-
from sklearn import random_projection
455-
X = np.random.rand(100, 10000)
456-
transformer = random_projection.GaussianRandomProjection()
457-
X_new = transformer.fit_transform(X)
458-
X_new.shape
459-
(100, 3947)
453+
from sklearn.feature_extraction.text import TfidfVectorizer
454+
import numpy as np
455+
456+
def TFIDF(X_train, X_test, MAX_NB_WORDS=75000):
457+
vectorizer_x = TfidfVectorizer(max_features=MAX_NB_WORDS)
458+
X_train = vectorizer_x.fit_transform(X_train).toarray()
459+
X_test = vectorizer_x.transform(X_test).toarray()
460+
print("tf-idf with", str(np.array(X_train).shape[1]), "features")
461+
return (X_train, X_test)
462+
463+
464+
from sklearn.datasets import fetch_20newsgroups
465+
466+
newsgroups_train = fetch_20newsgroups(subset='train')
467+
newsgroups_test = fetch_20newsgroups(subset='test')
468+
X_train = newsgroups_train.data
469+
X_test = newsgroups_test.data
470+
y_train = newsgroups_train.target
471+
y_test = newsgroups_test.target
460472
473+
X_train,X_test = TFIDF(X_train,X_test)
474+
475+
from sklearn import random_projection
476+
477+
RandomProjection = random_projection.GaussianRandomProjection(n_components=2000)
478+
X_train_new = RandomProjection.fit_transform(X_train)
479+
X_test_new = RandomProjection.transform(X_test)
480+
481+
print("train with old features: ",np.array(X_train).shape)
482+
print("train with new features:" ,np.array(X_train_new).shape)
461483
484+
print("test with old features: ",np.array(X_test).shape)
485+
print("test with new features:" ,np.array(X_test_new).shape)
486+
487+
output:
488+
489+
.. code:: python
490+
491+
tf-idf with 75000 features
492+
train with old features: (11314, 75000)
493+
train with new features: (11314, 2000)
494+
test with old features: (7532, 75000)
495+
test with new features: (7532, 2000)
496+
462497
~~~~~~~~~~~
463498
Autoencoder
464499
~~~~~~~~~~~

0 commit comments

Comments
 (0)