Update README.rst

kk7nc · web-flow · commit 4a1870bc815b · 2018-07-24T13:18:38.000-04:00
diff --git a/README.rst b/README.rst
@@ -376,6 +376,53 @@ Principal Component Analysis (PCA)
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
 
+Example of PCA on text dataset (20newsgroups) from  tf-idf with 75000 features to 2000 components:
+.. code:: python
+
+    from sklearn.feature_extraction.text import TfidfVectorizer
+    import numpy as np
+
+    def TFIDF(X_train, X_test, MAX_NB_WORDS=75000):
+        vectorizer_x = TfidfVectorizer(max_features=MAX_NB_WORDS)
+        X_train = vectorizer_x.fit_transform(X_train).toarray()
+        X_test = vectorizer_x.transform(X_test).toarray()
+        print("tf-idf with", str(np.array(X_train).shape[1]), "features")
+        return (X_train, X_test)
+
+
+    from sklearn.datasets import fetch_20newsgroups
+
+    newsgroups_train = fetch_20newsgroups(subset='train')
+    newsgroups_test = fetch_20newsgroups(subset='test')
+    X_train = newsgroups_train.data
+    X_test = newsgroups_test.data
+    y_train = newsgroups_train.target
+    y_test = newsgroups_test.target
+
+    X_train,X_test = TFIDF(X_train,X_test)
+
+    from sklearn.decomposition import PCA
+    pca = PCA(n_components=2000)
+    X_train_new = pca.fit_transform(X_train)
+    X_test_new = pca.transform(X_test)
+
+    print("train with old features: ",np.array(X_train).shape)
+    print("train with old features:" ,np.array(X_train_new).shape)
+    
+    print("test with old features: ",np.array(X_test).shape)
+    print("test with old features:" ,np.array(X_test_new).shape)
+
+output:
+
+.. code:: python
+
+    tf-idf with 75000 features
+    train with old features:  (11314, 75000)
+    train with old features: (11314, 2000)
+    test with old features:  (7532, 75000)
+    test with old features: (7532, 2000)
+
+
 
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 Linear Discriminant Analysis (LDA)