@@ -376,6 +376,53 @@ Principal Component Analysis (PCA)
376376~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
377377
378378
379+ Example of PCA on text dataset (20newsgroups) from tf-idf with 75000 features to 2000 components:
380+ .. code :: python
381+
382+ from sklearn.feature_extraction.text import TfidfVectorizer
383+ import numpy as np
384+
385+ def TFIDF (X_train , X_test , MAX_NB_WORDS = 75000 ):
386+ vectorizer_x = TfidfVectorizer(max_features = MAX_NB_WORDS )
387+ X_train = vectorizer_x.fit_transform(X_train).toarray()
388+ X_test = vectorizer_x.transform(X_test).toarray()
389+ print (" tf-idf with" , str (np.array(X_train).shape[1 ]), " features" )
390+ return (X_train, X_test)
391+
392+
393+ from sklearn.datasets import fetch_20newsgroups
394+
395+ newsgroups_train = fetch_20newsgroups(subset = ' train' )
396+ newsgroups_test = fetch_20newsgroups(subset = ' test' )
397+ X_train = newsgroups_train.data
398+ X_test = newsgroups_test.data
399+ y_train = newsgroups_train.target
400+ y_test = newsgroups_test.target
401+
402+ X_train,X_test = TFIDF(X_train,X_test)
403+
404+ from sklearn.decomposition import PCA
405+ pca = PCA(n_components = 2000 )
406+ X_train_new = pca.fit_transform(X_train)
407+ X_test_new = pca.transform(X_test)
408+
409+ print (" train with old features: " ,np.array(X_train).shape)
410+ print (" train with old features:" ,np.array(X_train_new).shape)
411+
412+ print (" test with old features: " ,np.array(X_test).shape)
413+ print (" test with old features:" ,np.array(X_test_new).shape)
414+
415+ output:
416+
417+ .. code :: python
418+
419+ tf- idf with 75000 features
420+ train with old features: (11314 , 75000 )
421+ train with old features: (11314 , 2000 )
422+ test with old features: (7532 , 75000 )
423+ test with old features: (7532 , 2000 )
424+
425+
379426
380427~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
381428Linear Discriminant Analysis (LDA)
0 commit comments