from sklearn.ensemble import GradientBoostingClassifier from sklearn.pipeline import Pipeline from sklearn import metrics from sklearn.feature_extraction.text import CountVectorizer from sklearn.feature_extraction.text import TfidfTransformer from sklearn.datasets import fetch_20newsgroups newsgroups_train = fetch_20newsgroups(subset='train') newsgroups_test = fetch_20newsgroups(subset='test') X_train = newsgroups_train.data X_test = newsgroups_test.data y_train = newsgroups_train.target y_test = newsgroups_test.target text_clf = Pipeline([('vect', CountVectorizer()), ('tfidf', TfidfTransformer()), ('clf', GradientBoostingClassifier(n_estimators=50,verbose=2)), ]) text_clf.fit(X_train, y_train) predicted = text_clf.predict(X_test) print(metrics.classification_report(y_test, predicted))