Difficulty: Beginner
Estimated Time: 10 minutes

from sklearn.datasets import fetch_20newsgroups from sklearn.feature_extraction.text import CountVectorizer,TfidfTransformer from sklearn.naive_bayes import MultinomialNB import ssl

ssl._create_default_https_context = ssl._create_unverified_context

categories = ['alt.atheism','soc.religion.christian','comp.graphics','sci.med'] twenty_train = fetch_20newsgroups(subset='train',categories=categories,shuffle=True,random_state=42,)

print(twenty_train.target_names) print (len(twenty_train.data)) print (len(twenty_train.filenames)) print("\n".join(twenty_train.data[0].split("\n")[:3])) print(twenty_train.target[0]) print(twenty_train.target_names[0]) print(twenty_train.filenames[0])

count_vect = CountVectorizer() x_train_counts = count_vect.fit_transform(twenty_train.data) print(x_train_counts.shape) tf_idf_transform = TfidfTransformer() x_train_tfidf = tf_idf_transform.fit_transform(x_train_counts) print(x_train_tfidf.shape)

clf = MultinomialNB().fit(x_train_tfidf,twenty_train.target)

docs_new = ['God is Love','Nvidia card','Disbelieves in God']

x_new_counts = count_vect.transform(docs_new) print(x_new_counts) x_new_tfidf = tf_idf_transform.transform(x_new_counts)

predicted = clf.predict(x_new_tfidf) print(predicted) for doc,category in zip(docs_new,predicted): print('%r => %s'%(doc,twenty_train.target_names[category]))

Don’t stop now! The next scenario will only take about 10 minutes to complete.

newsgroup 1

App