from nltk.corpus import movie_reviews from nltk import NaiveBayesClassifier # Naive Bayes klassifiserer from nltk import classify #for ? beregne accuracy import string # denne importeres for ? kunne slette tegnsetting fra tekstene from nltk.corpus import stopwords # denne brukes for ? slette stoppord fra tekstene # Vi skal bruke movie_reviews korpuset. Du kan lese mer om det p? NLTK sin webside. # Korpuset inneholder anmeldelser av filmer. # Hvert dokument er klassifisert som enten positive "pos" eller negative "neg". # I denne koden, begynner vi med ? bruke bag-of-words som trekk. # Vi deretter tester v?re klassifisserer p? development datasettet. # Vi utvider trekkene v?res og lager bag-of-words uten stoppord. # Vi tester klassifisereren p? development datasettet en gang til, # Vi utvider trekkene v?res og lager bag-of-words trekk, men n? uten b?de stoppord og tegnsetting, og tester det p? development datasettet. # Vi tester etterp? bag-of-ngrams p? development datasettet, f?r vi tester til slutt p? testdataen. ################################# # Del 1 -- Bag-of-words som trekk ################################# print('#################################') print('# Del 1 -- Bag-of-words som trekk') print('#################################') # Ideen her er ? f?rste hente alle positive og negative ord fra movie_reviews Korpuset # Deretter bruker vi disse ordene som trekk for v?r Naive Bayes # Vi deler dataen v?r i train_set, dev_set, og test_set. Vi bruker train_set for ? trene, og dev_set for ? teste effekten av trekkene vi bruker. # Hente ut alle positive og negative ord som finnes i movie_reviews def get_word_reviews(pos_reviews, neg_reviews): # Lage en liste av alle positive ord for fileid in movie_reviews.fileids('pos'): words = movie_reviews.words(fileid) pos_reviews.append(words) # Lage en liste av alle negative ord for fileid in movie_reviews.fileids('neg'): words = movie_reviews.words(fileid) neg_reviews.append(words) return pos_reviews, neg_reviews # Her velger vi trekkene v?re. Vi begynner med enkel bag-of-words. def bag_of_words(words): words_cleaned = [] for word in words: word = word.lower() words_cleaned.append(word) words_dict = dict([word, True] for word in words_cleaned) # ? bruke en dictionary her hjelper oss med ? slette alle duplikater. return words_dict pos_reviews = [] neg_reviews = [] pos_reviews, neg_reviews = get_word_reviews(pos_reviews, neg_reviews) # lage trekk for positie anmeldelser pos_reviews_feat = [] for words in pos_reviews: pos_reviews_feat.append((bag_of_words(words), 'pos')) # lage trekk for negative anmeldelser neg_reviews_feat = [] for words in neg_reviews: neg_reviews_feat.append((bag_of_words(words), 'neg')) # Fordele dataen v?r i test_set, dev_set, og train_set test_set = pos_reviews_feat[:200] + neg_reviews_feat[:200] dev_set = pos_reviews_feat[200:300] + neg_reviews_feat[200:300] train_set = pos_reviews_feat[300:] + neg_reviews_feat[300:] print(len(test_set), len(dev_set), len(train_set)) # printer 400 200 1400 # N? skal vi trene v?r Naive Bayes klassifisserer classifier = NaiveBayesClassifier.train(train_set) # S? enkelt var det! # N? skal vi se hvor god den er ved ? teste den p? dev_set accuracy = classify.accuracy(classifier, dev_set) print("Accuracy on dev_set: %0.2f" % accuracy) # Gir en accuracy p? 0.675 # Vi kan se p? de 10 mest informative trekk ved ? gj?re f?lgende print (classifier.show_most_informative_features(10)) # Dette viser f.eks. at ordet sucks er 13 ganger mer representativ av klassen negativ enn den er av klasen positive # Most Informative Features # captures = True pos : neg = 15.0 : 1.0 # sucks = True neg : pos = 13.0 : 1.0 # ludicrous = True neg : pos = 12.6 : 1.0 # maintains = True pos : neg = 11.7 : 1.0 # breathtaking = True pos : neg = 11.4 : 1.0 # anger = True pos : neg = 11.0 : 1.0 # depicted = True pos : neg = 10.3 : 1.0 # headache = True neg : pos = 9.7 : 1.0 # astounding = True pos : neg = 9.7 : 1.0 # avoids = True pos : neg = 9.7 : 1.0 ################################# # Del 2 -- Bag-of-words som trekk uten stoppord ################################# print('#################################') print('# Del 2 -- Bag-of-words uten stoppord som trekk') print('#################################') # N? skal vi teste om klassifissereren v?r gj?r det bedre om vi sletter alle stoppord # Vi kj?rer akkuratt samme kode, vi endrer bare bag_of_words() funksjonen til ? slette stoppord def bag_of_words_no_stopwords(words): words_cleaned = [] for word in words: word = word.lower() if word not in stopwords_english: words_cleaned.append(word) words_dict = dict([word, True] for word in words_cleaned) return words_dict # Hente ut alle stoppord for engelsk stopwords_english = stopwords.words('english') # lage trekk for positie anmeldelser pos_reviews_feat = [] for words in pos_reviews: pos_reviews_feat.append((bag_of_words_no_stopwords(words), 'pos')) # lage trekk for negative anmeldelser neg_reviews_feat = [] for words in neg_reviews: neg_reviews_feat.append((bag_of_words_no_stopwords(words), 'neg')) # Fordele dataen v?r i test_set, dev_set, og train_set test_set = pos_reviews_feat[:200] + neg_reviews_feat[:200] dev_set = pos_reviews_feat[200:300] + neg_reviews_feat[200:300] train_set = pos_reviews_feat[300:] + neg_reviews_feat[300:] print(len(test_set), len(dev_set), len(train_set)) # printer 400 200 1400 # N? skal vi trene v?r Naive Bayes klassifisserer classifier = NaiveBayesClassifier.train(train_set) # S? enkelt var det! # N? skal vi se hvor god den er ved ? teste den p? dev_set accuracy = classify.accuracy(classifier, dev_set) print("Accuracy on dev_set: %0.2f" % accuracy) # Gir en accuracy p? 0.675 # Vi kan se p? de 10 mest informative trekk ved ? gj?re f?lgende print (classifier.show_most_informative_features(10)) ################################# # Del 3 -- Bag-of-words uten stoppord og tegnsetting som trekk ################################# print('#################################') print('# Del 3 -- Bag-of-words uten stoppord og tegnsetting som trekk') print('#################################') # N? skal vi teste om klassifissereren v?r gj?r det bedre om vi sletter alle stoppord og tegnsetting # Vi kj?rer akkuratt samme kode, vi endrer bare bag_of_words() funksjonen til ? slette stoppord og tegnsetting def bag_of_words_no_stopwords_punct(words): words_cleaned = [] for word in words: word = word.lower() if word not in stopwords_english and word not in string.punctuation: words_cleaned.append(word) words_dict = dict([word, True] for word in words_cleaned) return words_dict # Hente ut alle stoppord for engelsk stopwords_english = stopwords.words('english') # lage trekk for positie anmeldelser pos_reviews_feat = [] for words in pos_reviews: pos_reviews_feat.append((bag_of_words_no_stopwords_punct(words), 'pos')) # lage trekk for negative anmeldelser neg_reviews_feat = [] for words in neg_reviews: neg_reviews_feat.append((bag_of_words_no_stopwords_punct(words), 'neg')) # Fordele dataen v?r i test_set, dev_set, og train_set test_set = pos_reviews_feat[:200] + neg_reviews_feat[:200] dev_set = pos_reviews_feat[200:300] + neg_reviews_feat[200:300] train_set = pos_reviews_feat[300:] + neg_reviews_feat[300:] print(len(test_set), len(dev_set), len(train_set)) # printer 400 200 1400 # N? skal vi trene v?r Naive Bayes klassifisserer classifier = NaiveBayesClassifier.train(train_set) # S? enkelt var det! # N? skal vi se hvor god den er ved ? teste den p? dev_set accuracy = classify.accuracy(classifier, dev_set) print("Accuracy on dev_set: %0.2f" % accuracy) # Gir en accuracy p? 0.675 # Vi kan se p? de 10 mest informative trekk ved ? gj?re f?lgende print (classifier.show_most_informative_features(10)) # Vi kan se at dette ikke gir s? gode resultater, og forbedrer egentlig ikke v?r klassifiserer. Vi skal derfor teste noe helt annet. Vi skal pr?ve om det er best ? bruke bi-grams i tilleg til enkle ord. Vi lager derfor en bag-of-words som ogs? inneholder bi-grams, alts? en bag-of-ngrams (bigrams + unigrams). ################################# # Del 4 -- Bag-of-ngrams som trekk ################################# print('#################################') print('# Del 4 -- Bag-of-ngrams V.1 som trekk') print('#################################') from nltk import bigrams def bag_of_ngrams(words): words_cleaned = [] lowered = [w.lower() for w in words] for bi in bigrams(lowered): words_cleaned.append(bi) for word in words: word = word.lower() if word not in stopwords_english and word not in string.punctuation: words_cleaned.append(word) words_dict = dict([word, True] for word in words_cleaned) return words_dict # Hente ut alle stoppord for engelsk stopwords_english = stopwords.words('english') # lage trekk for positie anmeldelser pos_reviews_feat = [] for words in pos_reviews: pos_reviews_feat.append((bag_of_ngrams(words), 'pos')) # lage trekk for negative anmeldelser neg_reviews_feat = [] for words in neg_reviews: neg_reviews_feat.append((bag_of_ngrams(words), 'neg')) # Fordele dataen v?r i test_set, dev_set, og train_set test_set = pos_reviews_feat[:200] + neg_reviews_feat[:200] dev_set = pos_reviews_feat[200:300] + neg_reviews_feat[200:300] train_set = pos_reviews_feat[300:] + neg_reviews_feat[300:] print(len(test_set), len(dev_set), len(train_set)) # printer 400 200 1400 # N? skal vi trene v?r Naive Bayes klassifisserer classifier = NaiveBayesClassifier.train(train_set) # S? enkelt var det! # N? skal vi se hvor god den er ved ? teste den p? dev_set accuracy = classify.accuracy(classifier, dev_set) print("Accuracy on dev_set: %0.2f" % accuracy) # Printer 0.73 Dette er den beste vi har hatt s? langt! # Vi kan se p? de 10 mest informative trekk ved ? gj?re f?lgende print (classifier.show_most_informative_features(10)) # Most Informative Features # ('waste', 'of') = True neg : pos = 29.0 : 1.0 # captures = True pos : neg = 15.0 : 1.0 # ('not', 'funny') = True neg : pos = 15.0 : 1.0 # ('perfect', '.') = True pos : neg = 15.0 : 1.0 # ('saving', 'private') = True pos : neg = 13.0 : 1.0 # sucks = True neg : pos = 13.0 : 1.0 # ludicrous = True neg : pos = 12.6 : 1.0 # ('brings', 'a') = True pos : neg = 12.3 : 1.0 # ('care', '.') = True neg : pos = 11.7 : 1.0 # ('the', 'ridiculous') = True neg : pos = 11.7 : 1.0 # N? gj?r vi fremgang, vi f?r bedre accuracy, og vi ser ogs? at det er en del bigrams som er viktige her. Men mer ogs?, at vi har ikke slettet tegnsetting og stoppord fra bigrams! Kankjse vi skal teste om resultatene blir bedre uten ? ################################# # Del 5 -- Bag-of-ngrams versjon 2 som trekk ################################# print('#################################') print('# Del 5 -- Bag-of-ngrams V.2 som trekk') print('#################################') from nltk import bigrams def bag_of_ngrams_no_stopwords_punct(words): words_cleaned = [] lowered = [w.lower() for w in words if w not in stopwords_english and w not in string.punctuation] for bi in bigrams(lowered): words_cleaned.append(bi) for word in words: word = word.lower() if word not in stopwords_english and word not in string.punctuation: words_cleaned.append(word) words_dict = dict([word, True] for word in words_cleaned) return words_dict # Hente ut alle stoppord for engelsk stopwords_english = stopwords.words('english') # lage trekk for positie anmeldelser pos_reviews_feat = [] for words in pos_reviews: pos_reviews_feat.append((bag_of_ngrams_no_stopwords_punct(words), 'pos')) # lage trekk for negative anmeldelser neg_reviews_feat = [] for words in neg_reviews: neg_reviews_feat.append((bag_of_ngrams_no_stopwords_punct(words), 'neg')) # Fordele dataen v?r i test_set, dev_set, og train_set test_set = pos_reviews_feat[:200] + neg_reviews_feat[:200] dev_set = pos_reviews_feat[200:300] + neg_reviews_feat[200:300] train_set = pos_reviews_feat[300:] + neg_reviews_feat[300:] print(len(test_set), len(dev_set), len(train_set)) # printer 400 200 1400 # N? skal vi trene v?r Naive Bayes klassifisserer classifier = NaiveBayesClassifier.train(train_set) # S? enkelt var det! # N? skal vi se hvor god den er ved ? teste den p? dev_set accuracy = classify.accuracy(classifier, dev_set) print("Accuracy on dev_set: %0.2f" % accuracy) # Printer 0.76 # Igjen, dette er den beste accuracy vi har oppn?dd til n?. 0.76 er ikke s? d?rlig! og jeg tror at vi kan gi oss her, men la oss ta en titt p? de meste informative trekkene print (classifier.show_most_informative_features(10)) # Most Informative Features # ('waste', 'time') = True neg : pos = 19.0 : 1.0 # captures = True pos : neg = 15.0 : 1.0 # ('one', 'worst') = True neg : pos = 15.0 : 1.0 # ('saving', 'private') = True pos : neg = 13.0 : 1.0 # sucks = True neg : pos = 13.0 : 1.0 # ludicrous = True neg : pos = 12.6 : 1.0 # maintains = True pos : neg = 11.7 : 1.0 # breathtaking = True pos : neg = 11.4 : 1.0 # anger = True pos : neg = 11.0 : 1.0 # depicted = True pos : neg = 10.3 : 1.0 # N? tenker jeg at vi kan endelig teste v?r klassifiserer p? v?r test_set # Resultatene blir vanligvis d?rligere enn p? dev_set, men det er ikke s? farlig. accuracy = classify.accuracy(classifier, test_set) print("Accuracy on our test set: %0.2f" % accuracy) # Accuracy on our test set: 0.73 # Det er slett ikke d?rlig for en veldig enkel Naive Bayes klassifiserer :D ################################# # Ekstra tester ################################# print('#################################') print('Ekstra tester') print('#################################') # Vi har ogs? muligheten til ? teste v?r klassifiserer p? enkelte setninger som f?lgende # from nltk.tokenize import word_tokenize sent = "Main actor: the worst! I hope he got paied well for embarassing himself like this!" sent_tokens = word_tokenize(sent) sent_features = bag_of_ngrams_no_stopwords_punct(sent_tokens) print("Classifying the sentense: %s" %sent) print ("Denne setningen er: %s" %classifier.classify(sent_features)) # Output: neg # Printe sannsynelighetene for hver klasse prob = classifier.prob_classify(sent_features) print ("Den mest sannsynelige klassen er: %s" %prob.max()) # Output: neg print ("Sannsynigheten for at sentninger er negativ: %0.2f" %prob.prob("neg")) # Output: 0.98 print ("Sannsynigheten for at sentninger er positiv: %0.2f" %prob.prob("pos")) # Output: 0.01 # Test 2 sent = "He managed to not embarass himself despite the horrible script. Well done!" sent_tokens = word_tokenize(sent) sent_features = bag_of_ngrams_no_stopwords_punct(sent_tokens) print("Classifying the sentense: %s" %sent) print ("Denne setningen er: %s" %classifier.classify(sent_features)) # Output: pos # Printe sannsynelighetene for hver klasse prob = classifier.prob_classify(sent_features) print ("Den mest sannsynelige klassen er: %s" %prob.max()) # Output: print ("Sannsynigheten for at sentninger er negativ: %0.2f" %prob.prob("neg")) # Output: 0.35 print ("Sannsynigheten for at sentninger er positiv: %0.2f" %prob.prob("pos")) # Output: 0.64