28 lines
831 B
Python
28 lines
831 B
Python
from textblob import TextBlob
|
|
import pandas as pd
|
|
import re
|
|
import preprocessor as p
|
|
import numpy as np
|
|
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
|
|
|
|
df = pd.read_csv("./data/sts_gold_tweets.csv")
|
|
df = df.sort_values(by=['polarity'])
|
|
|
|
reals = []
|
|
preds = []
|
|
|
|
for index, row in df.iterrows():
|
|
tweet = row["tweet.text"]
|
|
# tweet = re.sub(r'^https?:\/\/.*[\r\n]*', '', tweet, flags=re.MULTILINE)
|
|
tweet = p.clean(tweet)
|
|
print(tweet, row["tweet.text"])
|
|
tb = TextBlob(tweet)
|
|
print(row["polarity"], 4 if tb.polarity > 0 else 0, tb.subjectivity)
|
|
reals.append(row["polarity"])
|
|
preds.append(4 if tb.polarity > 0 else 0)
|
|
|
|
# default accuracy is 72% - NICE!
|
|
print(accuracy_score(reals, preds))
|
|
print(confusion_matrix(reals, preds))
|
|
print(classification_report(reals, preds))
|