-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathfilter_tweet.py
67 lines (57 loc) · 2.67 KB
/
filter_tweet.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
import re
'''
this class filters (removes tweet tags, punctuations,
irrelevant words such as a, an, the) from the tweets
and returns array of clean filtered tweets
'''
class FilterTweets:
# opening the testing file
tweet_testing = open('Tweets/Tweets_election_trial.txt')
final_tweets = []
def filterTweets(self):
tweets = []
for tweet in self.tweet_testing:
tweet = tweet.lower()
tweet = re.sub(":|;|\(|\)|!|\'s|\"|\.|n\'t|,|\?|i\'m|i\'ve", "", tweet)
tweet = re.sub(" an | the | is | of | a | i | was | and |"
" on | in | off | all | it | me | you | to |"
" into | we | your | that | they | can | could |"
" should | do | does | for | my | at | so | So |"
" if | has | have | had | from | such | are |"
" not | this | now | but | go | day |"
"-|_| up | down | these | today | lol |"
" lmao | af | get | got | here | there | who |"
" what | am | no | why | with | us | our | bro |"
" too | then | ur | zero | ah | see | saw ", " ", tweet)
tweets.append(tweet)
# cleaning up twice because of space issue
tweets1 = []
for tweet in tweets:
tweet = tweet.lower()
tweet = re.sub(" an | the | is | of | a | i | was | and |"
" on | in | off | all | it | me | you | to |"
" into | we | your | that | they | can | could |"
" should | do | does | for | my | at | so | So |"
" if | has | have | had | from | such | are |"
" not | this | now | but | go | day |"
" - | up | down | these | today | lol |"
" lmao | af | get | got | here | there | who |"
" what | am | no | why | with | us | our | bro ", " ", tweet)
tweets1.append(tweet)
print(tweet)
'''
if there contains a substring starting with http or @,
remove that substring from the tweet
'''
for str in tweets1:
index = 0
string_arr = str.split()
for string in string_arr:
if 'http' in string or '@' in string:
string_arr[index] = string.replace(string, '')
index += 1
tweet = ' '
tweet = tweet.join(string_arr)
self.final_tweets.append(tweet)
# print(tweet)
return self.final_tweets