Skip to content

Commit cf3a063

Browse files
updated preprocess text
1 parent 3939d3c commit cf3a063

File tree

6 files changed

+89
-34
lines changed

6 files changed

+89
-34
lines changed

__pycache__/model.cpython-311.pyc

640 Bytes
Binary file not shown.

model.py

Lines changed: 66 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -10,19 +10,55 @@
1010
from keras.preprocessing.text import Tokenizer
1111
from keras.preprocessing.sequence import pad_sequences
1212
from sklearn.preprocessing import LabelEncoder
13+
import re
1314

1415
import pickle
1516

1617

1718
nltk.download('stopwords')
1819
nltk.download('punkt')
1920

21+
# def preprocess_text(text):
22+
# # Remove punctuation, convert to lowercase
23+
# # text = ''.join([char.lower() for char in text if char.isalnum() and not char.isdigit() or char.isspace()])
24+
# text = ''.join([char.lower() for char in text if char.isalnum() or char.isspace()])
25+
26+
# # Tokenization
27+
# tokens = word_tokenize(text)
28+
29+
# # Remove stopwords
30+
# stop_words = set(stopwords.words('english'))
31+
# tokens = [word for word in tokens if word not in stop_words]
32+
33+
# # Stemming
34+
# stemmer = PorterStemmer()
35+
# tokens = [stemmer.stem(word) for word in tokens]
36+
37+
# return ' '.join(tokens)
38+
2039
def preprocess_text(text):
21-
# Remove punctuation and convert to lowercase
22-
text = ''.join([char.lower() for char in text if char.isalnum() or char.isspace()])
40+
# Convert to lowercase
41+
text = text.lower()
42+
43+
# Initialize an empty list to store processed characters
44+
processed_chars = []
45+
46+
i = 0
47+
while i < len(text):
48+
# If character is a digit, skip all characters until the next space
49+
if text[i].isdigit():
50+
while i < len(text) and text[i] != ' ':
51+
i += 1
52+
# If character is alphanumeric or space, add it to processed_chars
53+
elif text[i].isalnum() and not text[i].isdigit() or text[i].isspace():
54+
processed_chars.append(text[i])
55+
i += 1
56+
57+
# Join the processed characters into a string
58+
processed_text = ''.join(processed_chars)
2359

2460
# Tokenization
25-
tokens = word_tokenize(text)
61+
tokens = word_tokenize(processed_text)
2662

2763
# Remove stopwords
2864
stop_words = set(stopwords.words('english'))
@@ -34,6 +70,12 @@ def preprocess_text(text):
3470

3571
return ' '.join(tokens)
3672

73+
# Example usage:
74+
text = "This is an example text with some numbers like 12345 and punctuation! But we'll remove them."
75+
processed_text = preprocess_text(text)
76+
print(processed_text)
77+
78+
3779
def preprocess_text_list(text_list):
3880
preprocessed_texts = [preprocess_text(text) for text in text_list]
3981
return preprocessed_texts
@@ -70,16 +112,15 @@ def preprocess_text_list(text_list):
70112
"You have received UPI mandate collect request from TATA TECHNOLOGIES LI for INR 15000.00. Log into Google Pay app to authorize - Axis Bank",
71113
"ANURAG JAIN has requested money from you on Google Pay. On approving the request, INR 31.00 will be debited from your A/c - Axis Bank",
72114
"Flipkart Refund Processed: Refund of Rs. 237.0 for favoru Household wrap ... is successfully transferred and will be credited to your account by Oct 04, 2023.",
73-
"UPI mandate has been successfully created towards TATA TECHNOLOGIES LI for INR 15000.00. Funds blocked from A/c no. XX8926. 12e5d61d2ac145738241fbf117bb295c@okaxis - Axis Bank",
74-
"hi"
115+
"UPI mandate has been successfully created towards TATA TECHNOLOGIES LI for INR 15000.00. Funds blocked from A/c no. XX8926. 12e5d61d2ac145738241fbf117bb295c@okaxis - Axis Bank"
75116
]
76117

77118
# Preprocess the texts
78119
processed_texts = preprocess_text_list(texts)
79120

80121
# Example storage after cleaning
81122
data = {'text': processed_texts,
82-
'label': ['debited', 'debited', 'credited', 'credited', 'debited', 'credited', 'debited', 'credited', 'debited', 'credited', 'debited', 'debited', 'credited', 'debited', 'credited', 'credited','debited', 'requested', 'requested', 'requested', 'willcredit', 'blocked', 'x']}
123+
'label': ['debited', 'debited', 'credited', 'credited', 'debited', 'credited', 'debited', 'credited', 'debited', 'credited', 'debited', 'debited', 'credited', 'debited', 'credited', 'credited','debited', 'requested', 'requested', 'requested', 'willcredit', 'blocked']}
83124
df = pd.DataFrame(data)
84125
df.to_csv('processed_dataset.csv', index=False)
85126

@@ -92,7 +133,7 @@ def preprocess_text_list(text_list):
92133

93134
# Create a Tokenizer with an out-of-vocabulary (OOV) token
94135
tokenizer = Tokenizer(oov_token='<OOV>')
95-
print(tokenizer)
136+
# print(tokenizer)
96137
tokenizer.fit_on_texts(texts)
97138

98139
# Save the tokenizer to a file
@@ -101,7 +142,7 @@ def preprocess_text_list(text_list):
101142

102143
# Convert the text data to sequences of integers using the tokenizer
103144
sequences = tokenizer.texts_to_sequences(texts)
104-
print(sequences)
145+
# print(sequences)
105146
# Pad the sequences to ensure uniform length for neural network input
106147
padded_sequences = pad_sequences(sequences, padding='post')
107148

@@ -131,12 +172,27 @@ def preprocess_text_list(text_list):
131172
labels_np = np.array(encoded_labels)
132173
# Replace the lambda function with a named function
133174
def custom_sparse_softmax_cross_entropy(labels, logits):
134-
return tf.compat.v1.losses.sparse_softmax_cross_entropy(labels, logits)
175+
return tf.compat.v1.losses.sparse_softmax_cross_entropy(labels=labels, logits=logits)
135176

136177
# Compile the model with the named function
137-
model.compile(optimizer='adam', loss=custom_sparse_softmax_cross_entropy, metrics=['accuracy'])
178+
model.compile(optimizer='adam', loss=custom_sparse_softmax_cross_entropy, metrics=['accuracy', 'precision', 'recall'])
138179

139180
# Train the model
140181
model.fit(padded_sequences, labels_np, epochs=100)
141182
# Save the model in the recommended Keras format
142183
model.save('trained_model.keras')
184+
185+
186+
# One-hot encode labels (assuming labels are text strings)
187+
# label_encoder = LabelEncoder()
188+
# labels_encoded = label_encoder.fit_transform(labels)
189+
# labels_onehot = tf.keras.utils.to_categorical(labels_encoded, num_classes=len(set(labels))) # Adjust num_classes if needed
190+
191+
# # Compile the model with categorical crossentropy loss
192+
# model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy', 'precision', 'recall'])
193+
194+
# # Train the model
195+
# model.fit(padded_sequences, labels_onehot, epochs=100)
196+
197+
# # Save the model
198+
# model.save('trained_model.keras')

processed_dataset.csv

Lines changed: 22 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -1,24 +1,23 @@
11
text,label
2-
debit inr 50000 ac xx8926 121023 200219 upip2a328546155288anurag jain sm blockupi cust id 01351860002 axi bank,debited
3-
debit inr 10900 ac xx8926 270124 113657 upip2m6321837696198add money wallet sm blockupi cust id 919951860002 axi bank,debited
4-
inr 559000 credit ac xx8926 091123 115928 ist info upip2a334365332111anurag jainaxi bank axi bank,credited
5-
inr 21635 credit ac xx8926 060124 073216 ist info neftcms333334641nextbil avl bal inr 3347822 axi bank,credited
6-
ac xx8360 debit rs 1900 07022024 ucoupiavl bal rs3298 report disput httpsbitly3y39tlp feedback httpsrbgyfdfmda,debited
7-
ac xx8360 credit rs600 07022024 ucoupiavl bal rs5198 report disput httpsbitly3y39tlp feedback httpsrbgyfdfmda,credited
8-
dear upi user ac x0429 debit 200 date 22jan24 trf mr narayan badat refno 437652379634 u call 1800111109 sbi,debited
9-
dear sbi upi user ur acx0429 credit rs500 04feb24 ref 403585759002,credited
10-
ac 9172 debit rs5000 11022024 194440 mob bk ref 444816787760 avl bal rs187055if call 1800222243 union bank india,debited
11-
ac 9172 credit rs50100 23012024 200545 mob bk ref 402347890661 avl bal rs55600never share otppincvvunion bank india,credited
12-
rs 5000 debit ac use upi 03022024 164428 vpa abcd4321oksbi upi ref 403417856009feder bank,debited
13-
sent rs2000 kotak bank ac x8136 abcd2003oksbi 030224upi ref 403418725300 kotakcomfraud,debited
14-
receiv rs5000 kotak bank ac x8136 abcd4321oksbi 030224upi ref400653974000,credited
15-
updat inr 100000 debit hdfc bank xx2002 11dec23 info ft dr xxxxxxxxxx1498 anurag jain avl balinr 489100,debited
16-
hdfc bank rs 100 credit ac xxxxxx2002 230124 ac link vpa 9777777711fam upi ref 408888887329,credited
17-
jpb ac xxxx0956 credit rs2500 25aug2023 current account balanc rs25,credited
18-
rs550 sent abcd12341okicici ppbl ac 91xx8089upi ref439432479819balancehttpsmpaytmmepbcheckb helphttpmpytmcar,debited
19-
irctc cf request money googl pay upi app approv inr 103360 debit ac axi bank,requested
20-
receiv upi mandat collect request tata technolog li inr 1500000 log googl pay app author axi bank,requested
21-
anurag jain request money googl pay approv request inr 3100 debit ac axi bank,requested
22-
flipkart refund process refund rs 2370 favoru household wrap success transfer credit account oct 04 2023,willcredit
23-
upi mandat success creat toward tata technolog li inr 1500000 fund block ac xx8926 12e5d61d2ac145738241fbf117bb295cokaxi axi bank,blocked
24-
hi,x
2+
debit inr ac xxupipjain sm blockupi cust id axi bank,debited
3+
debit inr ac xxupipmoney wallet sm blockupi cust id axi bank,debited
4+
inr credit ac xxon ist info upipjainaxi bank axi bank,credited
5+
inr credit ac xxon ist info neftcmsavl bal inr axi bank,credited
6+
ac xxdebit rs ucoupiavl bal rsreport disput httpsbitlyfor feedback httpsrbgyfdfmda,debited
7+
ac xxcredit rson ucoupiavl bal rsreport disput httpsbitlyfor feedback httpsrbgyfdfmda,credited
8+
dear upi user ac xdebit date trf mr narayan badat refno u call sbi,debited
9+
dear sbi upi user ur acxcredit rson ref,credited
10+
ac debit rson mob bk ref avl bal rsnot call union bank india,debited
11+
ac credit rson mob bk ref avl bal rsshare otppincvvunion bank india,credited
12+
rs debit ac use upi vpa abcd upi ref bank,debited
13+
sent rsfrom kotak bank ac xto abcdon ref kotakcomfraud,debited
14+
receiv rsin kotak bank ac xfrom abcdon ref,credited
15+
updat inr debit hdfc bank xxon info ft dr xxxxxxxxxx anurag jain avl balinr,debited
16+
hdfc bank rs credit ac xxxxxxon ac link vpa upi ref,credited
17+
jpb ac xxxxi credit rson current account balanc rs,credited
18+
rssent abcdfrom ppbl ac refhelphttpmpytmcar,debited
19+
irctc cf request money googl pay upi app approv inr debit ac axi bank,requested
20+
receiv upi mandat collect request tata technolog li inr log googl pay app author axi bank,requested
21+
anurag jain request money googl pay approv request inr debit ac axi bank,requested
22+
flipkart refund process refund rs favoru household wrap success transfer credit account oct,willcredit
23+
upi mandat success creat toward tata technolog li inr fund block ac xx axi bank,blocked

runmodel.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -40,7 +40,7 @@ def custom_sparse_softmax_cross_entropy(labels, logits):
4040
"UPI mandate has been successfully created towards TATA TECHNOLOGIES LI for INR 15000.00. Funds blocked from A/c no. XX8926. 12e5d61d2ac145738241fbf117bb295c@okaxis - Axis Bank",
4141
"Dear Player, Rs.10,000* is credited to your RummyTime a/c Ref Id: RT210XX Download the app & make your 1st deposit now - http://gmg.im/bKSfALT&C Apply"]
4242

43-
similarity_threshold = 0.7
43+
similarity_threshold = 0.9
4444

4545
for text in new_texts:
4646
# Preprocess the new text using spaCy

tokenizer.pkl

-1.87 KB
Binary file not shown.

trained_model.keras

-24.4 KB
Binary file not shown.

0 commit comments

Comments
 (0)