10
10
from keras .preprocessing .text import Tokenizer
11
11
from keras .preprocessing .sequence import pad_sequences
12
12
from sklearn .preprocessing import LabelEncoder
13
+ import re
13
14
14
15
import pickle
15
16
16
17
17
18
nltk .download ('stopwords' )
18
19
nltk .download ('punkt' )
19
20
21
+ # def preprocess_text(text):
22
+ # # Remove punctuation, convert to lowercase
23
+ # # text = ''.join([char.lower() for char in text if char.isalnum() and not char.isdigit() or char.isspace()])
24
+ # text = ''.join([char.lower() for char in text if char.isalnum() or char.isspace()])
25
+
26
+ # # Tokenization
27
+ # tokens = word_tokenize(text)
28
+
29
+ # # Remove stopwords
30
+ # stop_words = set(stopwords.words('english'))
31
+ # tokens = [word for word in tokens if word not in stop_words]
32
+
33
+ # # Stemming
34
+ # stemmer = PorterStemmer()
35
+ # tokens = [stemmer.stem(word) for word in tokens]
36
+
37
+ # return ' '.join(tokens)
38
+
20
39
def preprocess_text (text ):
21
- # Remove punctuation and convert to lowercase
22
- text = '' .join ([char .lower () for char in text if char .isalnum () or char .isspace ()])
40
+ # Convert to lowercase
41
+ text = text .lower ()
42
+
43
+ # Initialize an empty list to store processed characters
44
+ processed_chars = []
45
+
46
+ i = 0
47
+ while i < len (text ):
48
+ # If character is a digit, skip all characters until the next space
49
+ if text [i ].isdigit ():
50
+ while i < len (text ) and text [i ] != ' ' :
51
+ i += 1
52
+ # If character is alphanumeric or space, add it to processed_chars
53
+ elif text [i ].isalnum () and not text [i ].isdigit () or text [i ].isspace ():
54
+ processed_chars .append (text [i ])
55
+ i += 1
56
+
57
+ # Join the processed characters into a string
58
+ processed_text = '' .join (processed_chars )
23
59
24
60
# Tokenization
25
- tokens = word_tokenize (text )
61
+ tokens = word_tokenize (processed_text )
26
62
27
63
# Remove stopwords
28
64
stop_words = set (stopwords .words ('english' ))
@@ -34,6 +70,12 @@ def preprocess_text(text):
34
70
35
71
return ' ' .join (tokens )
36
72
73
+ # Example usage:
74
+ text = "This is an example text with some numbers like 12345 and punctuation! But we'll remove them."
75
+ processed_text = preprocess_text (text )
76
+ print (processed_text )
77
+
78
+
37
79
def preprocess_text_list (text_list ):
38
80
preprocessed_texts = [preprocess_text (text ) for text in text_list ]
39
81
return preprocessed_texts
@@ -70,16 +112,15 @@ def preprocess_text_list(text_list):
70
112
"You have received UPI mandate collect request from TATA TECHNOLOGIES LI for INR 15000.00. Log into Google Pay app to authorize - Axis Bank" ,
71
113
"ANURAG JAIN has requested money from you on Google Pay. On approving the request, INR 31.00 will be debited from your A/c - Axis Bank" ,
72
114
"Flipkart Refund Processed: Refund of Rs. 237.0 for favoru Household wrap ... is successfully transferred and will be credited to your account by Oct 04, 2023." ,
73
- "UPI mandate has been successfully created towards TATA TECHNOLOGIES LI for INR 15000.00. Funds blocked from A/c no. XX8926. 12e5d61d2ac145738241fbf117bb295c@okaxis - Axis Bank" ,
74
- "hi"
115
+ "UPI mandate has been successfully created towards TATA TECHNOLOGIES LI for INR 15000.00. Funds blocked from A/c no. XX8926. 12e5d61d2ac145738241fbf117bb295c@okaxis - Axis Bank"
75
116
]
76
117
77
118
# Preprocess the texts
78
119
processed_texts = preprocess_text_list (texts )
79
120
80
121
# Example storage after cleaning
81
122
data = {'text' : processed_texts ,
82
- 'label' : ['debited' , 'debited' , 'credited' , 'credited' , 'debited' , 'credited' , 'debited' , 'credited' , 'debited' , 'credited' , 'debited' , 'debited' , 'credited' , 'debited' , 'credited' , 'credited' ,'debited' , 'requested' , 'requested' , 'requested' , 'willcredit' , 'blocked' , 'x' ]}
123
+ 'label' : ['debited' , 'debited' , 'credited' , 'credited' , 'debited' , 'credited' , 'debited' , 'credited' , 'debited' , 'credited' , 'debited' , 'debited' , 'credited' , 'debited' , 'credited' , 'credited' ,'debited' , 'requested' , 'requested' , 'requested' , 'willcredit' , 'blocked' ]}
83
124
df = pd .DataFrame (data )
84
125
df .to_csv ('processed_dataset.csv' , index = False )
85
126
@@ -92,7 +133,7 @@ def preprocess_text_list(text_list):
92
133
93
134
# Create a Tokenizer with an out-of-vocabulary (OOV) token
94
135
tokenizer = Tokenizer (oov_token = '<OOV>' )
95
- print (tokenizer )
136
+ # print(tokenizer)
96
137
tokenizer .fit_on_texts (texts )
97
138
98
139
# Save the tokenizer to a file
@@ -101,7 +142,7 @@ def preprocess_text_list(text_list):
101
142
102
143
# Convert the text data to sequences of integers using the tokenizer
103
144
sequences = tokenizer .texts_to_sequences (texts )
104
- print (sequences )
145
+ # print(sequences)
105
146
# Pad the sequences to ensure uniform length for neural network input
106
147
padded_sequences = pad_sequences (sequences , padding = 'post' )
107
148
@@ -131,12 +172,27 @@ def preprocess_text_list(text_list):
131
172
labels_np = np .array (encoded_labels )
132
173
# Replace the lambda function with a named function
133
174
def custom_sparse_softmax_cross_entropy (labels , logits ):
134
- return tf .compat .v1 .losses .sparse_softmax_cross_entropy (labels , logits )
175
+ return tf .compat .v1 .losses .sparse_softmax_cross_entropy (labels = labels , logits = logits )
135
176
136
177
# Compile the model with the named function
137
- model .compile (optimizer = 'adam' , loss = custom_sparse_softmax_cross_entropy , metrics = ['accuracy' ])
178
+ model .compile (optimizer = 'adam' , loss = custom_sparse_softmax_cross_entropy , metrics = ['accuracy' , 'precision' , 'recall' ])
138
179
139
180
# Train the model
140
181
model .fit (padded_sequences , labels_np , epochs = 100 )
141
182
# Save the model in the recommended Keras format
142
183
model .save ('trained_model.keras' )
184
+
185
+
186
+ # One-hot encode labels (assuming labels are text strings)
187
+ # label_encoder = LabelEncoder()
188
+ # labels_encoded = label_encoder.fit_transform(labels)
189
+ # labels_onehot = tf.keras.utils.to_categorical(labels_encoded, num_classes=len(set(labels))) # Adjust num_classes if needed
190
+
191
+ # # Compile the model with categorical crossentropy loss
192
+ # model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy', 'precision', 'recall'])
193
+
194
+ # # Train the model
195
+ # model.fit(padded_sequences, labels_onehot, epochs=100)
196
+
197
+ # # Save the model
198
+ # model.save('trained_model.keras')
0 commit comments