11
11
from keras .preprocessing .sequence import pad_sequences
12
12
from sklearn .preprocessing import LabelEncoder
13
13
import pickle
14
+ import matplotlib .pyplot as plt
14
15
15
16
nltk .download ('stopwords' )
16
17
nltk .download ('punkt' )
@@ -51,7 +52,7 @@ def preprocess_text(text):
51
52
return ' ' .join (tokens )
52
53
53
54
# Example usage:
54
- text = "This is an example text with some numbers like 12345 and punctuation! But we'll remove them."
55
+ text = "This is an example text with some numbers like 12345, email like [email protected] and punctuation! But we'll remove them."
55
56
processed_text = preprocess_text (text )
56
57
print (processed_text )
57
58
@@ -112,8 +113,8 @@ def preprocess_text_list(text_list):
112
113
labels = df ['label' ].tolist ()
113
114
114
115
# Create a Tokenizer with an out-of-vocabulary (OOV) token
115
- tokenizer = Tokenizer ( oov_token = '<OOV>' )
116
- # print(tokenizer )
116
+ # this will replace any unknown words with a token of our choosing
117
+ tokenizer = Tokenizer ( num_words = 95000 , oov_token = 'OOV' , filters = '!"#$%&()*+,-./:;<=>@[\]^_`{|}~ ' )
117
118
tokenizer .fit_on_texts (texts )
118
119
119
120
# Save the tokenizer to a file
@@ -122,7 +123,7 @@ def preprocess_text_list(text_list):
122
123
123
124
# Convert the text data to sequences of integers using the tokenizer
124
125
sequences = tokenizer .texts_to_sequences (texts )
125
- # print(sequences)
126
+
126
127
# Pad the sequences to ensure uniform length for neural network input
127
128
padded_sequences = pad_sequences (sequences , padding = 'post' )
128
129
@@ -135,21 +136,22 @@ def preprocess_text_list(text_list):
135
136
Embedding (input_dim = len (tokenizer .word_index ) + 1 , output_dim = 32 ),
136
137
137
138
# LSTM layer for processing sequential data
138
- LSTM (100 ),
139
+ LSTM (50 ),
139
140
140
141
# Dense output layer for classification
141
142
Dense (num_classes , activation = 'softmax' )
142
143
])
143
144
144
145
# Assuming 'df' is your DataFrame containing the 'label' column
145
- label_encoder = LabelEncoder ()
146
- df ['encoded_label' ] = label_encoder .fit_transform (df ['label' ])
146
+ label_encoder = LabelEncoder () # will be used to convert categorical labels into numerical labels.
147
+ df ['encoded_label' ] = label_encoder .fit_transform (df ['label' ]) # transform these labels into numerical format
147
148
148
149
# Extract the encoded labels
149
150
encoded_labels = df ['encoded_label' ].tolist ()
150
151
151
152
# Convert labels to NumPy array
152
153
labels_np = np .array (encoded_labels )
154
+
153
155
# Replace the lambda function with a named function
154
156
def custom_sparse_softmax_cross_entropy (labels , logits ):
155
157
return tf .compat .v1 .losses .sparse_softmax_cross_entropy (labels = labels , logits = logits )
@@ -158,6 +160,26 @@ def custom_sparse_softmax_cross_entropy(labels, logits):
158
160
model .compile (optimizer = 'adam' , loss = custom_sparse_softmax_cross_entropy , metrics = ['accuracy' , 'precision' , 'recall' ])
159
161
160
162
# Train the model
161
- model .fit (padded_sequences , labels_np , epochs = 100 )
163
+ # model.fit(padded_sequences, labels_np, epochs=100)
164
+
165
+ # Assuming you have stored the model training history in a variable named 'history'
166
+ history = model .fit (padded_sequences , labels_np , epochs = 100 , validation_split = 0.2 )
167
+
168
+ # Extracting training and validation loss from history
169
+ training_loss = history .history ['loss' ]
170
+ validation_loss = history .history ['val_loss' ]
171
+
172
+ # Plotting the training and validation loss
173
+ epochs = range (1 , len (training_loss ) + 1 )
174
+ plt .figure (figsize = (10 , 6 ))
175
+ plt .plot (epochs , training_loss , 'bo-' , label = 'Training Loss' )
176
+ plt .plot (epochs , validation_loss , 'ro-' , label = 'Validation Loss' )
177
+ plt .title ('Training and Validation Loss' )
178
+ plt .xlabel ('Epochs' )
179
+ plt .ylabel ('Loss' )
180
+ plt .legend ()
181
+ plt .grid (True )
182
+ plt .show ()
183
+
162
184
# Save the model in the recommended Keras format
163
185
model .save ('trained_model.keras' )
0 commit comments