Skip to content

Commit 11775eb

Browse files
update
1 parent 043bb79 commit 11775eb

19 files changed

+1723
-11
lines changed

__pycache__/model.cpython-311.pyc

-9 Bytes
Binary file not shown.

model.py

Lines changed: 2 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -71,15 +71,14 @@ def preprocess_text_list(text_list):
7171
"ANURAG JAIN has requested money from you on Google Pay. On approving the request, INR 31.00 will be debited from your A/c - Axis Bank",
7272
"Flipkart Refund Processed: Refund of Rs. 237.0 for favoru Household wrap ... is successfully transferred and will be credited to your account by Oct 04, 2023.",
7373
"UPI mandate has been successfully created towards TATA TECHNOLOGIES LI for INR 15000.00. Funds blocked from A/c no. XX8926. 12e5d61d2ac145738241fbf117bb295c@okaxis - Axis Bank",
74-
"hi"
7574
]
7675

7776
# Preprocess the texts
7877
processed_texts = preprocess_text_list(texts)
7978

8079
# Example storage after cleaning
8180
data = {'text': processed_texts,
82-
'label': ['debited', 'debited', 'credited', 'credited', 'debited', 'credited', 'debited', 'credited', 'debited', 'credited', 'debited', 'debited', 'credited', 'debited', 'credited', 'credited','debited', 'requested', 'requested', 'requested', 'willcredit', 'blocked', 'x']}
81+
'label': ['debited', 'debited', 'credited', 'credited', 'debited', 'credited', 'debited', 'credited', 'debited', 'credited', 'debited', 'debited', 'credited', 'debited', 'credited', 'credited','debited', 'requested', 'requested', 'requested', 'willcredit', 'blocked']}
8382
df = pd.DataFrame(data)
8483
df.to_csv('processed_dataset.csv', index=False)
8584

@@ -92,7 +91,6 @@ def preprocess_text_list(text_list):
9291

9392
# Create a Tokenizer with an out-of-vocabulary (OOV) token
9493
tokenizer = Tokenizer(oov_token='<OOV>')
95-
# print(tokenizer)
9694
tokenizer.fit_on_texts(texts)
9795

9896
# Save the tokenizer to a file
@@ -101,7 +99,6 @@ def preprocess_text_list(text_list):
10199

102100
# Convert the text data to sequences of integers using the tokenizer
103101
sequences = tokenizer.texts_to_sequences(texts)
104-
# print(sequences)
105102
# Pad the sequences to ensure uniform length for neural network input
106103
padded_sequences = pad_sequences(sequences, padding='post')
107104

@@ -139,4 +136,4 @@ def custom_sparse_softmax_cross_entropy(labels, logits):
139136
# Train the model
140137
model.fit(padded_sequences, labels_np, epochs=100)
141138
# Save the model in the recommended Keras format
142-
model.save('trained_model.keras')
139+
model.save('trained_model.keras')

ner.py

Lines changed: 53 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,53 @@
1+
import spacy
2+
from spacy.tokens import DocBin
3+
from spacy.training import Example
4+
5+
# Load the spaCy English model
6+
nlp = spacy.blank("en")
7+
8+
# Define training data with labeled examples
9+
TRAIN_DATA = [
10+
("Dear SBI UPI User, ur A/cX0304 debited by Rs91000 on 08Feb24 by (Ref no 403968023837)", {"entities": [(14, 20, "AC_NUMBER"), (29, 35, "AMOUNT"), (45, 53, "DATE")]}),
11+
# Add more labeled examples here
12+
]
13+
14+
# Prepare training examples
15+
examples = []
16+
for text, annot in TRAIN_DATA:
17+
examples.append(Example.from_dict(nlp.make_doc(text), annot))
18+
19+
# Initialize the pipeline components
20+
if "ner" not in nlp.pipe_names:
21+
ner = nlp.add_pipe("ner")
22+
else:
23+
ner = nlp.get_pipe("ner")
24+
25+
# Add entity labels to the NER pipeline
26+
for _, annotations in TRAIN_DATA:
27+
for ent in annotations.get("entities"):
28+
ner.add_label(ent[2])
29+
30+
# Disable other pipeline components for training
31+
pipe_exceptions = ["ner"]
32+
other_pipes = [pipe for pipe in nlp.pipe_names if pipe not in pipe_exceptions]
33+
34+
# Train the NER model
35+
with nlp.disable_pipes(*other_pipes):
36+
optimizer = nlp.begin_training()
37+
for itn in range(10): # Adjust number of iterations
38+
losses = {}
39+
examples = spacy.util.minibatch(examples, size=4) # Adjust batch size
40+
for batch in examples:
41+
nlp.update(batch, drop=0.5, losses=losses)
42+
43+
# Save the trained model
44+
nlp.to_disk("trained_ner_model")
45+
46+
# Load the trained model
47+
nlp = spacy.load("trained_ner_model")
48+
49+
# Test the trained model
50+
test_text = "Dear SBI UPI User, ur A/cX0304 debited by Rs91000 on 08Feb24 by (Ref no 403968023837)"
51+
doc = nlp(test_text)
52+
for ent in doc.ents:
53+
print(ent.text, ent.start_char, ent.end_char, ent.label_)

processed_dataset.csv

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -21,4 +21,3 @@ receiv upi mandat collect request tata technolog li inr 1500000 log googl pay ap
2121
anurag jain request money googl pay approv request inr 3100 debit ac axi bank,requested
2222
flipkart refund process refund rs 2370 favoru household wrap success transfer credit account oct 04 2023,willcredit
2323
upi mandat success creat toward tata technolog li inr 1500000 fund block ac xx8926 12e5d61d2ac145738241fbf117bb295cokaxi axi bank,blocked
24-
hi,x

runmodel.py

Lines changed: 9 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,8 @@
11
import tensorflow as tf
22
from keras.models import load_model
33
from model import preprocess_text # Import your preprocessing function
4-
import pickle
54
from model import processed_texts
5+
import pickle
66
from sklearn.feature_extraction.text import CountVectorizer
77
from sklearn.feature_extraction.text import TfidfVectorizer
88
from sklearn.metrics.pairwise import cosine_similarity
@@ -21,7 +21,7 @@ def custom_sparse_softmax_cross_entropy(labels, logits):
2121

2222
df = pd.read_csv('processed_dataset.csv')
2323

24-
# Load the processed_texts list
24+
# # Load the processed_texts list
2525
# with open('processed_texts.pkl', 'rb') as f:
2626
# processed_texts = pickle.load(f)
2727

@@ -35,15 +35,20 @@ def custom_sparse_softmax_cross_entropy(labels, logits):
3535
new_texts = [
3636
"Dear SBI UPI User, ur A/cX0304 debited by Rs91000 on 08Feb24 by (Ref no 403968023837)",
3737
"Dear SBI UPI User, ur A/cX0304 credited by Rs91000 on 08Feb24 by (Ref no 403968023837)",
38+
"Dear SBI UPI User, ur A/cX0429 debited by Rs500 on 04Feb24 by (Ref no 403585759002)",
39+
"Dear SBI UPI User, ur A/cX0429 credited by Rs500 on 04Feb24 by (Ref no 403585759002)",
40+
"Dear UPI user A/C X0429 debited by 20.0 on date 22Jan24 trf to Mr Narayan Badat Refno 437652379634. If not u? call 1800111109. -SBI",
41+
"Dear UPI user A/C X0429 credited by 20.0 on date 22Jan24 trf to Mr Narayan Badat Refno 437652379634. If not u? call 1800111109. -SBI"
3842
"Dear UPI user A/C X0304 debited by 70.0 on date 22Jan24 trf to TUSHAR KESHARI P Refno 402238694585. If not u? call 1800111109. -SBI",
3943
"Dear UPI user A/C X0304 credited by 70.0 on date 22Jan24 trf to TUSHAR KESHARI P Refno 402238694585. If not u? call 1800111109. -SBI",
4044
"UPI Bank account is credited with RS.25.00 on 25-Aug-2023",
4145
"credit INR refund 100",
4246
"Refund Processed: Refund of Rs. 237.0 for favoru Household wrap ... is successfully transferred and will be credited to your account by Oct 04, 2023.",
4347
"UPI mandate has been successfully created towards TATA TECHNOLOGIES LI for INR 15000.00. Funds blocked from A/c no. XX8926. 12e5d61d2ac145738241fbf117bb295c@okaxis - Axis Bank",
44-
"Dear Player, Rs.10,000* is credited to your RummyTime a/c Ref Id: RT210XX Download the app & make your 1st deposit now - http://gmg.im/bKSfALT&C Apply"]
48+
"Dear Player, Rs.10,000* is credited to your RummyTime a/c Ref Id: RT210XX Download the app & make your 1st deposit now - http://gmg.im/bKSfALT&C Apply"
49+
]
4550

46-
similarity_threshold = 0.7
51+
similarity_threshold = 0.5
4752

4853
for text in new_texts:
4954
# Preprocess the new text using spaCy
@@ -69,4 +74,3 @@ def custom_sparse_softmax_cross_entropy(labels, logits):
6974
is_relevant = any(score >= similarity_threshold for score in similarity_scores)
7075
relevance_status = "Relevant" if is_relevant else "Irrelevant"
7176
print(f"Text: {text} | Predicted Label: {predicted_class_labels[0]} | Relevance: {relevance_status}")
72-

texting.py

Lines changed: 178 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,178 @@
1+
import nltk
2+
import tensorflow as tf
3+
from nltk.corpus import stopwords
4+
from nltk.tokenize import word_tokenize
5+
from nltk.stem import PorterStemmer
6+
import pandas as pd
7+
import numpy as np
8+
from keras.layers import Embedding, LSTM, Dense
9+
from keras.models import Sequential
10+
from keras.preprocessing.text import Tokenizer
11+
from keras.preprocessing.sequence import pad_sequences
12+
from sklearn.preprocessing import LabelEncoder
13+
from sklearn.feature_extraction.text import CountVectorizer
14+
from sklearn.metrics.pairwise import cosine_similarity
15+
16+
nltk.download('stopwords')
17+
nltk.download('punkt')
18+
19+
def preprocess_text(text):
20+
# Remove punctuation and convert to lowercase
21+
text = ''.join([char.lower() for char in text if char.isalnum() or char.isspace()])
22+
23+
# Tokenization
24+
tokens = word_tokenize(text)
25+
26+
# Remove stopwords
27+
stop_words = set(stopwords.words('english'))
28+
tokens = [word for word in tokens if word not in stop_words]
29+
30+
# Stemming
31+
stemmer = PorterStemmer()
32+
tokens = [stemmer.stem(word) for word in tokens]
33+
34+
return ' '.join(tokens)
35+
36+
def preprocess_text_list(text_list):
37+
preprocessed_texts = [preprocess_text(text) for text in text_list]
38+
return preprocessed_texts
39+
40+
def check_relevance(new_text, dataset_texts, similarity_threshold=0.85):
41+
# Preprocess the new text
42+
preprocessed_new_text = preprocess_text(new_text)
43+
44+
# Preprocess each text in the dataset
45+
preprocessed_dataset_texts = [preprocess_text(text) for text in dataset_texts]
46+
47+
# Calculate similarity between the new text and each text in the dataset
48+
vectorizer = CountVectorizer().fit(preprocessed_dataset_texts)
49+
new_text_vectorized = vectorizer.transform([preprocessed_new_text])
50+
dataset_texts_vectorized = vectorizer.transform(preprocessed_dataset_texts)
51+
similarity_scores = cosine_similarity(new_text_vectorized, dataset_texts_vectorized)[0]
52+
53+
# Check if any text in the dataset is similar to the new text
54+
return any(score >= similarity_threshold for score in similarity_scores)
55+
56+
texts = [
57+
# Axis Bank
58+
"Debit INR 500.00 A/c no. XX8926 12-10-23 20:02:19 UPI/P2A/328546155288/ANURAG JAIN SMS BLOCKUPI Cust ID to 01351860002, if not you - Axis Bank",
59+
"Debit INR 109.00 A/c no. XX8926 27-01-24 11:36:57 UPI/P2M/6321837696198/Add Money to Wallet SMS BLOCKUPI Cust ID to 919951860002, if not you - Axis Bank",
60+
"INR 5590.00 credited to A/c no. XX8926 on 09-11-23 at 11:59:28 IST. Info- UPI/P2A/334365332111/ANURAG JAIN/Axis Bank - Axis Bank",
61+
"INR 216.35 credited to A/c no. XX8926 on 06-01-24 at 07:32:16 IST. Info- NEFT/CMS333334641/NEXTBIL. Avl Bal- INR 33478.22 - Axis Bank",
62+
# UCO Bank
63+
"A/c XX8360 Debited with Rs. 19.00 on 07-02-2024 by UCO-UPI.Avl Bal Rs.32.98. Report Dispute https://bit.ly/3y39tLP .For feedback https://rb.gy/fdfmda",
64+
"A/c XX8360 Credited with Rs.6.00 on 07-02-2024 by UCO-UPI.Avl Bal Rs.51.98. Report Dispute https://bit.ly/3y39tLP .For feedback https://rb.gy/fdfmda",
65+
# SBI
66+
"Dear UPI user A/C X0429 debited by 20.0 on date 22Jan24 trf to Mr Narayan Badat Refno 437652379634. If not u? call 1800111109. -SBI",
67+
"Dear SBI UPI User, ur A/cX0429 credited by Rs500 on 04Feb24 by (Ref no 403585759002)",
68+
# Union Bank
69+
"A/c *9172 Debited for Rs:50.00 on 11-02-2024 19:44:40 by Mob Bk ref no 444816787760 Avl Bal Rs:1870.55.If not you, Call 1800222243 -Union Bank of India",
70+
"A/c *9172 Credited for Rs:501.00 on 23-01-2024 20:05:45 by Mob Bk ref no 402347890661 Avl Bal Rs:556.00.Never Share OTP/PIN/CVV-Union Bank of India",
71+
# Federal Bank
72+
"Rs 50.00 debited from your A/c using UPI on 03-02-2024 16:44:28 to VPA abcd4321@oksbi - (UPI Ref No 403417856009)-Federal Bank",
73+
# Kotak Bank
74+
"Sent Rs.20.00 from Kotak Bank AC X8136 to abcd2003@oksbi on 03-02-24.UPI Ref 403418725300. Not you, kotak.com/fraud",
75+
"Received Rs.50.00 in your Kotak Bank AC X8136 from abcd4321@oksbi on 03-02-24.UPI Ref:400653974000.",
76+
# HDFC Bank
77+
"UPDATE: INR 1,000.00 debited from HDFC Bank XX2002 on 11-DEC-23. Info: FT - Dr - XXXXXXXXXX1498 - ANURAG JAIN. Avl bal:INR 4,891.00",
78+
"HDFC Bank: Rs. 1.00 credited to a/c XXXXXX2002 on 23-01-24 by a/c linked to VPA 9777777711@fam (UPI Ref No 408888887329).",
79+
# Jio Payments Bank
80+
"Your JPB A/c xxxx0956 is credited with Rs.25.00 on 25-Aug-2023. Your current account balance is Rs.25.",
81+
# Paytm Payments Bank
82+
"Rs.550 sent to abcd1234-1@okicici from PPBL a/c 91XX8089.UPI Ref:439432479819;Balance:https://m.paytm.me/pbCheckBal; Help:http://m.p-y.tm/care",
83+
# Extra
84+
"IRCTC CF has requested money on Google Pay UPI app. On approving, INR 1033.60 will be debited from your A/c - Axis Bank",
85+
"You have received UPI mandate collect request from TATA TECHNOLOGIES LI for INR 15000.00. Log into Google Pay app to authorize - Axis Bank",
86+
"ANURAG JAIN has requested money from you on Google Pay. On approving the request, INR 31.00 will be debited from your A/c - Axis Bank",
87+
"Flipkart Refund Processed: Refund of Rs. 237.0 for favoru Household wrap ... is successfully transferred and will be credited to your account by Oct 04, 2023.",
88+
"UPI mandate has been successfully created towards TATA TECHNOLOGIES LI for INR 15000.00. Funds blocked from A/c no. XX8926. 12e5d61d2ac145738241fbf117bb295c@okaxis - Axis Bank",
89+
]
90+
91+
# Preprocess the texts
92+
processed_texts = preprocess_text_list(texts)
93+
94+
# Example storage after cleaning
95+
data = {'text': processed_texts,
96+
'label': ['debited', 'debited', 'credited', 'credited', 'debited', 'credited', 'debited', 'credited', 'debited', 'credited', 'debited', 'debited', 'credited', 'debited', 'credited', 'credited','debited', 'requested', 'requested', 'requested', 'willcredit', 'blocked']}
97+
df = pd.DataFrame(data)
98+
df.to_csv('processed_dataset.csv', index=False)
99+
100+
# Load the processed dataset from the CSV file
101+
df = pd.read_csv('processed_dataset.csv')
102+
103+
# Extract the 'text' and 'label' columns from the DataFrame
104+
texts = df['text'].tolist()
105+
labels = df['label'].tolist()
106+
107+
# Create a Tokenizer with an out-of-vocabulary (OOV) token
108+
tokenizer = Tokenizer(oov_token='<OOV>')
109+
tokenizer.fit_on_texts(texts)
110+
111+
# Convert the text data to sequences of integers using the tokenizer
112+
sequences = tokenizer.texts_to_sequences(texts)
113+
# Pad the sequences to ensure uniform length for neural network input
114+
padded_sequences = pad_sequences(sequences, padding='post')
115+
116+
# Calculate the number of unique classes in the 'labels' list
117+
num_classes = len(set(labels))
118+
119+
# Create a Sequential model
120+
model = Sequential([
121+
# Embedding layer for word embeddings
122+
Embedding(input_dim=len(tokenizer.word_index) + 1, output_dim=32),
123+
124+
# LSTM layer for processing sequential data
125+
LSTM(100),
126+
127+
# Dense output layer for classification
128+
Dense(num_classes, activation='softmax')
129+
])
130+
131+
# Assuming 'df' is your DataFrame containing the 'label' column
132+
label_encoder = LabelEncoder()
133+
df['encoded_label'] = label_encoder.fit_transform(df['label'])
134+
135+
# Extract the encoded labels
136+
encoded_labels = df['encoded_label'].tolist()
137+
138+
# Convert labels to NumPy array
139+
labels_np = np.array(encoded_labels)
140+
141+
# Compile the model with the updated loss function
142+
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
143+
144+
# Train the model
145+
model.fit(padded_sequences, labels_np, epochs=300)
146+
147+
# Assuming 'new_texts' is a list of new messages
148+
new_texts = [
149+
"Dear SBI UPI User, ur A/cX0304 debited by Rs91000 on 08Feb24 by (Ref no 403968023837)",
150+
"Dear SBI UPI User, ur A/cX0304 credited by Rs91000 on 08Feb24 by (Ref no 403968023837)",
151+
"Dear SBI UPI User, ur A/cX0429 debited by Rs500 on 04Feb24 by (Ref no 403585759002)",
152+
"Dear SBI UPI User, ur A/cX0429 credited by Rs500 on 04Feb24 by (Ref no 403585759002)",
153+
"Dear UPI user A/C X0429 debited by 20.0 on date 22Jan24 trf to Mr Narayan Badat Refno 437652379634. If not u? call 1800111109. -SBI",
154+
"Dear UPI user A/C X0429 credited by 20.0 on date 22Jan24 trf to Mr Narayan Badat Refno 437652379634. If not u? call 1800111109. -SBI",
155+
"Dear UPI user A/C X0304 debited by 70.0 on date 22Jan24 trf to TUSHAR KESHARI P Refno 402238694585. If not u? call 1800111109. -SBI",
156+
"Dear UPI user A/C X0304 credited by 70.0 on date 22Jan24 trf to TUSHAR KESHARI P Refno 402238694585. If not u? call 1800111109. -SBI",
157+
"UPI Bank account is credited with RS.25.00 on 25-Aug-2023",
158+
"credit INR refund 100",
159+
"Refund Processed: Refund of Rs. 237.0 for favoru Household wrap ... is successfully transferred and will be credited to your account by Oct 04, 2023.",
160+
"UPI mandate has been successfully created towards TATA TECHNOLOGIES LI for INR 15000.00. Funds blocked from A/c no. XX8926. 12e5d61d2ac145738241fbf117bb295c@okaxis - Axis Bank",
161+
"Dear Player, Rs.10,000* is credited to your RummyTime a/c Ref Id: RT210XX Download the app & make your 1st deposit now - http://gmg.im/bKSfALT&C Apply"
162+
]
163+
# Check relevance and print the result
164+
for text in new_texts:
165+
new_sequences = tokenizer.texts_to_sequences([text])
166+
new_padded_sequences = pad_sequences(new_sequences, padding='post')
167+
168+
# Predictions
169+
predictions = model.predict(new_padded_sequences)
170+
predicted_labels = [label for label in predictions.argmax(axis=1)]
171+
172+
# Inverse transform predicted labels to original class labels
173+
predicted_class_labels = label_encoder.inverse_transform(predicted_labels)
174+
175+
# Check relevance and print the result
176+
is_relevant = check_relevance(text, texts)
177+
relevance_status = "Relevant" if is_relevant else "Irrelevant"
178+
print(f"Text: {text} | Predicted Label: {predicted_class_labels[0]} | Relevance: {relevance_status}")

tokenizer.pkl

-23 Bytes
Binary file not shown.

trained_model.keras

-1.56 KB
Binary file not shown.

0 commit comments

Comments
 (0)