Skip to content

Commit b236f14

Browse files
author
Ra Inta
committed
Added synthetic generation of customer database
1 parent 68657f2 commit b236f14

File tree

2 files changed

+224
-0
lines changed

2 files changed

+224
-0
lines changed
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,223 @@
1+
from mimesis import Person, Address, Business, Payment, Text
2+
3+
from scipy.stats import pareto
4+
import pandas as pd
5+
import numpy as np
6+
7+
import sqlite3
8+
import os
9+
10+
# Note: we don't ever store user passwords as clear text!!!
11+
# To emulate salting and hashing the user passwords:
12+
import hashlib
13+
import uuid
14+
15+
# However, we should really use a dedicated password hashing
16+
# package, such as passlib. However, this is out of scope
17+
# for this script e.g:
18+
# import passlib
19+
20+
np.random.seed(42) # To make our analysis reproducible
21+
22+
person = Person()
23+
address = Address()
24+
business = Business()
25+
payment = Payment()
26+
text = Text()
27+
28+
##################################################
29+
### Define a couple of convenience functions:
30+
##################################################
31+
32+
33+
def hashed_passwd(passwd):
34+
"""We should never entertain the idea of storing users' passwords
35+
as plaintext. This function performs a basic salting and hashing
36+
of a password input. This function should *never* be used in a
37+
production setting; if you need to securely store salted and hashed
38+
passwords, use a dedicated package such as passlib."""
39+
salt = uuid.uuid4().hex
40+
return hashlib.sha512(passwd.encode('utf-8')
41+
+ salt.encode('utf-8')).hexdigest()
42+
43+
44+
def account_balance():
45+
"""Generate account balances according to a Pareto distribution.
46+
We should expect balances to be distributed as with other income
47+
distributions. The power exponent is chosen here to replicate
48+
the 80-20 rule."""
49+
return pareto.rvs(1.161)
50+
51+
52+
def generate_sales(df, age='age', account_balance='account_balance',
53+
marketing_level='marketing_level', min_age=25,
54+
max_age=35, noise_ampl=10):
55+
"""Generate sales as a linear function of age (as a weak power), account
56+
balance and the interaction between a marketing campaign and the age
57+
bracket it was intended for, plus a small amount of noise."""
58+
noise = noise_ampl*np.random.normal(0.01, 1.7, df.shape[0])
59+
gated_age = np.heaviside(df[age] - min_age, 0.5) - np.heaviside(df[age] - max_age, 0.5)
60+
return 0.01*pow(np.abs(df[age] - 30), 2.5) + df[age] + 50*df[marketing_level]*gated_age + 2*df[account_balance] + noise
61+
62+
63+
##################################################
64+
65+
##################################################
66+
### Generate a DataFrame of user information
67+
##################################################
68+
# Generate 10,000 rows of the following:
69+
# user_id, first_name, last_name, email, password, address,
70+
# birth_date, credit_card_num, credit_card_exp, security_answer,
71+
# account_balance
72+
73+
user_df = pd.DataFrame([[x, person.name(), person.surname(), person.gender(),
74+
person.email(), hashed_passwd(person.password()),
75+
address.address(), person.age(),
76+
payment.credit_card_number(),
77+
payment.credit_card_expiration_date(), text.word(),
78+
account_balance(), np.random.randint(1, 11)]
79+
for x in range(10000)])
80+
81+
user_df.columns = ["user_id", "first_name", "last_name",
82+
"gender", "email", "password_hashed", "address",
83+
"age", "credit_card_num", "credit_card_exp",
84+
"security_answer", "account_balance",
85+
"marketing_level"]
86+
87+
# Generate sales, based on a noisy linear model
88+
user_df['sales'] = generate_sales(user_df)
89+
user_df['sales'] = user_df['sales'] - user_df['sales'].min()
90+
user_df['sales'] /= 40
91+
92+
print("Summary statistics on numerical data:")
93+
print(user_df.describe())
94+
95+
##################################################
96+
97+
##################################################
98+
### Scuff the data up a bit!
99+
##################################################
100+
# We'll 'disappear' 10% of some columns, and create
101+
# some dupes
102+
103+
104+
def makeDataMissing(df, col_name, frac=0.1):
105+
"""Randomly assign a fraction of a column, col_name,
106+
of a dataframe, df, as missing (np.nan).
107+
This makes use of the sample method associated with
108+
Series and DataFrame objects.
109+
110+
A copy of the column is returned."""
111+
rnd_Idx = df.sample(frac=frac).index
112+
col_out = df[col_name].copy()
113+
col_out[rnd_Idx] = np.nan
114+
return col_out
115+
116+
117+
def makeDupes(df, frac=0.1):
118+
"""Take a DataFrame, df, and randomly append
119+
a fraction of its own rows."""
120+
rnd_Idx = df.sample(frac=frac).index
121+
return df.append(df.loc[rnd_Idx, :])
122+
123+
# Ten percent of customers weren't comfortable with volunteering their gender:
124+
user_df['gender'] = makeDataMissing(user_df, 'gender')
125+
126+
# others couldn't be bothered with the address:
127+
user_df['address'] = makeDataMissing(user_df, 'address')
128+
129+
# We'll apply duplicates later.
130+
131+
##################################################
132+
133+
##################################################
134+
### Perform some Exploratory Data Analysis
135+
##################################################
136+
137+
user_df.sample(5)
138+
139+
user_df.describe()
140+
141+
# Note the median balance is 1.8, while the mean is 5.3
142+
# Recall we generated a heavily skewed distribution!
143+
144+
# We designed it according to the famous "80-20 rule"
145+
# The top twenty percent own 80% of the balances.
146+
# Let's test it. Take the 80th percentile:
147+
critical80 = np.quantile(user_df["account_balance"], 0.8)
148+
## 4.013269256450965
149+
150+
the_few = user_df.loc[user_df["account_balance"] > critical80,
151+
"account_balance"].sum()
152+
153+
tot_balance = user_df["account_balance"].sum()
154+
155+
the_few/tot_balance
156+
## 0.7298469832819879
157+
# So here, the top 20% 'only' have 73% of the account balance
158+
159+
# Plot the Pareto distribution
160+
user_df['log_account_balance'] = np.log10(user_df['account_balance'])
161+
user_df['log_account_balance'].hist(bins=20)
162+
163+
# Some limitations of mimesis
164+
# If you want realistic distributions of certain numerical variables
165+
# then you should simulate populations yourself. E.g.:
166+
167+
user_df["age"].plot(kind="kde")
168+
169+
# The way ages are generated are not exactly samples of any real population!
170+
# This will depend on the underlying demographic dynamics.
171+
172+
from pandas.plotting import scatter_matrix
173+
scatter_matrix(user_df[['age', 'account_balance', 'marketing_level', 'sales']])
174+
175+
import seaborn as sns
176+
177+
sns.pairplot(user_df[['age', 'account_balance', 'marketing_level', 'sales']],
178+
hue='marketing_level')
179+
180+
181+
##################################################
182+
183+
184+
##################################################
185+
### Export data to SQL, Excel and print summary
186+
##################################################
187+
188+
print("Account balance for top 20% of users: {} \nFraction of total \
189+
balance owned by top 20%: {}%\n".format(critical80,
190+
100*the_few/tot_balance))
191+
192+
# Generate user info, along with 10% dupes:
193+
main_user_df = makeDupes(user_df[['user_id', 'first_name', 'last_name', 'email',
194+
'password_hashed', 'gender', 'address', 'age',
195+
'credit_card_num', 'credit_card_exp',
196+
'security_answer', 'account_balance']])
197+
198+
199+
def df_sql_write(df, file_name="test.sql", table_name="test_table"):
200+
"""Function to generate an sqlite3 database from a pandas dataframe, df,
201+
with a table name, table_name. This is modified directly from the pandas
202+
documentation on connecting to databases:
203+
https://pandas.pydata.org/pandas-docs/stable/io.html#reading-tables"""
204+
if os.path.exists(file_name):
205+
os.remove(file_name)
206+
sql_db = sqlite3.connect(file_name)
207+
df.to_sql(name=table_name, con=sql_db, index=False)
208+
sql_db.close()
209+
210+
211+
# Write out user info to SQL database (in random order)
212+
df_sql_write(main_user_df.sample(frac=1.0), 'user_data.sql', table_name='user_accounts')
213+
214+
# Write out campaign data to Excel spreadsheet
215+
campaign_df = user_df[['user_id', 'marketing_level', 'sales']].sample(frac=1.0)
216+
217+
campaign_df.to_excel('advertising_campaign.xlsx', index=False)
218+
219+
## Extract DB thus:
220+
#with sqlite3.connect('user_data.sql') as cnx:
221+
# df1 = pd.read_sql_query("SELECT * FROM user_accounts", cnx)
222+
#
223+
#cnx.close()

README.md

+1
Original file line numberDiff line numberDiff line change
@@ -42,3 +42,4 @@ This is a collection of short Python scripts to solve and automate tasks and sim
4242
22 | [**E-Certificate Writer**](https://github.com/fnplus/Python-scripts-collection/tree/master/E-Certificate-Writer) | Useful for writing names of participants on E-Certificates using Python3. Returns the certificates in PDF format. | img2pdf==0.3.3, numpy==1.17.2, pandas==0.25.1, Pillow==6.2.0, python-dateutil==2.8.0, pytz==2019.3, six==1.12.0
4343
23 | [**YouTubeDownloader**](https://github.com/fnplus/Python-scripts-collection/tree/master/YouTubeDownloader) | You can download YouTube videos with the URLs provided. | pytube==9.5.3 |
4444
24 | [**SendEmail**](https://github.com/fnplus/Python-scripts-collection/tree/master/SendEmail) | Send email using Python. Prompts the user for their email address and sends to address input. Handles user's password securely using `getpass`. | None |
45+
25 | [**GenerateSyntheticCustomerDatabase**](https://github.com/fnplus/Python-scripts-collection/tree/master/GenerateSyntheticCustomerDatabase) | Generate a mock (synthetic) dataset of arbitrary length with 'customers'. This includes their names, credit card details and hashed passwords. This is all synthetic, so no data breach here! | mimesis, scipy, pandas, numpy |

0 commit comments

Comments
 (0)