Skip to content

Feature text generation extensions #214

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 32 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
32 commits
Select commit Hold shift + click to select a range
aebbb06
wip
ronanstokes-db Mar 26, 2023
5f0ffc0
merge from origin
ronanstokes-db Mar 27, 2023
1eda552
wip
ronanstokes-db Apr 7, 2023
7de014c
Merge branch 'master' of https://github.com/databrickslabs/dbldatagen
ronanstokes-db Apr 7, 2023
c859475
Merge branch 'master' of https://github.com/databrickslabs/dbldatagen
ronanstokes-db Apr 9, 2023
3094e96
Merge branch 'master' of https://github.com/databrickslabs/dbldatagen
ronanstokes-db Apr 13, 2023
3bf6e9b
Merge branch 'master' of https://github.com/databrickslabs/dbldatagen
ronanstokes-db Apr 17, 2023
caaff18
Merge branch 'master' of https://github.com/databrickslabs/dbldatagen
ronanstokes-db Apr 18, 2023
87d5c50
Merge branch 'master' of https://github.com/databrickslabs/dbldatagen
ronanstokes-db Apr 18, 2023
4536794
Merge branch 'master' of https://github.com/databrickslabs/dbldatagen
ronanstokes-db Apr 19, 2023
eba6193
Merge branch 'master' of https://github.com/databrickslabs/dbldatagen
ronanstokes-db Apr 21, 2023
c4fdc3b
wip
ronanstokes-db May 9, 2023
46969e7
wip
ronanstokes-db May 9, 2023
cf9e909
wip
ronanstokes-db May 9, 2023
ba929ac
wip
ronanstokes-db May 9, 2023
03245c3
wip
ronanstokes-db May 9, 2023
4b94465
wip
ronanstokes-db May 9, 2023
ff1574c
wip
ronanstokes-db May 9, 2023
97616ed
wip
ronanstokes-db May 9, 2023
064ab0a
wip
ronanstokes-db May 9, 2023
613bf9c
wip
ronanstokes-db May 9, 2023
1b90eb3
wip
ronanstokes-db May 9, 2023
65f4d54
updated build version
ronanstokes-db May 9, 2023
c81cc66
updated build version
ronanstokes-db May 9, 2023
794d88d
wip
ronanstokes-db May 9, 2023
c148246
Merge branch 'master' into feature_text_generation_extensions
ronanstokes-db May 9, 2023
9ff9cfe
wip
ronanstokes-db May 9, 2023
6111560
wip
ronanstokes-db May 9, 2023
16284e0
added GenerateStr and html output to data analyzer
ronanstokes-db Jun 1, 2023
df4dd55
wip
ronanstokes-db Jun 1, 2023
0a93d0f
wip
ronanstokes-db Jun 1, 2023
6e41061
Merge branch 'master' into feature_text_generation_extensions
ronanstokes-db Aug 26, 2023
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion dbldatagen/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,12 +42,13 @@
from .spark_singleton import SparkSingleton
from .text_generators import TemplateGenerator, ILText, TextGenerator
from .text_generator_plugins import PyfuncText, PyfuncTextFactory, FakerTextFactory, fakerText
from .text_generatestring import GenerateString
from .html_utils import HtmlUtils

__all__ = ["data_generator", "data_analyzer", "schema_parser", "daterange", "nrange",
"column_generation_spec", "utils", "function_builder",
"spark_singleton", "text_generators", "datarange", "datagen_constants",
"text_generator_plugins", "html_utils"
"text_generator_plugins", "html_utils", "text_generatestring"
]


Expand Down
5 changes: 4 additions & 1 deletion dbldatagen/column_generation_spec.py
Original file line number Diff line number Diff line change
Expand Up @@ -1142,7 +1142,7 @@ def _applyPrefixSuffixExpressions(self, cprefix, csuffix, new_def):
new_def = concat(new_def.astype(IntegerType()), lit(text_separator), lit(csuffix))
return new_def

def _applyTextGenerationExpression(self, new_def, use_pandas_optimizations):
def _applyTextGenerationExpression(self, new_def, use_pandas_optimizations=True):
"""Apply text generation expression to column expression

:param new_def : column definition being created
Expand All @@ -1153,6 +1153,9 @@ def _applyTextGenerationExpression(self, new_def, use_pandas_optimizations):
# while it seems like this could use a shared instance, this does not work if initialized
# in a class method
tg = self.textGenerator

new_def = tg.prepareBaseValue(new_def)

if use_pandas_optimizations:
self.executionHistory.append(f".. text generation via pandas scalar udf `{tg}`")
u_value_from_generator = pandas_udf(tg.pandasGenerateText,
Expand Down
45 changes: 31 additions & 14 deletions dbldatagen/data_analyzer.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@
import pyspark.sql.functions as F

from .utils import strip_margins
from .html_utils import HtmlUtils
from .spark_singleton import SparkSingleton


Expand Down Expand Up @@ -359,7 +360,7 @@ def _scriptDataGeneratorCode(cls, schema, dataSummary=None, sourceDf=None, suppr
return "\n".join(stmts)

@classmethod
def scriptDataGeneratorFromSchema(cls, schema, suppressOutput=False, name=None):
def scriptDataGeneratorFromSchema(cls, schema, suppressOutput=False, name=None, asHtml=False):
"""
Generate outline data generator code from an existing dataframe

Expand All @@ -373,16 +374,24 @@ def scriptDataGeneratorFromSchema(cls, schema, suppressOutput=False, name=None):
The dataframe to be analyzed is the dataframe passed to the constructor of the DataAnalyzer object.

:param schema: Pyspark schema - i.e manually constructed StructType or return value from `dataframe.schema`
:param suppressOutput: Suppress printing of generated code if True
:param suppressOutput: Suppress printing of generated code if True. If asHtml is True, output is suppressed
:param name: Optional name for data generator
:return: String containing skeleton code
:param asHtml: If True, will generate Html suitable for notebook ``displayHtml``.
:return: String containing skeleton code (in Html form if `asHtml` is True)

"""
return cls._scriptDataGeneratorCode(schema,
suppressOutput=suppressOutput,
name=name)
omit_output_printing = suppressOutput or asHtml

generated_code = cls._scriptDataGeneratorCode(schema,
suppressOutput=omit_output_printing,
name=name)

if asHtml:
generated_code = HtmlUtils.formatCodeAsHtml(generated_code)

return generated_code

def scriptDataGeneratorFromData(self, suppressOutput=False, name=None):
def scriptDataGeneratorFromData(self, suppressOutput=False, name=None, asHtml=False):
"""
Generate outline data generator code from an existing dataframe

Expand All @@ -395,14 +404,17 @@ def scriptDataGeneratorFromData(self, suppressOutput=False, name=None):

The dataframe to be analyzed is the Spark dataframe passed to the constructor of the DataAnalyzer object

:param suppressOutput: Suppress printing of generated code if True
:param suppressOutput: Suppress printing of generated code if True. If asHtml is True, output is suppressed
:param name: Optional name for data generator
:return: String containing skeleton code
:param asHtml: If True, will generate Html suitable for notebook ``displayHtml``.
:return: String containing skeleton code (in Html form if `asHtml` is True)

"""
assert self._df is not None
assert type(self._df) is ssql.DataFrame, "sourceDf must be a valid Pyspark dataframe"

omit_output_printing = suppressOutput or asHtml

if self._dataSummary is None:
df_summary = self.summarizeToDF()

Expand All @@ -411,8 +423,13 @@ def scriptDataGeneratorFromData(self, suppressOutput=False, name=None):
row_key_pairs = row.asDict()
self._dataSummary[row['measure_']] = row_key_pairs

return self._scriptDataGeneratorCode(self._df.schema,
suppressOutput=suppressOutput,
name=name,
dataSummary=self._dataSummary,
sourceDf=self._df)
generated_code = self._scriptDataGeneratorCode(self._df.schema,
suppressOutput=omit_output_printing,
name=name,
dataSummary=self._dataSummary,
sourceDf=self._df)

if asHtml:
generated_code = HtmlUtils.formatCodeAsHtml(generated_code)

return generated_code
181 changes: 181 additions & 0 deletions dbldatagen/text_generatestring.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,181 @@
# See the License for the specific language governing permissions and
# limitations under the License.
#

"""
This file defines the RandomStr text generator
"""

import math
import random

import numpy as np
import pandas as pd

import pyspark.sql.functions as F

from .text_generators import TextGenerator
from .text_generators import _DIGITS_ZERO, _LETTERS_UPPER, _LETTERS_LOWER, _LETTERS_ALL


class GenerateString(TextGenerator): # lgtm [py/missing-equals]
"""This class handles the generation of string text of specified length drawn from alphanumeric characters.

The set of chars to be used can be modified based on the parameters

This will generate deterministic strings chosen from the pool of characters `0-9`, `a-z`, `A-Z`, or from a
custom character range if specified.

:param length: length of string. Can be integer, or tuple (min, max)
:param leadingAlpha: If True, leading character will be in range a-zAA-Z
:param allUpper: If True, any alpha chars will be uppercase
:param allLower: If True, any alpha chars will be lowercase
:param allAlpha: If True, all chars will be non numeric
:param customChars: If supplied, specifies a list of chars to use, or string of chars to use.

This method will generate deterministic strings varying in size from `minLength` to `maxLength`.
The characters chosen will be in the range 0-9`, `a-z`, `A-Z` unless modified using the `leadingAlpha`,
`allUpper`, `allLower`, `allAlpha` or `customChars` parameters.

The modifiers can be combined - for example GenerateString(1, 5, leadingAlpha=True, allUpper=True)

When the length is specified to be a tuple, it wll generate variable length strings of lengths from the lower bound
to the upper bound inclusive.

The strings are generated deterministically so that they can be used for predictable primary and foreign keys.

If the column definition that includes this specifies `random` then the string generation will be determined by a
seeded random number according to the rules for random numbers and random seeds used in other columns

If random is false, then the string will be generated from a pseudo random sequence generated purely from the
SQL hash of the `baseColumns`

.. note::
If customChars are specified, then the flag `allAlpha` will only remove digits.

"""

def __init__(self, length, leadingAlpha=True, allUpper=False, allLower=False, allAlpha=False, customChars=None):
super().__init__()

assert not customChars or isinstance(customChars, (list, str)), \
"`customChars` should be list of characters or string containing custom chars"

assert not allUpper or not allLower, "allUpper and allLower cannot both be True"

if isinstance(customChars, str):
assert len(customChars) > 0, "string of customChars must be non-empty"
elif isinstance(customChars, list):
assert all(isinstance(c, str) for c in customChars)
assert len(customChars) > 0, "list of customChars must be non-empty"

self.leadingAlpha = leadingAlpha
self.allUpper = allUpper
self.allLower = allLower
self.allAlpha = allAlpha

# determine base alphabet
if isinstance(customChars, list):
charAlphabet = set("".join(customChars))
elif isinstance(customChars, str):
charAlphabet = set(customChars)
else:
charAlphabet = set(_LETTERS_ALL).union(set(_DIGITS_ZERO))

if allLower:
charAlphabet = charAlphabet.difference(set(_LETTERS_UPPER))
elif allUpper:
charAlphabet = charAlphabet.difference(set(_LETTERS_LOWER))

if allAlpha:
charAlphabet = charAlphabet.difference(set(_DIGITS_ZERO))

self._charAlphabet = np.array(list(charAlphabet))

if leadingAlpha:
self._firstCharAlphabet = np.array(list(charAlphabet.difference(set(_DIGITS_ZERO))))
else:
self._firstCharAlphabet = self._charAlphabet

# compute string lengths
if isinstance(length, int):
self._minLength = length
self._maxLength = length
elif isinstance(length, tuple):
assert len(length) == 2, "only 2 elements can be specified if length is a tuple"
assert all(isinstance(el, int) for el in length)
self._minLength, self._maxLength = length
else:
raise ValueError("`length` must be an integer or a tuple of two integers")

# compute bounds for generated strings
bounds = [len(self._firstCharAlphabet)]
for ix in range(1, self._maxLength):
bounds.append(len(self._charAlphabet))

self._bounds = bounds

def __repr__(self):
return f"GenerateString(length={(self._minLength, self._maxLength)}, leadingAlpha={self.leadingAlpha})"

def make_variable_length_mask(self, v, lengths):
""" given 2-d array of dimensions[r, c] and lengths of dimensions[r]

generate mask for each row where col_index[r,c] < lengths[r]
"""
print(v.shape, lengths.shape)
assert v.shape[0] == lengths.shape[0], "values and lengths must agree on dimension 0]"
_, c_ix = np.indices(v.shape)

return (c_ix.T < lengths.T).T

def mk_bounds(self, v, minLength, maxLength):
rng = np.random.default_rng(42)
v_bounds = np.full(v.shape[0], (maxLength - minLength) + 1)
return rng.integers(v_bounds) + minLength

def prepareBaseValue(self, baseDef):
""" Prepare the base value for processing

:param baseDef: base value expression
:return: base value expression unchanged

For generate string processing , we'll use the SQL function abs(hash(baseDef)

This will ensure that even if there are multiple base values, only a single value is passed to the UDF
"""
return F.abs(F.hash(baseDef))

def pandasGenerateText(self, v):
""" entry point to use for pandas udfs

Implementation uses vectorized implementation of process

:param v: Pandas series of values passed as base values
:return: Pandas series of expanded templates

"""
# placeholders is numpy array used to hold results

rnds = np.full((v.shape[0], self._maxLength), len(self._charAlphabet), dtype=np.object_)

rng = self.getNPRandomGenerator()
rnds2 = rng.integers(rnds)

placeholders = np.full((v.shape[0], self._maxLength), '', dtype=np.object_)

lengths = v.to_numpy() % (self._maxLength - self._minLength) + self._minLength

v1 = np.full((v.shape[0], self._maxLength), -1)

placeholder_mask = self.make_variable_length_mask(placeholders, lengths)
masked_placeholders = np.ma.MaskedArray(placeholders, mask=placeholder_mask)

masked_placeholders[~placeholder_mask] = self._charAlphabet[rnds2[~placeholder_mask]]

output = pd.Series(list(placeholders))

# join strings in placeholders
results = output.apply(lambda placeholder_items: "".join([str(elem) for elem in placeholder_items]))

return results
10 changes: 10 additions & 0 deletions dbldatagen/text_generators.py
Original file line number Diff line number Diff line change
Expand Up @@ -161,6 +161,16 @@ def getAsTupleOrElse(v, defaultValue, valueName):

return defaultValue

def prepareBaseValue(self, baseDef):
""" Prepare the base value for processing

:param baseDef: base value expression
:return: base value expression unchanged

Derived classes are expected to override this if needed
"""
return baseDef


class TemplateGenerator(TextGenerator): # lgtm [py/missing-equals]
"""This class handles the generation of text from templates
Expand Down
2 changes: 2 additions & 0 deletions docs/utils/mk_quick_index.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,8 @@
"grouping": "main classes"},
"text_generator_plugins.py": {"briefDesc": "Text data generation",
"grouping": "main classes"},
"text_generatestring.py": {"briefDesc": "Text data generation",
"grouping": "main classes"},
"data_analyzer.py": {"briefDesc": "Analysis of existing data",
"grouping": "main classes"},
"function_builder.py": {"briefDesc": "Internal utilities to create functions related to weights",
Expand Down
7 changes: 7 additions & 0 deletions tests/test_generation_from_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -71,6 +71,9 @@ def test_code_generation1(self, generation_spec, setupLogging):
ast_tree = ast.parse(generatedCode)
assert ast_tree is not None

generatedCode2 = analyzer.scriptDataGeneratorFromData(asHtml=True)
assert generatedCode in generatedCode2

def test_code_generation_from_schema(self, generation_spec, setupLogging):
df_source_data = generation_spec.build()
generatedCode = dg.DataAnalyzer.scriptDataGeneratorFromSchema(df_source_data.schema)
Expand All @@ -82,6 +85,10 @@ def test_code_generation_from_schema(self, generation_spec, setupLogging):
ast_tree = ast.parse(generatedCode)
assert ast_tree is not None

generatedCode2 = dg.DataAnalyzer.scriptDataGeneratorFromSchema(df_source_data.schema, asHtml=True)

assert generatedCode in generatedCode2

def test_summarize(self, testLogger, generation_spec):
testLogger.info("Building test data")

Expand Down
Loading