databrickslabs · ronanstokes-db · Mar 26, 2023 · Mar 27, 2023 · Apr 7, 2023 · Apr 7, 2023
@@ -42,12 +42,13 @@
 from .spark_singleton import SparkSingleton
 from .text_generators import TemplateGenerator, ILText, TextGenerator
 from .text_generator_plugins import PyfuncText, PyfuncTextFactory, FakerTextFactory, fakerText
+from .text_generatestring import GenerateString
 from .html_utils import HtmlUtils
 
 __all__ = ["data_generator", "data_analyzer", "schema_parser", "daterange", "nrange",
            "column_generation_spec", "utils", "function_builder",
            "spark_singleton", "text_generators", "datarange", "datagen_constants",
-           "text_generator_plugins", "html_utils"
+           "text_generator_plugins", "html_utils", "text_generatestring"
            ]
 
 

@@ -1142,7 +1142,7 @@ def _applyPrefixSuffixExpressions(self, cprefix, csuffix, new_def):
             new_def = concat(new_def.astype(IntegerType()), lit(text_separator), lit(csuffix))
         return new_def
 
-    def _applyTextGenerationExpression(self, new_def, use_pandas_optimizations):
+    def _applyTextGenerationExpression(self, new_def, use_pandas_optimizations=True):
         """Apply text generation expression to column expression
 
         :param new_def : column definition being created
@@ -1153,6 +1153,9 @@ def _applyTextGenerationExpression(self, new_def, use_pandas_optimizations):
         # while it seems like this could use a shared instance, this does not work if initialized
         # in a class method
         tg = self.textGenerator
+
+        new_def = tg.prepareBaseValue(new_def)
+
         if use_pandas_optimizations:
             self.executionHistory.append(f".. text generation via pandas scalar udf `{tg}`")
             u_value_from_generator = pandas_udf(tg.pandasGenerateText,

@@ -14,6 +14,7 @@
 import pyspark.sql.functions as F
 
 from .utils import strip_margins
+from .html_utils import HtmlUtils
 from .spark_singleton import SparkSingleton
 
 
@@ -359,7 +360,7 @@ def _scriptDataGeneratorCode(cls, schema, dataSummary=None, sourceDf=None, suppr
         return "\n".join(stmts)
 
     @classmethod
-    def scriptDataGeneratorFromSchema(cls, schema, suppressOutput=False, name=None):
+    def scriptDataGeneratorFromSchema(cls, schema, suppressOutput=False, name=None, asHtml=False):
         """
         Generate outline data generator code from an existing dataframe
 
@@ -373,16 +374,24 @@ def scriptDataGeneratorFromSchema(cls, schema, suppressOutput=False, name=None):
         The dataframe to be analyzed is the dataframe passed to the constructor of the DataAnalyzer object.
 
         :param schema: Pyspark schema - i.e manually constructed StructType or return value from `dataframe.schema`
-        :param suppressOutput: Suppress printing of generated code if True
+        :param suppressOutput: Suppress printing of generated code if True. If asHtml is True, output is suppressed
         :param name: Optional name for data generator
-        :return: String containing skeleton code
+        :param asHtml: If True, will generate Html suitable for notebook ``displayHtml``.
+        :return: String containing skeleton code (in Html form if `asHtml` is True)
 
         """
-        return cls._scriptDataGeneratorCode(schema,
-                                            suppressOutput=suppressOutput,
-                                            name=name)
+        omit_output_printing = suppressOutput or asHtml
+
+        generated_code = cls._scriptDataGeneratorCode(schema,
+                                                      suppressOutput=omit_output_printing,
+                                                      name=name)
+
+        if asHtml:
+            generated_code = HtmlUtils.formatCodeAsHtml(generated_code)
+
+        return generated_code
 
-    def scriptDataGeneratorFromData(self, suppressOutput=False, name=None):
+    def scriptDataGeneratorFromData(self, suppressOutput=False, name=None, asHtml=False):
         """
         Generate outline data generator code from an existing dataframe
 
@@ -395,14 +404,17 @@ def scriptDataGeneratorFromData(self, suppressOutput=False, name=None):
 
         The dataframe to be analyzed is the Spark dataframe passed to the constructor of the DataAnalyzer object
 
-        :param suppressOutput: Suppress printing of generated code if True
+        :param suppressOutput: Suppress printing of generated code if True. If asHtml is True, output is suppressed
         :param name: Optional name for data generator
-        :return: String containing skeleton code
+        :param asHtml: If True, will generate Html suitable for notebook ``displayHtml``.
+        :return: String containing skeleton code (in Html form if `asHtml` is True)
 
         """
         assert self._df is not None
         assert type(self._df) is ssql.DataFrame, "sourceDf must be a valid Pyspark dataframe"
 
+        omit_output_printing = suppressOutput or asHtml
+
         if self._dataSummary is None:
             df_summary = self.summarizeToDF()
 
@@ -411,8 +423,13 @@ def scriptDataGeneratorFromData(self, suppressOutput=False, name=None):
                 row_key_pairs = row.asDict()
                 self._dataSummary[row['measure_']] = row_key_pairs
 
-        return self._scriptDataGeneratorCode(self._df.schema,
-                                             suppressOutput=suppressOutput,
-                                             name=name,
-                                             dataSummary=self._dataSummary,
-                                             sourceDf=self._df)
+        generated_code = self._scriptDataGeneratorCode(self._df.schema,
+                                                      suppressOutput=omit_output_printing,
+                                                      name=name,
+                                                      dataSummary=self._dataSummary,
+                                                      sourceDf=self._df)
+
+        if asHtml:
+            generated_code = HtmlUtils.formatCodeAsHtml(generated_code)
+
+        return generated_code
@@ -0,0 +1,181 @@
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+"""
+This file defines the RandomStr text generator
+"""
+
+import math
+import random
+
+import numpy as np
+import pandas as pd
+
+import pyspark.sql.functions as F
+
+from .text_generators import TextGenerator
+from .text_generators import _DIGITS_ZERO, _LETTERS_UPPER, _LETTERS_LOWER, _LETTERS_ALL
+
+
+class GenerateString(TextGenerator):  # lgtm [py/missing-equals]
+    """This class handles the generation of string text of specified length drawn from alphanumeric characters.
+
+    The set of chars to be used can be modified based on the parameters
+
+    This will generate deterministic strings chosen from the pool of characters `0-9`, `a-z`, `A-Z`, or from a
+    custom character range if specified.
+
+    :param length: length of string. Can be integer, or tuple (min, max)
+    :param leadingAlpha:  If True, leading character will be in range a-zAA-Z
+    :param allUpper:  If True, any alpha chars will be uppercase
+    :param allLower:  If True, any alpha chars will be lowercase
+    :param allAlpha:  If True, all chars will be non numeric
+    :param customChars:  If supplied, specifies a list of chars to use, or string of chars to use.
+
+    This method will generate deterministic strings varying in size from `minLength` to `maxLength`.
+    The characters chosen will be in the range 0-9`, `a-z`, `A-Z` unless modified using the `leadingAlpha`,
+    `allUpper`, `allLower`, `allAlpha` or `customChars` parameters.
+
+    The modifiers can be combined - for example GenerateString(1, 5, leadingAlpha=True, allUpper=True)
+
+    When the length is specified to be a tuple, it wll generate variable length strings of lengths from the lower bound
+    to the upper bound inclusive.
+
+    The strings are generated deterministically so that they can be used for predictable primary and foreign keys.
+
+    If the column definition that includes this specifies `random` then the string generation will be determined by a
+    seeded random number according to the rules for random numbers and random seeds used in other columns
+
+    If random is false, then the string will be generated from a pseudo random sequence generated purely from the
+    SQL hash of the `baseColumns`
+
+    .. note::
+       If customChars are specified, then the flag `allAlpha` will only remove digits.
+
+    """
+
+    def __init__(self, length, leadingAlpha=True, allUpper=False, allLower=False, allAlpha=False, customChars=None):
+        super().__init__()
+
+        assert not customChars or isinstance(customChars, (list, str)), \
+            "`customChars` should be list of characters or string containing custom chars"
+
+        assert not allUpper or not allLower, "allUpper and allLower cannot both be True"
+
+        if isinstance(customChars, str):
+            assert len(customChars) > 0, "string of customChars must be non-empty"
+        elif isinstance(customChars, list):
+            assert all(isinstance(c, str) for c in customChars)
+            assert len(customChars) > 0, "list of customChars must be non-empty"
+
+        self.leadingAlpha = leadingAlpha
+        self.allUpper = allUpper
+        self.allLower = allLower
+        self.allAlpha = allAlpha
+
+        # determine base alphabet
+        if isinstance(customChars, list):
+            charAlphabet = set("".join(customChars))
+        elif isinstance(customChars, str):
+            charAlphabet = set(customChars)
+        else:
+            charAlphabet = set(_LETTERS_ALL).union(set(_DIGITS_ZERO))
+
+        if allLower:
+            charAlphabet = charAlphabet.difference(set(_LETTERS_UPPER))
+        elif allUpper:
+            charAlphabet = charAlphabet.difference(set(_LETTERS_LOWER))
+
+        if allAlpha:
+            charAlphabet = charAlphabet.difference(set(_DIGITS_ZERO))
+
+        self._charAlphabet = np.array(list(charAlphabet))
+
+        if leadingAlpha:
+            self._firstCharAlphabet = np.array(list(charAlphabet.difference(set(_DIGITS_ZERO))))
+        else:
+            self._firstCharAlphabet = self._charAlphabet
+
+        # compute string lengths
+        if isinstance(length, int):
+            self._minLength = length
+            self._maxLength = length
+        elif isinstance(length, tuple):
+            assert len(length) == 2, "only 2 elements can be specified if length is a tuple"
+            assert all(isinstance(el, int) for el in length)
+            self._minLength, self._maxLength = length
+        else:
+            raise ValueError("`length` must be an integer or a tuple of two integers")
+
+        # compute bounds for generated strings
+        bounds = [len(self._firstCharAlphabet)]
+        for ix in range(1, self._maxLength):
+            bounds.append(len(self._charAlphabet))
+
+        self._bounds = bounds
+
+    def __repr__(self):
+        return f"GenerateString(length={(self._minLength, self._maxLength)}, leadingAlpha={self.leadingAlpha})"
+
+    def make_variable_length_mask(self, v, lengths):
+        """ given 2-d array of dimensions[r, c] and lengths of dimensions[r]
+
+           generate mask for each row where col_index[r,c] < lengths[r]
+        """
+        print(v.shape, lengths.shape)
+        assert v.shape[0] == lengths.shape[0], "values and lengths must agree on dimension 0]"
+        _, c_ix = np.indices(v.shape)
+
+        return (c_ix.T < lengths.T).T
+
+    def mk_bounds(self, v, minLength, maxLength):
+        rng = np.random.default_rng(42)
+        v_bounds = np.full(v.shape[0], (maxLength - minLength) + 1)
+        return rng.integers(v_bounds) + minLength
+
+    def prepareBaseValue(self, baseDef):
+        """ Prepare the base value for processing
+
+        :param baseDef: base value expression
+        :return: base value expression unchanged
+
+        For generate string processing , we'll use the SQL function abs(hash(baseDef)
+
+        This will ensure that even if there are multiple base values, only a single value is passed to the UDF
+        """
+        return F.abs(F.hash(baseDef))
+
+    def pandasGenerateText(self, v):
+        """ entry point to use for pandas udfs
+
+        Implementation uses vectorized implementation of process
+
+        :param v: Pandas series of values passed as base values
+        :return: Pandas series of expanded templates
+
+        """
+        # placeholders is numpy array used to hold results
+
+        rnds = np.full((v.shape[0], self._maxLength), len(self._charAlphabet), dtype=np.object_)
+
+        rng = self.getNPRandomGenerator()
+        rnds2 = rng.integers(rnds)
+
+        placeholders = np.full((v.shape[0], self._maxLength), '', dtype=np.object_)
+
+        lengths = v.to_numpy() % (self._maxLength - self._minLength) + self._minLength
+
+        v1 = np.full((v.shape[0], self._maxLength), -1)
+
+        placeholder_mask = self.make_variable_length_mask(placeholders, lengths)
+        masked_placeholders = np.ma.MaskedArray(placeholders, mask=placeholder_mask)
+
+        masked_placeholders[~placeholder_mask] = self._charAlphabet[rnds2[~placeholder_mask]]
+
+        output = pd.Series(list(placeholders))
+
+        # join strings in placeholders
+        results = output.apply(lambda placeholder_items: "".join([str(elem) for elem in placeholder_items]))
+
+        return results
@@ -161,6 +161,16 @@ def getAsTupleOrElse(v, defaultValue, valueName):
 
         return defaultValue
 
+    def prepareBaseValue(self, baseDef):
+        """ Prepare the base value for processing
+
+        :param baseDef: base value expression
+        :return: base value expression unchanged
+
+        Derived classes are expected to override this if needed
+        """
+        return baseDef
+
 
 class TemplateGenerator(TextGenerator):  # lgtm [py/missing-equals]
     """This class handles the generation of text from templates

@@ -33,6 +33,8 @@
                            "grouping": "main classes"},
     "text_generator_plugins.py": {"briefDesc": "Text data generation",
                                   "grouping": "main classes"},
+    "text_generatestring.py": {"briefDesc": "Text data generation",
+                           "grouping": "main classes"},
     "data_analyzer.py": {"briefDesc": "Analysis of existing data",
                          "grouping": "main classes"},
     "function_builder.py": {"briefDesc": "Internal utilities to create functions related to weights",

@@ -71,6 +71,9 @@ def test_code_generation1(self, generation_spec, setupLogging):
         ast_tree = ast.parse(generatedCode)
         assert ast_tree is not None
 
+        generatedCode2 = analyzer.scriptDataGeneratorFromData(asHtml=True)
+        assert generatedCode in generatedCode2
+
     def test_code_generation_from_schema(self, generation_spec, setupLogging):
         df_source_data = generation_spec.build()
         generatedCode = dg.DataAnalyzer.scriptDataGeneratorFromSchema(df_source_data.schema)
@@ -82,6 +85,10 @@ def test_code_generation_from_schema(self, generation_spec, setupLogging):
         ast_tree = ast.parse(generatedCode)
         assert ast_tree is not None
 
+        generatedCode2 = dg.DataAnalyzer.scriptDataGeneratorFromSchema(df_source_data.schema, asHtml=True)
+
+        assert generatedCode in generatedCode2
+
     def test_summarize(self, testLogger, generation_spec):
         testLogger.info("Building test data")