From 71ee2f1ec13686cef9b84d62c7aa929cdd4a2924 Mon Sep 17 00:00:00 2001
From: David Coles <coles.david@gmail.com>
Date: Mon, 27 May 2019 22:43:04 -0700
Subject: [PATCH] Add RegEx support using RE2

Introduces 5 new built-in methods to the stdlib:
- `regexFullMatch(pattern, str)` -- Full match regex
- `regexPartialMatch(pattern, str)` -- Partial match regex
- `regexQuoteMeta(str)` -- Escape regex metachararacters
- `regexReplace(str, pattern, to)` -- Replace single occurance using regex
- `regexGlobalReplace(str, pattern, to)` -- Replace globally using regex

Since both `regexFullMatch` and `regexPartialMatch` can perform captures
these functions return a "match" object upon match or `null` otherwise.
For example:

```
$ ./jsonnet -e 'std.regexFullMatch("h(?P<mid>.*)o", "hello")'
{
   "captures": [
      "ell"
   ],
   "namedCaptures": {
      "mid": "ell"
   },
   "string": "hello"
}
```

Introduces a dependency on RE2 2019-06-01.
Builds tested using make, CMake and Bazel on Ubuntu 18.04.
---
 .travis.yml                                   |   4 +-
 CMakeLists.txt                                |  46 +++++-
 ...ists.txt.in => GoogleTestCMakeLists.txt.in |   0
 Makefile                                      |   6 +-
 RE2CMakeLists.txt.in                          |  18 +++
 WORKSPACE                                     |  10 +-
 core/BUILD                                    |   1 +
 core/CMakeLists.txt                           |   8 +-
 core/desugarer.cpp                            |   7 +-
 core/vm.cpp                                   | 133 ++++++++++++++++++
 test_suite/stdlib.jsonnet                     |  70 +++++++++
 11 files changed, 291 insertions(+), 12 deletions(-)
 rename CMakeLists.txt.in => GoogleTestCMakeLists.txt.in (100%)
 create mode 100644 RE2CMakeLists.txt.in

diff --git a/.travis.yml b/.travis.yml
index 361501ea8..f9c567360 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -7,6 +7,7 @@ matrix:
         apt:
           packages:
             - g++-4.9
+            - libre2-dev
           sources: &sources
             - llvm-toolchain-precise-3.8
             - ubuntu-toolchain-r-test
@@ -16,6 +17,7 @@ matrix:
         apt:
           packages:
             - clang-3.8
+            - libre2-dev
           sources: *sources
     - os: osx
       osx_image: xcode8
@@ -49,4 +51,4 @@ notifications:
     channels:
       - "chat.freenode.net#jsonnet"
     template:
-      - "%{repository}/%{branch} (%{commit} - %{author}): %{message}"
\ No newline at end of file
+      - "%{repository}/%{branch} (%{commit} - %{author}): %{message}"
diff --git a/CMakeLists.txt b/CMakeLists.txt
index cdd5367eb..c195c09f9 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -29,8 +29,50 @@ set(CMAKE_RUNTIME_OUTPUT_DIRECTORY ${GLOBAL_OUTPUT_PATH})
 set(CMAKE_LIBRARY_OUTPUT_DIRECTORY ${GLOBAL_OUTPUT_PATH})
 set(CMAKE_ARCHIVE_OUTPUT_DIRECTORY ${GLOBAL_OUTPUT_PATH})
 
+# Include external RE2 project. This runs a CMake sub-script
+# (RE2CMakeLists.txt.in) that downloads googletest source. It's then built as part
+# of the jsonnet project. The conventional way of handling CMake dependencies is
+# to use a find_package script, which finds and installs the library from
+# known locations on the local machine. Downloading the library ourselves
+# allows us to pin to a specific version and makes things easier for users
+# who don't have package managers.
+
+# Generate and download RE2 project.
+set(RE2_DIR ${GLOBAL_OUTPUT_PATH}/re2-download)
+configure_file(RE2CMakeLists.txt.in ${RE2_DIR}/CMakeLists.txt)
+execute_process(COMMAND ${CMAKE_COMMAND} -G "${CMAKE_GENERATOR}" .
+        RESULT_VARIABLE result
+        WORKING_DIRECTORY ${RE2_DIR}
+)
+if(result)
+    message(FATAL_ERROR "RE2 download failed: ${result}")
+endif()
+
+# Build RE2.
+execute_process(COMMAND ${CMAKE_COMMAND} --build .
+    RESULT_VARIABLE result
+    WORKING_DIRECTORY ${RE2_DIR})
+if(result)
+    message(FATAL_ERROR "Build step for re2 failed: ${result}")
+endif()
+
+# Add RE2 directly to our build. This defines
+# the re2 target.
+add_subdirectory(${GLOBAL_OUTPUT_PATH}/re2-src
+                 ${GLOBAL_OUTPUT_PATH}/re2-build)
+
+# Include RE2 headers.
+include_directories("${RE2_SOURCE_DIR}/include")
+
+# Allow linking into a shared library.
+set_property(TARGET re2 PROPERTY POSITION_INDEPENDENT_CODE ON)
+
+# RE2 requires pthreads
+set_property(TARGET re2 PROPERTY INTERFACE_COMPILE_OPTIONS $<${UNIX}:-pthread>)
+set_property(TARGET re2 PROPERTY INTERFACE_LINK_LIBRARIES $<${UNIX}:-pthread>)
+
 # Include external googletest project. This runs a CMake sub-script
-# (CMakeLists.txt.in) that downloads googletest source. It's then built as part
+# (GoogleTestCMakeLists.txt.in) that downloads googletest source. It's then built as part
 # of the jsonnet project. The conventional way of handling CMake dependencies is
 # to use a find_package script, which finds and installs the library from
 # known locations on the local machine. Downloading the library ourselves
@@ -41,7 +83,7 @@ if (BUILD_TESTS AND NOT USE_SYSTEM_GTEST)
 
     # Generate and download googletest project.
     set(GOOGLETEST_DIR ${GLOBAL_OUTPUT_PATH}/googletest-download)
-    configure_file(CMakeLists.txt.in ${GOOGLETEST_DIR}/CMakeLists.txt)
+    configure_file(GoogleTestCMakeLists.txt.in ${GOOGLETEST_DIR}/CMakeLists.txt)
     execute_process(COMMAND ${CMAKE_COMMAND} -G "${CMAKE_GENERATOR}" .
             RESULT_VARIABLE result
             WORKING_DIRECTORY ${GOOGLETEST_DIR}
diff --git a/CMakeLists.txt.in b/GoogleTestCMakeLists.txt.in
similarity index 100%
rename from CMakeLists.txt.in
rename to GoogleTestCMakeLists.txt.in
diff --git a/Makefile b/Makefile
index 97702eb5a..3db6e4542 100644
--- a/Makefile
+++ b/Makefile
@@ -34,7 +34,7 @@ CFLAGS ?= -g $(OPT) -Wall -Wextra -pedantic -std=c99 -fPIC -Iinclude
 MAKEDEPENDFLAGS ?= -Iinclude -Ithird_party/md5 -Ithird_party/json
 EMCXXFLAGS = $(CXXFLAGS) -g0 -Os --memory-init-file 0 -s DISABLE_EXCEPTION_CATCHING=0 -s OUTLINING_LIMIT=10000 -s RESERVED_FUNCTION_POINTERS=20 -s ASSERTIONS=1 -s ALLOW_MEMORY_GROWTH=1
 EMCFLAGS = $(CFLAGS) --memory-init-file 0 -s DISABLE_EXCEPTION_CATCHING=0 -s ASSERTIONS=1 -s ALLOW_MEMORY_GROWTH=1
-LDFLAGS ?=
+LDFLAGS ?= -lre2
 
 SHARED_LDFLAGS ?= -shared
 
@@ -121,11 +121,11 @@ core/desugarer.cpp: core/std.jsonnet.h
 
 # Commandline executable.
 jsonnet: cmd/jsonnet.cpp cmd/utils.cpp $(LIB_OBJ)
-	$(CXX) $(CXXFLAGS) $(LDFLAGS) $< cmd/utils.cpp $(LIB_SRC:.cpp=.o) -o $@
+	$(CXX) $(CXXFLAGS) $< cmd/utils.cpp $(LIB_SRC:.cpp=.o) -o $@ $(LDFLAGS)
 
 # Commandline executable (reformatter).
 jsonnetfmt: cmd/jsonnetfmt.cpp cmd/utils.cpp $(LIB_OBJ)
-	$(CXX) $(CXXFLAGS) $(LDFLAGS) $< cmd/utils.cpp $(LIB_SRC:.cpp=.o) -o $@
+	$(CXX) $(CXXFLAGS) $< cmd/utils.cpp $(LIB_SRC:.cpp=.o) -o $@ $(LDFLAGS)
 
 # C binding.
 libjsonnet.so: $(LIB_OBJ)
diff --git a/RE2CMakeLists.txt.in b/RE2CMakeLists.txt.in
new file mode 100644
index 000000000..808b92359
--- /dev/null
+++ b/RE2CMakeLists.txt.in
@@ -0,0 +1,18 @@
+# CMake script run a generation-time. This must be separate from the main
+# CMakeLists.txt file to allow downloading and building googletest at generation
+# time.
+cmake_minimum_required(VERSION 2.8.2)
+
+project(re2-download NONE)
+
+include(ExternalProject)
+ExternalProject_Add(re2
+		GIT_REPOSITORY    https://github.com/google/re2.git
+		GIT_TAG           2019-06-01
+		SOURCE_DIR        "${GLOBAL_OUTPUT_PATH}/re2-src"
+		BINARY_DIR        "${GLOBAL_OUTPUT_PATH}/re2-build"
+		CONFIGURE_COMMAND ""
+		BUILD_COMMAND     ""
+		INSTALL_COMMAND   ""
+		TEST_COMMAND      ""
+)
\ No newline at end of file
diff --git a/WORKSPACE b/WORKSPACE
index c4dc885b4..36653bbba 100644
--- a/WORKSPACE
+++ b/WORKSPACE
@@ -12,11 +12,19 @@ git_repository(
 git_repository(
     name = "com_google_googletest",
     remote = "https://github.com/google/googletest.git",
-    # If updating googletest version, also update CMakeLists.txt.in.
+    # If updating googletest version, also update GoogleTestCMakeLists.txt.in.
     commit = "2fe3bd994b3189899d93f1d5a881e725e046fdc2", # release: release-1.8.1
     shallow_since = "1535728917 -0400",
 )
 
+git_repository(
+    name = "com_googlesource_code_re2",
+    remote = "https://github.com/google/re2.git",
+    # If updating RE2 version, also update RE2CMakeLists.txt.in.
+    commit = "0c95bcce2f1f0f071a786ca2c42384b211b8caba", # release: 2019-06-01
+    shallow_since = "1558525654 +0000",
+)
+
 load("//tools/build_defs:python_repo.bzl", "python_interpreter")
 
 python_interpreter(name = "default_python")
diff --git a/core/BUILD b/core/BUILD
index 6a0e9cb50..b76feb953 100644
--- a/core/BUILD
+++ b/core/BUILD
@@ -36,6 +36,7 @@ cc_library(
         "//stdlib:std",
         "//third_party/json",
         "//third_party/md5:libmd5",
+        "@com_googlesource_code_re2//:re2",
     ],
 )
 
diff --git a/core/CMakeLists.txt b/core/CMakeLists.txt
index e877015cc..fa9bdcf13 100644
--- a/core/CMakeLists.txt
+++ b/core/CMakeLists.txt
@@ -29,8 +29,8 @@ set(LIBJSONNET_SOURCE
     vm.cpp)
 
 add_library(libjsonnet SHARED ${LIBJSONNET_HEADERS} ${LIBJSONNET_SOURCE})
-add_dependencies(libjsonnet md5 stdlib)
-target_link_libraries(libjsonnet md5)
+add_dependencies(libjsonnet md5 re2 stdlib)
+target_link_libraries(libjsonnet md5 re2)
 
 # CMake prepends CMAKE_SHARED_LIBRARY_PREFIX to shared libraries, so without
 # this step the output would be |liblibjsonnet|.
@@ -45,8 +45,8 @@ install(TARGETS libjsonnet
 
 # Static library for jsonnet command-line tool.
 add_library(libjsonnet_static STATIC ${LIBJSONNET_SOURCE})
-add_dependencies(libjsonnet_static md5 stdlib)
-target_link_libraries(libjsonnet_static md5)
+add_dependencies(libjsonnet_static md5 re2 stdlib)
+target_link_libraries(libjsonnet_static md5 re2)
 set_target_properties(libjsonnet_static PROPERTIES OUTPUT_NAME jsonnet)
 install(TARGETS libjsonnet_static DESTINATION "${CMAKE_INSTALL_LIBDIR}")
 
diff --git a/core/desugarer.cpp b/core/desugarer.cpp
index d49a73b05..37eab5de0 100644
--- a/core/desugarer.cpp
+++ b/core/desugarer.cpp
@@ -34,7 +34,7 @@ struct BuiltinDecl {
     std::vector<UString> params;
 };
 
-static unsigned long max_builtin = 37;
+static unsigned long max_builtin = 42;
 BuiltinDecl jsonnet_builtin_decl(unsigned long builtin)
 {
     switch (builtin) {
@@ -76,6 +76,11 @@ BuiltinDecl jsonnet_builtin_decl(unsigned long builtin)
         case 35: return {U"parseJson", {U"str"}};
         case 36: return {U"encodeUTF8", {U"str"}};
         case 37: return {U"decodeUTF8", {U"arr"}};
+        case 38: return {U"regexFullMatch", {U"pattern", U"str"}};
+        case 39: return {U"regexPartialMatch", {U"pattern", U"str"}};
+        case 40: return {U"regexQuoteMeta", {U"str"}};
+        case 41: return {U"regexReplace", {U"str", U"pattern", U"to"}};
+        case 42: return {U"regexGlobalReplace", {U"str", U"pattern", U"to"}};
         default:
             std::cerr << "INTERNAL ERROR: Unrecognized builtin function: " << builtin << std::endl;
             std::abort();
diff --git a/core/vm.cpp b/core/vm.cpp
index 0cf06fa94..94cda61af 100644
--- a/core/vm.cpp
+++ b/core/vm.cpp
@@ -26,6 +26,7 @@ limitations under the License.
 #include "json.hpp"
 #include "md5.h"
 #include "parser.h"
+#include "re2/re2.h"
 #include "state.h"
 #include "static_analysis.h"
 #include "string_utils.h"
@@ -35,6 +36,10 @@ using json = nlohmann::json;
 
 namespace {
 
+static const Fodder EF;  // Empty fodder.
+
+static const LocationRange E;  // Empty.
+
 /** Turn a path e.g. "/a/b/c" into a dir, e.g. "/a/b/".  If there is no path returns "".
  */
 std::string dir_name(const std::string &path)
@@ -881,6 +886,11 @@ class Interpreter {
         builtins["parseJson"] = &Interpreter::builtinParseJson;
         builtins["encodeUTF8"] = &Interpreter::builtinEncodeUTF8;
         builtins["decodeUTF8"] = &Interpreter::builtinDecodeUTF8;
+        builtins["regexFullMatch"] = &Interpreter::builtinRegexFullMatch;
+        builtins["regexPartialMatch"] = &Interpreter::builtinRegexPartialMatch;
+        builtins["regexQuoteMeta"] = &Interpreter::builtinRegexQuoteMeta;
+        builtins["regexReplace"] = &Interpreter::builtinRegexReplace;
+        builtins["regexGlobalReplace"] = &Interpreter::builtinRegexGlobalReplace;
     }
 
     /** Clean up the heap, stack, stash, and builtin function ASTs. */
@@ -1373,6 +1383,129 @@ class Interpreter {
         return decodeUTF8();
     }
 
+    const AST *regexMatch(const std::string &pattern, const std::string &string, bool full)
+    {
+        RE2 re(pattern, RE2::CannedOptions::Quiet);
+        if (!re.ok()) {
+            std::stringstream ss;
+            ss << "Invalid regex '" << re.pattern() << "': " << re.error();
+            throw makeError(stack.top().location, ss.str());
+        }
+
+        int num_groups = re.NumberOfCapturingGroups();
+
+        std::vector<std::string> rcaptures(num_groups);
+        std::vector<RE2::Arg> rargv(num_groups);
+        std::vector<const RE2::Arg*> rargs(num_groups);
+        for (int i = 0; i < num_groups; ++i) {
+            rargs[i] = &rargv[i];
+            rargv[i] = &rcaptures[i];
+        }
+
+        if (full ? RE2::FullMatchN(string, re, rargs.data(), num_groups)
+                 : RE2::PartialMatchN(string, re, rargs.data(), num_groups)) {
+            std::map<const Identifier *, HeapSimpleObject::Field> fields;
+
+            const Identifier *fid = alloc->makeIdentifier(U"string");
+            fields[fid].hide = ObjectField::VISIBLE;
+            fields[fid].body = alloc->make<LiteralString>(E, EF, decode_utf8(string), LiteralString::DOUBLE, "", "");
+
+            fid = alloc->makeIdentifier(U"captures");
+            fields[fid].hide = ObjectField::VISIBLE;
+            std::vector<Array::Element> captures;
+            for (int i = 0; i < num_groups; ++i) {
+                captures.push_back(Array::Element(
+                    alloc->make<LiteralString>(E, EF, decode_utf8(rcaptures[i]), LiteralString::DOUBLE, "", ""),
+                    EF));
+            }
+            fields[fid].body = alloc->make<Array>(E, EF, captures, false, EF);
+
+            fid = alloc->makeIdentifier(U"namedCaptures");
+            fields[fid].hide = ObjectField::VISIBLE;
+            DesugaredObject::Fields named_captures;
+            const std::map<std::string, int> &named_groups = re.NamedCapturingGroups();
+            for (auto it = named_groups.cbegin(); it != named_groups.cend(); ++it) {
+                named_captures.push_back(DesugaredObject::Field(
+                    ObjectField::VISIBLE,
+                    alloc->make<LiteralString>(E, EF, decode_utf8(it->first), LiteralString::DOUBLE, "", ""),
+                    alloc->make<LiteralString>(E, EF, decode_utf8(rcaptures[it->second-1]), LiteralString::DOUBLE, "", "")));
+            }
+            fields[fid].body = alloc->make<DesugaredObject>(E, ASTs{}, named_captures);
+
+            scratch = makeObject<HeapSimpleObject>(BindingFrame{}, fields, ASTs{});
+        } else {
+            scratch = makeNull();
+        }
+        return nullptr;
+    }
+
+    const AST *builtinRegexFullMatch(const LocationRange &loc, const std::vector<Value> &args)
+    {
+        validateBuiltinArgs(loc, "regexFullMatch", args, {Value::STRING, Value::STRING});
+
+        std::string pattern = encode_utf8(static_cast<HeapString *>(args[0].v.h)->value);
+        std::string string = encode_utf8(static_cast<HeapString *>(args[1].v.h)->value);
+
+        return regexMatch(pattern, string, true);
+    }
+
+    const AST *builtinRegexPartialMatch(const LocationRange &loc, const std::vector<Value> &args)
+    {
+        validateBuiltinArgs(loc, "regexPartialMatch", args, {Value::STRING, Value::STRING});
+
+        std::string pattern = encode_utf8(static_cast<HeapString *>(args[0].v.h)->value);
+        std::string string = encode_utf8(static_cast<HeapString *>(args[1].v.h)->value);
+
+        return regexMatch(pattern, string, false);
+    }
+
+    const AST *builtinRegexQuoteMeta(const LocationRange &loc, const std::vector<Value> &args)
+    {
+        validateBuiltinArgs(loc, "regexQuoteMeta", args, {Value::STRING});
+        scratch = makeString(decode_utf8(RE2::QuoteMeta(encode_utf8(static_cast<HeapString *>(args[0].v.h)->value))));
+        return nullptr;
+    }
+
+    const AST *builtinRegexReplace(const LocationRange &loc, const std::vector<Value> &args)
+    {
+        validateBuiltinArgs(loc, "regexReplace", args, {Value::STRING, Value::STRING, Value::STRING});
+
+        std::string string = encode_utf8(static_cast<HeapString *>(args[0].v.h)->value);
+        std::string pattern = encode_utf8(static_cast<HeapString *>(args[1].v.h)->value);
+        std::string replace = encode_utf8(static_cast<HeapString *>(args[2].v.h)->value);
+
+        RE2 re(pattern, RE2::CannedOptions::Quiet);
+        if(!re.ok()) {
+            std::stringstream ss;
+            ss << "Invalid regex '" << re.pattern() << "': " << re.error();
+            throw makeError(stack.top().location, ss.str());
+        }
+
+        RE2::Replace(&string, re, replace);
+        scratch = makeString(decode_utf8(string));
+        return nullptr;
+    }
+
+    const AST *builtinRegexGlobalReplace(const LocationRange &loc, const std::vector<Value> &args)
+    {
+        validateBuiltinArgs(loc, "regexGlobalReplace", args, {Value::STRING, Value::STRING, Value::STRING});
+
+        std::string string = encode_utf8(static_cast<HeapString *>(args[0].v.h)->value);
+        std::string pattern = encode_utf8(static_cast<HeapString *>(args[1].v.h)->value);
+        std::string replace = encode_utf8(static_cast<HeapString *>(args[2].v.h)->value);
+
+        RE2 re(pattern, RE2::CannedOptions::Quiet);
+        if(!re.ok()) {
+            std::stringstream ss;
+            ss << "Invalid regex '" << re.pattern() << "': " << re.error();
+            throw makeError(stack.top().location, ss.str());
+        }
+
+        RE2::GlobalReplace(&string, re, replace);
+        scratch = makeString(decode_utf8(string));
+        return nullptr;
+    }
+
     const AST *builtinTrace(const LocationRange &loc, const std::vector<Value> &args)
     {
         if(args[0].t != Value::STRING) {
diff --git a/test_suite/stdlib.jsonnet b/test_suite/stdlib.jsonnet
index 7ba684790..e2f6ed812 100644
--- a/test_suite/stdlib.jsonnet
+++ b/test_suite/stdlib.jsonnet
@@ -925,4 +925,74 @@ std.assertEqual(std.decodeUTF8([65 + 1 - 1]), 'A') &&
 std.assertEqual(std.decodeUTF8([90, 97, 197, 188, 195, 179, 197, 130, 196, 135, 32, 103, 196, 153, 197, 155, 108, 196, 133, 32, 106, 97, 197, 186, 197, 132]), 'Zażółć gęślą jaźń') &&
 std.assertEqual(std.decodeUTF8([240, 159, 152, 131]), '😃') &&
 
+std.assertEqual(std.regexFullMatch(@'e', 'hello'), null) &&
+
+std.assertEqual(
+  std.regexFullMatch(@'h.*o', 'hello'),
+  {
+    string: 'hello',
+    captures: [],
+    namedCaptures: {},
+  }
+) &&
+
+std.assertEqual(
+  std.regexFullMatch(@'h(.*)o', 'hello'),
+  {
+    string: 'hello',
+    captures: ['ell'],
+    namedCaptures: {},
+  }
+) &&
+
+std.assertEqual(
+  std.regexFullMatch(@'h(?P<mid>.*)o', 'hello'),
+  {
+    string: 'hello',
+    captures: ['ell'],
+    namedCaptures: {
+      mid: 'ell',
+    },
+  }
+) &&
+
+std.assertEqual(std.regexPartialMatch(@'world', 'hello'), null) &&
+
+std.assertEqual(
+  std.regexPartialMatch(@'e', 'hello'),
+  {
+    string: 'hello',
+    captures: [],
+    namedCaptures: {},
+  }
+) &&
+
+std.assertEqual(
+  std.regexPartialMatch(@'e(.*)o', 'hello'),
+  {
+    string: 'hello',
+    captures: ['ll'],
+    namedCaptures: {},
+  }
+) &&
+
+std.assertEqual(
+  std.regexPartialMatch(@'e(?P<mid>.*)o', 'hello'),
+  {
+    string: 'hello',
+    captures: ['ll'],
+    namedCaptures: {
+      mid: 'll',
+    },
+  }
+) &&
+
+std.assertEqual(std.regexQuoteMeta(@'1.5-2.0?'), '1\\.5\\-2\\.0\\?') &&
+
+std.assertEqual(std.regexReplace('wishyfishyisishy', @'ish', 'and'), 'wandyfishyisishy') &&
+std.assertEqual(std.regexReplace('yabba dabba doo', @'b+', 'd'), 'yada dabba doo') &&
+
+std.assertEqual(std.regexGlobalReplace('wishyfishyisishy', @'ish', 'and'), 'wandyfandyisandy') &&
+std.assertEqual(std.regexGlobalReplace('yabba dabba doo', @'b+', 'd'), 'yada dada doo') &&
+
 true