diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index 8fa0a12..36ab011 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -16,10 +16,10 @@ jobs:
       matrix:
 
         include:
-          - version: '1.8'
+          - version: '1.10' # LTS
             os: ubuntu-latest
             arch: x64
-          - version: '1.8'
+          - version: '1.10' # LTS
             os: windows-latest
             arch: x64
           - version: '1'
diff --git a/Project.toml b/Project.toml
index 3d6feda..3f91bbe 100644
--- a/Project.toml
+++ b/Project.toml
@@ -1,7 +1,7 @@
 name = "SourceCodeMcCormick"
 uuid = "a7283dc5-4ecf-47fb-a95b-1412723fc960"
 authors = ["Robert Gottlieb <Robert.x.gottlieb@uconn.edu>"]
-version = "0.3.1"
+version = "0.4.0"
 
 [deps]
 CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba"
@@ -17,9 +17,9 @@ CUDA = "5"
 DocStringExtensions = "0.8 - 0.9"
 Graphs = "1"
 IfElse = "0.1.0 - 0.1.1"
-SymbolicUtils = "1"
-Symbolics = "5"
-julia = "1.6"
+SymbolicUtils = "3"
+Symbolics = "6"
+julia = "1.10"
 
 [extras]
 McCormick = "53c679d3-6890-5091-8386-c291e8c8aaa1"
diff --git a/README.md b/README.md
index ca22531..db48eb5 100644
--- a/README.md
+++ b/README.md
@@ -2,185 +2,716 @@
 
 | **PSOR Lab** | **Build Status**                                                                                |
 |:------------:|:-----------------------------------------------------------------------------------------------:|
-| [![](https://img.shields.io/badge/Developed_by-PSOR_Lab-342674)](https://psor.uconn.edu/) | [![Build Status](https://github.com/PSORLab/SourceCodeMcCormick.jl/actions/workflows/ci.yml/badge.svg)](https://github.com/PSORLab/SourceCodeMcCormick.jl/actions?query=workflow%3ACI) [![codecov](https://codecov.io/gh/PSORLab/SourceCodeMcCormick.jl/branch/master/graph/badge.svg)](https://codecov.io/gh/PSORLab/SourceCodeMcCormick.jl)|
+| [![](https://img.shields.io/badge/Developed_by-PSOR_Lab-342674)](https://psor.uconn.edu/) | [![Build Status](https://github.com/PSORLab/SourceCodeMcCormick.jl/workflows/CI/badge.svg?branch=master)](https://github.com/PSORLab/SourceCodeMcCormick.jl/actions?query=workflow%3ACI) [![codecov](https://codecov.io/gh/PSORLab/SourceCodeMcCormick.jl/branch/master/graph/badge.svg)](https://codecov.io/gh/PSORLab/SourceCodeMcCormick.jl)|
 
 
 This package uses source-code transformation to construct McCormick-based relaxations. Expressions composed
 of `Symbolics.jl`-type variables can be passed into `SourceCodeMcCormick.jl` (`SCMC`) functions, after which
 the expressions are factored, generalized McCormick relaxation rules and inclusion monotonic interval
 extensions are applied to the factors, and the factors are recombined symbolically to create expressions
-representing convex and concave relaxations and inclusion monotonic interval extensions of the original
-expression. The new expressions are compiled into functions that return pointwise values of these
-four elements, which can be used in, e.g., a branch-and-bound routine. These functions can be used with
-floating-point values, vectors of floating-point values, or CUDA arrays of floating point values (using
-`CUDA.jl`) to return outputs of the same type.
+representing inclusion monotonic interval extensions, convex and concave relaxations, and subgradients of
+convex and concave relaxations of the original expression. The new expressions are compiled into functions
+that return pointwise values of these elements, which can be used in, e.g., a branch-and-bound routine. 
+These functions can be used with floating-point values, vectors of floating-point values, or CUDA arrays 
+of floating point values (using `CUDA.jl`) to return outputs of the same type. 64-bit (double-precision)
+numbers are recommended
+for relaxations to maintain accuracy.
+
+
+## Basic Functionality
+
+The primary user-facing function is `fgen` ("function generator"). The `fgen` function returns a source-code-generated
+function that provides evaluations of convex and concave relaxations, inclusion monotonic interval extensions,
+convex relaxation subgradients, and concave relaxation subgradients, for an input symbolic expression. Inputs
+to the newly generated function are [currently] output to the REPL, and are sorted primarily by variable name
+and secondarily by the order {cv, cc, lo, hi}. E.g., if variables `x` and `y` are used, the input to the generated
+function will be `{x_cv, x_cc, x_lo, x_hi, y_cv, y_cc, y_lo, y_hi}`. A demonstration of how to use `fgen` is shown
+here, with the output compared to the multiple-dispatch-based `McCormick.jl`:
+
+```julia
+using SourceCodeMcCormick, Symbolics
+Symbolics.@variables x, y
+expr = exp(x/y) - (x*y^2)/(y+1)
+new_func = fgen(expr)
+xcv, xcc, xlo, xhi = 1.0, 1.0, 0.5, 3.0
+ycv, ycc, ylo, yhi = 0.7, 0.7, 0.1, 2.0
 
+using McCormick
+xMC = MC{2,NS}(1.0, Interval(0.5, 3.0), 1)
+yMC = MC{2,NS}(0.7, Interval(0.1, 2.0), 2)
 
 
-## Algebraic Systems
+julia> new_func(xcv, xcc, xlo, xhi, ycv, ycc, ylo, yhi)  # SourceCodeMcCormick.jl
+(0.22836802303235837, 2.963476144457207e12, -9.625065492403166, 1.068647458152446e13, -2.32490689757473, -3.629472627089296, 3.592092296310309e12, -8.980230740778097e11)
 
-For a given algebraic equation or system of equations, `SCMC` is designed to provide symbolic transformations 
-that represent the lower/upper bounds and convex/concave relaxations of the provided equation(s). Most notably, 
-`SCMC` uses this symbolic transformation to generate "evaluation functions" which, for a given expression, 
-return the natural interval extension and convex/concave relaxations of an expression. E.g.:
+julia> exp(xMC/yMC) - (xMC*yMC^2)/(yMC+1)  # McCormick.jl
+MC{2, NS}(0.22836802303235793, 2.963476144457207e12, [-9.62507, 1.06865e+13], [-2.3249068975747305, -3.6294726270892967], [3.592092296310309e12, -8.980230740778097e11], false)
+```
 
+In this example, the symbolic expression `exp(x/y) - (x*y^2)/(y+1)` is passed into `fgen`, which returns the
+function `new_func`. Passing inputs representing the McCormick tuples associated with `x` and `y` (i.e., `cv` 
+and `cc` being the point to evaluate for that variable, and `lo` and `hi` being the variable bounds) into 
+`new_func` returns pointwise evaluations of {cv, cc, lo, hi, [cvgrad]..., [ccgrad]...} for the original
+expression of `exp(x/y) - (x*y^2)/(y+1)` on the specified domain of `x` and `y`.
+
+Functions generated using `fgen` are also built to be compatible with vectors and `CuArray`s. An example
+demonstrating this capability is shown here:
+```julia
+using SourceCodeMcCormick, Symbolics, CUDA
+@variables x
+expr = (x-0.5)^2+x
+new_func = fgen(expr)
+
+xcv_GPU = CUDA.rand(Float64, 1000000)
+xcc_GPU = copy(xcv_GPU)
+xlo_GPU = CUDA.zeros(Float64, 1000000)
+xhi_GPU = CUDA.ones(Float64, 1000000)
+
+outputs = new_func(xcv_GPU, xcc_GPU, xlo_GPU, xhi_GPU)
 ```
-using SourceCodeMcCormick, Symbolics
 
-@variables x, y
+This example shows how the generated functions can be used with CUDA arrays. The outputs generated in this
+example are in the same format as for floating-point inputs (`{cv, cc, lo, hi, [cvgrad]..., [ccgrad]...}`),
+but instead of the outputs being floating-point values, they are each vectors of the same length as the inputs.
+This capability is useful when you want to evaluate relaxation or interval information for a large number
+of points, or points on different domains, simultaneously. In this example, the domain of `x` is left
+unchanged for all points for simplicity, but this is not a requirement: points on any domain can be passed
+within the same function evaluation; all points are evaluated independently.
+
+Evaluating large numbers of points simultaneously, and particularly using a GPU, allows for faster relaxation
+calculations than performing individual calculations using a CPU. This is demonstrated in the following
+example:
+
+```julia
+using SourceCodeMcCormick, Symbolics, McCormick, CUDA, BenchmarkTools
+Symbolics.@variables x, y
 expr = exp(x/y) - (x*y^2)/(y+1)
-expr_lo_eval, expr_hi_eval, expr_cv_eval, expr_cc_eval, order = all_evaluators(expr)
-```
+new_func = fgen(expr)
+
+# CuArray values for SCMC timing
+xcv_GPU = CuArray(2.5*rand(1000000) .+ 0.5)
+xcc_GPU = copy(xcv_GPU)
+xlo_GPU = 0.5 .* CUDA.ones(Float64, 1000000)
+xhi_GPU = 3.0 .* CUDA.ones(Float64, 1000000)
+ycv_GPU = CuArray(1.9*rand(1000000) .+ 0.1)
+ycc_GPU = copy(ycv_GPU)
+ylo_GPU = 0.1 .* CUDA.ones(Float64, 1000000)
+yhi_GPU = 2.0 .* CUDA.ones(Float64, 1000000)
+
+# McCormick objects for McCormick.jl timing
+xMC = MC{2,NS}(1.0, Interval(0.5, 3.0), 1)
+yMC = MC{2,NS}(0.7, Interval(0.1, 2.0), 2)
+
+########################################################
+##### Timing 
+#   CPU: Intel i7-9850H
+#   GPU: NVIDIA Quadro T2000
+
+
+##### McCormick.jl
+@btime exp($xMC/$yMC) - ($xMC*$yMC^2)/($yMC+1);
+#   258.457 ns (0 allocations: 0 bytes)
+
+
+##### SourceCodeMcCormick.jl (using GPU)
+
+# (Outputs still on GPU)
+@btime CUDA.@sync new_func($xcv_GPU, $xcc_GPU, $xlo_GPU, $xhi_GPU, $ycv_GPU, $ycc_GPU, $ylo_GPU, $yhi_GPU);
+#   48.756 ms (12795 allocations: 252.33 KiB)
 
-Here, the outputs marked `_eval` are the evaluation functions for the lower bound (`lo`), upper
-bound (`hi`), convex underestimator (`cv`), and concave overestimator (`cc`) of the symbolic
-expression given by `expr`. The inputs to each of these functions are described by the `order` 
-vector, which in this case is `[x_cc, x_cv, x_hi, x_lo, y_cc, y_cv, y_hi, y_lo]`, representing
-the concave/convex relaxations and interval bounds of the variables `x` and `y`. E.g., if being
-used in a branch-and-bound (B&B) scheme, the interval bounds for each variable will be the lower and
-upper bounds of the B&B node for that variable, and the convex/concave relaxations will take the
-value where the relaxation of the original expression is desired.
+#   Average time per evaluation = 48.756 ms / 1000000 = 48.756 ns
+#   Note: Output is `NTuple{8, CuArray{Float64, 1, CUDA.DeviceMemory}}`
 
-One benefit of using a source code transformation approach such as this over a multiple dispatch
-approach like `McCormick.jl` is speed. When McCormick relaxations of functions are evaluated using
-`McCormick.jl`, there is overhead associated with finding the correct functions to call for each
-overloaded math operation. The functions generated by `SCMC`, however, eliminate this overhead by
-creating static functions with the correct McCormick rules already applied. While the `McCormick.jl`
-approach is flexible in quickly evaluating any new expression you provide, in the `SCMC` approach,
-one expression is selected up front, and relaxations and interval extension values are calculated
-for that expression quickly. For example:
 
+# (Outputs moved to CPU memory after calculation)
+@btime CUDA.@sync Array.(new_func($xcv_GPU, $xcc_GPU, $xlo_GPU, $xhi_GPU, $ycv_GPU, $ycc_GPU, $ylo_GPU, $yhi_GPU));
+#   68.897 ms (12853 allocations: 61.28 MiB)
+
+#   Average time per evaluation = 68.897 ms / 1000000 = 68.897 ns
+#   Note: Output is `NTuple{8, Vector{Float64}}`
 ```
-using BenchmarkTools, McCormick
 
-xMC = MC{2, NS}(2.5, Interval(-1.0, 4.0), 1)
-yMC = MC{2, NS}(1.5, Interval(0.5, 3.0), 2)
+As shown in the example, the functions generated by `fgen` can be very fast when used with `CuArray`s, 
+and as shown in the previous example, they can provide the same information as McCormick.jl for a given
+input function. It is worth noting that if `CuArray`s are passed as inputs, the outputs of the generated 
+function are also of type `CuArray`. I.e., if data stored in GPU memory is passed as inputs, the calculations
+occur on the GPU, and the outputs are also stored in GPU memory. This is most useful if you can make use 
+of the results of the relaxation calculations directly on the GPU. If you must move the results
+back into CPU memory, this incurs a large time cost (as in the example: moving the results adds roughly 40%
+to the total time). An active research project is focused on improvements to the underlying
+SourceCodeMcCormick functionality that creates these functions which will even further improve the
+calculation speed.
+
+
+## Arguments for `fgen`
+
+The `fgen` function can be called with only a `Num`-type expression as an input (as in the examples in the
+previous section), or with many other possible arguments that affect `fgen`'s behavior. Some of these
+functionalities and their use cases are shown in this section.
+
+### 1) Constants
+
+For any call to `fgen`, the keyword argument `constants` may be used to specify that a Symbolic variable
+is not meant to be treated as a McCormick object but rather as an adjustable parameter that will always
+take on a specific, constant value. This affects how the symbol is treated by SourceCodeMcCormick internally,
+and will also affect how the symbol is input to the generated function.
 
-@btime exp(xMC/yMC) - (xMC*yMC^2)/(yMC+1)
-# 497.382 ns (7 allocations: 560 bytes)
+```julia
+using SourceCodeMcCormick, Symbolics
+Symbolics.@variables x, y
+expr = exp(x/y) - (x*y^2)/(y+1)
 
-@btime expr_cv_eval(2.5, 2.5, 4.0, -1.0, 1.5, 1.5, 3.0, 0.5)
-# 184.964 ns (1 allocation: 16 bytes)
+# Standard way of calling fgen:
+new_func = fgen(expr)
+xcv, xcc, xlo, xhi = 1.0, 1.0, 0.5, 3.0
+ycv, ycc, ylo, yhi = 0.7, 0.7, 0.1, 2.0
+
+outputs = new_func(xcv, xcc, xlo, xhi, ycv, ycc, ylo, yhi)
+#    Note that the order of inputs is {x, y}, where each is 
+#    split into a McCormick tuple of {cv, cc, lo, hi}
+
+# Treating "y" as a constant or adjustable parameter
+new_func = fgen(expr, constants=[y])
+y_fixed_val = 0.7
+
+outputs = new_func(y_fixed_val, xcv, xcc, xlo, xhi)
+#    Note that the order of the inputs is adjusted so that 
+#    the constants come first. I.e., the ordering goes:
+#    {{constants}, {McCormick variables}} = {y, x}
+#    and then the McCormick variables are split into
+#    their McCormick tuples, giving: {y, x_cv, x_cc, x_lo, x_hi}
 ```
 
-Note that this is not an entirely fair comparison because `McCormick.jl`, by using the `MC` type and
-multiple dispatch, simultaneously calculates all of the following: natural interval extensions,
-convex and concave relaxations, and corresponding subgradients. 
+Marking variables as constants makes the calculations simpler, and eliminates the need to give expanded
+McCormick tuples of symbols that will always represent a single value.
+
 
-Another benefit of the `SCMC` approach is its compatibility with `CUDA.jl` [2]: `SCMC` functions are
-broadcastable over `CuArray`s. Depending on the GPU, number of evaluations, and complexity of the
-function, this can dramatically decrease the time to compute the numerical values of interval
-extensions and relaxations. E.g.:
+### 2) Specifying an Output Subset
 
+In some cases, you may not want or need the full list of outputs `{cv, cc, lo, hi, [cvgrad]..., [ccgrad]...}`. 
+E.g., if the generated function is being used in a subgradient-free lower-bounding routine within a
+branch-and-bound algorithm, you may only want `{cv}` or `{cv, lo}` as the outputs. Reducing the required
+outputs allows you to save space by not allocating unnecessary vectors or `CuArray`s, and it also
+speeds up the overall computation of the generated function by not performing unnecessary calculations.
+E.g., if the concave relaxation of the output is not required, the generated function will not calculate
+the concave relaxation in the final calculation step (though calculating the concave relaxation in
+intermediate steps may still be required to implement relaxation rules). Additionally, if no subgradient
+information is requested, the generated function can skip all subgradient calculations, since they do
+not impact the relaxation or inclusion monotonic interval calculations. Specifying the outputs can be
+done as follows:
+
+```julia
+using SourceCodeMcCormick, Symbolics
+Symbolics.@variables x, y
+expr = exp(x/y) - (x*y^2)/(y+1)
+
+# When fgen is called, any combination of the following outputs can be used:
+#   Symbol  |  Interpretation 
+# -----------------------------------------
+#      :cv  |  Convex relaxation
+#      :cc  |  Concave relaxation
+#      :lo  |  Lower bound of the interval extension
+#      :hi  |  Upper bound of the interval extension
+#  :cvgrad  |  Subgradient of the convex relaxation
+#  :ccgrad  |  Subgradient of the concave relaxation
+#      :MC  |  All of {cv, cc, lo, hi}
+#    :grad  |  Both {cvgrad, ccgrad}
+#     :all  |  All of {cv, cc, lo, hi, cvgrad, ccgrad} (DEFAULT)
+
+# For example, to get only the convex relaxation and lower bound:
+new_func = fgen(expr, [:cv, :lo])
 ```
-using CUDA
 
-# Using McCormick.jl
-xMC_array = MC{2,NS}.(rand(10000), Interval.(zeros(10000), ones(10000)), ones(Int, 10000))
-yMC_array = MC{2,NS}.(rand(10000), Interval.(zeros(10000), ones(10000)), ones(Int, 10000).*2)
 
-@btime @. exp(xMC_array/yMC_array) - (xMC_array*yMC_array^2)/(yMC_array+1)
-# 2.365 ms (18 allocations: 703.81 KiB)
+### 3) Mutated Inputs
+
+For any call to `fgen`, the keyword argument `mutate` may be used to specify that the generated function
+should modify a set of inputs rather than return newly generated vectors (or `CuArray`s). This functionality
+is meant to be used if generated functions are incorporated into a larger script and the space for the
+outputs has already been preallocated. Mutating functions will require extra inputs at the start of the
+input list which correspond to the desired outputs. This works in the following way:
+
+```julia
+using SourceCodeMcCormick, Symbolics
+Symbolics.@variables x, y
+expr = exp(x/y) - (x*y^2)/(y+1)
+new_func! = fgen(expr, [:cv, :lo, :cvgrad], mutate=true)
+
+# Inputs to new_func! are as follows:
+# {OUT_cv, OUT_lo, OUT_∂x_cv, OUT_∂y_cv, x_cv, x_cc, x_lo, x_hi, y_cv, y_cc, y_lo, y_hi}
+```
 
+In this example, `new_func!` will no longer output any information. The results of any calculations are
+instead stored in the corresponding input spaces. Note that this only works with vectors and `CuArray`s,
+since you cannot preallocate a Float64. Although not necessary for `mutate` to work, this example also
+shows that specifying the outputs as `[:cv, :lo, :cvgrad]` means that mutating inputs are only required
+for the corresponding set of desired outputs. In thie case, storage for the convex relaxation, lower
+bound, subgradient with respect to `x`, and subgradient with respect to `y` must be provided as inputs.
+The usual McCormick tuples associated with the variables `x` and `y` are still required, and will always
+appear after the mutating inputs. If any constant symbols are used (see `constants` earlier in this
+section), the order will be `{{mutating inputs}, {constants}, {McCormick variables}}`, each of which
+is sorted alphabetically (and then expanded, for the McCormick variables, as usual).
+
+
+### 4) Base-Level Variables
+
+In some cases, it may be useful to use `fgen` multiple times to represent a very complicated expression,
+with the results connected together---or fed into one another---using generalized McCormick theory. This
+approach is not recommended in general, due to the added complexity it entails, but allowances for this
+approach are nontheless incorporated into `fgen`. 
+
+Specifically, `fgen` works by compiling a list of variables in the input expression, and then assumes 
+that each input variable is a base-level variable (i.e., its relaxation subgradients are 0 in every 
+dimension but its own, and 1 for its own dimension). For this reason, subgradient information is not
+a required input for generated functions (because it is assumed), but if you use a variable that is
+a composite of base-level variables, its subgradients cannot be so simply assumed. Here is an example
+of how this functionality may be used (though note that the example expression can easily be handled
+with a single call to `fgen`---this is only to show what this functionality does):
+
+```julia
+using SourceCodeMcCormick, Symbolics
+Symbolics.@variables x, sub
 
-# Using SourceCodeMcCormick.jl, broadcast using CPU
-xcc = rand(10000)
-xcv = copy(xcc)
-xhi = ones(10000)
-xlo = zeros(10000)
+### Full expression: (x+1)^2 * (x-1)
 
-ycc = rand(10000)
-ycv = copy(xcv)
-yhi = ones(10000)
-ylo = zeros(10000)
+sub_expression = (x+1)^2
+new_func1 = fgen(sub_expression) # Assumes that "x" is a base-level variable
 
-@btime expr_cv_eval.(xcc, xcv, xhi, xlo, ycc, ycv, yhi, ylo);
-# 1.366 ms (20 allocations: 78.84 KiB)
+#   Inputs to new_func1: 
+#   {x_cv, x_cc, x_lo, x_hi}
 
 
-# Using SourceCodeMcCormick.jl and CUDA.jl, broadcast using GPU
-xcc_GPU = CuArray(xcc)
-xcv_GPU = CuArray(xcv)
-xhi_GPU = CuArray(xhi)
-xlo_GPU = CuArray(xlo)
-ycc_GPU = CuArray(ycc)
-ycv_GPU = CuArray(ycv)
-yhi_GPU = CuArray(yhi)
-ylo_GPU = CuArray(ylo)
+new_expression = sub * (x-1)
+new_func2 = fgen(new_expression, [x]) # Tells fgen that "sub" depends on x
 
-@btime CUDA.@sync expr_cv_eval.(xcc_GPU, xcv_GPU, xhi_GPU, xlo_GPU, ycc_GPU, ycv_GPU, yhi_GPU, ylo_GPU);
-# 29.800 μs (52 allocations: 3.88 KiB)
+#   Inputs to new_func2: 
+#   {sub_cv, sub_cc, sub_lo, sub_hi, ∂sub∂x_cv, ∂sub∂x_cc, x_cv, x_cc, x_lo, x_hi}
 ```
 
+Particularly, notice that for `new_func2`, it requires the inputs `{sub, x}` split into
+their McCormick tuples, but in addition, `fgen` is told that `sub` is not a base-level
+variable. I.e., its subgradient cannot be assumed to be 1 in its dimension, as it doesn't
+have its own unique dimension. Instead, it is some composite of the true base-level
+variable `x`, and thus, the subgradient of `sub`'s convex and concave relaxations in the
+`x` dimension must also be provided as inputs.
+
+Relatedly, there may be cases where you would want extra inputs to the generated functions
+even if they are not participating in the expression. For example, if you are incorporating
+constaints and an objective function into an optimization problem and are using `fgen` to
+calculate the relevant relaxations, but the constraints and objective function do not all
+use the same set of variables (e.g., some may participate in the objective function but not
+a constraint, or vice versa), it may be difficult for the optimizer to know ahead of time
+what variables participate in what constraint. One solution to this problem is to give
+each generated function full information of all participating variables across all constraints
+and the objective function. This is accomplished by specifying base-level variables and
+then passing the `all_inputs` keyword argument as `true`. For example:
+
+```julia
+using SourceCodeMcCormick, Symbolics
+Symbolics.@variables x, y
+
+# Constraint 1: (x + 2)^2 - 3 <= 0
+cons1 = (x+2)^2 - 3
+f_cons1 = fgen(cons1, [x, y], all_inputs=true)
+
+#   Inputs are:
+#   {x_cv, x_cc, x_lo, x_hi, y_cv, y_cc, y_lo, y_hi}
 
-## Dynamic Systems
 
-(In development) For dynamic systems, `SCMC` assumes a differential inequalities approach where the 
-relaxations of derivatives are calculated in advance and the resulting (larger) differential equation 
-system, with explicit definitions of the relaxations of derivatives, can be solved. For algebraic 
-systems, the main product of this package is the broadcastable evaluation functions. For dynamic
-systems, this package follows the same idea as in algebraic systems but stops at the symbolic 
-representations of relaxations. This functionality is designed to work with a `ModelingToolkit`-type 
-`ODESystem` with factorable equations [3]--`SCMC` will take such a system and return a new `ODESystem` 
-with expanded equations to provide interval extensions and (if desired) McCormick relaxations. E.g.:
+# Constraint 2: (y - 1)/(y + 1) - 0.9 <= 0
+cons2 = (y-1)/(y+1) - 0.9
+f_cons2 = fgen(cons2, [x, y], all_inputs=true)
 
+#   Inputs are:
+#   {x_cv, x_cc, x_lo, x_hi, y_cv, y_cc, y_lo, y_hi}
 ```
-using SourceCodeMcCormick, ModelingToolkit
-@parameters p[1:2] t
-@variables x[1:2](t)
-D = Differential(t)
 
-tspan = (0.0, 35.0)
-x0 = [1.0; 0.0]
-x_dict = Dict(x[i] .=> x0[i] for i in 1:2)
-p_start = [0.020; 0.025]
-p_dict = Dict(p[i] .=> p_start[i] for i in 1:2)
+Notice here that although only `x` participated in constraint 1, and only `y` participated in
+constraint 2, the input list is the same for both constraints. This is because `[x, y]` was
+specified as the base-level variables, and `all_inputs` was set to `true`. By calling `fgen`
+in this way, an optimizer that used the generated functions for these constraints would only
+need to know that `x` and `y` participated in any/all of the expressions (i.e., the optimizer
+could be informed that there are 2 total variables, with no other extra information about the
+constraints or objective specifically). The optimizer might then loop over all the constraints
+and call the generated functions using the exact same list of inputs, and the generated functions
+would simply ignore any of the inputs that are not relevant for the given expression. Adding
+these outputs does not impact the speed of the generated functions, since the irrelevant inputs 
+aren't used in any calculations.
+
+
+## The ParBB Algorithm
+
+The intended use of SourceCodeMcCormick is in conjunction with a branch-and-bound algorithm
+that is able to make use of relaxations and subgradient information that are calculated in
+parallel on a GPU. See the paper reference in the section "Citing SourceCodeMcCormick" for 
+a more complete description of the ParBB algorithm. Briefly, ParBB is built as an extension 
+of the EAGO solver, and it works by parallelizing the node procesing routines in such a way
+that tasks may be performed by a GPU. This section will demonstrate how SourceCodeMcCormick
+may be used in conjunction with the ParBB algorithm to accelerate global optimization solutions.
+
+### A) Problem solved using base EAGO
+
+The following example comes from an [EAGO-notebooks page](https://github.com/PSORLab/EAGO-notebooks/blob/master/notebooks/nlpopt_explicit_ann.ipynb)
+and involves the optimization of an ANN surrogate model. Here is how the problem is solved
+using the base implementation of EAGO, as given in the linked Jupyter Notebook:
+
+```julia
+using JuMP, EAGO, GLPK
+
+# Weights associated with the hidden layer
+W1 = [ 0.54  -1.97  0.09  -2.14  1.01  -0.58  0.45  0.26;
+     -0.81  -0.74  0.63  -1.60 -0.56  -1.05  1.23  0.93;
+     -0.11  -0.38 -1.19   0.43  1.21   2.78 -0.06  0.40]
+
+# Weights associated with the output layer
+W2 = [-0.91 0.11 0.52]
+
+# Bias associated with the hidden layer
+B1 = [-2.698 0.012 2.926]
+
+# Bias associated with the output layer
+B2 = -0.46
+
+# Variable bounds (Used to scale variables after optimization)
+xLBD = [0.623, 0.093, 0.259, 6.56, 1114,  0.013, 0.127, 0.004]
+xUBD = [5.89,  0.5,   1.0,   90,   25000, 0.149, 0.889, 0.049];
+
+# Create the objective function
+ann_cpu(p1::T, p2::T, p3::T, p4::T, p5::T, p6::T, p7::T, p8::T) where {T<:Real} = 
+    ann_cpu(W1, W2, B1, B2, p1, p2, p3, p4, p5, p6, p7, p8)
+function ann_cpu(W1::Matrix{Float64}, W2::Matrix{Float64}, B1::Matrix{Float64}, B2::Float64, 
+                p1::T, p2::T, p3::T, p4::T, p5::T, p6::T, p7::T, p8::T) where {T<:Real}
+     y1 = W1[1,1]*p1 + W1[1,2]*p2 + W1[1,3]*p3 + W1[1,4]*p4 + W1[1,5]*p5 + W1[1,6]*p6 + W1[1,7]*p7 + W1[1,8]*p8
+     y2 = W1[2,1]*p1 + W1[2,2]*p2 + W1[2,3]*p3 + W1[2,4]*p4 + W1[2,5]*p5 + W1[2,6]*p6 + W1[2,7]*p7 + W1[2,8]*p8
+     y3 = W1[3,1]*p1 + W1[3,2]*p2 + W1[3,3]*p3 + W1[3,4]*p4 + W1[3,5]*p5 + W1[3,6]*p6 + W1[3,7]*p7 + W1[3,8]*p8
+
+     # Note: the objective is already minimized here, relative to the Jupyter notebook
+     return -(B2 + W2[1]*(2/(1+exp(-2*y1+B1[1]))) + W2[2]*(2/(1+exp(-2*y2+B1[2]))) + W2[3]*(2/(1+exp(-2*y3+B1[3]))))
+end
+
+# Model construction
+factory = () -> EAGO.Optimizer(SubSolvers(r = GLPK.Optimizer()))
+model = Model(optimizer_with_attributes(factory, "absolute_tolerance" => 0.001, 
+                                                 "output_iterations" => 10000))
+register(model,:ann_cpu,8,ann_cpu,autodiff=true)
+@variable(model, -1.0 <= p[i=1:8] <= 1.0)
+@NLobjective(model, Min, ann_cpu(p[1], p[2], p[3], p[4], p[5], p[6], p[7], p[8]))
+
+# Solve the model
+optimize!(model)
+```
 
-eqns = [D(x[1]) ~ p[1]+x[1],
-        D(x[2]) ~ p[2]+x[2]]
+Running this code (after an initial compilation run) generates the following output:
+```
+---------------------------------------------------------------------------------------------------------------------------------
+|  Iteration #  |     Nodes     |  Lower Bound  |  Upper Bound  |      Gap      |     Ratio     |     Timer     |   Time Left   |
+---------------------------------------------------------------------------------------------------------------------------------
+|         10000 |          8059 |    -7.680E-01 |    -7.048E-01 |     6.319E-02 |     8.228E-02 |         13.19 |       3586.81 |
+|         20000 |         14199 |    -7.495E-01 |    -7.048E-01 |     4.470E-02 |     5.964E-02 |         17.50 |       3582.50 |
+|         30000 |         19415 |    -7.375E-01 |    -7.048E-01 |     3.268E-02 |     4.432E-02 |         20.71 |       3579.29 |
+|         40000 |         24013 |    -7.297E-01 |    -7.048E-01 |     2.494E-02 |     3.418E-02 |         23.85 |       3576.15 |
+|         50000 |         28091 |    -7.243E-01 |    -7.048E-01 |     1.953E-02 |     2.697E-02 |         26.78 |       3573.22 |
+|         60000 |         31383 |    -7.203E-01 |    -7.048E-01 |     1.551E-02 |     2.153E-02 |         29.45 |       3570.55 |
+|         70000 |         33939 |    -7.172E-01 |    -7.048E-01 |     1.240E-02 |     1.729E-02 |         32.50 |       3567.50 |
+|         80000 |         34973 |    -7.147E-01 |    -7.048E-01 |     9.958E-03 |     1.393E-02 |         35.00 |       3565.00 |
+|         90000 |         34771 |    -7.127E-01 |    -7.048E-01 |     7.905E-03 |     1.109E-02 |         37.48 |       3562.52 |
+---------------------------------------------------------------------------------------------------------------------------------
+|  Iteration #  |     Nodes     |  Lower Bound  |  Upper Bound  |      Gap      |     Ratio     |     Timer     |   Time Left   |
+---------------------------------------------------------------------------------------------------------------------------------
+|        100000 |         32945 |    -7.110E-01 |    -7.048E-01 |     6.177E-03 |     8.688E-03 |         39.96 |       3560.04 |
+|        110000 |         28647 |    -7.095E-01 |    -7.048E-01 |     4.708E-03 |     6.635E-03 |         42.31 |       3557.69 |
+|        120000 |         22075 |    -7.082E-01 |    -7.048E-01 |     3.463E-03 |     4.889E-03 |         44.65 |       3555.35 |
+|        130000 |         14571 |    -7.072E-01 |    -7.048E-01 |     2.416E-03 |     3.417E-03 |         46.84 |       3553.16 |
+|        140000 |          6497 |    -7.063E-01 |    -7.048E-01 |     1.536E-03 |     2.175E-03 |         48.94 |       3551.06 |
+|        147217 |             0 |    -7.058E-01 |    -7.048E-01 |     1.000E-03 |     1.417E-03 |         50.38 |       3549.62 |
+---------------------------------------------------------------------------------------------------------------------------------
+
+Empty Stack: Exhaustive Search Finished
+Optimal Solution Found at Node 2725
+Lower Bound: -0.705776904710031
+Upper Bound: -0.704776806738834
+Solution:
+   p[1] = -0.9999999998903418
+   p[2] = 0.9999999864966321
+   p[3] = -0.9999999997346657
+   p[4] = 0.166800526340091
+   p[5] = -0.7772932141668214
+   p[6] = 0.9999999999319428
+   p[7] = 0.9999999998522763
+   p[8] = 0.9999999998842807
+```
 
-@named syst = ODESystem(eqns, t, x, p, defaults=merge(x_dict, p_dict))
-new_syst = apply_transform(McCormickIntervalTransform(), syst)
+In particular, note that this problem solved in 147,217 iterations after a total time of 50.38
+seconds. These times were obtained on a workstation with an Intel i7-9850H processor.
+
+### B) Problem solved with ParBB (subgradient-free)
+
+Next, the same example will be solved by making use of a function generated by 
+SourceCodeMcCormick and the subgradient-free routines present in ParBB. This functionality
+is described in greater detail in the referenced paper (see "Citing SourceCodeMcCormick").
+Briefly, pointwise evaluations of the convex relaxation of the objective function are used
+to calculate a valid lower bound in each iteration. By grouping together pointwise evaluations
+for a large number of branch-and-bound nodes at the same time and calculating them using
+SourceCodeMcCormick generated functions, we can make use of the massive parallelization 
+available using GPUs.
+
+```julia
+using JuMP, EAGO, SourceCodeMcCormick, CUDA
+
+# Import the ParBB algorithm (Note: path may vary depending on where
+# this file is in relation to SourceCodeMcCormick)
+include(joinpath(@__DIR__, "ParBB", "extension.jl"))
+include(joinpath(@__DIR__, "ParBB", "subroutines.jl"))
+include(joinpath(@__DIR__, "ParBB", "kernels.jl"))
+
+# Weights associated with the hidden layer
+W1 = [ 0.54  -1.97  0.09  -2.14  1.01  -0.58  0.45  0.26;
+     -0.81  -0.74  0.63  -1.60 -0.56  -1.05  1.23  0.93;
+     -0.11  -0.38 -1.19   0.43  1.21   2.78 -0.06  0.40]
+
+# Weights associated with the output layer
+W2 = [-0.91 0.11 0.52]
+
+# Bias associated with the hidden layer
+B1 = [-2.698 0.012 2.926]
+
+# Bias associated with the output layer
+B2 = -0.46
+
+# Variable bounds (Used to scale variables after optimization)
+xLBD = [0.623, 0.093, 0.259, 6.56, 1114,  0.013, 0.127, 0.004]
+xUBD = [5.89,  0.5,   1.0,   90,   25000, 0.149, 0.889, 0.049];
+
+# Create a SourceCodeMcCormick generated function for the objective function
+Symbolics.@variables x[1:8]
+ann_function = fgen(-(B2 + W2[1]*(2/(1+exp(-2*sum(W1[1,i]*x[i] for i=1:8)+B1[1]))) + 
+                    W2[2]*(2/(1+exp(-2*sum(W1[2,i]*x[i] for i=1:8)+B1[2]))) + 
+                    W2[3]*(2/(1+exp(-2*sum(W1[3,i]*x[i] for i=1:8)+B1[3])))), [:cv])
+
+# Create the objective function
+ann_cpu(p1::T, p2::T, p3::T, p4::T, p5::T, p6::T, p7::T, p8::T) where {T<:Real} = 
+    ann_cpu(W1, W2, B1, B2, p1, p2, p3, p4, p5, p6, p7, p8)
+function ann_cpu(W1::Matrix{Float64}, W2::Matrix{Float64}, B1::Matrix{Float64}, B2::Float64, 
+                p1::T, p2::T, p3::T, p4::T, p5::T, p6::T, p7::T, p8::T) where {T<:Real}
+     y1 = W1[1,1]*p1 + W1[1,2]*p2 + W1[1,3]*p3 + W1[1,4]*p4 + W1[1,5]*p5 + W1[1,6]*p6 + W1[1,7]*p7 + W1[1,8]*p8
+     y2 = W1[2,1]*p1 + W1[2,2]*p2 + W1[2,3]*p3 + W1[2,4]*p4 + W1[2,5]*p5 + W1[2,6]*p6 + W1[2,7]*p7 + W1[2,8]*p8
+     y3 = W1[3,1]*p1 + W1[3,2]*p2 + W1[3,3]*p3 + W1[3,4]*p4 + W1[3,5]*p5 + W1[3,6]*p6 + W1[3,7]*p7 + W1[3,8]*p8
+
+     # Note: the objective is already minimized here, relative to the Jupyter notebook
+     return -(B2 + W2[1]*(2/(1+exp(-2*y1+B1[1]))) + W2[2]*(2/(1+exp(-2*y2+B1[2]))) + W2[3]*(2/(1+exp(-2*y3+B1[3]))))
+end
+
+# Model construction
+factory = () -> EAGO.Optimizer(SubSolvers(t = PointwiseGPU(ann_function, # Function returning [:cv] for the objective function
+                                                                      8, # Dimensionality of ann_function
+                                                        node_limit=8192, # Number of nodes to process per iteration
+                                                           gc_freq=50))) # Frequency of garbage collection
+model = Model(optimizer_with_attributes(factory, "absolute_tolerance" => 0.001, 
+                                                "enable_optimize_hook" => true, # Enables PointwiseGPU extension
+                                  "branch_variable" => Bool[true for i in 1:8], # Explicitly tell EAGO to branch on all variables
+                                                  "force_global_solve" => true, # Ignore EAGO's problem type detection
+                                                    "output_iterations" => 10))
+register(model,:ann_cpu,8,ann_cpu,autodiff=true)
+@variable(model, -1.0 <= p[i=1:8] <= 1.0)
+@NLobjective(model, Min, ann_cpu(p[1], p[2], p[3], p[4], p[5], p[6], p[7], p[8]))
+
+# Solve the model
+optimize!(model)
 ```
 
-This takes the original ODE system (`syst`) with equations:
+Running this code (after an initial compilation run) generates the following output:
+```
+---------------------------------------------------------------------------------------------------------------------------------
+|  Iteration #  |     Nodes     |  Lower Bound  |  Upper Bound  |      Gap      |     Ratio     |     Timer     |   Time Left   |
+---------------------------------------------------------------------------------------------------------------------------------
+|            10 |         47818 |    -8.337E-01 |    -7.048E-01 |     1.289E-01 |     1.546E-01 |          0.74 |       3599.26 |
+|            20 |        117316 |    -7.703E-01 |    -7.048E-01 |     6.549E-02 |     8.503E-02 |          1.47 |       3598.53 |
+|            30 |        162052 |    -7.608E-01 |    -7.048E-01 |     5.604E-02 |     7.365E-02 |          1.90 |       3598.10 |
+|            40 |        177018 |    -7.473E-01 |    -7.048E-01 |     4.254E-02 |     5.693E-02 |          2.38 |       3597.62 |
+|            50 |        161712 |    -7.411E-01 |    -7.048E-01 |     3.632E-02 |     4.901E-02 |          3.24 |       3596.76 |
+|            60 |        128062 |    -7.293E-01 |    -7.048E-01 |     2.453E-02 |     3.363E-02 |          4.10 |       3595.90 |
+|            70 |         73936 |    -7.211E-01 |    -7.048E-01 |     1.631E-02 |     2.262E-02 |          5.03 |       3594.97 |
+|            80 |         11916 |    -7.180E-01 |    -7.048E-01 |     1.321E-02 |     1.841E-02 |          5.59 |       3594.41 |
+|            83 |          4560 |    -7.051E-01 |    -7.048E-01 |     3.648E-04 |     5.173E-04 |          5.64 |       3594.36 |
+---------------------------------------------------------------------------------------------------------------------------------
+
+Relative Tolerance Achieved
+Optimal Solution Found at Node 1
+Lower Bound: -0.7051415840592385
+Upper Bound: -0.7047768067379733
+Solution:
+   p[1] = -0.9999999998903418
+   p[2] = 0.9999999864976372
+   p[3] = -0.9999999997346648
+   p[4] = 0.16680027211461484
+   p[5] = -0.777292487172587
+   p[6] = 0.9999999999319427
+   p[7] = 0.9999999998522755
+   p[8] = 0.9999999998842802
 ```
-Differential(t)(x[1](t)) ~ x[1](t) + p[1]
-Differential(t)(x[2](t)) ~ x[2](t) + p[2]
+
+Using the subgradient-free method, and processing 8192 nodes per iteration, this problem converged in 83
+iterations (roughly 679,936 nodes explored), with a total time of 5.64 seconds. These times were obtained
+on a workstation with an Intel i7-9850H processor and an NVIDIA Quadro T2000 GPU. Using a GPU with a
+greater capacity for double-precision floating point calculations will, of course, improve the overall 
+performance of the algorithm, but even with a fairly "typical" GPU such as this, relatively competitive
+speed can be obtained. As compared to the base version of EAGO (which makes use of subgradients), this
+example ran roughly 9x faster. 
+
+It is also important to note that, because a subgradient-free method was used, the lower bounds for any
+individual branch-and-bound node are not as tight as those that can be obtained using subgradient-based
+methods. This is one reason why ParBB needed to explore significantly more nodes than the base version
+of EAGO (nearly 5x as many nodes were explored in this case). The limitation of not using subgradients
+can also cause some optimization problems to converge extremely slowly (see the examples in the paper
+referenced in "Citing SourceCodeMcCormick"). For this reason, all standard global optimizers make use
+of subgradient information.
+
+### C) Problem solved with ParBB (using subgradients)
+
+NOTE: The use of subgradients in ParBB is an active area of research and remains under development. Associated
+papers are forthcoming regarding the incorporation of subgradients into SourceCodeMcCormick and the use
+of them within ParBB.
+
+One of the latest developments for SourceCodeMcCormick is the incorporation of subgradient information
+into the functions generated by `fgen`. The ability to calculate subgradients using a GPU enables
+subgradient-based lower-bounding methods to be used, provided that a routine is available that can
+make use of them. Typically, subgradients are used to generate a linear program (LP) that underapproximates
+the convex relaxation(s) in a given problem. The LP for a given node is then passed to a dedicated LP
+solver such as GLPK as in part (A) of this section.
+
+In ParBB, a large number of branch-and-bound nodes are evaluated simultaneously, meaning subgradients
+for many nodes are generated simultaneously, with the results stored in GPU memory. To use these
+subgradients, ParBB needs a GPU-based LP solver that is capable of handling many LPs simultaneously: one
+for each branch-and-bound node. This has been accomplished by adapting the two-phase simplex method
+to perform each individual step on many separate LPs stacked together in the same matrix (similar to
+many simplex tableaus stacked on top of each other, or a "stacked tableau"). The individual simplex
+steps are performed using custom kernels that parallelize the operations on the GPU. 
+
+Note that the use of the two-phase simplex method is not meant to imply that this method of solving
+LPs is superior to the other possible methods. It was implemented primarily as a proof-of-concept
+tool to demonstrate how subgradient information could be used within a GPU-accelerated branch-and-bound 
+algorithm. Other options are being explored that may be significantly more efficient than the current
+implementation. The following shows how SourceCodeMcCormick's new subgradient feature may be used
+with ParBB.
+
+```julia
+using JuMP, EAGO, SourceCodeMcCormick, CUDA
+
+# Import the ParBB algorithm (Note: path may vary depending on where
+# this file is in relation to SourceCodeMcCormick)
+include(joinpath(@__DIR__, "ParBB", "extension.jl"))
+include(joinpath(@__DIR__, "ParBB", "subroutines.jl"))
+include(joinpath(@__DIR__, "ParBB", "kernels.jl"))
+
+# Weights associated with the hidden layer
+W1 = [ 0.54  -1.97  0.09  -2.14  1.01  -0.58  0.45  0.26;
+     -0.81  -0.74  0.63  -1.60 -0.56  -1.05  1.23  0.93;
+     -0.11  -0.38 -1.19   0.43  1.21   2.78 -0.06  0.40]
+
+# Weights associated with the output layer
+W2 = [-0.91 0.11 0.52]
+
+# Bias associated with the hidden layer
+B1 = [-2.698 0.012 2.926]
+
+# Bias associated with the output layer
+B2 = -0.46
+
+# Variable bounds (Used to scale variables after optimization)
+xLBD = [0.623, 0.093, 0.259, 6.56, 1114,  0.013, 0.127, 0.004]
+xUBD = [5.89,  0.5,   1.0,   90,   25000, 0.149, 0.889, 0.049];
+
+# Create a SourceCodeMcCormick generated function for the objective function
+Symbolics.@variables x[1:8]
+ann_function! = fgen(-(B2 + W2[1]*(2/(1+exp(-2*sum(W1[1,i]*x[i] for i=1:8)+B1[1]))) + 
+                    W2[2]*(2/(1+exp(-2*sum(W1[2,i]*x[i] for i=1:8)+B1[2]))) + 
+                    W2[3]*(2/(1+exp(-2*sum(W1[3,i]*x[i] for i=1:8)+B1[3])))), [:cv, :lo, :cvgrad], mutate=true)
+
+# Create the objective function
+ann_cpu(p1::T, p2::T, p3::T, p4::T, p5::T, p6::T, p7::T, p8::T) where {T<:Real} = 
+    ann_cpu(W1, W2, B1, B2, p1, p2, p3, p4, p5, p6, p7, p8)
+function ann_cpu(W1::Matrix{Float64}, W2::Matrix{Float64}, B1::Matrix{Float64}, B2::Float64, 
+                p1::T, p2::T, p3::T, p4::T, p5::T, p6::T, p7::T, p8::T) where {T<:Real}
+     y1 = W1[1,1]*p1 + W1[1,2]*p2 + W1[1,3]*p3 + W1[1,4]*p4 + W1[1,5]*p5 + W1[1,6]*p6 + W1[1,7]*p7 + W1[1,8]*p8
+     y2 = W1[2,1]*p1 + W1[2,2]*p2 + W1[2,3]*p3 + W1[2,4]*p4 + W1[2,5]*p5 + W1[2,6]*p6 + W1[2,7]*p7 + W1[2,8]*p8
+     y3 = W1[3,1]*p1 + W1[3,2]*p2 + W1[3,3]*p3 + W1[3,4]*p4 + W1[3,5]*p5 + W1[3,6]*p6 + W1[3,7]*p7 + W1[3,8]*p8
+
+     # Note: the objective is already minimized here, relative to the Jupyter notebook
+     return -(B2 + W2[1]*(2/(1+exp(-2*y1+B1[1]))) + W2[2]*(2/(1+exp(-2*y2+B1[2]))) + W2[3]*(2/(1+exp(-2*y3+B1[3]))))
+end
+
+# Model construction
+factory = () -> EAGO.Optimizer(SubSolvers(; t = SimplexGPU_ObjAndCons(ann_function!, # Mutating function returning [:cv, :lo, :cvgrad] for the objective function
+                                                                                  8, # Dimensionality of ann_function!
+                                                                    node_limit=8192, # Number of nodes to process per iteration
+                                                                       max_cuts=1))) # Number of points/subgradients to evaluate per node
+model = Model(optimizer_with_attributes(factory, "enable_optimize_hook" => true,
+                                   "branch_variable" => Bool[true for i in 1:8],
+                                                   "force_global_solve" => true,
+                                                     "output_iterations" => 10))
+register(model,:ann_cpu,8,ann_cpu,autodiff=true)
+@variable(model, -1.0 <= p[i=1:8] <= 1.0)
+@NLobjective(model, Min, ann_cpu(p[1], p[2], p[3], p[4], p[5], p[6], p[7], p[8]))
+
+# Solve the model
+optimize!(model)
 ```
 
-and generates a new ODE system (`new_syst`) with equations:
+Running this code (after an initial compilation run) generates the following output:
 ```
-Differential(t)(x_1_lo(t)) ~ p_1_lo + x_1_lo(t)
-Differential(t)(x_1_hi(t)) ~ p_1_hi + x_1_hi(t)
-Differential(t)(x_1_cv(t)) ~ p_1_cv + x_1_cv(t)
-Differential(t)(x_1_cc(t)) ~ p_1_cc + x_1_cc(t)
-Differential(t)(x_2_lo(t)) ~ p_2_lo + x_2_lo(t)
-Differential(t)(x_2_hi(t)) ~ p_2_hi + x_2_hi(t)
-Differential(t)(x_2_cv(t)) ~ p_2_cv + x_2_cv(t)
-Differential(t)(x_2_cc(t)) ~ p_2_cc + x_2_cc(t)
+---------------------------------------------------------------------------------------------------------------------------------
+|  Iteration #  |     Nodes     |  Lower Bound  |  Upper Bound  |      Gap      |     Ratio     |     Timer     |   Time Left   |
+---------------------------------------------------------------------------------------------------------------------------------
+|            10 |         16280 |    -7.754E-01 |    -7.048E-01 |     7.067E-02 |     9.113E-02 |          0.59 |       3599.41 |
+|            20 |         59342 |    -7.353E-01 |    -7.048E-01 |     3.055E-02 |     4.154E-02 |          1.16 |       3598.84 |
+|            30 |         46048 |    -7.108E-01 |    -7.048E-01 |     5.983E-03 |     8.418E-03 |          1.82 |       3598.18 |
+|            40 |          2240 |    -7.058E-01 |    -7.048E-01 |     9.852E-04 |     1.396E-03 |          2.30 |       3597.70 |
+|            41 |          2240 |    -7.058E-01 |    -7.048E-01 |     9.852E-04 |     1.396E-03 |          2.30 |       3597.70 |
+---------------------------------------------------------------------------------------------------------------------------------
+
+Absolute Tolerance Achieved
+Optimal Solution Found at Node 1
+Lower Bound: -0.7057619894307734
+Upper Bound: -0.7047768067379733
+Solution:
+   p[1] = -0.9999999998903418
+   p[2] = 0.9999999864976372
+   p[3] = -0.9999999997346648
+   p[4] = 0.16680027211461484
+   p[5] = -0.777292487172587
+   p[6] = 0.9999999999319427
+   p[7] = 0.9999999998522755
+   p[8] = 0.9999999998842802
 ```
 
-where `x_lo < x_cv < x < x_cc < x_hi`. Only addition is shown in this example, as other operations
-can appear very expansive, but the same operations available for algebraic systems are available
-for dynamic systems as well. As with the algebraic evaluation functions, equations created by
-`SourceCodeMcCormick` are GPU-ready--multiple trajectories of the resulting ODE system at 
-different points and with different state/parameter bounds can be solved simultaneously using
-an `EnsembleProblem` in the SciML ecosystem, and GPU hardware can be applied for these solves
-using `DiffEqGPU.jl`.
+As in the previous examples, these results were generated using an Intel i7-9850H processor and an NVIDIA
+Quadro T2000 GPU. Effectively, ParBB is running precisely the same lower-bounding routine as the base
+version of the global solver EAGO, except that the routine is being performed in parallel on a GPU
+rather than serially on the CPU. This example converges in 40 iterations of at most 8192 nodes per iteration,
+for a total of roughly 327,680 nodes explored, and reaches its solution in 2.30 seconds. As compared to the
+base version of EAGO, this implementation of ParBB solves the problem roughly 22x faster. As in the previous
+subsection, it should be noted that a GPU with better double-precision floating-point calculation throughput
+would yield even faster results.
+
+It is also interesting to note that roughly twice as many nodes were explored in this example as compared
+to the base version of EAGO, despite the same lower-bounding method being used. There are several factors
+that could be affecting this result. First, it should be noted that not every iteration of ParBB may be
+processing the full 8192 nodes. If fewer than 8192 nodes are available in the branch-and-bound stack,
+ParBB will simply use every node in the stack. Thus, the estimate of 8192*40 nodes is conservatively high.
+Second, because ParBB processes nodes in parallel, decisions about which node(s) to explore next and
+which nodes should be fathomed will almost certainly differ from a serial implementation. Parallel
+processing results in processing some nodes that would have been fathomed in a serial implementation,
+thereby increasing the total node count for parallel branch-and-bound algorithms. The benefit, then, is
+that ParBB is able to make use of the faster computing hardware of GPUs, and even with more nodes being
+processed, can converge more quickly overall.
+
 
 ## Limitations
 
-`SCMC` has several limitations, some of which are described here. Ongoing research effort seeks
-to address several of these.
-- SCMC does not calculate subgradients, which are used in the lower bounding routines of many 
-global optimizers
-- Complicated expressions may cause significant compilation time. This can be manually avoided
-by combining results together in a user-defined function
+(See "Citing SourceCodeMcCormick" for limitations on a pre-subgradient version of SourceCodeMcCormick.)
+
 - `SCMC` is currently compatible with elementary arithmetic operations +, -, *, /, and the
 univariate intrinsic functions ^2 and exp. More diverse functions will be added in the future
-- Functions created with `SCMC` may only accept 32 CUDA arrays as inputs, so functions with more
-than 8 unique variables will need to be split/factored by the user to be accommodated
 - Due to the large number of floating point calculations required to calculate McCormick-based
 relaxations, it is highly recommended to use double-precision floating point numbers, including
 for operations on a GPU. Since most GPUs are designed for single-precision floating point operation,
@@ -188,24 +719,21 @@ forcing double-precision will often result in a significant performance hit. GPU
 scientific computing, with a higher proportion of double-precision-capable cores, are recommended
 for optimal performance with `SCMC`.
 - Due to the high branching factor of McCormick-based relaxations and the possibility of warp
-divergence, there will likely be a performance gap between optimizations with variables covering
-positive-only domains and variables with mixed domains. Additionally, more complicated expressions
-where the structure of a McCormick relaxation changes more frequently with respect to the bounds
-on its domain will likely perform worse than problems where the structure of the relaxation is
-more consistent.
+divergence, there will likely be a slight performance gap between optimization problems with 
+variables covering positive-only domains and problems containing variables with mixed domains. 
+Additionally, more complicated expressions where the structure of a McCormick relaxation changes 
+more frequently with respect to the bounds on its domain may perform worse than expressions 
+where the structure of the relaxation is more consistent. 
 
-## Citing SourceCodeMcCormick
 
+## Citing SourceCodeMcCormick
 Please cite the following paper when using SourceCodeMcCormick.jl. In plain text form this is:
-
 ```
 Gottlieb, R. X., Xu, P., and Stuber, M. D. Automatic source code generation for deterministic
 global optimization with parallel architectures. Optimization Methods and Software, 1–39 (2024).
 DOI: 10.1080/10556788.2024.2396297
 ```
-
 A BibTeX entry is given below:
-
 ```bibtex
 @Article{doi:10.1080/10556788.2024.2396297,
   author    = {Robert X. Gottlieb, Pengfei Xu, and Matthew D. Stuber},
@@ -220,6 +748,7 @@ A BibTeX entry is given below:
 }
 ```
 
+
 ## References
 
 1. M.E. Wilhelm, R.X. Gottlieb, and M.D. Stuber, PSORLab/McCormick.jl (2020), URL
@@ -228,4 +757,4 @@ https://github.com/PSORLab/McCormick.jl.
 on GPUs, IEEE Transactions on Parallel and Distributed Systems (2018).
 3. Y. Ma, S. Gowda, R. Anantharaman, C. Laughman, V. Shah, C. Rackauckas, ModelingToolkit: 
 A composable graph transformation system for equation-based modeling. arXiv preprint 
-arXiv:2103.05244, 2021. doi: 10.48550/ARXIV.2103.05244.
+arXiv:2103.05244, 2021. doi: 10.48550/ARXIV.2103.05244.
\ No newline at end of file
diff --git a/examples/ParBB/extension.jl b/examples/ParBB/extension.jl
index de2aa2a..db1d6c9 100644
--- a/examples/ParBB/extension.jl
+++ b/examples/ParBB/extension.jl
@@ -48,14 +48,300 @@ Base.@kwdef mutable struct PointwiseGPU <: ExtendGPU
     "Flag for stack prepopulation. Good if the total number
     of nodes throughout the solve is expected to be large (default = true)"
     prepopulate::Bool = true
+    "Frequency of garbage collection (number of iterations)"
+    gc_freq::Int = 300
     "(In development) Number of points to use for multistarting the NLP solver"
     multistart_points::Int = 1
 end
 
 function PointwiseGPU(convex_func, var_count::Int; alpha::Float64 = 0.01, node_limit::Int = 50000, 
-                    prepopulate::Bool = true, multistart_points::Int = 1)
+                    prepopulate::Bool = true, gc_freq::Int = 300, multistart_points::Int = 1)
     return PointwiseGPU(convex_func, var_count, node_limit, alpha, 
+                    Vector{Float64}(undef, node_limit), Vector{Float64}(undef, node_limit), Vector{NodeBB}(undef, node_limit), 0,
+                    Matrix{Float64}(undef, node_limit, var_count),
+                    Matrix{Float64}(undef, node_limit, var_count), prepopulate, gc_freq, multistart_points)
+end
+
+
+"""
+$(TYPEDEF)
+
+The SubgradGPU integrator is meant to be paired with the SourceCodeMcCormick
+package. SubgradGPU differs from PointwiseGPU in that SubgradGPU requires
+the `convex_func_and_subgrad` term to return both evaluations of the convex
+relaxation and evaluations of the subgradient of the convex relaxation.
+
+$(TYPEDFIELDS)
+"""
+Base.@kwdef mutable struct SubgradGPU <: ExtendGPU
+    "A user-defined function taking argument `p` and returning a vector
+    of convex evaluations of the objective function [outdated description, [cv, lo, subgrad]]"
+    convex_func_and_subgrad
+    "Number of decision variables"
+    np::Int
+    "The number of nodes to evaluate in parallel (default = 10000)"
+    node_limit::Int64 = 50000
+    "A parameter that changes how far spread out points are. Should be
+    in the range (0.0, 1.0]"
+    α::Float64 = 0.5
+    "Lower bound storage to hold calculated lower bounds for multiple nodes."
+    lower_bound_storage::Vector{Float64} = Vector{Float64}()
+    "Upper bound storage to hold calculated upper bounds for multiple nodes."
+    upper_bound_storage::Vector{Float64} = Vector{Float64}()
+    "Node storage to hold individual nodes outside of the main stack"
+    node_storage::Vector{NodeBB} = Vector{NodeBB}()
+    "An internal tracker of nodes in internal storage"
+    node_len::Int = 0
+    "Variable lower bounds to evaluate"
+    all_lvbs::Matrix{Float64} = Matrix{Float64}()
+    "Variable upper bounds to evaluate"
+    all_uvbs::Matrix{Float64} = Matrix{Float64}()
+    "Internal tracker for the count in the main stack"
+    # node_count::Int = 0
+    "Flag for stack prepopulation. Good if the total number
+    of nodes throughout the solve is expected to be large (default = true)"
+    prepopulate::Bool = true
+    "(In development) Number of points to use for multistarting the NLP solver"
+    multistart_points::Int = 1
+end
+
+function SubgradGPU(convex_func_and_subgrad, var_count::Int; alpha::Float64 = 0.01, node_limit::Int = 50000, 
+                    prepopulate::Bool = true, multistart_points::Int = 1)
+    return SubgradGPU(convex_func_and_subgrad, var_count, node_limit, alpha, 
                     Vector{Float64}(undef, node_limit), Vector{Float64}(undef, node_limit), Vector{NodeBB}(undef, node_limit), 0,
                     Matrix{Float64}(undef, node_limit, var_count),
                     Matrix{Float64}(undef, node_limit, var_count), prepopulate, multistart_points)
+end
+
+
+"""
+$(TYPEDEF)
+
+The SimplexGPU integrator is meant to be paired with the SourceCodeMcCormick
+package. SimplexGPU differs from SubgradGPU in that SimplexGPU can handle
+inequality constraints, and that relaxations are made tighter by solving
+linear programs within the lower bounding routine to make better use of
+subgradient information. Like SubgradGPU, SimplexGPU requires the 
+`convex_func_and_subgrad` term to return both evaluations of the convex
+relaxation and evaluations of the subgradient of the convex relaxation.
+
+$(TYPEDFIELDS)
+"""
+Base.@kwdef mutable struct SimplexGPU_OnlyObj <: ExtendGPU
+    "A user-defined function taking argument `p` and returning a vector
+    of convex evaluations of the objective function [outdated description, [cv, lo, subgrad]]"
+    convex_func_and_subgrad
+    "Number of decision variables"
+    np::Int
+    "The number of nodes to evaluate in parallel (default = 1024)"
+    node_limit::Int64 = 1024
+    "Lower bound storage to hold calculated lower bounds for multiple nodes."
+    lower_bound_storage::Vector{Float64} = Vector{Float64}()
+    "Upper bound storage to hold calculated upper bounds for multiple nodes."
+    upper_bound_storage::Vector{Float64} = Vector{Float64}()
+    "Node storage to hold individual nodes outside of the main stack"
+    node_storage::Vector{NodeBB} = Vector{NodeBB}()
+    "An internal tracker of nodes in internal storage"
+    node_len::Int = 0
+    "Variable lower bounds to evaluate"
+    all_lvbs::Matrix{Float64} = Matrix{Float64}()
+    "Variable upper bounds to evaluate"
+    all_uvbs::Matrix{Float64} = Matrix{Float64}()
+    "Flag for stack prepopulation. Good if the total number
+    of nodes throughout the solve is expected to be large (default = true)"
+    prepopulate::Bool = true
+    "Total number of cuts to do on each node"
+    max_cuts::Int = 3
+    "Frequency of garbage collection (number of iterations)"
+    gc_freq::Int = 15
+    "(In development) Number of points to use for multistarting the NLP solver"
+    multistart_points::Int = 1
+    relax_time::Float64 = 0.0
+    opt_time::Float64 = 0.0
+    lower_counter::Int = 0
+    node_counter::Int = 0
+end
+
+function SimplexGPU_OnlyObj(convex_func_and_subgrad, var_count::Int; node_limit::Int = 1024, 
+                    prepopulate::Bool = true, max_cuts::Int = 3, gc_freq::Int = 15, 
+                    multistart_points::Int = 1)
+    return SimplexGPU_OnlyObj(convex_func_and_subgrad, var_count, node_limit, 
+                    Vector{Float64}(undef, node_limit), Vector{Float64}(undef, node_limit), Vector{NodeBB}(undef, node_limit), 0,
+                    Matrix{Float64}(undef, node_limit, var_count),
+                    Matrix{Float64}(undef, node_limit, var_count), prepopulate, max_cuts, gc_freq, multistart_points, 0.0, 0.0, 0, 0)
+end
+
+"""
+$(TYPEDEF)
+
+The SimplexGPU_ObjAndCons structure is meant to handle optimization problems
+with nontrivial constraints as well as a potentially nonlinear objective
+function. Note that this struct requires the functions representing the
+objective function and constraints to mutate arguments, rather than return
+a tuple of results. SimplexGPU_ObjAndCons is not designed to handle mixed-integer
+problems; NLPs only.
+
+$(TYPEDFIELDS)
+"""
+Base.@kwdef mutable struct SimplexGPU_ObjAndCons <: ExtendGPU
+    "A SCMC-generated or user-defined function taking arguments [cv, lo, [cv_subgrad]..., p...],
+    which modifies `cv` to hold the convex relaxation of the objective function, `lo` to hold 
+    the lower bound of the inclusion monotonic interval extension of the objective function, 
+    and n instances of `cv_subgrad` that will hold the n subgradients of the convex relaxation 
+    of the objective function (where n is the dimensionality of the problem), all evaluated at
+    points `p`"
+    obj_fun
+    "A vector of SCMC-generated or user-defined functions, each with the same form as `obj_fun`,
+    but with arguments [cv, [cv_subgrad]..., p...], representing all of the LEQ inequality constraints"
+    leq_cons
+    "A vector of SCMC-generated or user-defined functions, taking arguments [cc, [cc_subgrad]..., p...],
+    defined similarly to the objective function and GEQ constraints, representing all of the 
+    GEQ inequality constraints"
+    geq_cons
+    "A vector of SCMC-generated or user-defined functions, taking arguments 
+    [cv, cc, [cv_subgrad]..., [cc_subgrad]..., p...], with terms defined similarly to 
+    the objective function and inequality constraints, representing all of the equality constraints"
+    eq_cons
+    "Number of decision variables"
+    np::Int
+    "The number of nodes to evaluate in parallel (default = 1024)"
+    node_limit::Int64 = 1024
+    "Lower bound storage to hold calculated lower bounds for multiple nodes."
+    lower_bound_storage::Vector{Float64} = Vector{Float64}()
+    "Upper bound storage to hold calculated upper bounds for multiple nodes."
+    upper_bound_storage::Vector{Float64} = Vector{Float64}()
+    "Node storage to hold individual nodes outside of the main stack"
+    node_storage::Vector{NodeBB} = Vector{NodeBB}()
+    "An internal tracker of nodes in internal storage"
+    node_len::Int = 0
+    "Variable lower bounds to evaluate"
+    all_lvbs::Matrix{Float64} = Matrix{Float64}()
+    "Variable upper bounds to evaluate"
+    all_uvbs::Matrix{Float64} = Matrix{Float64}()
+    "Flag for stack prepopulation. Good if the total number
+    of nodes throughout the solve is expected to be large (default = true)"
+    prepopulate::Bool = true
+    "Total number of cuts to do on each node"
+    max_cuts::Int = 3
+    "Frequency of garbage collection (number of iterations)"
+    gc_freq::Int = 15
+    "(In development) Number of points to use for multistarting the NLP solver"
+    multistart_points::Int = 1
+    relax_time::Float64 = 0.0
+    opt_time::Float64 = 0.0
+    lower_counter::Int = 0
+    node_counter::Int = 0
+end
+
+function SimplexGPU_ObjAndCons(obj_fun, var_count::Int; geq_cons=[], leq_cons=[], eq_cons=[], node_limit::Int = 1024,
+                        prepopulate::Bool = true, max_cuts::Int = 3, gc_freq::Int = 15, multistart_points::Int = 1)
+    return SimplexGPU_ObjAndCons(obj_fun, leq_cons, geq_cons, eq_cons, var_count, node_limit, 
+                    Vector{Float64}(undef, node_limit), Vector{Float64}(undef, node_limit), Vector{NodeBB}(undef, node_limit), 0,
+                    Matrix{Float64}(undef, node_limit, var_count),
+                    Matrix{Float64}(undef, node_limit, var_count), prepopulate, max_cuts, gc_freq, multistart_points, 0.0, 0.0, 0, 0)
+end
+
+
+
+
+
+
+Base.@kwdef mutable struct SimplexGPU_ObjOnly_Mat <: ExtendGPU
+    "A SCMC-generated or user-defined function taking arguments [cv, lo, [cv_subgrad]..., p...],
+    which modifies `cv` to hold the convex relaxation of the objective function, `lo` to hold 
+    the lower bound of the inclusion monotonic interval extension of the objective function, 
+    and n instances of `cv_subgrad` that will hold the n subgradients of the convex relaxation 
+    of the objective function (where n is the dimensionality of the problem), all evaluated at
+    points `p`"
+    obj_fun
+    "Number of decision variables"
+    np::Int
+    "The number of nodes to evaluate in parallel (default = 1024)"
+    node_limit::Int64 = 1024
+    "Lower bound storage to hold calculated lower bounds for multiple nodes."
+    lower_bound_storage::Vector{Float64} = Vector{Float64}()
+    "Upper bound storage to hold calculated upper bounds for multiple nodes."
+    upper_bound_storage::Vector{Float64} = Vector{Float64}()
+    "Node storage to hold individual nodes outside of the main stack"
+    node_storage::Vector{NodeBB} = Vector{NodeBB}()
+    "An internal tracker of nodes in internal storage"
+    node_len::Int = 0
+    "Variable lower bounds to evaluate"
+    all_lvbs::Matrix{Float64} = Matrix{Float64}()
+    "Variable upper bounds to evaluate"
+    all_uvbs::Matrix{Float64} = Matrix{Float64}()
+    "Flag for stack prepopulation. Good if the total number
+    of nodes throughout the solve is expected to be large (default = true)"
+    prepopulate::Bool = true
+    "Total number of cuts to do on each node"
+    max_cuts::Int = 3
+    "Frequency of garbage collection (number of iterations)"
+    gc_freq::Int = 15
+    "(In development) Number of points to use for multistarting the NLP solver"
+    multistart_points::Int = 1
+    relax_time::Float64 = 0.0
+    opt_time::Float64 = 0.0
+    lower_counter::Int = 0
+    node_counter::Int = 0
+end
+
+function SimplexGPU_ObjOnly_Mat(obj_fun, var_count::Int; node_limit::Int = 1024,
+                        prepopulate::Bool = true, max_cuts::Int = 3, gc_freq::Int = 15, multistart_points::Int = 1)
+    return SimplexGPU_ObjOnly_Mat(obj_fun, var_count, node_limit, 
+                    Vector{Float64}(undef, node_limit), Vector{Float64}(undef, node_limit), Vector{NodeBB}(undef, node_limit), 0,
+                    Matrix{Float64}(undef, node_limit, var_count),
+                    Matrix{Float64}(undef, node_limit, var_count), prepopulate, max_cuts, gc_freq, multistart_points, 0.0, 0.0, 0 ,0)
+end
+
+
+"""
+$(TYPEDEF)
+
+This is a testing method/struct, to see if we can check fewer points per node
+when we construct the LPs and still get all the same benefits. The normal
+SimplexGPU method uses 2n+1 points, where n is the problem dimensionality.
+This method only uses a single point in the center of the node, and can
+therefore get away with more simultaneous LPs, since each one is significantly
+smaller.
+
+$(TYPEDFIELDS)
+"""
+Base.@kwdef mutable struct SimplexGPU_Single <: ExtendGPU
+    "A user-defined function taking argument `p` and returning a vector
+    of convex evaluations of the objective function [outdated description, [cv, lo, subgrad]]"
+    convex_func_and_subgrad
+    "Number of decision variables"
+    np::Int
+    "The number of nodes to evaluate in parallel (default = 2500)"
+    node_limit::Int64 = 2500
+    "A parameter that changes how far spread out points are. Should be
+    in the range (0.0, 1.0]"
+    α::Float64 = 0.5
+    "Lower bound storage to hold calculated lower bounds for multiple nodes."
+    lower_bound_storage::Vector{Float64} = Vector{Float64}()
+    "Upper bound storage to hold calculated upper bounds for multiple nodes."
+    upper_bound_storage::Vector{Float64} = Vector{Float64}()
+    "Node storage to hold individual nodes outside of the main stack"
+    node_storage::Vector{NodeBB} = Vector{NodeBB}()
+    "An internal tracker of nodes in internal storage"
+    node_len::Int = 0
+    "Variable lower bounds to evaluate"
+    all_lvbs::Matrix{Float64} = Matrix{Float64}()
+    "Variable upper bounds to evaluate"
+    all_uvbs::Matrix{Float64} = Matrix{Float64}()
+    "Flag for stack prepopulation. Good if the total number
+    of nodes throughout the solve is expected to be large (default = true)"
+    prepopulate::Bool = true
+    "Total number of cuts to do on each node"
+    max_cuts::Int = 3
+    "(In development) Number of points to use for multistarting the NLP solver"
+    multistart_points::Int = 1
+end
+
+function SimplexGPU_Single(convex_func_and_subgrad, var_count::Int; alpha::Float64 = 0.01, node_limit::Int = 2500, 
+                    prepopulate::Bool = true, max_cuts::Int = 3, multistart_points::Int = 1)
+    return SimplexGPU_Single(convex_func_and_subgrad, var_count, node_limit, alpha, 
+                    Vector{Float64}(undef, node_limit), Vector{Float64}(undef, node_limit), Vector{NodeBB}(undef, node_limit), 0,
+                    Matrix{Float64}(undef, node_limit, var_count),
+                    Matrix{Float64}(undef, node_limit, var_count), prepopulate, max_cuts, multistart_points)
 end
\ No newline at end of file
diff --git a/examples/ParBB/kernels.jl b/examples/ParBB/kernels.jl
new file mode 100644
index 0000000..2960685
--- /dev/null
+++ b/examples/ParBB/kernels.jl
@@ -0,0 +1,876 @@
+
+# First one, we have a matrix of booleans. We want to identify the first element
+# in each row that's true (1), and return that as an array.
+function first_true_kernel(bool_matrix, first_true_indices)
+    idx = threadIdx().x + (blockIdx().x - Int32(1)) * blockDim().x
+    stride = blockDim().x * gridDim().x
+    matsize = Int32(size(bool_matrix, 2))
+    while idx <= Int32(length(first_true_indices))
+        first_true_indices[idx] = Int32(1)
+        while first_true_indices[idx] <= matsize
+            if bool_matrix[idx, first_true_indices[idx]]
+                first_true_indices[idx] += matsize+Int32(1)
+            else
+                first_true_indices[idx] += Int32(1)
+            end
+        end
+        first_true_indices[idx] -= (matsize+Int32(1))
+        idx += stride
+    end
+    return nothing
+end
+
+
+# This one accesses elements in the matrix
+function access_kernel(matrix, n_rows, pivot_cols, pivot_col_vals)
+    idx = threadIdx().x + (blockIdx().x - Int32(1)) * blockDim().x
+    stride = blockDim().x * gridDim().x
+    while idx <= Int32(length(pivot_col_vals))
+        col_index = cld(idx,n_rows)
+        if pivot_cols[col_index] != Int32(0)
+            pivot_col_vals[idx] = matrix[idx, pivot_cols[cld(idx,n_rows)]]
+        else
+            pivot_col_vals[idx] = -Inf
+        end
+        idx += stride
+    end
+    return nothing
+end
+
+# These set specific values to Inf, zero, or one, respectively
+function set_inf_kernel(vec, inds)
+    idx = threadIdx().x + (blockIdx().x - Int32(1)) * blockDim().x
+    stride = blockDim().x * gridDim().x
+    while idx <= Int32(length(inds))
+        if inds[idx]
+            vec[idx] = Inf
+        end
+        idx += stride
+    end
+    return nothing
+end
+
+
+function set_zero_kernel(vec, inds)
+    idx = threadIdx().x + (blockIdx().x - Int32(1)) * blockDim().x
+    stride = blockDim().x * gridDim().x
+    while idx <= Int32(length(inds))
+        if inds[idx]
+            vec[idx] = 0.0
+        end
+        idx += stride
+    end
+    return nothing
+end
+function set_one_kernel(vec, inds)
+    idx = threadIdx().x + (blockIdx().x - Int32(1)) * blockDim().x
+    stride = blockDim().x * gridDim().x
+    while idx <= Int32(length(inds))
+        if inds[idx]
+            vec[idx] = 1.0
+        end
+        idx += stride
+    end
+    return nothing
+end
+
+# # Sometimes multiplication and division ends up with a value
+# # close to 0, but not exactly 0. Sometimes, these can
+# # mess up pivots by causing some values to get multiplied
+# # by 1/[very low value], so there are some 1E16's floating
+# # around, and this can sometimes cause pivots to fail.
+# function filter_kernel(mat)
+#     idx = threadIdx().x + (blockIdx().x - Int32(1)) * blockDim().x
+#     stride = blockDim().x * gridDim().x
+#     len = Int32(size(mat, 1))
+#     wid = Int32(size(mat, 2))
+#     tot = len*wid
+#     while idx <= tot
+#         if abs(mat[idx])<=1E-15
+#             mat[idx] = 0.0
+#         end
+#         idx += stride
+#     end
+#     return nothing
+# end
+
+# This one finds the index of the minimum value of each column
+# of a ratio vector
+function find_min_kernel(ratios, n_rows, rows)
+    row = blockIdx().x
+    col = threadIdx().x
+
+    # Need to use parallel reduction... Each block handles one row...
+    shared_min_vals = @cuDynamicSharedMem(Float64, blockDim().x)
+    shared_min_idxs = @cuDynamicSharedMem(Int32, blockDim().x, offset=Int32(8)*blockDim().x)
+
+    # Note: "row" and "col" are a little weird here, because
+    # "ratios" is actually a vector.
+
+    # Initialize the shared memory
+    shared_min_vals[col] = ratios[(row-1)*n_rows + col]
+    shared_min_idxs[col] = col
+
+    # Parallel reduction to find the index of the minimum value
+    # (This could probably be made more efficient. If n_rows is
+    #  not cleanly divisble by 2 there may be repeated checks
+    #  of some elements, plus at smaller than 32 elements, not all
+    #  threads in each warp will be used.)
+    stride = cld(n_rows,Int32(2))
+    while (stride != Int32(0))
+        sync_threads()
+        if (col <= stride) && (col+stride <= n_rows)
+            if shared_min_vals[col] == shared_min_vals[col + stride]
+                if shared_min_idxs[col] > shared_min_idxs[col + stride]
+                    shared_min_idxs[col] = shared_min_idxs[col + stride]
+                end
+            elseif shared_min_vals[col] > shared_min_vals[col + stride]
+                shared_min_vals[col] = shared_min_vals[col + stride]
+                shared_min_idxs[col] = shared_min_idxs[col + stride]
+            end
+        end 
+        if stride==1
+            stride = Int32(0)
+        else
+            stride = cld(stride,Int32(2))
+        end
+    end
+    if col==1
+        rows[row] = shared_min_idxs[1]
+    end
+    return nothing
+end
+
+# This kernel function performs the pivoting operation
+# based on the previously determined pivot rows and columns.
+# Each block takes on a single LP to pivot, so the number
+# of blocks should match n_probs, threads can be anything but
+# 32 should be fine, shmem should be 8*(size(tableau, 2)+n_rows)
+function pivot_kernel(tableau, pivot_rows, pivot_cols, n_rows, width, entries_per_LP)
+    # Since each block has its own problem to address, we
+    # need to figure out the correct row/col addresses based
+    # on block.
+    thread = threadIdx().x
+    LP = (blockIdx().x)-Int32(1) #Base 0 is easier in some cases
+
+    # Maybe let's start with setting up shared memory. We only
+    # need the pivot row and the factors
+    pivot_row = @cuDynamicSharedMem(Float64, width)
+    factors = @cuDynamicSharedMem(Float64, n_rows, offset=Int32(8*width))
+
+    # Before we do anything at all, skip the entire pivot operation
+    # if pivot_cols[LP] is 0.
+    if pivot_cols[blockIdx().x] != 0
+
+        # Fill in the pivot row with the correct pivot rows
+        stride = blockDim().x 
+        while thread <= width
+            pivot_row[thread] = tableau[LP*n_rows + pivot_rows[blockIdx().x], thread]
+            thread += stride
+        end
+        sync_threads()
+        thread = threadIdx().x
+
+        # Fill in the multiplication factors for each row by comparing against 
+        # the pivot element in the pivot column
+        while thread <= n_rows
+            if thread==pivot_rows[blockIdx().x]
+                factors[thread] = 1/tableau[LP*n_rows + thread, pivot_cols[blockIdx().x]] - 1
+            else
+                factors[thread] = -tableau[LP*n_rows + thread, pivot_cols[blockIdx().x]] / pivot_row[pivot_cols[blockIdx().x]]
+            end
+            thread += stride
+        end
+        sync_threads()
+        thread = threadIdx().x
+
+        # Now we've stored the pivot row and multiplicative factors separately 
+        # from the main tableau, the pivot can happen by referencing these saved
+        # values.
+        while thread <= entries_per_LP
+            LP_row = cld(thread,width) #We need both division and modulo, but we can calculate one from the other
+            # Note: We could also have each thread take one column in its LP to not use division, 
+            # but that might not be very efficient if the number of columns isn't neatly divisible
+            # by 32.
+            row = Int32(LP*n_rows) + Int32(LP_row)
+            col = thread - Int32(((LP_row-Int32(1))*width))
+
+            # Now that we know what row and column we're looking at, we can
+            # perform pivots.
+            # Deal with the pivot column specially, but for all others, we're fine.
+            if col==pivot_cols[blockIdx().x]
+                if LP_row==pivot_rows[blockIdx().x]
+                    tableau[row,col] = 1.0
+                else
+                    tableau[row,col] = 0.0
+                end
+            else
+                tableau[row,col] += factors[LP_row]*pivot_row[col]
+            end
+            thread += stride
+        end
+    end
+    sync_threads()
+    return nothing
+end
+
+
+# This kernel checks a Boolean matrix "mat" and sets output to "true" if
+# every element of mat is "false". If any element of mat is "true",
+# then output is "false".
+function not_any_kernel(output, mat)
+    idx = threadIdx().x + (blockIdx().x - Int32(1)) * blockDim().x
+    stride = blockDim().x * gridDim().x
+    total_size = Int32(size(mat, 1))*Int32(size(mat,2))
+
+    while output[1] && idx <= total_size 
+        if mat[idx]
+            output[1] = false
+        end
+        idx += stride
+    end
+
+    return nothing
+end
+
+# This kernel checks a Boolean matrix "mat" and sets the output to "true"
+# if every element of mat is "Inf".
+function all_inf_kernel(output, mat)
+    idx = threadIdx().x + (blockIdx().x - Int32(1)) * blockDim().x
+    stride = blockDim().x * gridDim().x
+    len = Int32(length(mat))
+
+    while output[1]==true && idx <= len
+        if ~isinf(mat[idx])
+            output[1] = false
+        end
+        idx += stride
+    end
+    return nothing
+end
+
+# This kernel checks if rows of the two matrices are equal.
+# If they're completely equal, we mark the flag as true
+# for that row.
+function degeneracy_check_kernel(flag, mat1, mat2)
+    row = threadIdx().x + (blockIdx().x - Int32(1)) * blockDim().x
+    stride = blockDim().x * gridDim().x
+    len = Int32(length(flag))
+    width = Int32(size(mat1, 2))
+
+    # Check each row of the matrix to see if the flag should be
+    # true or false
+    while row <= len
+        col = Int32(1)
+        while col <= width
+            if abs(mat1[row,col]-mat2[row,col]) <= 1E-10
+                if col==width
+                    @inbounds flag[row] = true
+                end
+            else
+                @inbounds flag[row] = false
+                break
+            end
+            col += Int32(1)
+        end
+        row += stride
+    end
+    return nothing
+end
+
+# This kernel creates the basic tableau skeleton, which involves placing
+# 1's and -1's in the correct spaces based on the number of variables. 
+# An example tableau (with labels) might look like the following:
+#=
+OLD TABLEAU:
+9×19 CuArray{Float64, 2, CUDA.DeviceMemory}:
+   Z'   X_1        X_2        X_3        S_1  S_2  S_3  S_4  S_5  S_6  S_7  A_1  A_2  A_3  A_4  A_5  A_6  A_7  B
+  1.0   0.0        0.0        0.0        1.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.668527
+  0.0   1.0        0.0        0.0        0.0  1.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  1.0
+  0.0   0.0        1.0        0.0        0.0  0.0  1.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  1.0
+  0.0   0.0        0.0        1.0        0.0  0.0  0.0  1.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  1.0
+  1.0  -0.317476  -0.449098  -0.330095   0.0  0.0  0.0  0.0  1.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0
+  0.0   0.0        0.0        0.0        0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0
+  0.0   0.0        0.0        0.0        0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0
+ -1.0   0.0        0.0        0.0        0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0
+  0.0   0.0        0.0        0.0        0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0
+
+Description: (OLD)
+ Z is the epigraph variable that replaces the objective function. From the 
+    lower bound of the relaxation of the objective function, we calculate
+    that Z >= -0.668527. Adding in a slack variable, we get Z - S_1 = -0.668527. 
+    But, this is inconvenient since the slack variable is negative, so we can 
+    negate the expression above and instead calculate Z', where Z' = -Z. 
+    Now we have Z' <= 0.668527, or Z' + S_1 = 0.668527. This is the first row
+    of the tableau. 
+Since we originally wanted to minimize Z, but have replaced Z by Z', we are 
+    now minimizing -Z'. This is the second to last row of the tableau.
+X_1 through X_3 are the problem variables, scaled to be on the domain [0,1].
+    Since the tableau method assumes all variables are >= 0, we only need to
+    include rows for the upper bounds. I.e., X_1 <= 1.0. Adding in slack variables,
+    we get X_1 + S_2 = 1.0 (and similar for X_2 and X_3). This is lines 2-4
+    of the tableau.
+Line 5 of the tableau would not be created in this step, but shows an example
+    of what the line will eventually contain. Here, it is a convex relaxation
+    of the objective function, hence the 1 in the Z' column, and the subgradient
+    information is stored in the columns for the X variables. This line can be
+    obtained in the following way:
+    Z >= -0.317476*X_1 + -0.449098*X_2 + -0.330095*X_3
+    -0.317476*X_1 + -0.449098*X_2 + -0.330095*X_3 <= Z
+    -0.317476*X_1 + -0.449098*X_2 + -0.330095*X_3 - Z <= 0
+    -0.317476*X_1 + -0.449098*X_2 + -0.330095*X_3 + Z' <= 0
+    -0.317476*X_1 + -0.449098*X_2 + -0.330095*X_3 + Z' + S_5 = 0
+Lines 6 and 7 are reserved for future cuts. There may be greater or fewer lines
+    dedicated to cuts for the objective function and/or constraints. In the pivoting
+    step, these unused rows will be ignored until they contain useful information.
+In this case, none of the slack variables were negative, so there was no need
+    to use the artificial variable columns (A_1 through A_7). This information
+    will be passed to the pivoting kernel, which will ignore these columns entirely.
+The final row is for the Phase I objective, which is to minimize the sum of the 
+    artificial variables. This is unnecessary in this case, as no Phase I solution
+    is necessary and we already have a BFS comprising S_1 through S_5 basic.
+
+NEW TABLEAU:
+9×17 CuArray{Float64, 2, CUDA.DeviceMemory}:
+  Z_+  Z_-   X_1        X_2        X_3        S_1  S_2  S_3  S_4  S_5  S_6  S_7  A_1  A_2  A_3  A_4  B
+  0.0  0.0   1.0        0.0        0.0        1.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  1.0      # Variable 1 row
+  0.0  0.0   0.0        1.0        0.0        0.0  1.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  1.0      # Variable 2 row
+  0.0  0.0   0.0        0.0        1.0        0.0  0.0  1.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  1.0      # Variable 3 row
+ -1.0  1.0   0.0        0.0        0.0        0.0  0.0  0.0  1.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.668527 # Lower bound cut
+  1.0 -1.0   0.317476   0.449098   0.330095   0.0  0.0  0.0  0.0 -1.0  0.0  0.0  0.0  1.0  0.0  0.0  0.2      # First cut
+  0.0  0.0   0.0        0.0        0.0        0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0      # Empty for second cut
+  0.0  0.0   0.0        0.0        0.0        0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0      # Empty for third cut
+  1.0 -1.0   0.0        0.0        0.0        0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0      # Phase II objective
+ -1.0  1.0  -0.317476  -0.449098  -0.330095   0.0  0.0  0.0  0.0  1.0  0.0  0.0  0.0  0.0  0.0  0.0 -0.2      # Phase I objective
+
+Note that the lower bound cut comes from something like the following:
+ [objective]     >= -0.668527
+ Z_+ - Z_-       >= -0.668527
+ Z_+ - Z_- - S_4 == -0.668527
+-Z_+ + Z_- + S_4 == 0.668527
+
+And for objective function cuts, we have something like the following:
+            -0.317476*X_1 - 0.449098*X_2 - 0.330095*X_3 + 0.2       <= (Z_+ - Z_-)
+-Z_+ + Z_- - 0.317476*X_1 - 0.449098*X_2 - 0.330095*X_3             <= -0.2
+-Z_+ + Z_- - 0.317476*X_1 - 0.449098*X_2 - 0.330095*X_3 + S_5       == -0.2
+ Z_+ - Z_- + 0.317476*X_1 + 0.449098*X_2 + 0.330095*X_3 - S_5 + A_2 == 0.2
+
+ With the negative of the whole row, not including the artificial variable, being added to the
+ phase I objective row. 
+=#
+function tableau_skeleton_kernel(tableau, n_vars, slack_count)
+    # Each block handles one LP
+    thread = threadIdx().x
+    LP = blockIdx().x #Base 1 is easier in this kernel
+    stride = blockDim().x
+    width = n_vars*Int32(2) + Int32(3) # Skipping unnecessary slack variables and artificial variables
+    entries_per_LP = width*(n_vars+Int32(1))
+    
+    while thread <= entries_per_LP
+        LP_row = cld(thread,width) #We need both division and modulo, but we can calculate one from the other
+        row = (LP-Int32(1))*(slack_count+Int32(2)) + Int32(LP_row) # This is the real row in the tableau
+        col = thread - Int32(((LP_row-Int32(1))*width))
+
+        # Fill in variable columns (leaving first 2 columns for epigraph variables)
+        if (LP_row == col) && (LP_row <= n_vars)
+            tableau[row, col + Int32(2)] = 1.0
+
+        # Fill in slack variables (leaving first 2 columns for epigraph variables)
+        elseif ((LP_row + n_vars) == col) && (LP_row <= n_vars)
+            tableau[row, col + Int32(2)] = 1.0
+
+        # Fill in the b column with variable upper bounds (all scaled to 1.0)
+        elseif (col == width) && (LP_row <= n_vars)
+            tableau[row, end] = 1.0
+
+        # Fill in the phase II objective row
+        elseif (LP_row == (n_vars + Int32(1))) && (col == Int32(1))
+            tableau[LP*(slack_count+Int32(2)) - Int32(1), col] = 1.0
+        elseif (LP_row == (n_vars + Int32(1))) && (col == Int32(2))
+            tableau[LP*(slack_count+Int32(2)) - Int32(1), col] = -1.0
+        end
+
+        thread += stride
+    end
+    sync_threads()
+    return nothing
+end
+
+
+# Make sure to call this with the number of threads being
+# the largest value of 2^n that's less than the number of columns
+function accumulate_mul_kernel(output, mat1, mat2)
+    row = blockIdx().x
+    col = threadIdx().x
+    n_cols = Int32(size(mat1, 2))
+
+    # Each block handles one row
+    shared_mults = @cuDynamicSharedMem(Float64, n_cols)
+
+    # Initialize the shared memory to be the element-wise multiplication
+    # of mat1 and mat2
+    stride = blockDim().x # e.g., 32, where there are 40 columns
+    while col <= n_cols
+        shared_mults[col] = mat1[row,col] * mat2[row,col]
+        col += stride
+    end
+    sync_threads()
+
+    # Now perform a parallel reduction to sum up each row
+    col = threadIdx().x
+    while stride > 0
+        if (col <= stride) && (col+stride <= n_cols)
+            shared_mults[col] += shared_mults[col+stride]
+        end
+        stride >>= 1 # Bitshift by 1 to divide by 2
+        sync_threads()
+    end
+
+    if col==1
+        output[row] = shared_mults[1]
+    end
+    sync_threads()
+    return nothing
+end
+
+# This kernel adds information for a cut to the tableau. It modifies
+# only the row where the cut is being placed, and the final row, which
+# contains the Phase I objective. "obj_flag" should be "true" if the
+# cut is for the problem objective function, and "false" for a standard
+# constraint. "geq_flag" should be true if the constraint is a GEQ
+# constraint.
+function add_cut_kernel(tableau, subgradients, final_col, active_row, n_vars, 
+                        n_rows, n_probs, slack_count, obj_flag, geq_flag) #, deg_flag)
+    # For each LP, we modify (active_row) and (n_rows), being the line
+    # where the cut is added and the phase I objective row. Each thread
+    # modifies a single LP, since it's just filling in values in one line,
+    # the number of additions is small, and the points to fill in are sporadic.
+    thread = threadIdx().x + (blockIdx().x - Int32(1)) * blockDim().x
+    stride = blockDim().x * gridDim().x
+
+    while thread <= n_probs
+        # if deg_flag[thread]
+        #     thread += stride
+        #     continue
+        # end
+        row = (thread - Int32(1))*n_rows + active_row
+        if final_col[thread] < 0
+            if geq_flag==true 
+                # This is a GEQ constraint, so the slack variable would be negative,
+                # but the final column is also negative. This means we negate the
+                # whole row, so both the slack and final column are positive. No
+                # need for an artificial variable.
+
+                # Add in the epigraph variables for the objective (NEGATED)
+                if obj_flag
+                    tableau[row, Int32(1)] = 1.0 # Negative of the normal -1
+                    tableau[row, Int32(2)] = -1.0 # Negative of the normal +1
+                end
+                
+                # Add the subgradients for the cut (NEGATED)
+                col = Int32(3)
+                while col <= n_vars + Int32(2)
+                    tableau[row, col] = -subgradients[thread, col-Int32(2)]
+                    col += Int32(1)
+                end
+
+                # Add the slack variable (DOUBLE NEGATED)
+                tableau[row, Int32(2) + n_vars + active_row] = 1.0
+
+                # Add the final column value (NEGATED)
+                tableau[row, end] = -final_col[thread]
+            else
+                # This is a LEQ constraint, so the slack variable would be positive,
+                # but the final column is negative. This means we negate the whole
+                # row, add an artificial variable, and add the original row to the 
+                # phase I objective
+                art_row = thread*n_rows
+
+                # Add in the epigraph variables (NEGATED), and for the artificial objective (DOUBLE NEGATED)
+                if obj_flag
+                    tableau[row, Int32(1)] = 1.0 # Negative of the normal -1
+                    tableau[row, Int32(2)] = -1.0 # Negative of the normal +1
+                    tableau[art_row, Int32(1)] += -1.0
+                    tableau[art_row, Int32(2)] += 1.0
+                end
+
+                # Add the subgradients for the cut (NEGATED), and for the artificial objective (DOUBLE NEGATED)
+                col = Int32(3)
+                while col <= n_vars + Int32(2)
+                    tableau[row, col] = -subgradients[thread, col-Int32(2)]
+                    tableau[art_row, col] += subgradients[thread, col-Int32(2)]
+                    col += Int32(1)
+                end
+
+                # Add the slack variable (NEGATED) and artificial variable
+                tableau[row, Int32(2) + n_vars + active_row] = -1.0
+                tableau[art_row, Int32(2) + n_vars + active_row] = 1.0
+                tableau[row, Int32(2) + slack_count + active_row] = 1.0
+
+                # Add the final column value (NEGATED), and for the artificial objective (DOUBLE NEGATED)
+                tableau[row, end] = -final_col[thread]
+                tableau[art_row, end] += final_col[thread]
+            end
+        else # Final column would be positive
+            if geq_flag==true 
+                # This is a GEQ constraint, so the slack variable would be negative,
+                # and the final column is positive. This means we keep the row as-is,
+                # but we add an artificial variable
+                art_row = thread*n_rows
+
+                # Add in the epigraph variables for the objective
+                if obj_flag
+                    tableau[row, Int32(1)] = -1.0 
+                    tableau[row, Int32(2)] = 1.0 
+                    tableau[art_row, Int32(1)] += 1.0
+                    tableau[art_row, Int32(2)] += -1.0
+                end
+
+                # Add the subgradients for the cut, and for the artificial objective (NEGATED)
+                col = Int32(3)
+                while col <= n_vars + Int32(2)
+                    tableau[row, col] = subgradients[thread, col-Int32(2)]
+                    tableau[art_row, col] += -subgradients[thread, col-Int32(2)]
+                    col += Int32(1)
+                end
+
+                # Add the slack variable (NEGATED) and artificial variable
+                tableau[row, Int32(2) + n_vars + active_row] = -1.0
+                tableau[art_row, Int32(2) + n_vars + active_row] = 1.0
+                tableau[row, Int32(2) + slack_count + active_row] = 1.0
+
+                # Add the final column value, and for the artificial objective (NEGATED)
+                tableau[row, end] = final_col[thread]
+                tableau[art_row, end] += -final_col[thread]
+            else
+                # This is a LEQ constraint, and the final column is positive.
+                # We can add the subgradients as-is, and we don't need an artificial
+                # variable.
+
+                # Add in the epigraph variables for the objective
+                if obj_flag
+                    tableau[row, Int32(1)] = -1.0 
+                    tableau[row, Int32(2)] = 1.0
+                end
+                
+                # Add the subgradients for the cut
+                col = Int32(3)
+                while col <= n_vars + Int32(2)
+                    tableau[row, col] = subgradients[thread, col-Int32(2)]
+                    col += Int32(1)
+                end
+
+                # Add the slack variable
+                tableau[row, Int32(2) + n_vars + active_row] = 1.0
+
+                # Add the final column value
+                tableau[row, end] = final_col[thread]
+            end
+        end
+        thread += stride
+    end
+    sync_threads()
+    return nothing
+end
+  
+# A simplified version of add_cut_kernel that only adds the objective
+# function lower bound to the tableau. Subgradients are not required,
+# because an epigraph reformulation is used and the subgradients on
+# this line are always 0.
+function add_lower_bound_kernel(tableau, final_col, active_row, n_rows, n_probs)
+    # For each LP, we modify (active_row) and (n_rows), being the line
+    # where the "cut" is added and the phase I objective row. Each thread
+    # modifies a single LP, since it's just filling in values in one line,
+    # the number of additions is small, and the points to fill in are sporadic.
+    thread = threadIdx().x + (blockIdx().x - Int32(1)) * blockDim().x
+    stride = blockDim().x * gridDim().x
+
+    while thread <= n_probs
+        row = (thread - Int32(1))*n_rows + active_row
+        # The lower bound constraint is a GEQ constraint, so a positive value
+        # in the final column means the slack variable will have a negative
+        # coefficient. This means we'll need an artificial variable, and must
+        # add the negative of the row to the phase I objective row.
+        if final_col[thread] > 0
+            art_row = thread*n_rows
+            tableau[row, Int32(1)] = 1.0 # Z_+, negated from -1
+            tableau[row, Int32(2)] = -1.0 # Z_-, negated from 1
+            tableau[row, Int32(2)*active_row + Int32(1)] = -1.0 # Slack variable
+            tableau[row, active_row + n_rows] = 1.0 # Artificial variable
+            tableau[row, end] = final_col[thread] # b
+
+            tableau[art_row, Int32(1)] = -1.0
+            tableau[art_row, Int32(2)] = 1.0
+            tableau[art_row, Int32(2)*active_row + Int32(1)] = 1.0
+            tableau[art_row, end] = -final_col[thread]
+        
+        # In the other case, the lower bound is nonpositive. Since it's a GEQ
+        # constraint, the slack variable starts as negative (or 0), but we can
+        # negate the whole row to make the slack variable and final column 
+        # nonnegative.
+        else
+            tableau[row, Int32(1)] = -1.0
+            tableau[row, Int32(2)] = 1.0
+            tableau[row, Int32(2)*active_row + Int32(1)] = 1.0 # Slack variable
+            tableau[row, end] = -final_col[thread]
+        end
+        thread += stride
+    end
+    sync_threads()
+    return nothing
+end
+
+# This kernel transitions from Phase I to Phase II. It checks the 
+# status of the Phase I objective and marks the Phase II solution
+# as -Inf if Phase I did not find a feasible solution.
+function feasibility_check_kernel(tableau, n_rows, n_probs)
+    thread = threadIdx().x + (blockIdx().x - Int32(1)) * blockDim().x
+    stride = blockDim().x * gridDim().x
+    width = Int32(size(tableau, 2))
+
+    # Each thread will check one LP and then move on
+    while thread <= n_probs
+        # Check if a BFS was NOT found.
+        if (tableau[thread*n_rows, end] < -1E-14) || (tableau[thread*n_rows, end] > 1E-14)
+            # The thread will adjust all columns of the preceding row.
+            # This isn't very efficient, but if the width of the tableau
+            # is much less than the number of problems, this may be
+            # more efficient than setting each block to check one LP.
+            # Also, this will be very quick if every LP found a BFS.
+            col = Int32(1)
+            while col < width
+                tableau[thread*n_rows-Int32(1), col] = 0.0
+                col += Int32(1)
+            end
+            tableau[thread*n_rows-Int32(1), end] = -Inf
+        end
+
+        # Regardless of whether a BFS was found or not, set the phase I
+        # objective row to all 0s
+        col = Int32(1)
+        while col <= width
+            tableau[thread*n_rows, col] = 0.0
+            col += Int32(1)
+        end
+        thread += stride
+    end
+    sync_threads()
+    return nothing
+end
+#=
+tableau = [1.0  0.0  0.0  0.0  1.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  -1.0;
+            0.0  1.0  0.0  0.0  0.0  1.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0   1.0;
+            0.0  0.0  1.0  0.0  0.0  0.0  1.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0   1.0;
+            0.0  0.0  0.0  1.0  0.0  0.0  0.0  1.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0   1.0;
+            0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0   0.0;
+            0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0   0.0;
+            0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0   0.0;
+            -1.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0   0.0;
+            0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0   0.0;
+            1.0  0.0  0.0  0.0  1.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  -2.0;
+            0.0  1.0  0.0  0.0  0.0  1.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0   1.0;
+            0.0  0.0  1.0  0.0  0.0  0.0  1.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0   1.0;
+            0.0  0.0  0.0  1.0  0.0  0.0  0.0  1.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0   1.0;
+            0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0   0.0;
+            0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0   0.0;
+            0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0   0.0;
+            -1.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0   0.0;
+            0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0   0.0];
+subgradients = CuArray([-0.3176476 -0.449098 -0.330095; 0.32 0.45 0.33])
+final_col = CuArray([5.0 -4.9])
+
+@device_code_warntype @cuda threads=768 add_cut_kernel(tableau, subgradients, final_col, Int32(7), Int32(3), Int32(9), Int32(2), Int32(7))
+
+
+
+
+function test_kernel(storage, rand_mat1, rand_mat2)
+    @cuda blocks=1000 threads=1024 shmem=8*120 accumulate_mul_kernel(storage, rand_mat1, rand_mat2)
+    return storage
+end
+
+rand_mat1 = CUDA.rand(Float64, 1000, 100);
+rand_mat2 = CUDA.rand(Float64, 1000, 100);
+storage = CuArray{Float64}(undef, 1000);
+@device_code_warntype @cuda blocks=1000 threads=1024 shmem=8*120 accumulate_mul_kernel(storage, rand_mat1, rand_mat2)
+tempstore = similar(rand_mat1);
+CUDA.@time test_other(storage, rand_mat1, rand_mat2, tempstore);
+function test_other(storage, rand_mat1, rand_mat2, tempstore)
+    tempstore .= rand_mat1 .* rand_mat2
+    storage .= sum(tempstore, dims=2)
+    return storage
+end
+
+
+function SCRAP_find_min_kernel(ratios, n_rows, rows)
+    row = blockIdx().x
+    col = threadIdx().x
+
+    # Need to use parallel reduction... Each block handles one row...
+    shared_min_vals = @cuDynamicSharedMem(Float64, blockDim().x)
+    shared_min_idxs = @cuDynamicSharedMem(Int32, blockDim().x, offset=Int32(8)*blockDim().x)
+
+    # Note: "row" and "col" are a little weird here, because
+    # "ratios" is actually a vector.
+
+    # Initialize the shared memory
+    shared_min_vals[col] = ratios[(row-1)*n_rows + col]
+    shared_min_idxs[col] = col
+
+    # Parallel reduction to find the index of the minimum value
+    # (This could probably be made more efficient. If n_rows is
+    #  not cleanly divisble by 2 there may be repeated checks
+    #  of some elements, plus at smaller than 32 elements, not all
+    #  threads in each warp will be used.)
+    stride = cld(n_rows,Int32(2))
+    while (stride != Int32(0))
+        sync_threads()
+        if (col <= stride) && (col+stride <= n_rows)
+            if shared_min_vals[col] == shared_min_vals[col + stride]
+                if shared_min_idxs[col] > shared_min_idxs[col + stride]
+                    shared_min_idxs[col] = shared_min_idxs[col + stride]
+                end
+            elseif shared_min_vals[col] > shared_min_vals[col + stride]
+                shared_min_vals[col] = shared_min_vals[col + stride]
+                shared_min_idxs[col] = shared_min_idxs[col + stride]
+            end
+        end 
+        if stride==1
+            stride = Int32(0)
+        else
+            stride = cld(stride,Int32(2))
+        end
+    end
+    if col==1
+        rows[row] = shared_min_idxs[1]
+    end
+    return nothing
+end
+
+
+function DEPRECATED_create_tableau_kernel(tableau, n_vars, n_rows, n_cols, entries_per_LP, 
+                                cuts, max_cuts, slack_count, corrected_subgradients, 
+                                lower_bounds, b_vals)
+    # We'll make each block handle a single LP.
+    thread = threadIdx().x
+    LP = (blockIdx().x)-Int32(1) #Base 0 is easier in some cases
+    stride = blockDim().x 
+    
+    while thread <= entries_per_LP
+        LP_row = cld(thread,width) #We need both division and modulo, but we can calculate one from the other
+        # Note: We could also have each thread take one column in its LP to not use division, 
+        # but that might not be very efficient if the number of columns isn't neatly divisible
+        # by 32.
+        row = Int32(LP*n_rows) + Int32(LP_row)
+        col = thread - Int32(((LP_row-Int32(1))*width))
+
+        # We now know what row and column we're looking at. We can fill in values as needed. 
+
+        # Fill in the first column
+        if col==1 # Lots of things happen for column 1
+            if row <= cuts+1 #Lower bound constraint and each cut
+                tableau[row, col] = 1.0
+            elseif row == slack_count + Int32(1) # Objective row
+                tableau[row, col] = -1.0
+            end
+
+        # Now fill in the row for the most recent cut
+        elseif (row == cuts+1) && (col > 1) && (col <= n_vars + Int32(1))
+            tableau[row, col] = corrected_subgradients[LP, col-Int32(1)]
+
+        # Now fill in the 1's for slack variables
+        elseif (row <= slack_count) && (row==(col + n_vars + Int32(1)))
+            # Should add a skip for rows belonging to future cuts...
+            tableau[row, col] = 1.0
+
+        # Now fill in the lower bound for the epigraph variable
+        elseif col==n_cols
+            if row==1
+                tableau[row, col] = -lower_bounds[LP]
+        
+        # Now the value for the first cut, from the b-value
+            elseif row==2
+                tableau[row, col] = b_vals[LP]
+        
+        # And finally, the upper bounds for all the variables
+            elseif (row > max_cuts + Int32(1)) && (row <= max_cuts + Int32(1) + n_vars)
+                tableau[row, col] = 1.0
+            end
+
+        # And now we change the variable upper bounds to be 1's
+        elseif (row > max_cuts + Int32(1)) && (row <= max_cuts + Int32(1) + n_vars) && (row - max_cuts == col)
+            tableau[row, col] = 1.0
+        end
+
+        thread += stride
+    end
+    sync_threads()
+    return nothing
+end
+
+function DEPRECATED_tableau_skeleton_kernel(tableau, n_vars, slack_count, lower_bounds)
+    # Each block handles one LP
+    thread = threadIdx().x
+    LP = (blockIdx().x)-Int32(1) #Base 0 is easier in some cases
+    stride = blockDim().x
+    width = ((n_vars + Int32(1))*2 + Int32(1))
+    height = (n_vars + Int32(2))
+    entries_per_LP = width*height
+    
+    while thread <= entries_per_LP
+        LP_row = cld(thread,width) #We need both division and modulo, but we can calculate one from the other
+        row = Int32(LP*(slack_count+Int32(2))) + Int32(LP_row) # This is the real row in the tableau
+        col = thread - Int32(((LP_row-Int32(1))*width))
+
+        # NOTE: No longer going to do this. We aren't doing an epigraph reformulation
+        # anymore, so it's probably better to put in this row as a GEQ [lower bound]
+        # "cut".
+        # Fill in the objective lower bound constraint. Mark it as negative if
+        # the associated lower bound is positive.
+        if (col==1) && (LP_row==1)
+            if lower_bounds[LP+Int32(1)] > 0.0
+                tableau[row, col] = -1.0
+            else
+                tableau[row, col] = 1.0
+            end
+
+        # Fill in the variable upper bound rows (note that because this is "elseif",
+        # we don't overwrite [1,1])
+        elseif (col < height) && (LP_row==col)
+            tableau[row, col] = 1.0
+        
+        # Fill in the slack variables for the lower bound row
+        elseif (col == height) && (LP_row == Int32(1))
+            if lower_bounds[LP+Int32(1)] > 0.0
+                tableau[row, col] = -1.0
+            else
+                tableau[row, col] = 1.0
+            end
+        
+        # Fill in slack variables for other rows
+        elseif (col > height) && (col < width) && ((LP_row + n_vars + Int32(1)) == col)
+            tableau[row, col] = 1.0
+
+        # Fill in the lower bound value
+        elseif (col == width) && (LP_row == Int32(1))
+            if lower_bounds[LP+Int32(1)] > 0.0
+                tableau[row, end] = lower_bounds[LP+Int32(1)]
+            else
+                tableau[row, end] = -lower_bounds[LP+Int32(1)]
+            end
+
+        # Fill in the upper bound values for the variables
+        elseif (col == width) && (LP_row > Int32(1)) && (LP_row < height)
+            tableau[row, end] = 1.0
+        
+        # Fill in the objective function row (one row before the end, because of the Phase I objective row)
+        elseif (col == Int32(1)) && (LP_row == height)
+            tableau[row + slack_count - height + Int32(1), 1] = -1.0
+        end
+
+        thread += stride
+    end
+    sync_threads()
+    return nothing
+end
+=#
\ No newline at end of file
diff --git a/examples/ParBB/subroutines.jl b/examples/ParBB/subroutines.jl
index 9789f25..83569c5 100644
--- a/examples/ParBB/subroutines.jl
+++ b/examples/ParBB/subroutines.jl
@@ -25,6 +25,9 @@ Upper problems are still solved using an ODERelaxProb and following the same tec
 as in the "normal" DynamicExt extension.
 """
 function solve_gpu!(m::EAGO.GlobalOptimizer)
+    # Turn off garbage collection
+    GC.enable(false)
+
     # Identify the extension
     ext = EAGO._ext(m)
 
@@ -37,7 +40,7 @@ function solve_gpu!(m::EAGO.GlobalOptimizer)
     EAGO.presolve_global!(m)
     EAGO.print_preamble!(m)
 
-    # Run the NLP solver to get a start-point upper bound with multi-starting
+    # Run the NLP solver to get a start-point upper bound with multi-starting (In development)
     multistart_upper!(m)
 
     # Fill the stack with multiple nodes for the GPU to parallelize
@@ -47,12 +50,13 @@ function solve_gpu!(m::EAGO.GlobalOptimizer)
     ext.node_storage = Vector{EAGO.NodeBB}(undef, ext.node_limit)
     ext.lower_bound_storage = Vector{Float64}(undef, ext.node_limit)
 
+
     # Run branch and bound; terminate when the stack is empty or when some
     # tolerance or limit is hit
     while !EAGO.termination_check(m)
         
-        # Garbage collect every 1000 iterations
-        if mod(m._iteration_count, 1000)==0
+        # Garbage collect every gc_freq iterations
+        if mod(m._iteration_count, EAGO._ext(m).gc_freq)==0
             GC.enable(true)
             GC.gc(false)
             GC.enable(false)
@@ -64,6 +68,7 @@ function solve_gpu!(m::EAGO.GlobalOptimizer)
         # Extract up to `node_limit` nodes from the main problem stack
         count = min(ext.node_limit, m._node_count)
         ext.node_storage[1:count] .= EAGO.popmin!(m._stack, count)
+        
         for i = 1:count
             ext.all_lvbs[i,:] .= ext.node_storage[i].lower_variable_bounds
             ext.all_uvbs[i,:] .= ext.node_storage[i].upper_variable_bounds
@@ -72,7 +77,8 @@ function solve_gpu!(m::EAGO.GlobalOptimizer)
         m._node_count -= count
 
         # Solve all the nodes in parallel
-        m._last_lower_problem_time += @elapsed lower_and_upper_problem!(m)
+        lower_and_upper_problem!(m)
+        
         EAGO.print_results!(m, true)
 
         for i in 1:ext.node_len
@@ -84,31 +90,38 @@ function solve_gpu!(m::EAGO.GlobalOptimizer)
                 EAGO.store_candidate_solution!(m)
 
                 # Perform post processing on each node I'm keeping track of
-                m._last_postprocessing_time += @elapsed EAGO.postprocess!(m)
+                # postprocess_total += @elapsed EAGO.postprocess!(m)
+                # EAGO.postprocess!(m)
+                # m._last_postprocessing_time += @elapsed EAGO.postprocess!(m)
 
                 # Branch the nodes if they're feasible
-                if m._postprocess_feasibility
-                    EAGO.branch_node!(m)
-                end
+                # if m._postprocess_feasibility
+                EAGO.branch_node!(m)
+                # end
             end
         end
         EAGO.set_global_lower_bound!(m)
         m._run_time = time() - m._start_time
         m._time_left = m._parameters.time_limit - m._run_time
         EAGO.log_iteration!(m)
-        EAGO.print_iteration!(m)
+        EAGO.print_iteration!(m, false)
         m._iteration_count += 1
     end
+    EAGO.print_iteration!(m, true)
 
     EAGO.set_termination_status!(m)
     EAGO.set_result_status!(m)
     EAGO.print_solution!(m)
+
+    # Turn back on garbage collection
+    GC.enable(true)
+    GC.gc()
 end
 
 # Helper functions here
 function prepopulate!(t::ExtendGPU, m::EAGO.GlobalOptimizer)
     if t.prepopulate == true
-        println("Prepopulating with $(t.node_limit) total nodes")
+        # println("Prepopulating with $(t.node_limit) total nodes")
 
         # Calculate the base number of splits we need for each parameter
         splits = floor(t.node_limit^(1/t.np))
@@ -259,92 +272,2270 @@ function lower_and_upper_problem!(t::PointwiseGPU, m::EAGO.GlobalOptimizer)
     t.lower_bound_storage .= Array(results_d)
     t.upper_bound_storage .= Array(evals_d[np:np:end])
 
+    less_val = t.upper_bound_storage .< m._global_upper_bound
+    if any(==(true), less_val)
+        println("Should be lower")
+        @show t.upper_bound_storage[less_val]
+    end
+
     return nothing
 end
 lower_and_upper_problem!(m::EAGO.GlobalOptimizer{R,S,Q}) where {R,S,Q<:EAGO.ExtensionType} = lower_and_upper_problem!(EAGO._ext(m), m)
 
-# Utility function to pick out a node from the subvector storage and make it the "current_node".
-# This is used for branching, since currently we use the EAGO branch function that uses the
-# "current_node" to make branching decisions.
-function make_current_node!(t::ExtendGPU, m::EAGO.GlobalOptimizer)
-    prev = copy(t.node_storage[t.node_len])
-    new_lower = t.lower_bound_storage[t.node_len]
-    new_upper = t.upper_bound_storage[t.node_len]
-    t.node_len -= 1
 
-    m._current_node = NodeBB(prev.lower_variable_bounds, prev.upper_variable_bounds,
-                             prev.is_integer, prev.continuous, new_lower, new_upper,
-                             prev.depth, prev.cont_depth, prev.id, prev.branch_direction,
-                             prev.last_branch, prev.branch_extent)
+# A separate version of the lower_and_upper_problem! function that uses subgradients
+function lower_and_upper_problem!(t::SubgradGPU, m::EAGO.GlobalOptimizer)
+    # Step 1) Bring the bounds into the GPU
+    lvbs_d = CuArray(t.all_lvbs)
+    uvbs_d = CuArray(t.all_uvbs) # [points x num_vars]
+
+    # Step 2) Preallocate points to evaluate 
+    l, w = size(t.all_lvbs) #points, num_vars
+    np = 2*w+2 #Adding an extra for upper bound calculations
+    eval_points = Vector{CuArray{Float64}}(undef, 3*w) #Only 3x because one is repeated
+    for i = 1:w
+        eval_points[3i-2] = CuArray{Float64}(undef, l*np)
+        eval_points[3i-1] = repeat(lvbs_d[:,i], inner=np)
+        eval_points[3i] = repeat(uvbs_d[:,i], inner=np)
+    end
+    bounds_d = CuArray{Float64}(undef, l*np)
+    
+    # Step 3) Fill in each of these points
+    for i = 1:w #1-3
+        eval_points[3i-2][1:np:end] .= (lvbs_d[:,i].+uvbs_d[:,i])./2
+        for j = 2:np-1 #2-7
+            if j==2i
+                eval_points[3i-2][j:np:end] .= (lvbs_d[:,i].+uvbs_d[:,i])./2 .+ t.α.*(uvbs_d[:,i].-lvbs_d[:,i])./2
+            elseif j==2i+1
+                eval_points[3i-2][j:np:end] .= (lvbs_d[:,i].+uvbs_d[:,i])./2 .- t.α.*(uvbs_d[:,i].-lvbs_d[:,i])./2
+            else
+                eval_points[3i-2][j:np:end] .= (lvbs_d[:,i].+uvbs_d[:,i])./2
+            end
+        end
+        # Now we do np:np:end. Each one is set to the center of the variable bounds,
+        # creating a degenerate interval. This gives us the upper bound.
+        eval_points[3i-2][np:np:end] .= (lvbs_d[:,i].+uvbs_d[:,i])./2
+        eval_points[3i-1][np:np:end] .= (lvbs_d[:,i].+uvbs_d[:,i])./2
+        eval_points[3i][np:np:end] .= (lvbs_d[:,i].+uvbs_d[:,i])./2
+    end
+
+    # Step 4) Prepare the input vector for the convex function
+    input = Vector{CuArray{Float64}}(undef, 0)
+    for i = 1:w
+        push!(input, [eval_points[3i-2], eval_points[3i-2], eval_points[3i-1], eval_points[3i]]...)
+    end
+
+    # Step 5) Perform the calculations
+    func_output = t.convex_func_and_subgrad(input...) # n+2-dimensional
+
+    # Step 6) Use values and subgradients to calculate lower bounds
+    bounds_d .= func_output[1]
+    for i = 1:w
+        bounds_d .+= -(func_output[i+2] .>= 0.0).*func_output[i+2].*(eval_points[3i-2][:] .- eval_points[3i-1][:]) .- 
+                      (func_output[i+2] .<= 0.0).*func_output[i+2].*(eval_points[3i-2][:] .- eval_points[3i][:])
+    end
+
+    # Add to lower and upper bound storage
+    t.lower_bound_storage .= max.(Array(func_output[2][1:np:end]), [maximum(bounds_d[i:i+np-2]) for i in 1:np:l*np])
+    t.upper_bound_storage .= Array(bounds_d[np:np:end])
+
+    return nothing
 end
-make_current_node!(m::EAGO.GlobalOptimizer{R,S,Q}) where {R,S,Q<:EAGO.ExtensionType} = make_current_node!(EAGO._ext(m), m)
 
-# (In development) A multi-start function to enable multiple runs of a solver such as IPOPT,
-# before the main B&B algorithm begins
-function multistart_upper!(m::EAGO.GlobalOptimizer{R,S,Q}) where {R,S,Q<:EAGO.ExtensionType}
-    m._current_node = EAGO.popmin!(m._stack)
-    t = EAGO._ext(m)
+# A third version of lower_and_upper_problem! that uses the new GPU Simplex algorithm
+function lower_and_upper_problem_old!(t::SimplexGPU_OnlyObj, m::EAGO.GlobalOptimizer)
+    # Step 1) Bring the bounds into the GPU
+    lvbs_d = CuArray(t.all_lvbs)
+    uvbs_d = CuArray(t.all_uvbs) # [points x num_vars]
 
-    if t.multistart_points > 1
-        @warn "Multistart points above 1 not currently supported."
+    # Step 2) Preallocate points to evaluate 
+    l, w = size(t.all_lvbs) #points, num_vars
+    np = 2*w+2 #Number of points; Adding an extra for upper bound calculations
+    eval_points = Vector{CuArray{Float64}}(undef, 3*w) #Only 3x because one is repeated
+    for i = 1:w
+        eval_points[3i-2] = CuArray{Float64}(undef, l*np)
+        eval_points[3i-1] = repeat(lvbs_d[:,i], inner=np)
+        eval_points[3i] = repeat(uvbs_d[:,i], inner=np)
     end
-    for n = 1:t.multistart_points
-        upper_optimizer = EAGO._upper_optimizer(m)
-        MOI.empty!(upper_optimizer)
+    bounds_d = CuArray{Float64}(undef, l)
     
-        for i = 1:m._working_problem._variable_count
-            m._upper_variables[i] = MOI.add_variable(upper_optimizer)
+    # Step 3) Fill in each of these points
+    for i = 1:w #1-3
+        eval_points[3i-2][1:np:end] .= (lvbs_d[:,i].+uvbs_d[:,i])./2
+        for j = 2:np-1 #2-7
+            if j==2i
+                eval_points[3i-2][j:np:end] .= (lvbs_d[:,i].+uvbs_d[:,i])./2 .- t.α.*(uvbs_d[:,i].-lvbs_d[:,i])./2
+            elseif j==2i+1
+                eval_points[3i-2][j:np:end] .= (lvbs_d[:,i].+uvbs_d[:,i])./2 .+ t.α.*(uvbs_d[:,i].-lvbs_d[:,i])./2
+            else
+                eval_points[3i-2][j:np:end] .= (lvbs_d[:,i].+uvbs_d[:,i])./2
+            end
         end
-        EAGO._update_upper_variables!(upper_optimizer, m)
+        # Now we do np:np:end. Each one is set to the center of the variable bounds,
+        # creating a degenerate interval. This gives us the upper bound.
+        eval_points[3i-2][np:np:end] .= (lvbs_d[:,i].+uvbs_d[:,i])./2
+        eval_points[3i-1][np:np:end] .= (lvbs_d[:,i].+uvbs_d[:,i])./2
+        eval_points[3i][np:np:end] .= (lvbs_d[:,i].+uvbs_d[:,i])./2
+    end
 
-        for i = 1:EAGO._variable_num(EAGO.FullVar(), m)
-            l  = EAGO._lower_bound(EAGO.FullVar(), m, i)
-            u  = EAGO._upper_bound(EAGO.FullVar(), m, i)
-            v = m._upper_variables[i]
-            MOI.set(upper_optimizer, MOI.VariablePrimalStart(), v, EAGO._finite_mid(l, u)) #THIS IS WHAT I WOULD CHANGE TO MAKE IT MULTI
+    # Step 4) Prepare the input vector for the convex function
+    input = Vector{CuArray{Float64}}(undef, 0)
+    for i = 1:w
+        push!(input, [eval_points[3i-2], eval_points[3i-2], eval_points[3i-1], eval_points[3i]]...)
+    end
+
+    # Step 5) Perform the calculations
+    func_output = t.convex_func_and_subgrad(input...) # n+2-dimensional
+    # Also need whatever constraints!!
+
+    # Step 6) Use values and subgradients to prepare the stacked Simplex tableau
+
+    # First things first, we can prepare the "b" vector and see if we need any auxiliary systems.
+    # This step calculates the intercept of b at x=x_lo, which is equivalent to calculating
+    # the intercept at x=0 and then later shifting x_lo to 0, but without the extra re-calculation
+    # steps
+    b_start = func_output[1]
+    for i = 1:w
+        b_start -= func_output[i+2].*(eval_points[3i-2] .- eval_points[3i-1])
+
+        # func_output[i+2] is the subgradient of the convex relaxation in the i'th dimension
+        # eval_points[3i-2] is the cv/cc point used to obtain the relaxation
+        # eval_points[3i-1] is the lower bound for this relaxation
+        # eval_points[3i] is the upper bound (which isn't used here)
+
+        # Note that <= [upper bound] will change to <= [upper bound] - [lower bound] 
+        # for each variable, later
+    end
+
+    if all(<=(0.0), b_start) 
+        #If b_start is all nonpositive, we don't need any auxiliary systems
+
+        # Start making the tableau as normal, since we have a basic feasible solution at the start.
+        # Create an extended b_array 
+        b_array = vcat([vcat(-b_start[np*(j-1)+1:np*j-1],   # First "1:(np-1)" points for each node
+                             uvbs_d[j,:].-lvbs_d[j,:],      # Upper bound minus lower bound 
+                             0.0)                           # 0.0 for the objective function row
+                        for j=1:l]...) # Repeat for every node
+        
+        # Prepare the epigraph variable columns. We're minimizing "Z", but since "Z" is unbounded, we
+        # convert it to Z = Z_pos - Z_neg, where Z_pos, Z_neg >= 0.0. The first column will be Z_pos,
+        # and the second column will be Z_neg. The upper bound rows and auxiliary system objective
+        # function row will have these as 0; the objective function row will be [1, -1] (minimizing
+        # Z_pos - Z_neg); and the constraints associated with Z will be [-1, 1] (-Z = -Z_pos + Z_neg)
+        epigraph = hcat(-CUDA.ones(Float64, length(b_array)), CUDA.ones(Float64, length(b_array)))
+        for i = 1:w
+            # Starting at the first upper bound, repeat for every tableau
+            epigraph[np+i-1 : np+w : end, :] .= 0.0
         end
+        epigraph[np+w : np+w : end, :] .*= -1.0 # The main objective function is opposite the other rows (minimizing Z)
+        epigraph[np+w+1 : np+w : end, :] .= 0.0 # The epigraph column is 0 in the auxiliary objective row
 
-        # add constraints
-        ip = m._input_problem
-        EAGO._add_constraint_store_ci_linear!(upper_optimizer, ip)
-        EAGO._add_constraint_store_ci_quadratic!(upper_optimizer, ip)
-        #add_soc_constraints!(m, upper_optimizer)
-    
-        # Add nonlinear evaluation block
-        MOI.set(upper_optimizer, MOI.NLPBlock(), m._working_problem._nlp_data)
-        MOI.set(upper_optimizer, MOI.ObjectiveSense(), MOI.MIN_SENSE)
-        MOI.set(upper_optimizer, MOI.ObjectiveFunction{EAGO.SAF}(), m._working_problem._objective_saf)
 
-        # Optimize the object
-        MOI.optimize!(upper_optimizer)
-        EAGO._unpack_local_nlp_solve!(m, upper_optimizer)
-        EAGO.store_candidate_solution!(m)
+        # Combine the epigraph columns, "A" matrix, slack variable columns, and "b" array into the stacked tableaus
+        tableaus = hcat(epigraph,                                                       # Epigraph variable columns
+                        [vcat([vcat(func_output[i+2][np*(j-1)+1:np*j-1],                    # >>Subgradient values for the i'th variable for the j'th node
+                            CUDA.zeros(Float64, w),                                         # >>Zeros for upper bound constraints (will fill in later with 1.0s)
+                            0.0)                                                            # >>0.0 for the objective function row
+                            for j = 1:l]...)                                                # Repeat for every j'th node vertically
+                        for i = 1:w]...,                                                # Add a column for every i'th variable
+                        [CUDA.zeros(Float64, length(b_array)) for _ = 1:(np-1)+w]...,   # Slack variables (will fill in later with 1.0s)
+                        b_array)                                                        # The array of b's
+        
+        # Fill in the upper bound constraint indices and the slack variables
+        for i = 1:w
+            tableaus[np+i-1 : np+w : end, i+2] .= 1.0
+        end
+        for i = 1:(np-1)+w
+            tableaus[i:np+w:end, (w+2)+i] .= 1.0
+        end
+
+        tableaus .= parallel_simplex(tableaus, np+w)
+
+    else
+        # It was detected that some slack variable coefficient would be negative, so we need to make auxiliary systems. 
+        # Note: It's probably worth it to only do auxiliary systems for the tableaus that will need it. At least check 
+        # to see how common this is, and whether it'll be necessary...
+
+        # Create the extended b_array
+        b_array = vcat([vcat(-b_start[np*(j-1)+1:np*j-1],   # First "1:(np-1)" points for each node
+                             uvbs_d[j,:].-lvbs_d[j,:],      # Upper bound minus lower bound 
+                             0.0,                           # 0.0 for the objective function row
+                             0.0)                           # 0.0 for the auxiliary system objective function row
+                        for j=1:l]...) # Repeat for every node
+
+        # (NOTE: Should we be scaling all the variables/subgradients so that the variables are bounded on [0, 1]?)
+
+        # Prepare the epigraph variable columns. We're minimizing "Z", but since "Z" is unbounded, we
+        # convert it to Z = Z_pos - Z_neg, where Z_pos, Z_neg >= 0.0. The first column will be Z_pos,
+        # and the second column will be Z_neg. The upper bound rows and auxiliary system objective
+        # function row will have these as 0; the objective function row will be [1, -1] (minimizing
+        # Z_pos - Z_neg); and the constraints associated with Z will be [-1, 1] (-Z = -Z_pos + Z_neg)
+        epigraph = hcat(-CUDA.ones(Float64, length(b_array)), CUDA.ones(Float64, length(b_array)))
+        for i = 1:w
+            # Starting at the first upper bound, repeat for every tableau 
+            # (which has np+w+1 rows thanks to the auxiliary row)
+            epigraph[np+i-1 : np+w+1 : end, :] .= 0.0 
+        end
+        epigraph[np+w : np+w+1 : end, :] .*= -1.0 # The main objective function is opposite the other rows (minimizing Z)
+        epigraph[np+w+1 : np+w+1 : end, :] .= 0.0 # The epigraph column is 0 in the auxiliary objective row
+
+        # Combine the epigraph columns, "A" matrix, slack variable columns, and "b" array into the stacked tableaus
+        tableaus = hcat(epigraph,                                                         # Epigraph variable columns
+                        [vcat([vcat(func_output[i+2][np*(j-1)+1:np*j-1],                    # >>Subgradient values for the i'th variable for the j'th node
+                            CUDA.zeros(Float64, w),                                         # >>Zeros for upper bound constraints (will fill in later with 1.0s)
+                            0.0,                                                            # >>0.0 for the objective function row
+                            0.0)                                                            # >>0.0 for the auxiliary objective function row
+                            for j = 1:l]...)                                                # Repeat for every j'th node vertically
+                        for i = 1:w]...,                                                  # Add a column for every i'th variable
+                        [CUDA.zeros(Float64, length(b_array)) for _ = 1:2*((np-1)+w)]..., # Slack and auxiliary variables (will fill in later with 1.0s)
+                        b_array)                                                          # The array of b's
+        
+        # Fill in the upper bound constraint indices
+        for i = 1:w
+            tableaus[np+i-1 : np+w+1 : end, i+2] .= 1.0 #np+w+1 length now, because of the auxiliary row
+        end
+
+        # Fill in the slack variables like normal, and then add auxiliary variables as needed
+        signs = sign.(tableaus[:,end])
+        signs[signs.==0] .= 1.0
+        for i = 1:np+w-1
+            tableaus[i:np+w+1:end, (w+2)+i] .= 1.0 #np+w+1 length now, because of the auxiliary row
+
+            # If the "b" row is negative, do the following:
+            # 1) Flip the row so that "b" is positive
+            # 2) Subtract the entire row FROM the auxiliary objective row
+            # 3) Add an auxiliary variable for this row
+            tableaus[i:np+w+1:end, :] .*= signs[i:np+w+1:end] #Flipped the row if b was negative
+            tableaus[np+w+1 : np+w+1 : end, :] .-= (signs[i:np+w+1:end].<0.0).*tableaus[i:np+w+1:end, :] #Row subtracted from auxiliary objective row
+            tableaus[i:np+w+1:end, (w+2)+np+w-1+i] .+= (signs[i:np+w+1:end].<0.0).*1.0
+        end
+
+        # Send the tableaus to the parallel_simplex algorithm, with the "aux" flag set to "true"
+        tableaus .= parallel_simplex(tableaus, np+w+1, aux=true)
+
+        if all(abs.(tableaus[np+w+1:np+w+1:end,end]).<=1E-10)
+            # Delete the [np+w+1 : np+w+1 : end] rows and the [w+1+(np+w-1) + 1 : end-1] columns
+            # Note: is it faster to NOT remove the rows/columns and just have an adjusted simplex
+            # algorithm that ignores them? Maybe, maybe not. I'll test later.
+            tableaus = tableaus[setdiff(1:end, np+w+1:np+w+1:end), setdiff(1:end, w+2+(np+w-1):end-1)]
+            tableaus .= parallel_simplex(tableaus, np+w)
+        else
+            warn = true
+        end
     end
-    push!(m._stack, m._current_node)
+
+    # display(Array(func_output[2]))
+    # display(Array(tableaus))
+    # display(Array(-tableaus[np+w:np+w:end,end]))
+    # display(Array(func_output[2][1:np:end]))
+    # display(Array(max.(func_output[2][1:np:end], -tableaus[np+w:np+w:end,end])))
+
+    # Step 8) Add results to lower and upper bound storage
+    t.lower_bound_storage .= Array(max.(func_output[2][1:np:end], -tableaus[np+w:np+w:end,end]))
+    t.upper_bound_storage .= Array(func_output[1][np:np:end])
+
+    return nothing
 end
 
+# An even newer Simplex
+function lower_and_upper_problem_slightly_old!(t::SimplexGPU_OnlyObj, m::EAGO.GlobalOptimizer)
+    # Step 1) Bring the bounds into the GPU
+    lvbs_d = CuArray(t.all_lvbs)
+    uvbs_d = CuArray(t.all_uvbs) # [points x num_vars]
 
-# Set the upper problem heuristic to only evaluate at depth 1, for now
-import EAGO: default_upper_heuristic
-function default_upper_heuristic(m::EAGO.GlobalOptimizer)
-    bool = false
-    if EAGO._current_node(m).depth==1
-        bool = true
+    # Step 2) Set up points to evaluate, which are the centers of every node
+    l, w = size(t.all_lvbs) #points, num_vars
+    np = 1 #Number of evaluations per node; Adding an extra for upper bound calculations
+    eval_points = Vector{CuArray{Float64}}(undef, 3*w) #Only 3x because cv is the same as cc
+    temp_lvbs = CuArray{Float64}(undef, l) #Pre-allocate slices of lvbs
+    temp_uvbs = CuArray{Float64}(undef, l) #Pre-allocate slices of uvbs
+    for i = 1:w
+        # Temporarily hold slices of variable bounds
+        temp_lvbs .= lvbs_d[:,i]
+        temp_uvbs .= uvbs_d[:,i]
+
+        # Set up bounds to evaluate
+        eval_points[3i-1] = repeat(temp_lvbs, inner=np)
+        eval_points[3i] = repeat(temp_uvbs, inner=np)
+
+        # Calculate midpoints of the bounds
+        eval_points[3i-2] = (eval_points[3i-1].+eval_points[3i])./2
+
+        # Correct the bounds for the upper bound calculation (every 2 evaluations)
+        # eval_points[3i-1][np:np:end] .= eval_points[3i-2][np:np:end]
+        # eval_points[3i][np:np:end] .= eval_points[3i-2][np:np:end]
     end
-    return bool
-end
+    # println("After initial setup:")
+    # CUDA.memory_status();println("")
 
-# Add a custom branching function that branches at the midpoint
-import EAGO: select_branch_point
-function select_branch_point(t::ExtendGPU, m::EAGO.GlobalOptimizer, i)
-    return EAGO._mid(EAGO.BranchVar(), m, i)
-end
+    # Step 3) Perform the calculations (Note: also need to add in constraint handling. Perhaps
+    #         that will be a different function, so that this one can stay as-is?
+    func_output = t.convex_func_and_subgrad(
+                ([[eval_points[3i-2],eval_points[3i-2],eval_points[3i-1],eval_points[3i]] for i=1:w]...)...) # n+2-dimensional
 
-# Disable epigraph reformation, preprocessing, and postprocessing
-import EAGO: reform_epigraph_min!
-function reform_epigraph_min!(m::EAGO.GlobalOptimizer)
-    nothing
+    # println("After calling the function:")
+    # CUDA.memory_status();println("")
+    # Might as well save the upper bound right now, since we have it
+    t.upper_bound_storage .= Array(func_output[1][np:np:end])
+    # println("Saving upper bound results:")
+    # CUDA.memory_status();println("")
+
+    # Step 4) Use values and subgradients to prepare the stacked Simplex tableau.
+    # Based on this procedure, we should never need an auxiliary system? Check on that to
+    # be sure, because if we don't need an auxiliary system, that's much easier
+
+    # Preallocate subgradient matrices and the b vector
+    subgradients = CuArray{Float64}(undef, l, w)
+    corrected_subgradients = CuArray{Float64}(undef, l, w)
+    b_val = CuArray{Float64}(undef, l, 1)
+    # println("More preallocations:")
+    # CUDA.memory_status();println("")
+
+    # Extract subgradients and apply a correction for shifting variables to [0,1]
+    subgradients .= hcat([func_output[i+2][1:np:end] for i in 1:w]...) #Only for the lower bound, not upper bound
+    corrected_subgradients .= (subgradients).*(uvbs_d .- lvbs_d)
+
+    # Calculate corrected "b" values based on the intercept at the evaluation point
+    # and the corrections for shifted variables
+    b_val .= sum(hcat([eval_points[3i-2][1:np:end] for i=1:w]...).*subgradients, dims=2) .- func_output[1][1:np:end] .- sum(lvbs_d.*subgradients, dims=2)
+
+    # If there are any negative values in b, we can simply change the epigraph
+    # variable to be that much higher to make the minimum 0. Note that this
+    # won't work for constraints, it only works because it's for the objective
+    # function and we have "z" in the tableau.
+    add_val = 0.0
+    if any(<(0.0), b_val)
+        add_val = -minimum(b_val)
+        b_val .+= add_val
+    end
+
+    # Free up eval_points since we no longer need it
+    CUDA.unsafe_free!.(eval_points)
+
+    # Preemptively determine how many slack variables we'll need. This is going to be
+    # the total number of cuts (an input to this function), plus one for the lower bound
+    # value, plus the number of variables (since each has an upper bound of 1)
+    slack_count = t.max_cuts+1+w # n_cuts, lower bound, w [upper bounds]
+    
+    # Create the stacked tableau as a big array of 0's, and then we fill it in as necessary.
+    tableau = CUDA.zeros(Float64, l*(slack_count+1), 1+w+slack_count+1)
+    solution_tableau = similar(tableau)
+    # println("Tableau prepared:")
+    # CUDA.memory_status();println("")
+
+    # Fill in the first column, corresponding to the epigraph variable
+    tableau[1:(slack_count+1):end,1] .= 1.0 # Lower bound constraint
+    tableau[2:(slack_count+1):end,1] .= 1.0 # Only one cut to consider for now
+    tableau[(slack_count+1):(slack_count+1):end,1] .= -1.0 # Objective row
+
+    # Fill in the corrected subgradients in their respective columns
+    tableau[2:(slack_count+1):end,2:1+w] .= corrected_subgradients
+
+    # Fill in the slack variables (besides the rows for cuts we haven't done yet)
+    for i = 1:slack_count
+        if (i<=t.max_cuts+1) && (t.max_cuts>1) && (i>2)  #Reserving rows 3 to max_cuts+1 for future cuts
+            continue
+        end
+        tableau[i:(slack_count+1):end,1+w+i] .= 1.0
+    end
+
+    # Fill in the lower bound for the epigraph variable
+    tableau[1:(slack_count+1):end,end] .= -func_output[2][1:np:end] .+ add_val
+
+    # Fill in the value for the first cut (b)
+    tableau[2:(slack_count+1):end,end] .= b_val
+
+    # Fill in the upper bounds for all the variables, which are all 1's
+    for i = 1:w
+        tableau[t.max_cuts+1+i:(slack_count+1):end,1+i] .= 1.0 # The variable itself
+        tableau[t.max_cuts+1+i:(slack_count+1):end,end] .= 1.0 # The variable's upper bound (always 1 because we shifted it)
+    end
+
+    # Make sure all the rightmost column values are positive
+    if any(<(0.0), tableau[:,end])
+        # display(tableau[:,end])
+        error("Check b_val, might need an auxiliary system (or more creativity)")
+    end
+    # println("Tableau filled:")
+    # CUDA.memory_status();println("")
+
+    # Free up the func outputs since we no longer need them
+    CUDA.unsafe_free!.(func_output)
+
+    # Pass the tableau through the simplex algorithm and see what we get out of it
+    # (Note that the solution values will be negated. That's fine, we don't actually
+    # care what they are just yet.)
+    solution_tableau .= tableau
+    # println("Right before simplex:")
+    # CUDA.memory_status();println("")
+    parallel_simplex(solution_tableau, slack_count+1)
+
+    # println("Right after simplex:")
+    # CUDA.memory_status();println("")
+
+    # Now we need to add another cut, which means we have to extract out the solution 
+    # from the tableau, convert back into un-shifted variables, pass it back through
+    # the convex evaluator, and add rows to the tableau.
+
+    # Preallocate some arrays checks we'll be using 
+    tableau_vals = CuArray{Float64}(undef, l,(slack_count+1))
+    variable_vals = CuArray{Float64}(undef, l,(slack_count+1))
+    bool_check = CuArray{Bool}(undef, l,(slack_count+1))
+    zero_check = CuArray{Bool}(undef, l,(slack_count+1))
+    short_eval_points = Vector{CuArray{Float64}}(undef, w) # Only need pointwise evaluations
+    # println("Preallocations for next cuts:")
+    # CUDA.memory_status();println("")
+
+    for cut = 1:t.max_cuts-1 # Two additional cuts.
+        # Extract solution values from the tableau to decide where the evaluations are.
+        # How to do this... Search through [2:w+1] columns to find columns that are all 0's with
+        # single 1's. 
+        # Maybe think about this for an individual block. We have a block of size (slack_count+1, w),
+        # and we need to identify which rows are relevant. We could go by row, but then we'd have
+        # to get to the end before we could identify anything... we could preallocate space for
+        # the variable values, at least---or, wait, that's already done with eval_points[3i-2].
+        # So that's nice. Uhhh... okay... 
+        # Figure out which ones are "correct" for each variable?
+        for i = 1:w
+            temp_lvbs .= lvbs_d[:,i]
+            temp_uvbs .= uvbs_d[:,i]
+            tableau_vals .= reshape(solution_tableau[:,end], l,(slack_count+1))
+            variable_vals .= reshape(solution_tableau[:,1+i], l,(slack_count+1))
+            bool_check .= (variable_vals .== 1.0)
+            zero_check .= (variable_vals .== 0.0)
+            bool_check .&= (count(bool_check, dims=2).==1)
+            bool_check .&= (count(zero_check, dims=2).==slack_count)
+            tableau_vals .*= bool_check
+            short_eval_points[i] = min.(0.95, max.(0.05, sum(tableau_vals, dims=2))).*(temp_uvbs .- temp_lvbs).+temp_lvbs
+        end
+        # println("Cut $cut, found short eval points:")
+        # CUDA.memory_status();println("")
+
+        # Okay, so now we have the points to evaluate, we need to call the function again
+        func_output = t.convex_func_and_subgrad(
+            ([[short_eval_points[i],short_eval_points[i],lvbs_d[:,i],uvbs_d[:,i]] for i=1:w]...)...) # n+2-dimensional
+
+        # println("Function called again:")
+        # CUDA.memory_status();println("")
+        # As before, calculate corrected subgradients and the new b_values
+        subgradients .= hcat([func_output[i+2] for i in 1:w]...) #Only for the lower bound, not upper bound
+        corrected_subgradients .= (subgradients).*(uvbs_d .- lvbs_d)
+        b_val .= sum(hcat([short_eval_points[i] for i=1:w]...).*subgradients, dims=2) .- func_output[1] .- sum(lvbs_d.*subgradients, dims=2)
+
+        # Add in the extra factor
+        b_val .+= add_val
+
+        # If any of b_val is [still] negative, update add_val and the rest of the tableau
+        if any(<(0.0), b_val)
+            update = -minimum(b_val)
+            b_val .+= update
+            tableau[1:slack_count+1:end, end] .+= update
+            for i = 2:(2+cut-1)
+                tableau[i:slack_count+1:end, end] .+= update
+            end
+            add_val += update
+        end
+
+        # Clear the short_eval_points from memory
+        CUDA.unsafe_free!.(short_eval_points)
+
+        # We can now place in these values into the tableau in the spots we left open earlier
+        tableau[2+cut:slack_count+1:end, 1] .= 1.0
+        tableau[2+cut:slack_count+1:end, 2:1+w] .= corrected_subgradients
+        tableau[2+cut:slack_count+1:end, 1+w+2+cut] .= 1.0
+        tableau[2+cut:slack_count+1:end, end] .= b_val
+
+        # Adjust the final line of each problem to be the original minimization problem
+        tableau[(slack_count+1):(slack_count+1):end,:] .= hcat(-CUDA.one(Float64), CUDA.zeros(Float64, 1, w+slack_count+1))
+
+        # Run the simplex algorithm again
+        solution_tableau .= tableau
+        
+        # println("Everything until simplex again")
+        # CUDA.memory_status();println("")
+        parallel_simplex(solution_tableau, slack_count+1)
+        # println("Right after simplex again:")
+        # CUDA.memory_status();println("")
+    end
+    
+    # println("End of simplexing:")
+    # CUDA.memory_status();println("")
+    
+    # Save the lower bounds
+    t.lower_bound_storage .= Array(-(solution_tableau[slack_count+1:slack_count+1:end,end] .- add_val))
+
+    # println("After lower bounds saved:")
+    # CUDA.memory_status();println("")
+
+    # Free variables we're finally done with
+    for i in [lvbs_d, uvbs_d, temp_lvbs, temp_uvbs, subgradients, corrected_subgradients, b_val,
+              tableau, solution_tableau, tableau_vals, variable_vals, bool_check, zero_check]
+        CUDA.unsafe_free!(i)
+    end
+    # println("Freed up storage, and done.:")
+    # CUDA.memory_status();println("")
+    # error()
+    return nothing
 end
+
+function lower_and_upper_problem_split!(t::SimplexGPU_OnlyObj, m::EAGO.GlobalOptimizer)
+    # Step 1) Bring the bounds into the GPU
+    lvbs_d = CuArray(t.all_lvbs)
+    uvbs_d = CuArray(t.all_uvbs) # [points x num_vars]
+
+    # Step 2) Set up points to evaluate, which are the centers of every node
+    l, w = size(t.all_lvbs) #points, num_vars
+    eval_points = (lvbs_d .+ uvbs_d)./2
+
+    # Step 3) Perform the calculations (Note: also need to add in constraint handling. Perhaps
+    #         that will be a different function, so that this one can stay as-is?
+
+    # Upper bound calculations first; lower and upper bounds, and cv/cc, are the midpoints of nodes
+    # NOTE: Speed can be improved by 75% for this call if you switch it out with a separate
+    #       function that only calculates the lower bound, for example. Or maybe more if you can
+    #       make a normal GPU-compatible version of the objective function.
+    func_output = @views t.convex_func_and_subgrad(
+                ([[eval_points[:,i],eval_points[:,i],eval_points[:,i],eval_points[:,i]] for i=1:w]...)...) # n+2-dimensional
+
+    t.upper_bound_storage .= Array(func_output)
+
+    # Free up the func outputs
+    CUDA.unsafe_free!(func_output)
+
+    # Now lower bound calculations. It's the same as upper bounds, but we use lvbs and uvbs.
+    func_output = @views t.convex_func_and_subgrad(
+        ([[eval_points[:,i],eval_points[:,i],lvbs_d[:,i],uvbs_d[:,i]] for i=1:w]...)...) # n+2-dimensional
+
+    # Step 4) Use values and subgradients to prepare the stacked Simplex tableau.
+    # Based on this procedure, we should never need an auxiliary system? Check on that to
+    # be sure, because if we don't need an auxiliary system, that's much easier
+
+    # Preallocate subgradient matrices and the b vector
+    subgradients = CuArray{Float64}(undef, l, w)
+    corrected_subgradients = CuArray{Float64}(undef, l, w)
+    mid_times_sub = CuArray{Float64}(undef, l, w)
+    low_times_sub = CuArray{Float64}(undef, l, w)
+    b_val = CuArray{Float64}(undef, l, 1)
+
+    # Extract subgradients and apply a correction for shifting variables to [0,1]
+    subgradients .= hcat([func_output[i+2] for i in 1:w]...) 
+    corrected_subgradients .= (subgradients).*(uvbs_d .- lvbs_d)
+
+    # Calculate corrected "b" values based on the intercept at the evaluation point
+    # and the corrections for shifted variables
+    mid_times_sub .= eval_points.*subgradients
+    low_times_sub .= lvbs_d.*subgradients
+    b_val .= sum(mid_times_sub, dims=2) .- func_output[1] .- sum(low_times_sub, dims=2)
+
+    # If there are any negative values in b, we can simply change the epigraph
+    # variable to be that much higher to make the minimum 0. Note that this
+    # won't work for constraints, it only works because it's for the objective
+    # function and we have "z" in the tableau.
+    add_val = 0.0
+    if any(<(0.0), b_val)
+        add_val = -minimum(b_val)
+        b_val .+= add_val
+    end
+
+    # Preemptively determine how many slack variables we'll need. This is going to be
+    # the total number of cuts (an input to this function), plus one for the lower bound
+    # value, plus the number of variables (since each has an upper bound of 1)
+    slack_count = t.max_cuts+1+w # n_cuts, lower bound, w [upper bounds]
+    
+    # Create the stacked tableau as a big array of 0's, and then we fill it in as necessary.
+    tableau = CUDA.zeros(Float64, l*(slack_count+1), 1+w+slack_count+1)
+    solution_tableau = similar(tableau)
+
+    # Fill in the first column, corresponding to the epigraph variable
+    tableau[1:(slack_count+1):end,1] .= 1.0 # Lower bound constraint
+    tableau[2:(slack_count+1):end,1] .= 1.0 # Only one cut to consider for now
+    tableau[(slack_count+1):(slack_count+1):end,1] .= -1.0 # Objective row
+
+    # Fill in the corrected subgradients in their respective columns
+    tableau[2:(slack_count+1):end,2:1+w] .= corrected_subgradients
+
+    # Fill in the slack variables (besides the rows for cuts we haven't done yet)
+    for i = 1:slack_count
+        if (i<=t.max_cuts+1) && (t.max_cuts>1) && (i>2)  #Reserving rows 3 to max_cuts+1 for future cuts
+            continue
+        end
+        tableau[i:(slack_count+1):end,1+w+i] .= 1.0
+    end
+
+    # Fill in the lower bound for the epigraph variable
+    tableau[1:(slack_count+1):end,end] .= -func_output[2] .+ add_val
+
+    # Fill in the value for the first cut (b)
+    tableau[2:(slack_count+1):end,end] .= b_val
+
+    # Fill in the upper bounds for all the variables, which are all 1's
+    for i = 1:w
+        tableau[t.max_cuts+1+i:(slack_count+1):end,1+i] .= 1.0 # The variable itself
+        tableau[t.max_cuts+1+i:(slack_count+1):end,end] .= 1.0 # The variable's upper bound (always 1 because we shifted it)
+    end
+
+    # Make sure all the rightmost column values are positive (not necessary, and for some
+    # reason this eats up GPU memory?)
+    # if any((@view tableau[:,end]) .< 0.0)
+    #     error("Check b_val, might need an auxiliary system (or more creativity)")
+    # end
+
+    # Free up the func outputs since we no longer need them
+    CUDA.unsafe_free!.(func_output)
+    
+
+    # Pass the tableau through the simplex algorithm and see what we get out of it
+    # (Note that the solution values will be negated. That's fine, we don't actually
+    # care what they are just yet.)
+    solution_tableau .= tableau
+    parallel_simplex(solution_tableau, slack_count+1)
+
+    # Now we need to add another cut, which means we have to extract out the solution 
+    # from the tableau, convert back into un-shifted variables, pass it back through
+    # the convex evaluator, and add rows to the tableau.
+
+    # Preallocate some arrays checks we'll be using 
+    tableau_vals = CuArray{Float64}(undef, l,(slack_count+1))
+    variable_vals = CuArray{Float64}(undef, l,(slack_count+1))
+    bool_check = CuArray{Bool}(undef, l,(slack_count+1))
+    zero_check = CuArray{Bool}(undef, l,(slack_count+1))
+    for cut = 1:t.max_cuts-1 # Two additional cuts.
+        # Extract solution values from the tableau to decide where the evaluations are.
+        # Figure out which ones are "correct" for each variable?
+        for i = 1:w
+            tableau_vals .= reshape((@view solution_tableau[:,end]), l,(slack_count+1))
+            variable_vals .= reshape((@view solution_tableau[:,1+i]), l,(slack_count+1))
+            bool_check .= (variable_vals .== 1.0)
+            zero_check .= (variable_vals .== 0.0)
+            bool_check .&= (count(bool_check, dims=2).==1)
+            bool_check .&= (count(zero_check, dims=2).==slack_count)
+            tableau_vals .*= bool_check
+            eval_points[:,i] .= @views min.(0.95, max.(0.05, sum(tableau_vals, dims=2))).*(uvbs_d[:,i] .- lvbs_d[:,i]).+lvbs_d[:,i]
+        end
+
+        # Okay, so now we have the points to evaluate, we need to call the function again
+        func_output = @views t.convex_func_and_subgrad(
+            ([[eval_points[:,i],eval_points[:,i],lvbs_d[:,i],uvbs_d[:,i]] for i=1:w]...)...) # n+2-dimensional
+        
+        # As before, calculate corrected subgradients and the new b_values
+        subgradients .= hcat([func_output[i+2] for i in 1:w]...) #Only for the lower bound, not upper bound
+        corrected_subgradients .= (subgradients).*(uvbs_d .- lvbs_d)
+        mid_times_sub .= eval_points.*subgradients
+        low_times_sub .= lvbs_d.*subgradients
+        b_val .= sum(mid_times_sub, dims=2) .- func_output[1] .- sum(low_times_sub, dims=2)
+
+        # Add in the extra factor
+        b_val .+= add_val
+
+        # If any of b_val is [still] negative, update add_val and the rest of the tableau
+        if any(<(0.0), b_val)
+            update = -minimum(b_val)
+            b_val .+= update
+            tableau[1:slack_count+1:end, end] .+= update
+            for i = 2:(2+cut-1)
+                tableau[i:slack_count+1:end, end] .+= update
+            end
+            add_val += update
+        end
+
+        # Free up the func outputs since we no longer need them
+        CUDA.unsafe_free!.(func_output)
+
+        # We can now place in these values into the tableau in the spots we left open earlier
+        tableau[2+cut:slack_count+1:end, 1] .= 1.0
+        tableau[2+cut:slack_count+1:end, 2:1+w] .= corrected_subgradients
+        tableau[2+cut:slack_count+1:end, 1+w+2+cut] .= 1.0
+        tableau[2+cut:slack_count+1:end, end] .= b_val
+
+        # Adjust the final line of each problem to be the original minimization problem
+        tableau[(slack_count+1):(slack_count+1):end,:] .= hcat(-CUDA.one(Float64), CUDA.zeros(Float64, 1, w+slack_count+1))
+
+        # Run the simplex algorithm again
+        solution_tableau .= tableau
+        parallel_simplex(solution_tableau, slack_count+1)
+    end
+
+    # Save the lower bounds (remembering to negate the values)
+    t.lower_bound_storage .= @views Array(-(solution_tableau[slack_count+1:slack_count+1:end,end] .- add_val))
+
+    for i in [lvbs_d, uvbs_d, eval_points, subgradients, corrected_subgradients, b_val,
+              tableau, solution_tableau, tableau_vals, variable_vals, bool_check, zero_check]
+        CUDA.unsafe_free!(i)
+    end
+    return nothing
+end
+
+function lower_and_upper_problem!(t::SimplexGPU_OnlyObj, m::EAGO.GlobalOptimizer)
+    t.lower_counter += 1
+    # Step 1) Bring the bounds into the GPU
+    lvbs_d = CuArray(t.all_lvbs[1:t.node_len,:])
+    uvbs_d = CuArray(t.all_uvbs[1:t.node_len,:]) # [points x num_vars]
+    
+    # Step 2) Set up points to evaluate, which are the centers of every node
+    # l, w = size(t.all_lvbs) #points, num_vars
+    w = t.np
+    l = t.node_len
+    eval_points = (lvbs_d .+ uvbs_d)./2
+
+    # Step 3) Perform the calculations (Note: also need to add in constraint handling. Perhaps
+    #         that will be a different function, so that this one can stay as-is?
+
+    # Perform both lower and upper bound calculations, stacked on top of one another. This
+    # is faster than splitting lower and upper bounding problems and calling the convex
+    # function twice, 
+    t.relax_time += @elapsed CUDA.@sync func_output = @views t.convex_func_and_subgrad(
+        ([[[eval_points[:,i];eval_points[:,i]],[eval_points[:,i];eval_points[:,i]],[lvbs_d[:,i];eval_points[:,i]],[uvbs_d[:,i];eval_points[:,i]]] for i=1:w]...)...) # n+2-dimensional
+    t.upper_bound_storage[1:t.node_len] .= @views Array(func_output[1][l+1:end])
+
+
+    # Step 4) Use values and subgradients to prepare the stacked Simplex tableau.
+    # Based on this procedure, we should never need an auxiliary system? Check on that to
+    # be sure, because if we don't need an auxiliary system, that's much easier
+
+    # Preallocate subgradient matrices and the b vector
+    subgradients = CuArray{Float64}(undef, l, w)
+    corrected_subgradients = CuArray{Float64}(undef, l, w)
+    mid_times_sub = CuArray{Float64}(undef, l, w)
+    low_times_sub = CuArray{Float64}(undef, l, w)
+    b_val = CuArray{Float64}(undef, l, 1)
+
+    # Extract subgradients and apply a correction for shifting variables to [0,1]
+    subgradients .= @views hcat([func_output[i+2][1:l] for i in 1:w]...) 
+    corrected_subgradients .= (subgradients).*(uvbs_d .- lvbs_d)
+
+    # Calculate corrected "b" values based on the intercept at the evaluation point
+    # and the corrections for shifted variables
+    mid_times_sub .= eval_points.*subgradients
+    low_times_sub .= lvbs_d.*subgradients
+    b_val .= sum(mid_times_sub, dims=2) .- (@view func_output[1][1:l]) .- sum(low_times_sub, dims=2)
+
+    # If there are any negative values in b, we can simply change the epigraph
+    # variable to be that much higher to make the minimum 0. Note that this
+    # won't work for constraints, it only works because it's for the objective
+    # function and we have "z" in the tableau.
+    # println("Step 1:")
+    # display(Array(b_val))
+    add_val = 0.0
+    if any(<(0.0), b_val)
+        add_val = -minimum(b_val)
+        b_val .+= add_val
+    end
+
+    # Preemptively determine how many slack variables we'll need. This is going to be
+    # the total number of cuts (an input to this function), plus one for the lower bound
+    # value, plus the number of variables (since each has an upper bound of 1)
+    slack_count = t.max_cuts+1+w # n_cuts, lower bound, w [upper bounds]
+    
+    # Create the stacked tableau as a big array of 0's, and then we fill it in as necessary.
+    tableau = CUDA.zeros(Float64, l*(slack_count+1), 1+w+slack_count+1)
+    solution_tableau = similar(tableau)
+
+    # Fill in the first column, corresponding to the epigraph variable
+    tableau[1:(slack_count+1):end,1] .= 1.0 # Lower bound constraint
+    tableau[2:(slack_count+1):end,1] .= 1.0 # Only one cut to consider for now
+    tableau[(slack_count+1):(slack_count+1):end,1] .= -1.0 # Objective row
+
+    # Fill in the corrected subgradients in their respective columns
+    tableau[2:(slack_count+1):end,2:1+w] .= corrected_subgradients
+
+    # Fill in the slack variables (besides the rows for cuts we haven't done yet)
+    for i = 1:slack_count
+        if (i<=t.max_cuts+1) && (t.max_cuts>1) && (i>2)  #Reserving rows 3 to max_cuts+1 for future cuts
+            continue
+        end
+        tableau[i:(slack_count+1):end,1+w+i] .= 1.0
+    end
+
+    # Fill in the lower bound for the epigraph variable 
+    # (z >= lower_bound --> -z <= -lower_bound --> -z + v1 <= -lower_bound, and z is flipped to be -z
+    tableau[1:(slack_count+1):end,end] .= @views -func_output[2][1:l] .+ add_val
+
+    # Fill in the value for the first cut (b)
+    tableau[2:(slack_count+1):end,end] .= b_val
+
+    # Fill in the upper bounds for all the variables, which are all 1's
+    for i = 1:w
+        tableau[t.max_cuts+1+i:(slack_count+1):end,1+i] .= 1.0 # The variable itself
+        tableau[t.max_cuts+1+i:(slack_count+1):end,end] .= 1.0 # The variable's upper bound (always 1 because we shifted it)
+    end
+
+    # Make sure all the rightmost column values are positive (not necessary, and for some
+    # reason this eats up GPU memory?)
+    # if any((@view tableau[:,end]) .< 0.0)
+    #     error("Check b_val, might need an auxiliary system (or more creativity)")
+    # end
+
+    # Free up the func outputs since we no longer need them
+    CUDA.unsafe_free!.(func_output)
+    
+
+    # Pass the tableau through the simplex algorithm and see what we get out of it
+    # (Note that the solution values will be negated. That's fine, we don't actually
+    # care what they are just yet.)
+    solution_tableau .= tableau
+    t.opt_time += @elapsed CUDA.@sync parallel_simplex(solution_tableau, slack_count+1)
+
+    # Now we need to add another cut, which means we have to extract out the solution 
+    # from the tableau, convert back into un-shifted variables, pass it back through
+    # the convex evaluator, and add rows to the tableau.
+
+    # Preallocate some arrays checks we'll be using 
+    tableau_vals = CuArray{Float64}(undef, l,(slack_count+1))
+    variable_vals = CuArray{Float64}(undef, l,(slack_count+1))
+    bool_check = CuArray{Bool}(undef, l,(slack_count+1))
+    zero_check = CuArray{Bool}(undef, l,(slack_count+1))
+    for cut = 1:t.max_cuts-1 # Two additional cuts.
+        # Extract solution values from the tableau to decide where the evaluations are.
+        # Figure out which ones are "correct" for each variable?
+        for i = 1:w
+            tableau_vals .= reshape((@view solution_tableau[:,end]), (slack_count+1),l)'
+            variable_vals .= reshape((@view solution_tableau[:,1+i]), (slack_count+1),l)'
+            bool_check .= (variable_vals .== 1.0)
+            zero_check .= (variable_vals .== 0.0)
+            bool_check .&= (count(bool_check, dims=2).==1)
+            bool_check .&= (count(zero_check, dims=2).==slack_count)
+            tableau_vals .*= bool_check
+            eval_points[:,i] .= @views min.(0.95, max.(0.05, sum(tableau_vals, dims=2))).*(uvbs_d[:,i] .- lvbs_d[:,i]).+lvbs_d[:,i]
+        end
+
+        # Okay, so now we have the points to evaluate, we need to call the function again
+        t.relax_time += @elapsed CUDA.@sync func_output = @views t.convex_func_and_subgrad(
+            ([[eval_points[:,i],eval_points[:,i],lvbs_d[:,i],uvbs_d[:,i]] for i=1:w]...)...) # n+2-dimensional
+
+        # As before, calculate corrected subgradients and the new b_values
+        subgradients .= hcat([func_output[i+2] for i in 1:w]...) #Only for the lower bound, not upper bound
+        corrected_subgradients .= (subgradients).*(uvbs_d .- lvbs_d)
+        mid_times_sub .= eval_points.*subgradients
+        low_times_sub .= lvbs_d.*subgradients
+        b_val .= sum(mid_times_sub, dims=2) .- func_output[1] .- sum(low_times_sub, dims=2)
+
+        # Add in the extra factor
+        b_val .+= add_val
+
+        # If any of b_val is [still] negative, update add_val and the rest of the tableau
+        if any(<(0.0), b_val)
+            update = -minimum(b_val)
+            b_val .+= update
+            tableau[1:slack_count+1:end, end] .+= update
+            for i = 2:(2+cut-1)
+                tableau[i:slack_count+1:end, end] .+= update
+            end
+            add_val += update
+        end
+
+        # Free up the func outputs since we no longer need them
+        CUDA.unsafe_free!.(func_output)
+
+        # We can now place in these values into the tableau in the spots we left open earlier
+        tableau[2+cut:slack_count+1:end, 1] .= 1.0
+        tableau[2+cut:slack_count+1:end, 2:1+w] .= corrected_subgradients
+        tableau[2+cut:slack_count+1:end, 1+w+2+cut] .= 1.0
+        tableau[2+cut:slack_count+1:end, end] .= b_val
+
+        # Adjust the final line of each problem to be the original minimization problem
+        tableau[(slack_count+1):(slack_count+1):end,:] .= hcat(-CUDA.one(Float64), CUDA.zeros(Float64, 1, w+slack_count+1))
+
+        # Run the simplex algorithm again
+        solution_tableau .= tableau
+        t.opt_time += @elapsed CUDA.@sync parallel_simplex(solution_tableau, slack_count+1)
+    end
+
+    # Save the lower bounds (remembering to negate the values)
+    t.lower_bound_storage[1:t.node_len] .= @views Array(-(solution_tableau[slack_count+1:slack_count+1:end,end] .- add_val))
+
+    for i in [lvbs_d, uvbs_d, eval_points, subgradients, corrected_subgradients, b_val,
+              tableau, solution_tableau, tableau_vals, variable_vals, bool_check, zero_check]
+        CUDA.unsafe_free!(i)
+    end
+    return nothing
+end
+
+function lower_and_upper_problem!(t::SimplexGPU_ObjAndCons, m::EAGO.GlobalOptimizer)
+    ################################################################################
+    ##########  Step 1) Determine problem parameters
+    ################################################################################
+    l, w = size(@view t.all_lvbs[1:t.node_len,:]) # points, num_vars
+    geq_len = length(t.geq_cons)
+    leq_len = length(t.leq_cons)
+    eq_len = length(t.eq_cons)
+
+    ################################################################################
+    ##########  Step 2) Bring the bounds into the GPU
+    ################################################################################
+    lvbs_d = @views CuArray(t.all_lvbs[1:l,:])
+    uvbs_d = @views CuArray(t.all_uvbs[1:l,:]) # [points x num_vars]
+
+    
+    ################################################################################
+    ##########  Step 3) Preallocate space for mutating arguments of the objective 
+    ##########          function and constraints
+    ################################################################################
+    # Objective function storage (2*l because we include upper bound calculations)
+    obj_cv = CuArray{Float64}(undef, 2*l)
+    obj_lo = CuArray{Float64}(undef, 2*l)
+    obj_cvgrad =[CuArray{Float64}(undef, 2*l) for _ in 1:w]
+
+    # GEQ constraint storage
+    geq_cc = [CuArray{Float64}(undef, l) for _ in 1:geq_len]
+    geq_ccgrad = [[CuArray{Float64}(undef, l) for _ in 1:w] for _ in 1:geq_len]
+    
+    # LEQ constraint storage
+    leq_cv = [CuArray{Float64}(undef, l) for _ in 1:leq_len]
+    leq_cvgrad = [[CuArray{Float64}(undef, l) for _ in 1:w] for _ in 1:leq_len]
+    
+    # EQ constraint storage
+    eq_cv = [CuArray{Float64}(undef, l) for _ in 1:eq_len]
+    eq_cc = [CuArray{Float64}(undef, l) for _ in 1:eq_len]
+    eq_cvgrad = [[CuArray{Float64}(undef, l) for _ in 1:w] for _ in 1:eq_len]
+    eq_ccgrad = [[CuArray{Float64}(undef, l) for _ in 1:w] for _ in 1:eq_len]
+    
+    
+    ################################################################################
+    ##########  Step 4) Set up points to evaluate (i.e. the centers of every node)
+    ################################################################################
+    eval_points = (lvbs_d .+ uvbs_d)./2
+
+
+    ################################################################################
+    ##########  Step 5) Calculate all required relaxations
+    ################################################################################
+    # Objective function (first 1:l are for the lower bound, l+1:end are for upper bound. This
+    #                     is faster than calling the objective function twice due to GPU allocations in the function)
+    @views t.obj_fun(obj_cv, obj_lo, obj_cvgrad..., 
+                   ([[[eval_points[:,i];eval_points[:,i]],[eval_points[:,i];eval_points[:,i]],
+                     [lvbs_d[:,i];eval_points[:,i]],[uvbs_d[:,i];eval_points[:,i]]] for i=1:w]...)...)
+
+    # LEQ constraints
+    for i in 1:leq_len
+        @views t.leq_cons[i](leq_cv[i], leq_cvgrad[i]...,
+                          ([[eval_points[:,j], eval_points[:,j], lvbs_d[:,j], uvbs_d[:,j]] for j=1:w]...)...)
+    end
+    
+    # GEQ constraints
+    for i in 1:geq_len
+        @views t.geq_cons[i](geq_cc[i], geq_ccgrad[i]...,
+                          ([[eval_points[:,j], eval_points[:,j], lvbs_d[:,j], uvbs_d[:,j]] for j=1:w]...)...)
+    end
+    
+    # EQ constraints
+    for i in 1:eq_len
+        @views t.eq_cons[i](eq_cv[i], eq_cc[i], eq_cvgrad[i]..., eq_ccgrad[i]...,
+                          ([[eval_points[:,j], eval_points[:,j], lvbs_d[:,j], uvbs_d[:,j]] for j=1:w]...)...)
+    end
+
+    # Store the upper bounds
+    t.upper_bound_storage[1:l] .= @views Array(obj_cv[l+1:end])
+
+
+    ################################################################################
+    ##########  Step 6) Create the stacked Simplex tableau
+    ################################################################################
+    # We can start by creating the basic tableau skeleton, which doesn't require
+    # any of the calculated information. First, determine how many extra columns
+    # are needed for slack variables. We need one for each variable's upper bound 
+    # (which will be scaled to 1), one for the lower bound of the objective function, 
+    # and then for each cut we'll need one for the objective function and one for
+    # each constraint
+    slack_count = Int32(w + 1 + (t.max_cuts * (1 + geq_len + leq_len + 2*eq_len)))
+
+    # In addition to slack variables, there are artificial variables. Artificial
+    # variables get used if the rightmost column (b) is negative, or if the 
+    # constraint is a GEQ constraint (but not both of these, or the negatives
+    # cancel out). However, because all variables are scaled to [0, 1], we will
+    # never need an artificial variable for the "w" variable rows. Hence, there
+    # are (slack_count - w) artificial variable rows. This makes the total width
+    # of the tableau equal to 2 for the epigraph variables, plus the number of 
+    # variables "w", plus the slack variables (slack_count), plus the number of 
+    # artificial variables (slack_count - w), plus 1 for the final column. 
+    # The w's cancel, and we get (2*slack_count + 3)
+    tableau_width = Int32(2*slack_count + 3)
+
+    # Allocate space for the tableau and working tableau, and create the basic
+    # stacked Simplex tableau skeleton
+    tableau = CUDA.zeros(Float64, l*(slack_count+2), tableau_width)
+    working_tableau = similar(tableau)
+    CUDA.@cuda blocks=l threads=768 tableau_skeleton_kernel(tableau, w, slack_count)
+
+    # Add in a "cut" that is the lower bound of the objective function
+    CUDA.@cuda blocks=l threads=640 add_lower_bound_kernel(tableau, (@view obj_lo[1:l]), w+Int32(1), slack_count+Int32(2), l)
+
+
+    ################################################################################
+    ##########  Step 7) Add cut information
+    ################################################################################
+
+    # Now that the skeleton is in place, we can add individual rows for each of
+    # the calculated subgradients, for the objective and constraints
+    subgradients = CuArray{Float64}(undef, l, w)
+    scaled_subgradients = CuArray{Float64}(undef, l, w)
+    hyperplane_mid = CuArray{Float64}(undef, l)
+    hyperplane_low = CuArray{Float64}(undef, l)
+    b_val = CuArray{Float64}(undef, l)
+    thread_count = Int32(2^floor(Int, log2(w-1))) # Active threads in the rowsum operation
+
+    # Start with the objective. Note that we must scale the subtangent hyperplanes
+    # to be on [0, 1] instead of the original domains. Given a hyperplane of:
+    # m*x + n*y = c, on [xL, xU], [yL, yU]
+    # We can scale to [0, 1], [0, 1] by shifting the intercept from (0, 0) to (xL, yL),
+    # and then multiplying the subgradient terms by (xU - xL) and (yU - yL), respectively.
+    # The subgradients m and n come directly from subgradient calculations, and c
+    # can be calculated since we know that m*(eval_x) + n*(eval_y) - c = (cv or cc)
+    # based on the point of evaluation (eval_x, eval_y) and the value of the relaxation
+    # (cv or cc), depending on which type of relaxation was calculated. This value
+    # can be scaled to (0,0) from (xL, yL) by subtracting the slopes times the lower
+    # bounds. 
+    # That is, we can calculate:
+    # m' = m*(xU - xL)
+    # n' = n*(yU - yL)
+    # c' = m*(eval_x) + n*(eval_y) - (m*(xL) + n*(yL)) - (cv or cc)
+    # To get the final hyperplane:
+    # m'*x + n'*y = c'
+    subgradients .= @views hcat([obj_cvgrad[i][1:l] for i in 1:w]...)
+    scaled_subgradients .= subgradients .* (uvbs_d .- lvbs_d)
+    CUDA.@cuda blocks=l threads=thread_count shmem=8*w accumulate_mul_kernel(hyperplane_mid, subgradients, eval_points)
+    CUDA.@cuda blocks=l threads=thread_count shmem=8*w accumulate_mul_kernel(hyperplane_low, subgradients, lvbs_d)
+    b_val .= hyperplane_mid .- hyperplane_low .- (@view obj_cv[1:l])
+
+    # Now we can add in the objective cut for the l problems.
+    CUDA.@cuda threads=512 add_cut_kernel(tableau, scaled_subgradients, b_val, Int32(w+2), w, slack_count+Int32(2), l, slack_count, true, false)
+
+    # Now we can do a similar process for the constraints
+    for i = 1:leq_len
+        subgradients .= @views hcat([leq_cvgrad[i][j] for j = 1:w]...)
+        scaled_subgradients .= subgradients .* (uvbs_d .- lvbs_d)
+        CUDA.@cuda blocks=l threads=thread_count shmem=8*w accumulate_mul_kernel(hyperplane_mid, subgradients, eval_points)
+        CUDA.@cuda blocks=l threads=thread_count shmem=8*w accumulate_mul_kernel(hyperplane_low, subgradients, lvbs_d)
+        b_val .= hyperplane_mid .- hyperplane_low .- leq_cv[i]
+        CUDA.@cuda threads=512 add_cut_kernel(tableau, scaled_subgradients, b_val, Int32(w+2+i), w, slack_count+Int32(2), l, slack_count, false, false)
+    end
+    for i = 1:geq_len
+        subgradients .= @views hcat([geq_ccgrad[i][j] for j = 1:w]...)
+        scaled_subgradients .= subgradients .* (uvbs_d .- lvbs_d)
+        CUDA.@cuda blocks=l threads=thread_count shmem=8*w accumulate_mul_kernel(hyperplane_mid, subgradients, eval_points)
+        CUDA.@cuda blocks=l threads=thread_count shmem=8*w accumulate_mul_kernel(hyperplane_low, subgradients, lvbs_d)
+        b_val .= hyperplane_mid .- hyperplane_low .- geq_cc[i]
+        CUDA.@cuda threads=512 add_cut_kernel(tableau, scaled_subgradients, b_val, Int32(w+2+leq_len+i), w, slack_count+Int32(2), l, slack_count, false, true)
+    end
+    for i = 1:eq_len
+        # Repeat what happened for LEQ and GEQ constraints, but EQ constraints have both.
+        subgradients .= @views hcat([eq_cvgrad[i][j] for j = 1:w]...)
+        scaled_subgradients .= subgradients .* (uvbs_d .- lvbs_d)
+        CUDA.@cuda blocks=l threads=thread_count shmem=8*w accumulate_mul_kernel(hyperplane_mid, subgradients, eval_points)
+        CUDA.@cuda blocks=l threads=thread_count shmem=8*w accumulate_mul_kernel(hyperplane_low, subgradients, lvbs_d)
+        b_val .= hyperplane_mid .- hyperplane_low .- eq_cv[i]
+        CUDA.@cuda threads=512 add_cut_kernel(tableau, scaled_subgradients, b_val, Int32(w+2+leq_len+geq_len+(2*i-1)), w, slack_count+Int32(2), l, slack_count, false, false)
+        subgradients .= @views hcat([eq_ccgrad[i][j] for j = 1:w]...)
+        scaled_subgradients .= subgradients .* (uvbs_d .- lvbs_d)
+        CUDA.@cuda blocks=l threads=thread_count shmem=8*w accumulate_mul_kernel(hyperplane_mid, subgradients, eval_points)
+        CUDA.@cuda blocks=l threads=thread_count shmem=8*w accumulate_mul_kernel(hyperplane_low, subgradients, lvbs_d)
+        b_val .= hyperplane_mid .- hyperplane_low .- eq_cc[i]
+        CUDA.@cuda threads=512 add_cut_kernel(tableau, scaled_subgradients, b_val, Int32(w+2+leq_len+geq_len+(2*i)), w, slack_count+Int32(2), l, slack_count, false, true)
+    end
+
+    ################################################################################
+    ##########  Step 8) Run the Simplex algorithm
+    ################################################################################
+
+    # Pass the tableau through the simplex algorithm and see what we get out of it
+    # (Note that the solution values will be negated. That's fine, we don't actually
+    # care what they are just yet.)
+    # device_synchronize()
+    working_tableau .= tableau
+    # display(Array(tableau))
+    twophase_parallel_simplex(working_tableau, w, slack_count+2)
+    # display(Array(working_tableau))
+    # Note that the solutions are on lines [slack_count+1 : slack_count+2 : end, end]
+
+    ################################################################################
+    ##########  Step 9) Add additional cuts if necessary
+    ################################################################################
+
+    # Now we need to add another cut, which means we have to extract out the solution 
+    # from the tableau, convert back into un-shifted variables, pass it back through
+    # the convex evaluator, and add rows to the tableau.
+
+    # If we're doing any more cuts...
+    if t.max_cuts > 1
+        # Remake storage for the objective function, now that we don't need upper bound
+        # calculations anymore 
+        CUDA.unsafe_free!(obj_cv)
+        CUDA.unsafe_free!(obj_lo)
+        CUDA.unsafe_free!.(obj_cvgrad)
+        obj_cv = CuArray{Float64}(undef, l)
+        obj_lo = CuArray{Float64}(undef, l)
+        obj_cvgrad =[CuArray{Float64}(undef, l) for _ in 1:w]
+
+        # Preallocate some arrays checks we'll be using 
+        tableau_vals = CuArray{Float64}(undef, l,(slack_count+2))
+        variable_vals = CuArray{Float64}(undef, l,(slack_count+2))
+        bool_check = CuArray{Bool}(undef, l,(slack_count+2))
+        zero_check = CuArray{Bool}(undef, l,(slack_count+2))
+        for cut = 1:t.max_cuts-1
+            # Extract solution values from the tableau to decide where the evaluations are.
+            # Figure out which ones are "correct" for each variable?
+            for i = 1:w
+                tableau_vals .= reshape((@view working_tableau[:,end]), (slack_count+2),l)'
+                variable_vals .= reshape((@view working_tableau[:,2+i]), (slack_count+2),l)'
+                bool_check .= (variable_vals .== 1.0)
+                zero_check .= (variable_vals .== 0.0)
+                bool_check .&= (count(bool_check, dims=2).==1)
+                bool_check .&= (count(zero_check, dims=2).==slack_count+1)
+                tableau_vals .*= bool_check
+                eval_points[:,i] .= @views min.(0.95, max.(0.05, sum(tableau_vals, dims=2))).*(uvbs_d[:,i] .- lvbs_d[:,i]).+lvbs_d[:,i]
+            end
+
+            # Okay, so now we have the points to evaluate, we need to call all the functions again
+            # Objective function
+            @views t.obj_fun(obj_cv, obj_lo, obj_cvgrad..., 
+                        ([[eval_points[:,i],eval_points[:,i],lvbs_d[:,i],uvbs_d[:,i]] for i=1:w]...)...)
+
+            # LEQ constraints
+            for i in 1:leq_len
+                @views t.leq_cons[i](leq_cv[i], leq_cvgrad[i]...,
+                                ([[eval_points[:,j], eval_points[:,j], lvbs_d[:,j], uvbs_d[:,j]] for j=1:w]...)...)
+            end
+            
+            # GEQ constraints
+            for i in 1:geq_len
+                @views t.geq_cons[i](geq_cc[i], geq_ccgrad[i]...,
+                                ([[eval_points[:,j], eval_points[:,j], lvbs_d[:,j], uvbs_d[:,j]] for j=1:w]...)...)
+            end
+            
+            # EQ constraints
+            for i in 1:eq_len
+                @views t.eq_cons[i](eq_cv[i], eq_cc[i], eq_cvgrad[i]..., eq_ccgrad[i]...,
+                                ([[eval_points[:,j], eval_points[:,j], lvbs_d[:,j], uvbs_d[:,j]] for j=1:w]...)...)
+            end
+
+            # And, as before, we can add the cuts to the tableau one by one
+            subgradients .= @views hcat([obj_cvgrad[i] for i in 1:w]...)
+            scaled_subgradients .= subgradients .* (uvbs_d .- lvbs_d)
+            CUDA.@cuda blocks=l threads=thread_count shmem=8*w accumulate_mul_kernel(hyperplane_mid, subgradients, eval_points)
+            CUDA.@cuda blocks=l threads=thread_count shmem=8*w accumulate_mul_kernel(hyperplane_low, subgradients, lvbs_d)
+            b_val .= hyperplane_mid .- hyperplane_low .- obj_cv
+
+            # Now we can add in the objective cut for the l problems.
+            # Note that the row we're adding to is moved forward depending on which
+            # cut we're on.
+            shift = Int32(cut * (1 + geq_len + leq_len + 2*eq_len))
+            CUDA.@cuda threads=512 add_cut_kernel(tableau, scaled_subgradients, b_val, Int32(shift+w+2), w, slack_count+Int32(2), l, slack_count, true, false)
+
+            # Now we can do a similar process for the constraints
+            for i = 1:leq_len
+                subgradients .= @views hcat([leq_cvgrad[i][j] for j = 1:w]...)
+                scaled_subgradients .= subgradients .* (uvbs_d .- lvbs_d)
+                CUDA.@cuda blocks=l threads=thread_count shmem=8*w accumulate_mul_kernel(hyperplane_mid, subgradients, eval_points)
+                CUDA.@cuda blocks=l threads=thread_count shmem=8*w accumulate_mul_kernel(hyperplane_low, subgradients, lvbs_d)
+                b_val .= hyperplane_mid .- hyperplane_low .- leq_cv[i]
+                CUDA.@cuda threads=512 add_cut_kernel(tableau, scaled_subgradients, b_val, Int32(shift+w+2+i), w, slack_count+Int32(2), l, slack_count, false, false)
+            end
+            for i = 1:geq_len
+                subgradients .= @views hcat([geq_ccgrad[i][j] for j = 1:w]...)
+                scaled_subgradients .= subgradients .* (uvbs_d .- lvbs_d)
+                CUDA.@cuda blocks=l threads=thread_count shmem=8*w accumulate_mul_kernel(hyperplane_mid, subgradients, eval_points)
+                CUDA.@cuda blocks=l threads=thread_count shmem=8*w accumulate_mul_kernel(hyperplane_low, subgradients, lvbs_d)
+                b_val .= hyperplane_mid .- hyperplane_low .- geq_cc[i]
+                CUDA.@cuda threads=512 add_cut_kernel(tableau, scaled_subgradients, b_val, Int32(shift+w+2+leq_len+i), w, slack_count+Int32(2), l, slack_count, false, true)
+            end
+            for i = 1:eq_len
+                # Repeat what happened for LEQ and GEQ constraints, but EQ constraints have both.
+                subgradients .= @views hcat([eq_cvgrad[i][j] for j = 1:w]...)
+                scaled_subgradients .= subgradients .* (uvbs_d .- lvbs_d)
+                CUDA.@cuda blocks=l threads=thread_count shmem=8*w accumulate_mul_kernel(hyperplane_mid, subgradients, eval_points)
+                CUDA.@cuda blocks=l threads=thread_count shmem=8*w accumulate_mul_kernel(hyperplane_low, subgradients, lvbs_d)
+                b_val .= hyperplane_mid .- hyperplane_low .- eq_cv[i]
+                CUDA.@cuda threads=512 add_cut_kernel(tableau, scaled_subgradients, b_val, Int32(shift+w+2+leq_len+geq_len+(2*i-1)), w, slack_count+Int32(2), l, slack_count, false, false)
+                subgradients .= @views hcat([eq_ccgrad[i][j] for j = 1:w]...)
+                scaled_subgradients .= subgradients .* (uvbs_d .- lvbs_d)
+                CUDA.@cuda blocks=l threads=thread_count shmem=8*w accumulate_mul_kernel(hyperplane_mid, subgradients, eval_points)
+                CUDA.@cuda blocks=l threads=thread_count shmem=8*w accumulate_mul_kernel(hyperplane_low, subgradients, lvbs_d)
+                b_val .= hyperplane_mid .- hyperplane_low .- eq_cc[i]
+                CUDA.@cuda threads=512 add_cut_kernel(tableau, scaled_subgradients, b_val, Int32(shift+w+2+leq_len+geq_len+(2*i)), w, slack_count+Int32(2), l, slack_count, false, true)
+            end
+
+            # Run the simplex algorithm again
+            working_tableau .= tableau
+            twophase_parallel_simplex(working_tableau, w, slack_count+2)
+        end
+        for i in [tableau_vals, variable_vals, bool_check, zero_check]
+            CUDA.unsafe_free!(i)
+        end
+    end
+
+    # Save the lower bounds (Note that it's the second from the bottom row in each LP,
+    # and we're remembering to negate the values)
+    t.lower_bound_storage[1:l] .= @views Array(-(working_tableau[slack_count+1:slack_count+2:end,end]))
+
+    for i in [lvbs_d, uvbs_d, eval_points, obj_cv, obj_lo, obj_cvgrad, geq_cc, geq_ccgrad,
+              leq_cv, leq_cvgrad, eq_cv, eq_cc, eq_cvgrad, eq_ccgrad, subgradients, 
+              scaled_subgradients, hyperplane_mid, hyperplane_low, b_val, tableau, 
+              working_tableau]
+        if typeof(i) <: Vector
+            if !isempty(i)
+                if typeof(i[1]) <: Vector
+                    for j in i
+                        CUDA.unsafe_free!.(j)
+                    end
+                else
+                    CUDA.unsafe_free!.(i)
+                end
+            end
+        else
+            CUDA.unsafe_free!(i)
+        end
+    end
+    return nothing
+end
+
+function lower_and_upper_problem!(t::SimplexGPU_ObjOnly_Mat, m::EAGO.GlobalOptimizer)
+    ################################################################################
+    ##########  Step 1) Determine problem parameters
+    ################################################################################
+    l, w = size(@view t.all_lvbs[1:t.node_len,:]) # points, num_vars
+    t.lower_counter += 1
+    t.node_counter += t.node_len
+
+    ################################################################################
+    ##########  Step 2) Bring the bounds into the GPU
+    ################################################################################
+    lvbs_d = @views CuArray(t.all_lvbs[1:l,:])
+    uvbs_d = @views CuArray(t.all_uvbs[1:l,:]) # [points x num_vars]
+
+    
+    ################################################################################
+    ##########  Step 3) Preallocate space for mutating arguments of the objective 
+    ##########          function and constraints
+    ################################################################################
+    # Objective function storage (2*l because we include upper bound calculations)
+    obj_cv = CuArray{Float64}(undef, 2*l)
+    obj_lo = CuArray{Float64}(undef, 2*l)
+    obj_cvgrad = CuArray{Float64}(undef, 2*l, w)
+
+    
+    ################################################################################
+    ##########  Step 4) Set up points to evaluate (i.e. the centers of every node)
+    ################################################################################
+    eval_points = (lvbs_d .+ uvbs_d)./2
+
+
+    ################################################################################
+    ##########  Step 5) Calculate all required relaxations
+    ################################################################################
+    # Objective function (first 1:l are for the lower bound, l+1:end are for upper bound. This
+    #                     is faster than calling the objective function twice due to GPU allocations in the function)
+    # start = t.relax_time
+    CUDA.NVTX.@range "Relaxations" begin
+    CUDA.@profile t.relax_time += @elapsed @views t.obj_fun(obj_cv, obj_lo, obj_cvgrad, 
+    # @views t.obj_fun(obj_cv, obj_lo, obj_cvgrad, 
+                   ([[[eval_points[:,i];eval_points[:,i]],[eval_points[:,i];eval_points[:,i]],
+                     [lvbs_d[:,i];eval_points[:,i]],[uvbs_d[:,i];eval_points[:,i]]] for i=1:w]...)...)
+    end
+    # @show t.node_len
+    # @show t.relax_time-start
+    error()
+
+    # Store the upper bounds
+    t.upper_bound_storage[1:l] .= @views Array(obj_cv[l+1:end])
+
+
+    ################################################################################
+    ##########  Step 6) Create the stacked Simplex tableau
+    ################################################################################
+    # We can start by creating the basic tableau skeleton, which doesn't require
+    # any of the calculated information. First, determine how many extra columns
+    # are needed for slack variables. We need one for each variable's upper bound 
+    # (which will be scaled to 1), one for the lower bound of the objective function, 
+    # and then for each cut we'll need one for the objective function and one for
+    # each constraint
+    slack_count = Int32(w + 1 + t.max_cuts)
+
+    # In addition to slack variables, there are artificial variables. Artificial
+    # variables get used if the rightmost column (b) is negative, or if the 
+    # constraint is a GEQ constraint (but not both of these, or the negatives
+    # cancel out). However, because all variables are scaled to [0, 1], we will
+    # never need an artificial variable for the "w" variable rows. Hence, there
+    # are (slack_count - w) artificial variable rows. This makes the total width
+    # of the tableau equal to 2 for the epigraph variables, plus the number of 
+    # variables "w", plus the slack variables (slack_count), plus the number of 
+    # artificial variables (slack_count - w), plus 1 for the final column. 
+    # The w's cancel, and we get (2*slack_count + 3)
+    tableau_width = Int32(2*slack_count + 3)
+
+    # Allocate space for the tableau and working tableau, and create the basic
+    # stacked Simplex tableau skeleton
+    tableau = CUDA.zeros(Float64, l*(slack_count+2), tableau_width)
+    working_tableau = similar(tableau)
+    CUDA.@cuda blocks=l threads=768 tableau_skeleton_kernel(tableau, w, slack_count)
+
+    # Add in a "cut" that is the lower bound of the objective function
+    CUDA.@cuda blocks=l threads=640 add_lower_bound_kernel(tableau, (@view obj_lo[1:l]), w+Int32(1), slack_count+Int32(2), l)
+
+
+    ################################################################################
+    ##########  Step 7) Add cut information
+    ################################################################################
+
+    # Now that the skeleton is in place, we can add individual rows for each of
+    # the calculated subgradients, for the objective and constraints
+    subgradients = CuArray{Float64}(undef, l, w)
+    scaled_subgradients = CuArray{Float64}(undef, l, w)
+    hyperplane_mid = CuArray{Float64}(undef, l)
+    hyperplane_low = CuArray{Float64}(undef, l)
+    b_val = CuArray{Float64}(undef, l)
+    thread_count = Int32(2^floor(Int, log2(w-1))) # Active threads in the rowsum operation
+    degeneracy_flag = CUDA.zeros(Bool, l)
+    blocks = Int32(min(cld(l,512),1024))
+
+    # Start with the objective. Note that we must scale the subtangent hyperplanes
+    # to be on [0, 1] instead of the original domains. Given a hyperplane of:
+    # m*x + n*y = c, on [xL, xU], [yL, yU]
+    # We can scale to [0, 1], [0, 1] by shifting the intercept from (0, 0) to (xL, yL),
+    # and then multiplying the subgradient terms by (xU - xL) and (yU - yL), respectively.
+    # The subgradients m and n come directly from subgradient calculations, and c
+    # can be calculated since we know that m*(eval_x) + n*(eval_y) - c = (cv or cc)
+    # based on the point of evaluation (eval_x, eval_y) and the value of the relaxation
+    # (cv or cc), depending on which type of relaxation was calculated. This value
+    # can be scaled to (0,0) from (xL, yL) by subtracting the slopes times the lower
+    # bounds. 
+    # That is, we can calculate:
+    # m' = m*(xU - xL)
+    # n' = n*(yU - yL)
+    # c' = m*(eval_x) + n*(eval_y) - (m*(xL) + n*(yL)) - (cv or cc)
+    # To get the final hyperplane:
+    # m'*x + n'*y = c'
+    subgradients .= @view obj_cvgrad[1:l,:]
+    scaled_subgradients .= subgradients .* (uvbs_d .- lvbs_d)
+    CUDA.@cuda blocks=l threads=thread_count shmem=8*w accumulate_mul_kernel(hyperplane_mid, subgradients, eval_points)
+    CUDA.@cuda blocks=l threads=thread_count shmem=8*w accumulate_mul_kernel(hyperplane_low, subgradients, lvbs_d)
+    b_val .= hyperplane_mid .- hyperplane_low .- (@view obj_cv[1:l])
+
+    # Now we can add in the objective cut for the l problems.
+    CUDA.@cuda blocks=blocks threads=512 add_cut_kernel(tableau, scaled_subgradients, b_val, Int32(w+2), w, slack_count+Int32(2), l, slack_count, true, false, degeneracy_flag)
+
+    # println("1024 and 1025 for cut: 1")
+    # display(Array(tableau)[1023*(slack_count+2)+1 : 1025*(slack_count+2),:])
+
+    ################################################################################
+    ##########  Step 8) Run the Simplex algorithm
+    ################################################################################
+
+    # Pass the tableau through the simplex algorithm and see what we get out of it
+    # (Note that the solution values will be negated. That's fine, we don't actually
+    # care what they are just yet.)
+    # device_synchronize()
+    working_tableau .= tableau
+    t.opt_time += @elapsed twophase_parallel_simplex(working_tableau, w, slack_count+2)
+    # twophase_parallel_simplex(working_tableau, w, slack_count+2)
+    # Note that the solutions are on lines [slack_count+1 : slack_count+2 : end, end]
+    # error()
+
+    ################################################################################
+    ##########  Step 9) Add additional cuts if necessary
+    ################################################################################
+
+    # Now we need to add another cut, which means we have to extract out the solution 
+    # from the tableau, convert back into un-shifted variables, pass it back through
+    # the convex evaluator, and add rows to the tableau.
+
+    # If we're doing any more cuts...
+    if t.max_cuts > 1
+        # Remake storage for the objective function, now that we don't need upper bound
+        # calculations anymore 
+        CUDA.unsafe_free!(obj_cv)
+        CUDA.unsafe_free!(obj_lo)
+        CUDA.unsafe_free!(obj_cvgrad)
+        obj_cv = CuArray{Float64}(undef, l)
+        obj_lo = CuArray{Float64}(undef, l)
+        obj_cvgrad = CuArray{Float64}(undef, l, w)
+        previous_sol = CuArray{Float64}(undef, l, w)
+
+        # Preallocate some arrays checks we'll be using 
+        tableau_vals = CuArray{Float64}(undef, l,(slack_count+2))
+        variable_vals = CuArray{Float64}(undef, l,(slack_count+2))
+        bool_check = CuArray{Bool}(undef, l,(slack_count+2))
+        zero_check = CuArray{Bool}(undef, l,(slack_count+2))
+        for cut = 1:t.max_cuts-1
+            # Extract solution values from the tableau to decide where the evaluations are.
+            # Figure out which ones are "correct" for each variable
+            for i = 1:w
+                tableau_vals .= reshape((@view working_tableau[:,end]), (slack_count+2),l)'
+                variable_vals .= reshape((@view working_tableau[:,2+i]), (slack_count+2),l)'
+                bool_check .= (variable_vals .== 1.0)
+                zero_check .= (variable_vals .== 0.0)
+                bool_check .&= (count(bool_check, dims=2).==1)
+                bool_check .&= (count(zero_check, dims=2).==slack_count+1)
+                tableau_vals .*= bool_check
+                eval_points[:,i] .= @views min.(0.95, max.(0.05, sum(tableau_vals, dims=2))).*(uvbs_d[:,i] .- lvbs_d[:,i]).+lvbs_d[:,i]
+            end
+
+            # Run a degeneracy check to compare eval_points against previous_sol.
+            # If there's a row where they're exactly equal, we don't add that cut.
+            CUDA.@cuda blocks=blocks threads=1024 degeneracy_check_kernel(degeneracy_flag, previous_sol, eval_points)
+
+            # Set the previous solution matrix to be the current solutions
+            previous_sol .= eval_points
+
+            # Okay, so now we have the points to evaluate, we need to call all the functions again
+            # Objective function
+            t.relax_time += @elapsed @views t.obj_fun(obj_cv, obj_lo, obj_cvgrad, 
+            # @views t.obj_fun(obj_cv, obj_lo, obj_cvgrad, 
+                        ([[eval_points[:,i],eval_points[:,i],lvbs_d[:,i],uvbs_d[:,i]] for i=1:w]...)...)
+
+            # And, as before, we can add the cuts to the tableau one by one
+            subgradients .= obj_cvgrad
+            scaled_subgradients .= subgradients .* (uvbs_d .- lvbs_d)
+            CUDA.@cuda blocks=l threads=thread_count shmem=8*w accumulate_mul_kernel(hyperplane_mid, subgradients, eval_points)
+            CUDA.@cuda blocks=l threads=thread_count shmem=8*w accumulate_mul_kernel(hyperplane_low, subgradients, lvbs_d)
+            b_val .= hyperplane_mid .- hyperplane_low .- obj_cv
+
+            # Now we can add in the objective cut for the l problems.
+            # Note that the row we're adding to is moved forward depending on which
+            # cut we're on.
+            shift = Int32(cut)
+            CUDA.@cuda blocks=blocks threads=512 add_cut_kernel(tableau, scaled_subgradients, b_val, Int32(shift+w+2), w, slack_count+Int32(2), l, slack_count, true, false, degeneracy_flag)
+
+            # Run the simplex algorithm again
+            working_tableau .= tableau
+            t.opt_time += @elapsed twophase_parallel_simplex(working_tableau, w, slack_count+2)
+            # twophase_parallel_simplex(working_tableau, w, slack_count+2)
+        end
+        for i in [tableau_vals, variable_vals, bool_check, zero_check]
+            CUDA.unsafe_free!(i)
+        end
+    end
+
+    # Save the lower bounds (Note that it's the second from the bottom row in each LP,
+    # and we're remembering to negate the values)
+    t.lower_bound_storage[1:l] .= @views Array(-(working_tableau[slack_count+1:slack_count+2:end,end]))
+
+    for i in [lvbs_d, uvbs_d, eval_points, obj_cv, obj_lo, obj_cvgrad, subgradients, 
+              scaled_subgradients, hyperplane_mid, hyperplane_low, b_val, tableau, 
+              working_tableau]
+        if typeof(i) <: Vector
+            if !isempty(i)
+                if typeof(i[1]) <: Vector
+                    for j in i
+                        CUDA.unsafe_free!.(j)
+                    end
+                else
+                    CUDA.unsafe_free!.(i)
+                end
+            end
+        else
+            CUDA.unsafe_free!(i)
+        end
+    end
+    return nothing
+end
+
+# A fourth version of lower_and_upper_problem! that uses the new GPU Simplex algorithm
+# but that only checks the midpoint of the node to get subgradients
+function lower_and_upper_problem!(t::SimplexGPU_Single, m::EAGO.GlobalOptimizer)
+    # Step 1) Bring the bounds into the GPU
+    lvbs_d = CuArray(t.all_lvbs)
+    uvbs_d = CuArray(t.all_uvbs) # [points x num_vars]
+
+    # Step 2) Preallocate points to evaluate 
+    l, w = size(t.all_lvbs) #points, num_vars
+    np = 2 #Number of points; Center point, and one extra for upper bound calculations
+    eval_points = Vector{CuArray{Float64}}(undef, 3*w) #Only 3x because one is repeated (cv or cc, lo, hi)
+    for i = 1:w
+        eval_points[3i-2] = CuArray{Float64}(undef, l*np)
+        eval_points[3i-1] = repeat(lvbs_d[:,i], inner=np)
+        eval_points[3i] = repeat(uvbs_d[:,i], inner=np)
+    end
+    bounds_d = CuArray{Float64}(undef, l)
+    
+    # Step 3) Fill in the variable midpoints for each node
+    for i = 1:w
+        # Each variable's cv (or cc) value is the midpoint of the node
+        # (lower and upper bounds specified previously)
+        eval_points[3i-2][1:np:end] .= (lvbs_d[:,i].+uvbs_d[:,i])./2
+
+        # Now we do np:np:end. Each one is set to the center of the variable bounds,
+        # creating a degenerate interval. This gives us the upper bound for the node.
+        eval_points[3i-2][np:np:end] .= (lvbs_d[:,i].+uvbs_d[:,i])./2
+        eval_points[3i-1][np:np:end] .= (lvbs_d[:,i].+uvbs_d[:,i])./2
+        eval_points[3i][np:np:end] .= (lvbs_d[:,i].+uvbs_d[:,i])./2
+    end
+
+    # Step 4) Prepare the input vector for the convex function
+    input = Vector{CuArray{Float64}}(undef, 0)
+    for i = 1:w
+        push!(input, [eval_points[3i-2], eval_points[3i-2], eval_points[3i-1], eval_points[3i]]...)
+    end
+
+    # Step 5) Perform the calculations
+    func_output = t.convex_func_and_subgrad(input...) # n+2-dimensional
+    # Also need whatever constraints!!
+
+    # Step 6) Use values and subgradients to prepare the stacked Simplex tableau
+
+    # First things first, we can prepare the "b" vector and see if we need any auxiliary systems.
+    # This step calculates the intercept of b at x=x_lo, which is equivalent to calculating
+    # the intercept at x=0 and then later shifting x_lo to 0, but without the extra re-calculation
+    # steps
+    b_start = func_output[1]
+    for i = 1:w
+        b_start -= func_output[i+2].*(eval_points[3i-2] .- eval_points[3i-1])
+
+        # func_output[i+2] is the subgradient of the convex relaxation in the i'th dimension
+        # eval_points[3i-2] is the cv/cc point used to obtain the relaxation
+        # eval_points[3i-1] is the lower bound for this relaxation
+        # eval_points[3i] is the upper bound (which isn't used here)
+
+        # Note that <= [upper bound] will change to <= [upper bound] - [lower bound] 
+        # for each variable, later
+    end
+
+    if all(<=(0.0), b_start) 
+        #If b_start is all nonpositive, we don't need any auxiliary systems
+
+        # Start making the tableau as normal, since we have a basic feasible solution at the start.
+        # Create an extended b_array 
+        b_array = vcat([vcat(-b_start[np*(j-1)+1:np*j-1],   # First "1:(np-1)" points for each node
+                             uvbs_d[j,:].-lvbs_d[j,:],      # Upper bound minus lower bound 
+                             0.0)                           # 0.0 for the objective function row
+                        for j=1:l]...) # Repeat for every node
+        
+        # Prepare the epigraph variable columns. We're minimizing "Z", but since "Z" is unbounded, we
+        # convert it to Z = Z_pos - Z_neg, where Z_pos, Z_neg >= 0.0. The first column will be Z_pos,
+        # and the second column will be Z_neg. The upper bound rows and auxiliary system objective
+        # function row will have these as 0; the objective function row will be [1, -1] (minimizing
+        # Z_pos - Z_neg); and the constraints associated with Z will be [-1, 1] (-Z = -Z_pos + Z_neg)
+        epigraph = hcat(-CUDA.ones(Float64, length(b_array)), CUDA.ones(Float64, length(b_array)))
+        for i = 1:w
+            # Starting at the first upper bound, repeat for every tableau
+            epigraph[np+i-1 : np+w : end, :] .= 0.0
+        end
+        epigraph[np+w : np+w : end, :] .*= -1.0 # The main objective function is opposite the other rows (minimizing Z)
+        epigraph[np+w+1 : np+w : end, :] .= 0.0 # The epigraph column is 0 in the auxiliary objective row
+
+
+        # Combine the epigraph columns, "A" matrix, slack variable columns, and "b" array into the stacked tableaus
+        tableaus = hcat(epigraph,                                                       # Epigraph variable columns
+                        [vcat([vcat(func_output[i+2][np*(j-1)+1:np*j-1],                    # >>Subgradient values for the i'th variable for the j'th node
+                            CUDA.zeros(Float64, w),                                         # >>Zeros for upper bound constraints (will fill in later with 1.0s)
+                            0.0)                                                            # >>0.0 for the objective function row
+                            for j = 1:l]...)                                                # Repeat for every j'th node vertically
+                        for i = 1:w]...,                                                # Add a column for every i'th variable
+                        [CUDA.zeros(Float64, length(b_array)) for _ = 1:(np-1)+w]...,   # Slack variables (will fill in later with 1.0s)
+                        b_array)                                                        # The array of b's
+        
+        # Fill in the upper bound constraint indices and the slack variables
+        for i = 1:w
+            tableaus[np+i-1 : np+w : end, i+2] .= 1.0
+        end
+        for i = 1:(np-1)+w
+            tableaus[i:np+w:end, (w+2)+i] .= 1.0
+        end
+
+        tableaus .= parallel_simplex(tableaus, np+w)
+
+    else
+        # It was detected that some slack variable coefficient would be negative, so we need to make auxiliary systems. 
+        # Note: It's probably worth it to only do auxiliary systems for the tableaus that will need it. At least check 
+        # to see how common this is, and whether it'll be necessary...
+
+        # Create the extended b_array
+        b_array = vcat([vcat(-b_start[np*(j-1)+1:np*j-1],   # First "1:(np-1)" points for each node
+                             uvbs_d[j,:].-lvbs_d[j,:],      # Upper bound minus lower bound 
+                             0.0,                           # 0.0 for the objective function row
+                             0.0)                           # 0.0 for the auxiliary system objective function row
+                        for j=1:l]...) # Repeat for every node
+
+        # (NOTE: Should we be scaling all the variables/subgradients so that the variables are bounded on [0, 1]?)
+
+        # Prepare the epigraph variable columns. We're minimizing "Z", but since "Z" is unbounded, we
+        # convert it to Z = Z_pos - Z_neg, where Z_pos, Z_neg >= 0.0. The first column will be Z_pos,
+        # and the second column will be Z_neg. The upper bound rows and auxiliary system objective
+        # function row will have these as 0; the objective function row will be [1, -1] (minimizing
+        # Z_pos - Z_neg); and the constraints associated with Z will be [-1, 1] (-Z = -Z_pos + Z_neg)
+        epigraph = hcat(-CUDA.ones(Float64, length(b_array)), CUDA.ones(Float64, length(b_array)))
+        for i = 1:w
+            # Starting at the first upper bound, repeat for every tableau 
+            # (which has np+w+1 rows thanks to the auxiliary row)
+            epigraph[np+i-1 : np+w+1 : end, :] .= 0.0 
+        end
+        epigraph[np+w : np+w+1 : end, :] .*= -1.0 # The main objective function is opposite the other rows (minimizing Z)
+        epigraph[np+w+1 : np+w+1 : end, :] .= 0.0 # The epigraph column is 0 in the auxiliary objective row
+
+        # Combine the epigraph columns, "A" matrix, slack variable columns, and "b" array into the stacked tableaus
+        tableaus = hcat(epigraph,                                                         # Epigraph variable columns
+                        [vcat([vcat(func_output[i+2][np*(j-1)+1:np*j-1],                    # >>Subgradient values for the i'th variable for the j'th node
+                            CUDA.zeros(Float64, w),                                         # >>Zeros for upper bound constraints (will fill in later with 1.0s)
+                            0.0,                                                            # >>0.0 for the objective function row
+                            0.0)                                                            # >>0.0 for the auxiliary objective function row
+                            for j = 1:l]...)                                                # Repeat for every j'th node vertically
+                        for i = 1:w]...,                                                  # Add a column for every i'th variable
+                        [CUDA.zeros(Float64, length(b_array)) for _ = 1:2*((np-1)+w)]..., # Slack and auxiliary variables (will fill in later with 1.0s)
+                        b_array)                                                          # The array of b's
+        
+        # Fill in the upper bound constraint indices
+        for i = 1:w
+            tableaus[np+i-1 : np+w+1 : end, i+2] .= 1.0 #np+w+1 length now, because of the auxiliary row
+        end
+
+        # Fill in the slack variables like normal, and then add auxiliary variables as needed
+        signs = sign.(tableaus[:,end])
+        signs[signs.==0] .= 1.0
+        for i = 1:np+w-1
+            tableaus[i:np+w+1:end, (w+2)+i] .= 1.0 #np+w+1 length now, because of the auxiliary row
+
+            # If the "b" row is negative, do the following:
+            # 1) Flip the row so that "b" is positive
+            # 2) Subtract the entire row FROM the auxiliary objective row
+            # 3) Add an auxiliary variable for this row
+            tableaus[i:np+w+1:end, :] .*= signs[i:np+w+1:end] #Flipped the row if b was negative
+            tableaus[np+w+1 : np+w+1 : end, :] .-= (signs[i:np+w+1:end].<0.0).*tableaus[i:np+w+1:end, :] #Row subtracted from auxiliary objective row
+            tableaus[i:np+w+1:end, (w+2)+np+w-1+i] .+= (signs[i:np+w+1:end].<0.0).*1.0
+        end
+
+        # Send the tableaus to the parallel_simplex algorithm, with the "aux" flag set to "true"
+        tableaus .= parallel_simplex(tableaus, np+w+1, aux=true)
+
+        if all(abs.(tableaus[np+w+1:np+w+1:end,end]).<=1E-10)
+            # Delete the [np+w+1 : np+w+1 : end] rows and the [w+1+(np+w-1) + 1 : end-1] columns
+            # Note: is it faster to NOT remove the rows/columns and just have an adjusted simplex
+            # algorithm that ignores them? Maybe, maybe not. I'll test later.
+            tableaus = tableaus[setdiff(1:end, np+w+1:np+w+1:end), setdiff(1:end, w+2+(np+w-1):end-1)]
+            tableaus .= parallel_simplex(tableaus, np+w)
+        else
+            warn = true
+        end
+    end
+
+    # display(Array(func_output[2]))
+    # display(Array(tableaus))
+    # display(Array(-tableaus[np+w:np+w:end,end]))
+    # display(Array(func_output[2][1:np:end]))
+    # display(Array(max.(func_output[2][1:np:end], -tableaus[np+w:np+w:end,end])))
+
+    # Step 8) Add results to lower and upper bound storage
+    t.lower_bound_storage .= Array(max.(func_output[2][1:np:end], -tableaus[np+w:np+w:end,end]))
+    t.upper_bound_storage .= Array(func_output[1][np:np:end])
+
+    return nothing
+end
+
+
+
+
+# The GPU Simplex algorithm 
+function parallel_simplex_older(tableau, n_rows; step_limit=100)
+    # This version takes in a 2D matrix of stacked tableaus and performs the simplex
+    # algorithm on the entire stacked tableau. Preallocate everything that we can.
+    println("\n\nStart simplex")
+    CUDA.memory_status()
+    row_count = size(tableau, 1)
+    n_probs = Int(row_count/n_rows)
+    count_vector = CuArray{Int64}(collect(1:row_count))
+    pivot_vals = CuArray{Float64}(undef, row_count)
+    vals = CuArray{Int64}(undef, n_probs)
+    cols = CuArray{CartesianIndex{2}}(undef, n_probs)
+    col_inds = CuArray{CartesianIndex{2}}(undef, row_count)
+    ratios = CuArray{Float64}(undef, row_count)
+    rows = CuArray{CartesianIndex{2}}(undef, n_probs)
+    pivot_rows = CuArray{Int32}(undef, row_count)
+    row_inds = CuArray{CartesianIndex{2}}(undef, row_count)
+    bland_set = CuArray{Float64}(undef, n_probs, size(tableau, 2)-1)
+    bland_set_bool = CuArray{Bool}(undef, n_probs, size(tableau, 2)-1)
+    col_ind_vec = CuArray{Float64}(undef, row_count)
+    row_ind_vec = CuArray{Float64}(undef, row_count)
+    pivot_tableau = CuArray{Float64}(undef, row_count, size(tableau,2))
+
+    println("\n\nIndividual checks")
+    CUDA.memory_status()
+
+    reached = 0
+    for STEP = 1:step_limit
+        println("\nStep $STEP") # TOTAL: 32 MiB
+        display(Array(tableau))
+        reached += 1
+        # Step 1: Identify the pivot columns following Bland's rule (picking
+        #         the lowest variable index that has a negative value)
+        bland_set .= tableau[n_rows:n_rows:end,1:end-1] # 2 MiB
+        bland_set_bool .= (bland_set.<-1E-10) # None
+        mins = findmax(bland_set_bool, dims=2) # 0 MiB
+        vals .= -mins[1]
+        cols .= mins[2]
+        # @show size(vals)
+        # @show size(cols)
+        # @show Array(vals)
+        # @show Array(cols)
+
+        println("\nmins calculated") # 4 MiB
+        CUDA.memory_status()
+
+        if sum(vals)==0 # I.e., if we don't need to pivot on any tableau
+            break
+        end
+
+        # Use the columns to generate cartesian indices we can refer to for each row
+        col_inds .= CartesianIndex.(count_vector, repeat(getfield.(getfield.(cols, 1), 2), inner=(n_rows,1)))
+        col_ind_vec .= tableau[col_inds]
+        # @show size(col_inds)
+        # @show size(col_ind_vec)
+        # @show Array(col_inds)
+        # @show Array(col_ind_vec)
+        # println("\ncol_inds calculated") # 2 MiB
+        # CUDA.memory_status()
+
+        # Step 2: Using the identified columns, calculate ratios and pick pivot rows
+        ratios .= tableau[:,end] ./ col_ind_vec
+        ratios[col_ind_vec .<= 0] .= Inf
+        ratios[ratios.==-Inf] .= Inf
+        # @show size(ratios)
+        # println("\nratios calculated") # 2 MiB
+        # CUDA.memory_status()
+
+        if all(isinf.(ratios))
+            # display(Array(tableau))
+            error("Pivot failed; at least one column is all negatives or 0s. Submit an issue if you get this error.")
+            break
+        end
+
+        # Apply the ratio test to each pivot column separately
+        # (Note: if this is an auxiliary system with auxiliary variables,
+        # "aux" should be set to "true", and this step will disallow the main
+        # objective function from being selected as the pivot row)
+        rows .= reshape(findmin(reshape(ratios, (n_rows,:))[1:end-1,:], dims=1)[2], (:,1))
+        @show rows
+        @show size(rows)
+        println("\nrows calculated") # None
+        CUDA.memory_status()
+
+
+        pivot_rows .= Int.(ceil.(count_vector./n_rows).-1).*n_rows .+ repeat(getfield.(getfield.(rows, 1), 1), inner=(n_rows, 1))
+        println("\n")
+        @show Array(Int.(ceil.(count_vector./n_rows).-1).*n_rows)
+        @show Array(repeat(getfield.(getfield.(rows, 1), 1), inner=(n_rows, 1)))
+        @show size(count_vector)
+        @show Array(count_vector)
+        @show n_rows
+        @show size(pivot_rows)
+        println("\npivot_rows calculated") # None
+        CUDA.memory_status()
+
+        row_inds .= CartesianIndex.(pivot_rows, #Rows come from the pivot rows
+                                    getfield.(getfield.(col_inds, 1), 2)) #Cols are from col_inds
+        row_ind_vec .= tableau[row_inds]
+        println("\nrow_inds calculated") # 2 MiB
+        CUDA.memory_status()
+
+        # Step 3: Pivot! Pivot!!!
+        # Find multiplication factors for each row, but set the pivot row's factor separately
+        pivot_vals .= -(col_ind_vec./row_ind_vec)
+        pivot_vals[getfield.(getfield.(row_inds, 1), 1)] .= (1 ./ row_ind_vec) .- 1.0
+        println("\npivot_vals calculated") # 2 MiB
+        CUDA.memory_status()
+
+        # Use the original "vals" to set pivot_vals to 0 if the val was nonnegative 
+        # (i.e., don't change anything if there's no need to pivot)
+        pivot_vals[repeat(vals, inner=(n_rows,1)).>=0.0] .= 0.0
+        println("\npivot_vals calculated") # 2 MiB
+        CUDA.memory_status()
+        # display(pivot_rows[:])
+        # display(pivot_rows)
+        # @show size(pivot_rows)
+        # @show size(pivot_tableau)
+        # @show size(tableau[pivot_rows[:],:])
+        # error()
+        # pivot_tableau .= tableau[pivot_rows[:],:]
+        pivot_tableau .= tableau[pivot_rows,:]
+        println("\npivot_tableau calculated") # 18 MiB
+        CUDA.memory_status()
+
+        # Adjust the tableau
+        tableau .+= pivot_vals .* pivot_tableau
+        println("\nmultiplication done") # None
+        CUDA.memory_status()
+
+        # Fix values to 0.0 and 1.0
+        tableau[col_inds] .= 0.0
+        tableau[row_inds] .= 1.0
+        error()
+    end
+
+    if reached==step_limit
+        # display(Array(tableau))
+        error("Cycle of some sort detected; solution not guaranteed! Submit an issue if you get this error.")
+    end
+    for i in [count_vector, pivot_vals, vals, cols, col_inds, 
+              ratios, rows, pivot_rows, row_inds, bland_set,
+              col_ind_vec, row_ind_vec, pivot_tableau]
+        CUDA.unsafe_free!(i)
+    end
+
+    println("\nEnd Simplex")
+    CUDA.memory_status()
+    return tableau #, tableau[n_rows:n_rows:end,end] would be the [negative] solutions only
+end
+
+function parallel_simplex_old(tableau, n_rows; step_limit=100)
+    # This version takes in a 2D matrix of stacked tableaus and performs the simplex
+    # algorithm on the entire stacked tableau. Preallocate everything that we can.
+    row_count = size(tableau, 1)
+    n_probs = Int(row_count/n_rows)
+    count_vector = CuArray{Int64}(collect(1:row_count))
+    row_adds = CuArray{Int64}(collect(0:n_rows:row_count-1))
+    pivot_vals = CuArray{Float64}(undef, row_count)
+    vals = CuArray{Int64}(undef, n_probs)
+    cols = CuArray{CartesianIndex{2}}(undef, n_probs)
+    col_inds = CuArray{CartesianIndex{2}}(undef, row_count)
+    ratios = CuArray{Float64}(undef, row_count)
+    rows = CuArray{CartesianIndex{2}}(undef, n_probs)
+    pivot_rows = CuArray{Int32}(undef, n_probs)
+    row_inds = CuArray{CartesianIndex{2}}(undef, n_probs)
+    bland_set = CuArray{Float64}(undef, n_probs, size(tableau, 2)-1)
+    bland_set_bool = CuArray{Bool}(undef, n_probs, size(tableau, 2)-1)
+    col_ind_vec = CuArray{Float64}(undef, row_count)
+    row_ind_vec = CuArray{Float64}(undef, row_count)
+    pivot_tableau = CuArray{Float64}(undef, row_count, size(tableau,2))
+
+    reached = 0
+    for _ = 1:step_limit
+        reached += 1
+        # Step 1: Identify the pivot columns following Bland's rule (picking
+        #         the lowest variable index that has a negative value)
+        bland_set .= tableau[n_rows:n_rows:end,1:end-1] # 2 MiB
+        bland_set_bool .= (bland_set.<-1E-10) # 
+        mins = findmax(bland_set_bool, dims=2) # 
+        vals .= -mins[1]
+        cols .= mins[2]
+
+        if sum(vals)==0 # I.e., if we don't need to pivot on any tableau
+            break
+        end
+
+        # Use the columns to generate cartesian indices we can refer to for each row
+        col_inds .= CartesianIndex.(count_vector, repeat(getfield.(getfield.(cols, 1), 2), inner=(n_rows,1)))
+        col_ind_vec .= tableau[col_inds]
+
+        # Step 2: Using the identified columns, calculate ratios and pick pivot rows
+        ratios .= tableau[:,end] ./ col_ind_vec
+        ratios[col_ind_vec .<= 0] .= Inf
+        ratios[ratios.==-Inf] .= Inf
+
+        if all(isinf.(ratios))
+            error("Pivot failed; at least one column is all negatives or 0s. Submit an issue if you get this error.")
+            break
+        end
+
+        # Apply the ratio test to each pivot column separately
+        # (Note: if this is an auxiliary system with auxiliary variables,
+        # "aux" should be set to "true", and this step will disallow the main
+        # objective function from being selected as the pivot row)
+        rows .= reshape(findmin(reshape(ratios, (n_rows,:))[1:end-1,:], dims=1)[2], (:,1))
+
+        pivot_rows .= row_adds .+ getfield.(getfield.(rows, 1), 1)
+
+        row_inds .= CartesianIndex.(pivot_rows,
+                                    getfield.(getfield.(cols, 1), 2))
+
+
+        row_ind_vec.= repeat(tableau[row_inds], inner=(n_rows,))
+
+        # Step 3: Pivot! Pivot!!!
+        # Find multiplication factors for each row, but set the pivot row's factor separately
+        pivot_vals .= -(col_ind_vec./row_ind_vec)
+        pivot_vals[repeat(pivot_rows, inner=(n_rows,))] .= (1 ./ row_ind_vec) .- 1.0
+
+        # Use the original "vals" to set pivot_vals to 0 if the val was nonnegative 
+        # (i.e., don't change anything if there's no need to pivot)
+        pivot_vals[repeat(vals, inner=(n_rows,1)).>=0.0] .= 0.0
+        pivot_tableau .= repeat(tableau[pivot_rows,:], inner=(n_rows,1))
+
+        # Adjust the tableau
+        tableau .+= pivot_vals .* pivot_tableau
+
+        # Fix values to 0.0 and 1.0
+        tableau[col_inds] .= 0.0
+        tableau[row_inds] .= 1.0
+    end
+
+    if reached==step_limit
+        # display(Array(tableau))
+        error("Cycle of some sort detected; solution not guaranteed! Submit an issue if you get this error.")
+    end
+    for i in [count_vector, row_adds, pivot_vals, vals, cols, col_inds, 
+              ratios, rows, pivot_rows, row_inds, bland_set, bland_set_bool,
+              col_ind_vec, row_ind_vec, pivot_tableau]
+        CUDA.unsafe_free!(i)
+    end
+
+    return tableau #, tableau[n_rows:n_rows:end,end] would be the [negative] solutions only
+end
+
+function parallel_simplex(tableau, n_rows; step_limit=100)
+    # This version takes in a 2D matrix of stacked tableaus and performs the simplex
+    # algorithm on the entire stacked tableau. Preallocate everything that we can.
+    total_n_rows = Int32(size(tableau, 1))
+    n_cols = Int32(size(tableau, 2))
+    n_rows = Int32(n_rows) #32-bit is fine for all integers
+    n_probs = Int32(total_n_rows/n_rows)
+    bland_set = CuArray{Float64}(undef, n_probs, n_cols-1)
+    bland_set_bool = CuArray{Bool}(undef, n_probs, n_cols-1)
+    pivot_cols = CuArray{Int32}(undef, n_probs)
+    pivot_col_vals = CuArray{Float64}(undef, total_n_rows)
+    final_col = CuArray{Float64}(undef, total_n_rows)
+    ratios = CuArray{Float64}(undef, total_n_rows)
+    negatives = CuArray{Bool}(undef, total_n_rows)
+    neginfs = CuArray{Bool}(undef, total_n_rows)
+    ratio_bool = CuArray{Bool}(undef, total_n_rows)
+    pivot_rows = CuArray{Int32}(undef, n_probs)
+    blocks = Int32(cld(total_n_rows, 1024))
+    
+    reached = 0
+    for _ = 1:step_limit
+        reached += 1
+        # display(Array(tableau))
+        # Step 1: Identify the pivot columns following Bland's rule (picking
+        #         the lowest variable index that has a negative value)
+        bland_set .= @view tableau[n_rows:n_rows:end,1:end-1] # No GPU allocations
+        bland_set_bool .= (bland_set.<-1E-10) # No GPU allocations
+
+        if !(any(bland_set_bool)) # I.e., everything's 0 and we don't need to pivot anymore
+            break
+        end
+
+        # Call the first-true finder kernel. This sets "pivot_cols" to the column number
+        # of the first "true" in each row, or 0 if there aren't any "true"
+        CUDA.@cuda blocks=n_probs threads=n_cols-1 first_true_kernel(bland_set_bool, pivot_cols)
+
+        # Fill pivot_col_vals with the correct entries in tableau
+        CUDA.@cuda blocks=blocks threads=1024 access_kernel(tableau, n_rows, pivot_cols, pivot_col_vals)
+        # pivot_col_vals is the values of tableau in the pivot columns
+
+        # Step 2: Using the identified columns, calculate ratios and pick pivot rows
+        final_col .= @view tableau[:,end]
+        ratios .= final_col ./ pivot_col_vals
+        negatives .= (pivot_col_vals .<= 0.0)
+        neginfs .= isinf.(ratios)
+        
+        CUDA.@cuda blocks=blocks threads=1024 set_inf_kernel(ratios, negatives)
+        CUDA.@cuda blocks=blocks threads=1024 set_inf_kernel(ratios, neginfs)
+
+        ratio_bool .= isinf.(ratios)
+        # @show typeof(ratio_bool)
+        # @show all(ratio_bool)
+        if all(ratio_bool)
+            error("Pivot failed; at least one column is all negatives or 0s. Submit an issue if you get this error.")
+            break
+        end
+
+        # Apply the ratio test to each pivot column separately... essentially,
+        # find the index of the minimum value of each column
+        CUDA.@cuda blocks=n_probs threads=n_rows shmem=12*n_rows find_min_kernel(ratios, n_rows, pivot_rows)
+        # if reached==2
+        #     display(ratios)
+        #     @show pivot_rows
+        # end
+        # "pivot_rows" is the pivot rows
+
+        # Now that we have the pivot columns and pivot rows for each LP,
+        # we can call the pivot kernel to perform the pivot. We need at least
+        # as many threads as there are columns in the tableau, but ideally
+        # we'd have as many threads as there are entires in the LP. And,
+        # ideally, that'd be divisible by 32, but that's incredibly rare.
+        # Best we can do is make sure it doesn't go above 1024.
+        CUDA.@cuda blocks=n_probs threads=min(Int32(896),n_rows*n_cols) shmem=8*(n_cols+n_rows) pivot_kernel(tableau, pivot_rows, pivot_cols, n_rows, n_cols, n_rows*n_cols)
+    end
+
+    if reached==step_limit
+        # display(Array(tableau))
+        error("Cycle of some sort detected; solution not guaranteed! Submit an issue if you get this error.")
+    end
+    for i in [bland_set, bland_set_bool, pivot_cols, pivot_col_vals, final_col, 
+              ratios, negatives, neginfs, ratio_bool, pivot_rows]
+        CUDA.unsafe_free!(i)
+    end
+    CUDA.synchronize()
+
+    return nothing #, tableau[n_rows:n_rows:end,end] would be the [negative] solutions only
+end
+
+# A 2-phase Simplex version that works with artificial variables.
+function twophase_parallel_simplex(tableau, n_vars, n_rows; step_limit=100)
+    # This version takes in a 2D matrix of stacked tableaus and performs the simplex
+    # algorithm on the entire stacked tableau. Preallocate everything that we can.
+    total_n_rows = Int32(size(tableau, 1))
+    n_cols = Int32(size(tableau, 2))
+    n_rows = Int32(n_rows) #32-bit is fine for all integers
+    art_start = Int32(n_rows + n_vars + 1) # Since n_rows is slack_vars+2, slack+2+w+1 is the start of the artificial rows
+    n_probs = Int32(total_n_rows/n_rows)
+    bland_set = CuArray{Float64}(undef, n_probs, n_cols-1)
+    bland_set_bool = CuArray{Bool}(undef, n_probs, n_cols-1)
+    pivot_cols = CuArray{Int32}(undef, n_probs)
+    pivot_col_vals = CuArray{Float64}(undef, total_n_rows)
+    final_col = CuArray{Float64}(undef, total_n_rows)
+    ratios = CuArray{Float64}(undef, total_n_rows)
+    negatives = CuArray{Bool}(undef, total_n_rows)
+    neginfs = CuArray{Bool}(undef, total_n_rows)
+    ratio_bool = CuArray{Bool}(undef, total_n_rows)
+    pivot_rows = CuArray{Int32}(undef, n_probs)
+    blocks = Int32(cld(total_n_rows, 1024))
+    flag = CuArray{Bool}(undef, 1)
+
+    ################### PHASE 1 ###################
+
+    # In phase I, we solve a larger system to get an initial basic feasible solution (BFS)
+    # for the problem we actually want to solve. This function assumes that the input
+    # system is already in a form with artificial variables that start at "art_start".
+
+    # Solve the system normally
+    reached = 0
+    for _ = 1:step_limit
+        reached += 1
+        # Step 1: Identify the pivot columns following Bland's rule (picking
+        #         the lowest variable index that has a negative value)
+        bland_set .= @view tableau[n_rows:n_rows:end,1:end-1] # No GPU allocations
+        bland_set_bool .= (bland_set.<-1E-10) # No GPU allocations
+
+        # Check if we are done pivoting, by checking if every element of bland_set_bool is "false"
+        flag .= true
+        CUDA.@cuda blocks=blocks threads=1024 not_any_kernel(flag, bland_set_bool)
+        if Array(flag)[1] # I.e., everything's 0 and we don't need to pivot anymore
+            break
+        end
+
+        # Call the first-true finder kernel. This sets "pivot_cols" to the column number
+        # of the first "true" in each row, or 0 if there aren't any "true"
+        CUDA.@cuda blocks=n_probs threads=n_cols-1 first_true_kernel(bland_set_bool, pivot_cols)
+
+        # Fill pivot_col_vals with the correct entries in tableau
+        CUDA.@cuda blocks=blocks threads=1024 access_kernel(tableau, n_rows, pivot_cols, pivot_col_vals)
+        # pivot_col_vals is the values of tableau in the pivot columns
+
+        # Step 2: Using the identified columns, calculate ratios and pick pivot rows
+        final_col .= @view tableau[:,end]
+        ratios .= final_col ./ pivot_col_vals
+        negatives .= (pivot_col_vals .<= 0.0)
+        neginfs .= isinf.(ratios)
+        ratios[n_rows-Int32(1):n_rows:end] .= Inf # Don't allow pivoting on the Phase II objective row
+        # Note: No need for a special "don't pivot on the Phase I objective" rule,
+        # because the pivot column value in the Phase I objective is negative,
+        # by definition of it being a pivot column. The Phase I objective ratio
+        # therefore gets set to Inf by the "negatives" check.
+        
+        CUDA.@cuda blocks=blocks threads=1024 set_inf_kernel(ratios, negatives)
+        CUDA.@cuda blocks=blocks threads=1024 set_inf_kernel(ratios, neginfs)
+
+        # Check to see if all the values in ratios are "Inf" (if so, something went wrong with the pivot)
+        flag .= true
+        CUDA.@cuda blocks=blocks threads=1024 all_inf_kernel(flag, ratios)
+        if Array(flag)[1]
+            error("Pivot failed; at least one column is all negatives or 0s. Submit an issue if you get this error.")
+            break
+        end
+
+        # Apply the ratio test to each pivot column separately... essentially,
+        # find the index of the minimum value of each column
+        CUDA.@cuda blocks=n_probs threads=n_rows shmem=12*n_rows find_min_kernel(ratios, n_rows, pivot_rows)
+        # "pivot_rows" is the pivot rows
+
+        # Now that we have the pivot columns and pivot rows for each LP,
+        # we can call the pivot kernel to perform the pivot. We need at least
+        # as many threads as there are columns in the tableau, but ideally
+        # we'd have as many threads as there are entires in the LP. And,
+        # ideally, that'd be divisible by 32, but that's incredibly rare.
+        # Best we can do is make sure it doesn't go above 1024.
+        # println("RIGHT BEFORE PIVOT")
+        # display(Array(tableau)[1023*(n_rows)+1 : 1025*(n_rows),:])
+        CUDA.@cuda blocks=n_probs threads=min(Int32(896),n_rows*n_cols) shmem=8*(n_cols+n_rows) pivot_kernel(tableau, pivot_rows, pivot_cols, n_rows, n_cols, n_rows*n_cols)
+    end
+
+    if reached==step_limit
+        # display(Array(tableau))
+        error("Cycle of some sort detected; solution not guaranteed! Submit an issue if you get this error.")
+    end
+
+    ################### PHASE 2 ###################
+
+    # In phase II, we start from the initial BFS identified from phase I, assuming
+    # phase I ended with a BFS. Start by checking if a BFS could be found, and if
+    # not, set the phase II objective row to be all 0's with a final lower bound
+    # of +Inf (i.e., set the value to -Inf). 
+    CUDA.@cuda blocks=blocks threads=1024 feasibility_check_kernel(tableau, n_rows, n_probs)
+
+    # Remake the boolean check matrices, since we no longer care about artificial columns.
+    CUDA.unsafe_free!(bland_set)
+    CUDA.unsafe_free!(bland_set_bool)
+    bland_set = CuArray{Float64}(undef, n_probs, art_start-1)
+    bland_set_bool = CuArray{Bool}(undef, n_probs, art_start-1)
+
+    # Now we can solve the smaller system. We can fully ignore the artificial
+    # variable columns and the phase I objective row.
+    reached = 0
+    for _ = 1:step_limit
+        # CSV.write("C:/Users/rxg20001/OneDrive - University of Connecticut/Research OneDrive/tableaus/step$STEP.csv", DataFrame(Array(tableau),:auto), delim=',')
+        reached += 1
+        # Step 1: Identify the pivot columns following Bland's rule (picking
+        #         the lowest variable index that has a negative value). Note
+        #         that we're using n_rows-1, to be the Phase II objective,
+        #         and we're not including the artificial columns.
+        bland_set .= @view tableau[n_rows-1:n_rows:end,1:art_start-1] # No GPU allocations
+        bland_set_bool .= (bland_set.<-1E-10) # No GPU allocations
+        # @show sum(Array(bland_set_bool))
+        # if sum(Array(bland_set_bool))<200
+        #     @show findfirst(bland_set_bool)
+        # end
+
+        # Check if we are done pivoting, by checking if every element of bland_set_bool is "false"
+        flag .= true
+        CUDA.@cuda blocks=blocks threads=1024 not_any_kernel(flag, bland_set_bool)
+        if Array(flag)[1] # I.e., everything's 0 and we don't need to pivot anymore
+            break
+        end
+
+        # Call the first-true finder kernel. This sets "pivot_cols" to the column number
+        # of the first "true" in each row, or 0 if there aren't any "true"
+        CUDA.@cuda blocks=n_probs threads=art_start-1 first_true_kernel(bland_set_bool, pivot_cols)
+
+        # Fill pivot_col_vals with the correct entries in tableau
+        CUDA.@cuda blocks=blocks threads=1024 access_kernel(tableau, n_rows, pivot_cols, pivot_col_vals)
+        # pivot_col_vals is the values of tableau in the pivot columns
+
+        # Step 2: Using the identified columns, calculate ratios and pick pivot rows
+        final_col .= @view tableau[:,end]
+        ratios .= final_col ./ pivot_col_vals
+        negatives .= (pivot_col_vals .<= 0.0)
+        neginfs .= isinf.(ratios)
+        ratios[n_rows:n_rows:end] .= Inf # Don't allow pivoting on the Phase I objective row
+        # Note: No need for a special "don't pivot on the Phase II objective" rule,
+        # because the pivot column value in the Phase II objective is negative,
+        # by definition of it being a pivot column. The Phase II objective ratio
+        # therefore gets set to Inf by the "negatives" check.
+
+        CUDA.@cuda blocks=blocks threads=1024 set_inf_kernel(ratios, negatives)
+        CUDA.@cuda blocks=blocks threads=1024 set_inf_kernel(ratios, neginfs)
+
+        # Check to see if all the values in ratios are "Inf" (if so, something went wrong with the pivot)
+        flag .= true
+        CUDA.@cuda blocks=blocks threads=1024 all_inf_kernel(flag, ratios)
+        if Array(flag)[1]
+            error("Pivot failed; at least one column is all negatives or 0s. Submit an issue if you get this error.")
+            break
+        end
+
+        # Apply the ratio test to each pivot column separately... essentially,
+        # find the index of the minimum value of each column
+        CUDA.@cuda blocks=n_probs threads=n_rows shmem=12*n_rows find_min_kernel(ratios, n_rows, pivot_rows)
+        # "pivot_rows" is the pivot rows
+
+        # Now that we have the pivot columns and pivot rows for each LP,
+        # we can call the pivot kernel to perform the pivot. We need at least
+        # as many threads as there are columns in the tableau, but ideally
+        # we'd have as many threads as there are entires in the LP. And,
+        # ideally, that'd be divisible by 32, but that's incredibly rare.
+        # Best we can do is make sure it doesn't go above 1024.
+        CUDA.@cuda blocks=n_probs threads=min(Int32(896),n_rows*n_cols) shmem=8*(n_cols+n_rows) pivot_kernel(tableau, pivot_rows, pivot_cols, n_rows, n_cols, n_rows*n_cols)
+
+    end
+
+    for i in [bland_set, bland_set_bool, pivot_cols, pivot_col_vals, final_col, 
+              ratios, negatives, neginfs, ratio_bool, pivot_rows]
+        CUDA.unsafe_free!(i)
+    end
+    # CUDA.synchronize()
+
+    return nothing #, tableau[n_rows:n_rows:end,end] would be the [negative] solutions only
+end
+
+# Utility function to pick out a node from the subvector storage and make it the "current_node".
+# This is used for branching, since currently we use the EAGO branch function that uses the
+# "current_node" to make branching decisions.
+function make_current_node!(t::ExtendGPU, m::EAGO.GlobalOptimizer)
+    prev = copy(t.node_storage[t.node_len])
+    new_lower = t.lower_bound_storage[t.node_len]
+    new_upper = t.upper_bound_storage[t.node_len]
+    # m._upper_objective_value = copy(new_upper)
+    # if m._upper_objective_value < m._global_upper_bound
+    #     m._upper_solution = (prev.lower_variable_bounds .+ prev.upper_variable_bounds)./2
+    # end
+    t.node_len -= 1
+
+    m._current_node = NodeBB(prev.lower_variable_bounds, prev.upper_variable_bounds,
+                             prev.is_integer, prev.continuous, new_lower, new_upper,
+                             prev.depth, prev.cont_depth, prev.id, prev.branch_direction,
+                             prev.last_branch, prev.branch_extent)
+end
+make_current_node!(m::EAGO.GlobalOptimizer{R,S,Q}) where {R,S,Q<:EAGO.ExtensionType} = make_current_node!(EAGO._ext(m), m)
+
+# (In development) A multi-start function to enable multiple runs of a solver such as IPOPT,
+# before the main B&B algorithm begins
+function multistart_upper!(m::EAGO.GlobalOptimizer{R,S,Q}) where {R,S,Q<:EAGO.ExtensionType}
+    m._current_node = EAGO.popmin!(m._stack)
+    t = EAGO._ext(m)
+
+    if t.multistart_points > 1
+        @warn "Multistart points above 1 not currently supported."
+    end
+    for n = 1:t.multistart_points
+        upper_optimizer = EAGO._upper_optimizer(m)
+        MOI.empty!(upper_optimizer)
+    
+        for i = 1:m._working_problem._variable_count
+            m._upper_variables[i] = MOI.add_variable(upper_optimizer)
+        end
+        EAGO._update_upper_variables!(upper_optimizer, m)
+
+        for i = 1:EAGO._variable_num(EAGO.FullVar(), m)
+            l  = EAGO._lower_bound(EAGO.FullVar(), m, i)
+            u  = EAGO._upper_bound(EAGO.FullVar(), m, i)
+            v = m._upper_variables[i]
+            MOI.set(upper_optimizer, MOI.VariablePrimalStart(), v, EAGO._finite_mid(l, u)) #THIS IS WHAT I WOULD CHANGE TO MAKE IT MULTI
+        end
+
+        # add constraints
+        ip = m._input_problem
+        EAGO._add_constraint_store_ci_linear!(upper_optimizer, ip)
+        EAGO._add_constraint_store_ci_quadratic!(upper_optimizer, ip)
+        #add_soc_constraints!(m, upper_optimizer)
+    
+        # Add nonlinear evaluation block
+        MOI.set(upper_optimizer, MOI.NLPBlock(), m._working_problem._nlp_data)
+        MOI.set(upper_optimizer, MOI.ObjectiveSense(), MOI.MIN_SENSE)
+        MOI.set(upper_optimizer, MOI.ObjectiveFunction{EAGO.SAF}(), m._working_problem._objective_saf)
+
+        # Optimize the object
+        MOI.optimize!(upper_optimizer)
+        EAGO._unpack_local_nlp_solve!(m, upper_optimizer)
+        EAGO.store_candidate_solution!(m)
+    end
+    push!(m._stack, m._current_node)
+end
+
+
+# Set the upper problem heuristic to only evaluate at depth 1, for now
+import EAGO: default_upper_heuristic
+function default_upper_heuristic(m::EAGO.GlobalOptimizer)
+    bool = false
+    if EAGO._current_node(m).depth==1
+        bool = true
+    end
+    return bool
+end
+
+# Add a custom branching function that branches at the midpoint
+import EAGO: select_branch_point
+function select_branch_point(t::ExtendGPU, m::EAGO.GlobalOptimizer, i)
+    return EAGO._mid(EAGO.BranchVar(), m, i)
+end
+
+# Disable epigraph reformation, preprocessing, and postprocessing
+import EAGO: reform_epigraph_min!
+function reform_epigraph_min!(m::EAGO.GlobalOptimizer)
+    nothing
+end
+# function reform_epigraph_min!(t::ExtendGPU, m::EAGO.GlobalOptimizer)
+#     nothing
+# end
+
+# reform_epigraph_min!(m::EAGO.GlobalOptimizer) = reform_epigraph_min!(_ext(m), m)
+
+
+
 import EAGO: preprocess!
 function EAGO.preprocess!(t::ExtendGPU, x::EAGO.GlobalOptimizer)
     x._preprocess_feasibility = true
diff --git a/examples/explicit_kinetic_problem.jl b/examples/explicit_kinetic_problem.jl
index 489149d..284209c 100644
--- a/examples/explicit_kinetic_problem.jl
+++ b/examples/explicit_kinetic_problem.jl
@@ -217,13 +217,13 @@ end
 
 
 # Compilation run
-factory = () -> EAGO.Optimizer(SubSolvers(; t = ExtendGPU(explicit_euler_gpu64, 3)))
+factory = () -> EAGO.Optimizer(SubSolvers(; t = PointwiseGPU(explicit_euler_gpu64, 3, alpha=0.00002, node_limit=8192)))
 opt = optimizer_with_attributes(factory, "enable_optimize_hook" => true,
                                          "branch_variable" => Bool[true for i in 1:3],
                                          "force_global_solve" => true,
                                          "node_limit" => Int(3e8),
                                          "time_limit" => 10.0,
-                                         "output_iterations" => 10000)
+                                         "output_iterations" => 1)
 m = Model(opt)
 pL = [10.0, 10.0, 0.001]
 pU = [1200.0, 1200.0, 40.0]
@@ -237,7 +237,7 @@ optimize!(m)
 
 
 # Run for non-compilation timing
-factory = () -> EAGO.Optimizer(SubSolvers(; t = ExtendGPU(explicit_euler_gpu64, 3, alpha=0.00002)))
+factory = () -> EAGO.Optimizer(SubSolvers(; t = PointwiseGPU(explicit_euler_gpu64, 3, alpha=0.00002, node_limit=8192)))
 opt = optimizer_with_attributes(factory, "enable_optimize_hook" => true,
                                          "branch_variable" => Bool[true for i in 1:3],
                                          "force_global_solve" => true,
diff --git a/examples/polynomial_pressure_problem_methods_testing.jl b/examples/polynomial_pressure_problem_methods_testing.jl
new file mode 100644
index 0000000..1a70b7e
--- /dev/null
+++ b/examples/polynomial_pressure_problem_methods_testing.jl
@@ -0,0 +1,331 @@
+
+#############################################################################
+# This code uses the GPU to speed up the solve time for the polynomial 
+# pressure optimization example described in Alvarez2011.
+#############################################################################
+
+# Import the necessary packages
+using JuMP, EAGO
+using Symbolics, SourceCodeMcCormick, CUDA, CSV, DataFrames
+
+# Import the ParBB algorithm
+include(joinpath(@__DIR__, "ParBB", "extension.jl"))
+include(joinpath(@__DIR__, "ParBB", "subroutines.jl"))
+
+# Import the pressure data for the polynomial equation
+pressure_data = CSV.read(joinpath(@__DIR__, "polynomial_pressure_data.csv"), DataFrame)
+
+# Adjust CUDA settings to make sure no scalar calculations occur
+CUDA.allowscalar(false)
+
+# Set up symbolic variables for the problem
+Symbolics.@variables var_a0, var_a1, var_a2, var_b0, var_b1, var_b2, awval, aTval
+func = fgen(exp(var_a0 + var_a1*awval + var_a2*awval^2 + (1/aTval)*(var_b0 + var_b1*awval + var_b2*awval^2)), constants=[awval, aTval])
+
+Symbolics.@variables calc, adata
+SSE_func = fgen(((calc - adata)/adata)^2, [var_a0, var_a1, var_a2, var_b0, var_b1, var_b2], [:cv, :lo, :cvgrad], constants=[adata])
+
+# Set up simpler versions for the pointwise GPU method
+func_simple = fgen(exp(var_a0 + var_a1*awval + var_a2*awval^2 + (1/aTval)*(var_b0 + var_b1*awval + var_b2*awval^2)), [:MC], constants=[awval, aTval])
+SSE_func_simple = fgen(((calc - adata)/adata)^2, [var_a0, var_a1, var_a2, var_b0, var_b1, var_b2], [:cv], constants=[adata])
+
+@variables a0, a1, a2, b0, b1, b2, data, W, T
+expr = exp(a0 + a1*W + a2*W^2 + (1/T)*(b0 + b1*W + b2*W^2))
+
+new_func = fgen(((expr-data)/data)^2, constants=[data, W, T])
+
+# Set up the GPU-compatible L2norm calculation
+L2norm(p...) = L2norm(pressure_data, p...)
+function L2norm(pressure_data::DataFrame, p...)
+    pset = pressure_data.pressure
+    wset = pressure_data.w
+    Tset = pressure_data.T
+
+    SSE_cv = CUDA.zeros(Float64, length(p[1]))
+    for i = 1:length(pset)
+        SSE_cv .+= SSE_func_simple(pset[i], func_simple(Tset[i], wset[i], p...)...)
+    end
+    return SSE_cv
+end
+
+# Set up the more complicated version that includes subgradients
+L2norm_grad(p...) = L2norm_grad(pressure_data, p...)
+function L2norm_grad(pressure_data::DataFrame, p...)
+    pset = pressure_data.pressure
+    wset = pressure_data.w
+    Tset = pressure_data.T
+
+    diff_SSE_cv = similar(p[1])
+    diff_SSE_lo = similar(p[1])
+    diff_SSE_cvgrad_a0 = similar(p[1])
+    diff_SSE_cvgrad_a1 = similar(p[1])
+    diff_SSE_cvgrad_a2 = similar(p[1])
+    diff_SSE_cvgrad_b0 = similar(p[1])
+    diff_SSE_cvgrad_b1 = similar(p[1])
+    diff_SSE_cvgrad_b2 = similar(p[1])
+
+    SSE_cv = CUDA.zeros(Float64, length(p[1]))
+    SSE_lo = CUDA.zeros(Float64, length(p[1]))
+    SSE_cvgrad_a0 = CUDA.zeros(Float64, length(p[1]))
+    SSE_cvgrad_a1 = CUDA.zeros(Float64, length(p[1]))
+    SSE_cvgrad_a2 = CUDA.zeros(Float64, length(p[1]))
+    SSE_cvgrad_b0 = CUDA.zeros(Float64, length(p[1]))
+    SSE_cvgrad_b1 = CUDA.zeros(Float64, length(p[1]))
+    SSE_cvgrad_b2 = CUDA.zeros(Float64, length(p[1]))
+
+
+    for i = 1:length(pset)
+        diff_SSE_cv, diff_SSE_lo, diff_SSE_cvgrad_a0, diff_SSE_cvgrad_a1, 
+        diff_SSE_cvgrad_a2, diff_SSE_cvgrad_b0, diff_SSE_cvgrad_b1, 
+        diff_SSE_cvgrad_b2 = SSE_func(pset[i], func(Tset[i], wset[i], p...)...)
+
+        SSE_cv .+= diff_SSE_cv
+        SSE_lo .+= diff_SSE_lo
+        SSE_cvgrad_a0 .+= diff_SSE_cvgrad_a0
+        SSE_cvgrad_a1 .+= diff_SSE_cvgrad_a1
+        SSE_cvgrad_a2 .+= diff_SSE_cvgrad_a2
+        SSE_cvgrad_b0 .+= diff_SSE_cvgrad_b0
+        SSE_cvgrad_b1 .+= diff_SSE_cvgrad_b1
+        SSE_cvgrad_b2 .+= diff_SSE_cvgrad_b2
+    end
+    return SSE_cv, SSE_lo, SSE_cvgrad_a0, SSE_cvgrad_a1, SSE_cvgrad_a2, SSE_cvgrad_b0, SSE_cvgrad_b1, SSE_cvgrad_b2
+end
+
+# CPU version of the function, to be used for IPOPT
+L2norm_cpu(p1::T, p2::T, p3::T, p4::T, p5::T, p6::T) where {T<:Real} = L2norm_cpu(pressure_data, p1, p2, p3, p4, p5, p6)
+function L2norm_cpu(pressure_data::DataFrame, p1::T, p2::T, p3::T, p4::T, p5::T, p6::T) where {T<:Real}
+    SSE = zero(T)
+    pset = pressure_data.pressure
+    wset = pressure_data.w
+    Tset = pressure_data.T
+
+    for i = 1:length(pset)
+        temp = exp(p1 + p2*wset[i] + p3*wset[i]^2 + (1/Tset[i])*(p4 + p5*wset[i] + p6*wset[i]^2))
+
+        SSE += ((temp - pset[i])/pset[i])^2
+    end
+    return SSE
+end
+
+
+# Run the problem once with a short time_limit to compile all necessary functions
+factory = () -> EAGO.Optimizer(SubSolvers(; t = PointwiseGPU(L2norm, 6)))
+opt = optimizer_with_attributes(factory, "enable_optimize_hook" => true,
+                                         "branch_variable" => Bool[true for i in 1:6],
+                                         "force_global_solve" => true,
+                                         "node_limit" => Int(3e8),
+                                         "time_limit" => 10.0,
+                                         "output_iterations" => 10000)
+model = Model(opt)
+register(model,:L2norm_cpu,6,L2norm_cpu,autodiff=true)
+
+# Set bounds to be +/- 0.1 in every dimension, from reported results
+reported = [8.7369, 27.0375, -21.4172, -2432.1378, -6955.3785, 4525.9568]
+lb = reported .- 0.1
+ub = reported .+ 0.1
+@variable(model, lb[i]<= x[i=1:6] <= ub[i] )
+@NLobjective(model, Min, L2norm_cpu(x[1], x[2], x[3], x[4], x[5], x[6]))
+optimize!(model)
+
+# Real run to get times without compilation
+factory = () -> EAGO.Optimizer(SubSolvers(; t = PointwiseGPU(L2norm, 6)))
+opt = optimizer_with_attributes(factory, "enable_optimize_hook" => true,
+                                         "branch_variable" => Bool[true for i in 1:6],
+                                         "force_global_solve" => true,
+                                         "node_limit" => Int(3e8),
+                                         "time_limit" => 60.0,
+                                         "output_iterations" => 10,
+                                         "log_on" => true)
+model = Model(opt)
+register(model,:L2norm_cpu,6,L2norm_cpu,autodiff=true)
+
+# Set bounds to be +/- 0.1 in every dimension, from reported results
+reported = [8.7369, 27.0375, -21.4172, -2432.1378, -6955.3785, 4525.9568]
+lb = reported .- 0.1
+ub = reported .+ 0.1
+@variable(model, lb[i]<= x[i=1:6] <= ub[i] )
+@NLobjective(model, Min, L2norm_cpu(x[1], x[2], x[3], x[4], x[5], x[6]))
+optimize!(model)
+
+
+#########################################################################################
+factory = () -> EAGO.Optimizer(SubSolvers(; t = SubgradGPU(L2norm_grad, 6)))
+opt = optimizer_with_attributes(factory, "enable_optimize_hook" => true,
+                                         "branch_variable" => Bool[true for i in 1:6],
+                                         "force_global_solve" => true,
+                                         "node_limit" => Int(3e8),
+                                         "time_limit" => 10.0,
+                                         "output_iterations" => 10000)
+model = Model(opt)
+register(model,:L2norm_cpu,6,L2norm_cpu,autodiff=true)
+
+# Set bounds to be +/- 0.1 in every dimension, from reported results
+reported = [8.7369, 27.0375, -21.4172, -2432.1378, -6955.3785, 4525.9568]
+lb = reported .- 0.1
+ub = reported .+ 0.1
+@variable(model, lb[i]<= x[i=1:6] <= ub[i] )
+@NLobjective(model, Min, L2norm_cpu(x[1], x[2], x[3], x[4], x[5], x[6]))
+optimize!(model)
+
+# Real run to get times without compilation
+factory = () -> EAGO.Optimizer(SubSolvers(; t = SubgradGPU(L2norm_grad, 6, node_limit=1000)))
+opt = optimizer_with_attributes(factory, "enable_optimize_hook" => true,
+                                         "branch_variable" => Bool[true for i in 1:6],
+                                         "force_global_solve" => true,
+                                         "node_limit" => Int(3e8),
+                                         "time_limit" => 60.0,
+                                         "output_iterations" => 10,
+                                         "log_on" => true)
+model = Model(opt)
+register(model,:L2norm_cpu,6,L2norm_cpu,autodiff=true)
+
+# Set bounds to be +/- 0.1 in every dimension, from reported results
+reported = [8.7369, 27.0375, -21.4172, -2432.1378, -6955.3785, 4525.9568]
+lb = reported .- 0.1
+ub = reported .+ 0.1
+@variable(model, lb[i]<= x[i=1:6] <= ub[i] )
+@NLobjective(model, Min, L2norm_cpu(x[1], x[2], x[3], x[4], x[5], x[6]))
+optimize!(model)
+
+#########################################################################################
+factory = () -> EAGO.Optimizer(SubSolvers(; t = SimplexGPU(L2norm_grad, 6)))
+opt = optimizer_with_attributes(factory, "enable_optimize_hook" => true,
+                                         "branch_variable" => Bool[true for i in 1:6],
+                                         "force_global_solve" => true,
+                                         "node_limit" => Int(3e8),
+                                         "time_limit" => 10.0,
+                                         "output_iterations" => 10000)
+model = Model(opt)
+register(model,:L2norm_cpu,6,L2norm_cpu,autodiff=true)
+
+# Set bounds to be +/- 0.1 in every dimension, from reported results
+reported = [8.7369, 27.0375, -21.4172, -2432.1378, -6955.3785, 4525.9568]
+lb = reported .- 0.1
+ub = reported .+ 0.1
+@variable(model, lb[i]<= x[i=1:6] <= ub[i] )
+@NLobjective(model, Min, L2norm_cpu(x[1], x[2], x[3], x[4], x[5], x[6]))
+optimize!(model)
+
+# Real run to get times without compilation
+factory = () -> EAGO.Optimizer(SubSolvers(; t = SimplexGPU(L2norm_grad, 6, node_limit=1000)))
+opt = optimizer_with_attributes(factory, "enable_optimize_hook" => true,
+                                         "branch_variable" => Bool[true for i in 1:6],
+                                         "force_global_solve" => true,
+                                         "node_limit" => Int(3e8),
+                                         "time_limit" => 60.0,
+                                         "output_iterations" => 10,
+                                         "log_on" => true)
+model = Model(opt)
+register(model,:L2norm_cpu,6,L2norm_cpu,autodiff=true)
+
+# Set bounds to be +/- 0.1 in every dimension, from reported results
+reported = [8.7369, 27.0375, -21.4172, -2432.1378, -6955.3785, 4525.9568]
+lb = reported .- 0.1
+ub = reported .+ 0.1
+@variable(model, lb[i]<= x[i=1:6] <= ub[i] )
+@NLobjective(model, Min, L2norm_cpu(x[1], x[2], x[3], x[4], x[5], x[6]))
+optimize!(model)
+
+#########################################################################################
+factory = () -> EAGO.Optimizer(SubSolvers(; t = SimplexGPU_Single(L2norm_grad, 6, node_limit=2500)))
+opt = optimizer_with_attributes(factory, "enable_optimize_hook" => true,
+                                         "branch_variable" => Bool[true for i in 1:6],
+                                         "force_global_solve" => true,
+                                         "node_limit" => Int(3e8),
+                                         "time_limit" => 10.0,
+                                         "output_iterations" => 10000)
+model = Model(opt)
+register(model,:L2norm_cpu,6,L2norm_cpu,autodiff=true)
+
+# Set bounds to be +/- 0.1 in every dimension, from reported results
+reported = [8.7369, 27.0375, -21.4172, -2432.1378, -6955.3785, 4525.9568]
+lb = reported .- 0.1
+ub = reported .+ 0.1
+@variable(model, lb[i]<= x[i=1:6] <= ub[i] )
+@NLobjective(model, Min, L2norm_cpu(x[1], x[2], x[3], x[4], x[5], x[6]))
+optimize!(model)
+
+# Real run to get times without compilation
+factory = () -> EAGO.Optimizer(SubSolvers(; t = SimplexGPU_Single(L2norm_grad, 6, node_limit=2500)))
+opt = optimizer_with_attributes(factory, "enable_optimize_hook" => true,
+                                         "branch_variable" => Bool[true for i in 1:6],
+                                         "force_global_solve" => true,
+                                         "node_limit" => Int(3e8),
+                                         "time_limit" => 60.0,
+                                         "output_iterations" => 10,
+                                         "log_on" => true)
+model = Model(opt)
+register(model,:L2norm_cpu,6,L2norm_cpu,autodiff=true)
+
+# Set bounds to be +/- 0.1 in every dimension, from reported results
+reported = [8.7369, 27.0375, -21.4172, -2432.1378, -6955.3785, 4525.9568]
+lb = reported .- 0.1
+ub = reported .+ 0.1
+@variable(model, lb[i]<= x[i=1:6] <= ub[i] )
+@NLobjective(model, Min, L2norm_cpu(x[1], x[2], x[3], x[4], x[5], x[6]))
+optimize!(model)
+
+# L2norm_grad_test(8.7369, 8.7369, 8.6369, 8.8369, 
+#             27.0375, 27.0375, 26.9375, 27.137500000000003, 
+#             -21.4172, -21.4172, -21.517200000000003, -21.3172, 
+#             -2432.1378, -2432.1378, -2432.2378, -2432.0378, 
+#             -6955.3785, -6955.3785, -6955.4785, -6955.278499999999, 
+#             4525.9568, 4525.9568, 4525.8568, 4526.0568)
+
+# L2norm_grad_test(8.7369, 8.7369, 8.7369, 8.7369,
+#             27.0375, 27.0375, 27.0375, 27.0375,
+#             -21.4172, -21.4172, -21.4172, -21.4172, 
+#             -2432.1378, -2432.1378, -2432.1378, -2432.1378, 
+#             -6955.3785, -6955.3785, -6955.3785, -6955.3785, 
+#             4525.9568, 4525.9568, 4525.9568, 4525.9568)
+
+# L2norm_grad_test(p...) = L2norm_grad_test(pressure_data, p...)
+# function L2norm_grad_test(pressure_data::DataFrame, p...)
+#     pset = pressure_data.pressure
+#     wset = pressure_data.w
+#     Tset = pressure_data.T
+
+#     # diff_SSE_cv = similar(p[1])
+#     # diff_SSE_lo = similar(p[1])
+#     # diff_SSE_cvgrad_a0 = similar(p[1])
+#     # diff_SSE_cvgrad_a1 = similar(p[1])
+#     # diff_SSE_cvgrad_a2 = similar(p[1])
+#     # diff_SSE_cvgrad_b0 = similar(p[1])
+#     # diff_SSE_cvgrad_b1 = similar(p[1])
+#     # diff_SSE_cvgrad_b2 = similar(p[1])
+
+#     SSE_cv = CUDA.zeros(Float64, length(p[1]))
+#     SSE_lo = CUDA.zeros(Float64, length(p[1]))
+#     SSE_cvgrad_a0 = CUDA.zeros(Float64, length(p[1]))
+#     SSE_cvgrad_a1 = CUDA.zeros(Float64, length(p[1]))
+#     SSE_cvgrad_a2 = CUDA.zeros(Float64, length(p[1]))
+#     SSE_cvgrad_b0 = CUDA.zeros(Float64, length(p[1]))
+#     SSE_cvgrad_b1 = CUDA.zeros(Float64, length(p[1]))
+#     SSE_cvgrad_b2 = CUDA.zeros(Float64, length(p[1]))
+
+
+#     for i = 1:length(pset)
+#         diff_SSE_cv, diff_SSE_lo, diff_SSE_cvgrad_a0, diff_SSE_cvgrad_a1, 
+#         diff_SSE_cvgrad_a2, diff_SSE_cvgrad_b0, diff_SSE_cvgrad_b1, 
+#         diff_SSE_cvgrad_b2 = SSE_func(pset[i], func(Tset[i], wset[i], p...)...)
+
+#         # func_out = func(Tset[i], wset[i], p...)
+#         # @show func_out
+#         # @show pset[i]
+#         # @show SSE_func(pset[i], func_out...)
+#         # @show nothing
+#         # error()
+
+#         # @show diff_SSE_lo
+
+#         SSE_cv .+= diff_SSE_cv
+#         SSE_lo .+= diff_SSE_lo
+#         SSE_cvgrad_a0 .+= diff_SSE_cvgrad_a0
+#         SSE_cvgrad_a1 .+= diff_SSE_cvgrad_a1
+#         SSE_cvgrad_a2 .+= diff_SSE_cvgrad_a2
+#         SSE_cvgrad_b0 .+= diff_SSE_cvgrad_b0
+#         SSE_cvgrad_b1 .+= diff_SSE_cvgrad_b1
+#         SSE_cvgrad_b2 .+= diff_SSE_cvgrad_b2
+#     end
+#     return SSE_cv, SSE_lo, SSE_cvgrad_a0, SSE_cvgrad_a1, SSE_cvgrad_a2, SSE_cvgrad_b0, SSE_cvgrad_b1, SSE_cvgrad_b2
+# end
\ No newline at end of file
diff --git a/src/SourceCodeMcCormick.jl b/src/SourceCodeMcCormick.jl
index f2e0706..0f68330 100644
--- a/src/SourceCodeMcCormick.jl
+++ b/src/SourceCodeMcCormick.jl
@@ -44,10 +44,15 @@ function transform_rule end
 include(joinpath(@__DIR__, "interval", "interval.jl"))
 include(joinpath(@__DIR__, "relaxation", "relaxation.jl"))
 include(joinpath(@__DIR__, "transform", "transform.jl"))
+include(joinpath(@__DIR__, "grad", "grad.jl"))
 
 export McCormickIntervalTransform, IntervalTransform
 
 export apply_transform, all_evaluators, convex_evaluator, extract_terms, 
-        genvar, genparam, get_name, factor, binarize!, pull_vars, shrink_eqs
+        genvar, genparam, get_name, factor, binarize!, pull_vars, shrink_eqs,
+        grad, shrink_grad!, convex_subgradient, all_subgradients, grad_transform!, 
+        levels, eqn_edges, eval_generator, grad_eval_generator, fgen_cv, fgen_cvgrad,
+        fgen, fgen2
+export @variables, Num
         
 end
\ No newline at end of file
diff --git a/src/grad/grad.jl b/src/grad/grad.jl
new file mode 100644
index 0000000..632153b
--- /dev/null
+++ b/src/grad/grad.jl
@@ -0,0 +1,321 @@
+
+# Can remove the import statement once this is fully incorporated into SCMC
+# import SourceCodeMcCormick: xstr, ystr, zstr, var_names, arity, op, transform_rule
+
+
+"""
+    grad(::Num; force::Bool)
+    grad(::Num, ::Vector{Num}; base_level::Bool, force::Bool)
+
+Given a symbolic expression, return vectors of expressions
+representing the subgradients of the convex and concave
+relaxations. Inputs are the expression that subgradients are
+requested for and, optionally, the dimensions that are needed.
+If no `Vector{Num}` is given, subgradients will be produced
+with respect to all variables found in the expression.
+
+By default, `grad` will assume that the input is a subexpression
+which is part of a larger expression that subgradients are desired
+for. Therefore, values for the gradients will be required as inputs
+to the created functions. Alternatively, if `base_level` is set
+to `true`, gradients will be constructed of `0`s and a `1` based
+on the order of variables given in the `Vector{Num}`.
+
+This function includes a check to make sure excessively large
+substitutions are not being made, which can stall Julia for 
+potentially long time period. If large substitutions are detected,
+`grad` will back out of calculations and warn the user. This
+functionality can be suppressed by setting `force=true`.
+
+# Example
+
+```
+cvgrad, ccgrad = grad(x*y, [x,y,z], base_level=true);
+```
+
+Here, subgradients are requested for the expression `x*y`. The user
+has indicated that the full expression being considered is 3-dimensional,
+with dimensions `[x,y,z]`, so the resulting gradient expressions will
+also be 3-dimensional. E.g., `cvgrad` will be a 3-element Vector{Num}
+with elements `cvgrad[1]` being the x-component of the subgradient of 
+the convex relaxation of `x*y`, `cvgrad[2]` being the y-component, and
+`cvgrad[3]` being the z-component. Because `base_level` has been set to
+`true`, the gradients of `x`, `y`, and `z` are internally set to
+`[1,0,0]`, `[0,1,0]`, and `[0,0,1]`, respectively, prior to creating
+the subgradient expressions for the convex relaxation of `x*y`. Note 
+that all of the above also applies to `ccgrad`, which contains the 
+subgradients of the concave relaxation of the input expression. 
+    
+If `base_level` were not set to `true` (and instead retained its default
+value of `false`), the resulting subgradient expressions would be functions
+of `[dx/dx, dx/dy, dx/dz]`, `[dy/dx, dy/dy, dy/dz]`, and `[dz/dx, dz/dy, dz/dz]`,
+respectively. This may be important if expressions are broken into subexpressions
+that are individually fed to `grad`. E.g.,:
+
+```
+cvgrad, ccgrad = grad(a*b, [x,y,z])
+```
+
+In this case, `a` and `b` may be composite, intermediate terms, which contain
+the base-level variables `x`, `y`, and `z`. The resulting expressions from this
+call of `grad` would then require inputs of the McCormick tuples of `a` and `b`, 
+as well as values for `[da/dx, da/dy, da/dz]` and `[db/dx, db/dy, db/dz]`. 
+
+If `grad` is called with only the first argument, `base_level` will be
+assumed to be `true` (i.e., it is assumed that if the user is writing an
+expression and wants only the variables present in the expression to be
+used in the creation of subgradient expressions, the expression is likely
+made only of base-level variables).
+"""
+grad(num::Num; force::Bool=false, expand::Bool=false, constants::Vector{Num}=Num[]) = grad(num, pull_vars(num), force=force, expand=expand, constants=constants)
+function grad(num::Num, varlist_in::Vector{Num}; force::Bool=false, expand::Bool=false, constants::Vector{Num}=Num[])
+
+    # Create a new varlist to keep track of which gradients are being tracked
+    orig_len = length(varlist_in)
+    varlist_in_string = string.(get_name.(varlist_in))
+    varlist = copy(varlist_in_string)
+
+    # Add in variables that weren't included in the varlist
+    all_vars = string.(get_name.(pull_vars(num)))
+    for var in all_vars
+        if !(var in varlist)
+            push!(varlist, var)
+        end
+    end
+    base_vars = copy(varlist)
+    base_len = length(varlist)
+
+    # Factorize the equation to generate a new set of equations
+    @variables result
+    eqn = result ~ num
+    equations = Equation[]
+    factor(eqn.rhs, eqs=equations)
+    if length(equations) > 0
+        push!(equations, Equation(eqn.lhs, equations[end].rhs))
+        deleteat!(equations, length(equations)-1)
+    else
+        index = findall(x -> isequal(x.rhs, eqn.rhs), equations)
+        push!(equations, Equation(eqn.lhs, equations[index[1]].lhs))
+    end
+
+    # Add in auxiliary variables that appeared during factorization
+    varlist = [varlist; string.(get_name.(getfield.(equations, :lhs)))]
+    
+    # Now we need to create vectors of type Num, of size (orig_len, length(varlist))
+    cv_gradlist = Num.(zeros(orig_len, length(varlist)))
+    cc_gradlist = Num.(zeros(orig_len, length(varlist)))
+    for i in 1:base_len
+        if !expand && (base_vars[i] in varlist_in_string)
+            # If the variable is in the input list, we know it's a base-level variable and we know its gradient
+            cv_gradlist[i,i] = Num(1.0)
+            cc_gradlist[i,i] = Num(1.0)
+        elseif string(base_vars[i]) in string.(constants)
+            # If the variable is a constant input, the gradients are always zero
+            nothing
+        else
+            for j = 1:orig_len
+                # If the variable isn't in the list, assume it's some function of the base-level variables
+                cv_gradlist[j,i] = genvar(Symbol("∂"*varlist[i]*"∂"*varlist_in_string[j]*"_cv"))
+                cc_gradlist[j,i] = genvar(Symbol("∂"*varlist[i]*"∂"*varlist_in_string[j]*"_cc"))
+            end
+        end
+    end
+
+    # Apply transform rules to the factored equations to make the final equation set.
+    # Within each equation, we also want to update the gradlist.
+    new_equations = Equation[]
+    for a in equations
+        zn = var_names(McCormickIntervalTransform(), zstr(a))
+        if string(xstr(a)) in string.(constants)
+            xn = (xstr(a), xstr(a), xstr(a), xstr(a))
+        else
+            xn = var_names(McCormickIntervalTransform(), xstr(a))
+        end
+        if isone(arity(a)) 
+            targs = (McCormickIntervalTransform(), op(a), zn..., xn...)
+        else
+            if string(ystr(a)) in string.(constants)
+                yn = (ystr(a), ystr(a), ystr(a), ystr(a))
+            else
+                yn = var_names(McCormickIntervalTransform(), ystr(a))
+            end
+            targs = (McCormickIntervalTransform(), op(a), zn..., xn..., yn...)
+        end
+        new = transform_rule(targs...)
+        for i in new
+            push!(new_equations, i)
+        end
+
+        # Apply the appropriate transform rule to propagate the subgradients
+        # in cv_gradlist and cc_gradlist
+        # grad_transform!(McCormickIntervalTransform(), op(a), new[1].rhs, new[2].rhs, new[3].rhs, new[4].rhs, targs[7:end]..., varlist, cv_gradlist, cc_gradlist)
+        grad_transform!(targs..., varlist, cv_gradlist, cc_gradlist)
+    end
+
+    # Shrink the equations in new_equations and use the substitution process
+    # to simultaneously eliminate auxiliary variables in the subgradient
+    # vectors for the final expression
+    shrink_grad!(cv_gradlist, cc_gradlist, new_equations, force=force)
+
+    # Return only the subgradients of the final term
+    return cv_gradlist[:,end], cc_gradlist[:,end]
+end
+grad(a::SymbolicUtils.BasicSymbolic; force::Bool=false, expand::Bool=false, constants::Vector{Num}=Num[]) = grad(Num(a), force=force, expand=expand, constants=constants)
+grad(a::SymbolicUtils.BasicSymbolic, b::Vector{Num}; force::Bool=false, expand::Bool=false, constants::Vector{Num}=Num[]) = grad(Num(a), b, force=force, expand=expand, constants=constants)
+
+# The subgradients are constructed in sequence, but they are comprised of terms of the McCormick
+# tuples of the original variables and new auxiliary variables. Therefore we only need to
+# substitute out auxiliary variables in the right-most column of each gradlist, which contains
+# the subgradient of the original input `Num` in `grad`.
+function shrink_grad!(cv_gradlist::Matrix{Num}, cc_gradlist::Matrix{Num}, eqs::Vector{Equation}; force::Bool=false)
+    new_eqs = copy(eqs)
+    for _ in 1:length(eqs)-4
+        # Perform the same substitution as in shrink_eqs
+        lhs = string(new_eqs[1].lhs)
+        replace = [false; in.(lhs, [string.(x) for x in pull_vars.(new_eqs[2:end])])]
+        replacecount = sum(length.(collect.(eachmatch.(Regex("$(lhs)"), string.(new_eqs[2:end])))))
+        # println("Substituting $(length(string(new_eqs[1].rhs))), $(replacecount) times")
+        if !force && length(string(new_eqs[1].rhs))*replacecount > 10000000
+            @warn """Your expression may be too complicated for SourceCodeMcCormick to handle
+            without using substantial CPU memory. Consider breaking your expression
+            into smaller components using `all_evaluators` and user-defined code.
+            (or use the option `force=true` to force this operation to continue)"""
+            return
+        end
+        new_eqs[replace] = substitute(new_eqs[replace], Dict(new_eqs[1].lhs => new_eqs[1].rhs))
+
+        # But also substitute into the right-most column of the gradlists
+        replace = in.(lhs, [string.(x) for x in pull_vars.(cv_gradlist[:,end])])
+        replacecount = sum(length.(collect.(eachmatch.(Regex("$(lhs)"), string.(cv_gradlist[:,end])))))
+        if !force && length(string(new_eqs[1].rhs))*replacecount > 10000000
+            @warn """Your expression may be too complicated for SourceCodeMcCormick to handle
+            without using substantial CPU memory. Consider breaking your expression
+            into smaller components using `all_evaluators` and user-defined code.
+            (or use the option `force=true` to force this operation to continue)"""
+            return
+        end
+        cv_gradlist[replace,end] = substitute(cv_gradlist[replace,end], Dict(new_eqs[1].lhs => new_eqs[1].rhs))
+
+        replace = in.(lhs, [string.(x) for x in pull_vars.(cc_gradlist[:,end])])
+        replacecount = sum(length.(collect.(eachmatch.(Regex("$(lhs)"), string.(cc_gradlist[:,end])))))
+        if !force && length(string(new_eqs[1].rhs))*replacecount > 10000000
+            @warn """Your expression may be too complicated for SourceCodeMcCormick to handle
+            without using substantial CPU memory. Consider breaking your expression
+            into smaller components using `all_evaluators` and user-defined code.
+            (or use the option `force=true` to force this operation to continue)"""
+            return
+        end
+        cc_gradlist[replace,end] = substitute(cc_gradlist[replace,end], Dict(new_eqs[1].lhs => new_eqs[1].rhs))
+        new_eqs = new_eqs[2:end]
+    end
+
+    # The McCormick object of the final expression may also be needed for gradient calculations, so we might
+    # as well try substituting that as well.
+    for i in eachindex(new_eqs)
+        # Don't shrink new_eqs anymore, only perform substitutions into the gradlists
+        lhs = string(new_eqs[i].lhs)
+        replace = in.(lhs, [string.(x) for x in pull_vars.(cv_gradlist[:,end])])
+        replacecount = sum(length.(collect.(eachmatch.(Regex("$(lhs)"), string.(cv_gradlist[:,end])))))
+        if !force && length(string(new_eqs[i].rhs))*replacecount > 10000000
+            @warn """Your expression may be too complicated for SourceCodeMcCormick to handle
+            without using substantial CPU memory. Consider breaking your expression
+            into smaller components using `all_evaluators` and user-defined code.
+            (or use the option `force=true` to force this operation to continue)"""
+            return
+        end
+        cv_gradlist[replace,end] = substitute(cv_gradlist[replace,end], Dict(new_eqs[i].lhs => new_eqs[i].rhs))
+
+        replace = in.(lhs, [string.(x) for x in pull_vars.(cc_gradlist[:,end])])
+        replacecount = sum(length.(collect.(eachmatch.(Regex("$(lhs)"), string.(cc_gradlist[:,end])))))
+        if !force && length(string(new_eqs[i].rhs))*replacecount > 10000000
+            @warn """Your expression may be too complicated for SourceCodeMcCormick to handle
+            without using substantial CPU memory. Consider breaking your expression
+            into smaller components using `all_evaluators` and user-defined code.
+            (or use the option `force=true` to force this operation to continue)"""
+            return
+        end
+        cc_gradlist[replace,end] = substitute(cc_gradlist[replace,end], Dict(new_eqs[i].lhs => new_eqs[i].rhs))
+    end
+
+    return
+end
+
+
+# Need a function like convex_evaluator, but for subgradients.
+convex_subgradient(term::BasicSymbolic; force::Bool=false, constants::Vector{Num}=Num[]) = convex_subgradient(Num(term), force=force, constants=constants)
+convex_subgradient(term::Num; force::Bool=false, constants::Vector{Num}=Num[]) = convex_subgradient(term, pull_vars(term), force=force, constants=constants)
+function convex_subgradient(term::Num, varlist::Vector{Num}; force::Bool=false, constants::Vector{Num}=Num[])
+    if exprtype(term.val) == ADD
+        cvgrad = zeros(Num, length(vars))
+        for (key,val) in term.val.dict
+            new_cvgrad, _ = grad(val*key, varlist, force=force, constants=constants)
+            if isnothing(new_cvgrad)
+                return
+            end
+            cvgrad .+= new_cvgrad
+        end
+        
+        ordered_vars = pull_vars(cvgrad)
+        func_list = []
+        for i = eachindex(cvgrad)
+            @eval new_func = $(build_function(cvgrad[i], ordered_vars..., expression=Val{true}))
+            push!(func_list, new_func)
+        end
+    else
+        cvgrad, _ = grad(term, varlist, force=force, constants=constants)
+        if isnothing(cvgrad)
+            return
+        end
+        ordered_vars = pull_vars(cvgrad)
+        func_list = []
+        for i = eachindex(cvgrad)
+            @eval new_func = $(build_function(cvgrad[i], ordered_vars..., expression=Val{true}))
+            push!(func_list, new_func)
+        end
+    end
+    return func_list, ordered_vars
+end
+
+all_subgradients(term::BasicSymbolic; force::Bool=false, expand::Bool=false, constants::Vector{Num}=Num[]) = all_subgradients(Num(term), force=force, expand=expand, constants=constants)
+all_subgradients(term::Num; force::Bool=false, expand::Bool=false, constants::Vector{Num}=Num[]) = all_subgradients(term, pull_vars(term), force=force, expand=expand, constants=constants)
+function all_subgradients(term::Num, varlist::Vector{Num}; force::Bool=false, expand::Bool=false, constants::Vector{Num}=Num[])
+    if exprtype(term.val) == ADD
+        cvgrad = zeros(Num, length(varlist))
+        ccgrad = zeros(Num, length(varlist))
+        for (key,val) in term.val.dict
+            new_cvgrad, new_ccgrad = grad(val*key, varlist, force=force, expand=expand, constants=constants)
+            if isnothing(new_cvgrad)
+                return
+            end
+            cvgrad .+= new_cvgrad
+            ccgrad .+= new_ccgrad
+        end
+        
+        ordered_vars = pull_vars(cvgrad + ccgrad)
+        cv_func_list = []
+        cc_func_list = []
+        for i = eachindex(cvgrad)
+            @eval new_func = $(build_function(cvgrad[i], ordered_vars..., expression=Val{true}))
+            push!(cv_func_list, new_func)
+            @eval new_func = $(build_function(ccgrad[i], ordered_vars..., expression=Val{true}))
+            push!(cc_func_list, new_func)
+        end
+    else
+        cvgrad, ccgrad = grad(term, varlist, force=force, expand=expand, constants=constants)
+        if isnothing(cvgrad)
+            return
+        end
+        ordered_vars = pull_vars(cvgrad + ccgrad)
+        cv_func_list = []
+        cc_func_list = []
+        for i = eachindex(cvgrad)
+            @eval new_func = $(build_function(cvgrad[i], ordered_vars..., expression=Val{true}))
+            push!(cv_func_list, new_func)
+            @eval new_func = $(build_function(ccgrad[i], ordered_vars..., expression=Val{true}))
+            push!(cc_func_list, new_func)
+        end
+    end
+    return cv_func_list, cc_func_list, ordered_vars
+end
+
+include(joinpath(@__DIR__, "rules.jl"))
diff --git a/src/grad/rules.jl b/src/grad/rules.jl
new file mode 100644
index 0000000..e34bd2b
--- /dev/null
+++ b/src/grad/rules.jl
@@ -0,0 +1,621 @@
+
+# This file is specific to gradient calculations and assumes the presence of 
+# a matrix of gradients::Matrix{Num}
+
+
+#=
+Unitary Rules
+=#  
+function grad_transform!(::McCormickIntervalTransform, ::Nothing, zL, zU, zcv, zcc, xL, xU, 
+                            xcv, xcc, varlist::Vector{String}, cv_gradlist::Matrix{Num}, cc_gradlist::Matrix{Num})
+    z = findfirst(x -> x==string(zL)[1:end-3], varlist)
+    x = findfirst(x -> x==string(xL)[1:end-3], varlist)
+
+    # Separate variable check, in case x is a constant
+    if isnothing(x)
+        x = findfirst(x -> x==string(xL), varlist)
+    end
+
+    cv_gradlist[:,z] .= cv_gradlist[:,x]
+    cc_gradlist[:,z] .= cc_gradlist[:,x]
+    return
+end
+function grad_transform!(::McCormickIntervalTransform, ::typeof(getindex), zL, zU, zcv, zcc, xL, xU, 
+                            xcv, xcc, varlist::Vector{String}, cv_gradlist::Matrix{Num}, cc_gradlist::Matrix{Num})
+    z = findfirst(x -> x==string(zL)[1:end-3], varlist)
+    x = findfirst(x -> x==string(xL)[1:end-3], varlist)
+
+    # Separate variable check, in case x is a constant
+    if isnothing(x)
+        x = findfirst(x -> x==string(xL), varlist)
+    end
+
+    cv_gradlist[:,z] .= cv_gradlist[:,x]
+    cc_gradlist[:,z] .= cc_gradlist[:,x]
+    return
+end
+function grad_transform!(::McCormickIntervalTransform, ::typeof(exp), zL, zU, zcv, zcc, xL, xU, 
+                            xcv, xcc, varlist::Vector{String}, cv_gradlist::Matrix{Num}, cc_gradlist::Matrix{Num})
+    z = findfirst(x -> x==string(zL)[1:end-3], varlist)
+    x = findfirst(x -> x==string(xL)[1:end-3], varlist)
+
+    # Separate variable check, in case x is a constant
+    if isnothing(x)
+        x = findfirst(x -> x==string(xL), varlist)
+    end
+
+    # For cv_grad, we do a mid operation using {xcc, xcv, xL}, and use {exp(xcc), exp(xcv), 0.0} accordingly
+    @. cv_gradlist[:,z] = mid_grad(xcc, xcv, xL, cc_gradlist[:,x]*exp(xcc), cv_gradlist[:,x]*exp(xcv), 0.0)
+
+    # For cc_grad, we do the same as above, but we need to check that the interval is non-degenerate first
+    @. cc_gradlist[:,z] = IfElse.ifelse(xU - xL == 0.0, 0.0, mid_grad(xcc, xcv, xU, cc_gradlist[:,x]*((exp(xU)-exp(xL))/(xU-xL)), cv_gradlist[:,x]*((exp(xU)-exp(xL))/(xU-xL)), 0.0))
+    return
+end
+
+#=
+Binary Rules
+=#
+function grad_transform!(::McCormickIntervalTransform, ::typeof(+), zL, zU, zcv, zcc, xL, xU, 
+                            xcv, xcc, yL, yU, ycv, ycc, varlist::Vector{String}, cv_gradlist::Matrix{Num}, cc_gradlist::Matrix{Num})
+    # Identify which variables are being used
+    z = findfirst(x -> x==string(zL)[1:end-3], varlist)
+    x = findfirst(x -> x==string(xL)[1:end-3], varlist)
+    y = findfirst(x -> x==string(yL)[1:end-3], varlist)
+
+    # Separate variable check, in case x or y are constants
+    if isnothing(x)
+        x = findfirst(x -> x==string(xL), varlist)
+    end
+    if isnothing(y)
+        y = findfirst(x -> x==string(yL), varlist)
+    end
+
+    # Modify the gradlist accordingly
+    cv_gradlist[:, z] .= cv_gradlist[:, x] .+ cv_gradlist[:, y]
+    cc_gradlist[:, z] .= cc_gradlist[:, x] .+ cc_gradlist[:, y]
+    return
+end
+function grad_transform!(::McCormickIntervalTransform, ::typeof(+), zL, zU, zcv, zcc, xL, xU, 
+    xcv, xcc, yL::Real, yU::Real, ycv::Real, ycc::Real, varlist::Vector{String}, cv_gradlist::Matrix{Num}, cc_gradlist::Matrix{Num})
+    # Identify which variables are being used
+    z = findfirst(x -> x==string(zL)[1:end-3], varlist)
+    x = findfirst(x -> x==string(xL)[1:end-3], varlist)
+
+    # Separate variable check, in case x is a constant
+    if isnothing(x)
+        x = findfirst(x -> x==string(xL), varlist)
+    end
+
+    # Modify the gradlist accordingly
+    cv_gradlist[:, z] .= cv_gradlist[:, x]
+    cc_gradlist[:, z] .= cc_gradlist[:, x]
+end
+function grad_transform!(::McCormickIntervalTransform, ::typeof(*), zL, zU, zcv, zcc, xL, xU, 
+                            xcv, xcc, yL, yU, ycv, ycc, varlist::Vector{String}, cv_gradlist::Matrix{Num}, cc_gradlist::Matrix{Num})
+    # Identify which variables are being used
+    z = findfirst(x -> x==string(zL)[1:end-3], varlist)
+    x = findfirst(x -> x==string(xL)[1:end-3], varlist)
+    y = findfirst(x -> x==string(yL)[1:end-3], varlist)
+
+    # Separate variable check, in case x or y are constants
+    if isnothing(x)
+        x = findfirst(x -> x==string(xL), varlist)
+    end
+    if isnothing(y)
+        y = findfirst(x -> x==string(yL), varlist)
+    end
+
+    # Need to include math for {L, U, cv, cc} without the cut operator, so the cut() operator
+    # can be used here.
+    rL = IfElse.ifelse(yL >= 0.0, 
+        IfElse.ifelse(xL >= 0.0, xL*yL,
+            IfElse.ifelse(xU <= 0.0, xL*yU, xL*yU)),
+        IfElse.ifelse(yU <= 0.0, 
+            IfElse.ifelse(xL >= 0.0, xU*yL, 
+                IfElse.ifelse(xU <= 0.0, xU*yU, xU*yL)), 
+            IfElse.ifelse(xL > 0.0, xU*yL,
+                IfElse.ifelse(xU < 0.0, xL*yU, min(xL*yU, xU*yL)))))
+    rU = IfElse.ifelse(yL >= 0.0, 
+        IfElse.ifelse(xL >= 0.0, xU*yU, 
+            IfElse.ifelse(xU <= 0.0, xU*yL, xU*yU)), 
+        IfElse.ifelse(yU <= 0.0,
+            IfElse.ifelse(xL >= 0.0, xL*yU,
+                IfElse.ifelse(xU <= 0.0, xL*yL, xL*yL)),
+            IfElse.ifelse(xL > 0.0, xU*yU,
+                IfElse.ifelse(xU < 0.0, xL*yL, max(xL*yL, xU*yU))))) 
+    rcv = IfElse.ifelse(xL >= 0.0,
+        IfElse.ifelse(yL >= 0.0, max(yU*xcv + xU*ycv - xU*yU, yL*xcv + xL*ycv - xL*yL),
+            IfElse.ifelse(yU <= 0.0, -min((-yU)*xcc + xU*(-ycv) - xU*(-yU), (-yL)*xcc + xL*(-ycv) - xL*(-yL)),
+                max(yU*xcv + xU*ycv - xU*yU, yL*xcc + xL*ycv - xL*yL))),
+        IfElse.ifelse(xU <= 0.0,
+            IfElse.ifelse(yL >= 0.0, -min(yL*(-xcv) + (-xL)*ycc - (-xL)*yL, yU*(-xcv) + (-xU)*ycc - (-xU)*yU),
+                IfElse.ifelse(yU <= 0.0, max(yL*xcc + xL*ycc - xL*yL, yU*xcc + xU*ycc - xU*yU),
+                    -min(yL*(-xcc) + (-xL)*ycc - (-xL)*yL, yU*(-xcv) + (-xU)*ycc - (-xU)*yU))),
+            IfElse.ifelse(yL >= 0.0, max(xU*ycv + yU*xcv - yU*xU, xL*ycc + yL*xcv - yL*xL),
+                IfElse.ifelse(yU <= 0.0, -min(xL*(-ycc) + (-yL)*xcc - (-yL)*xL, xU*(-ycv) + (-yU)*xcc - (-yU)*xU), 
+                max(yU*xcv + xU*ycv - xU*yU, yL*xcc + xL*ycc - xL*yL)))))
+    rcc = IfElse.ifelse(xL >= 0.0,
+        IfElse.ifelse(yL >= 0.0, min(yL*xcc + xU*ycc - xU*yL, yU*xcc + xL*ycc - xL*yU),
+            IfElse.ifelse(yU <= 0.0, -max((-yL)*xcv + xU*(-ycc) - xU*(-yL), (-yU)*xcv + xL*(-ycc) - xL*(-yU)),
+                min(yL*xcv + xU*ycc - xU*yL, yU*xcc + xL*ycc - xL*yU))),
+        IfElse.ifelse(xU <= 0.0,
+            IfElse.ifelse(yL >= 0.0, -max(yU*(-xcc) + (-xL)*ycv - (-xL)*yU, yL*(-xcc) + (-xU)*ycv - (-xU)*yL),
+                 IfElse.ifelse(yU <= 0.0, min(yU*xcv + xL*ycv - xL*yU, yL*xcv + xU*ycv - xU*yL),
+                 -max(yU*(-xcc) + (-xL)*ycv - (-xL)*yU, yL*(-xcv) + (-xU)*ycv - (-xU)*yL))),
+            IfElse.ifelse(yL >= 0.0, min(xL*ycv + yU*xcc - yU*xL, xU*ycc + yL*xcc - yL*xU),
+                IfElse.ifelse(yU <= 0.0, -max(xU*(-ycc) + (-yL)*xcv - (-yL)*xU, xL*(-ycv) + (-yU)*xcv - (-yU)*xL), 
+                    min(yL*xcv + xU*ycc - xU*yL, yU*xcc + xL*ycv - xL*yU)))))
+
+    # Include the cut definition in the gradlist, setting the gradient to the zero vector
+    # if cv would have been cut
+    zero_vec = Num.(zeros(size(cv_gradlist[:,y])))
+    @. cv_gradlist[:, z] = IfElse.ifelse(rcv < rL, zero_vec, IfElse.ifelse(xL >= 0.0, 
+        IfElse.ifelse(yL >= 0.0, IfElse.ifelse(yU*xcv + xU*ycv - xU*yU > yL*xcv + xL*ycv - xL*yL, 
+                                               yU*cv_gradlist[:,x] + xU*cv_gradlist[:,y], 
+                                               yL*cv_gradlist[:,x] + xL*cv_gradlist[:,y]),
+            IfElse.ifelse(yU <= 0.0, -IfElse.ifelse((-yU)*xcc + xU*(-ycv) - xU*(-yU) < (-yL)*xcc + xL*(-ycv) - xL*(-yL), 
+                                                    (-yU)*cc_gradlist[:,x] + xU*(-cv_gradlist[:,y]),
+                                                    (-yL)*cc_gradlist[:,x] + xL*(-cv_gradlist[:,y])),
+                IfElse.ifelse(yU*xcv + xU*ycv - xU*yU > yL*xcc + xL*ycv - xL*yL,
+                              yU*cv_gradlist[:,x] + xU*cv_gradlist[:,y],
+                              yL*cc_gradlist[:,x] + xL*cv_gradlist[:,y]))),
+        IfElse.ifelse(xU <= 0.0,
+            IfElse.ifelse(yL >= 0.0, -IfElse.ifelse(yL*(-xcv) + (-xL)*ycc - (-xL)*yL < yU*(-xcv) + (-xU)*ycc - (-xU)*yU, 
+                                                    yL*(-cv_gradlist[:,x]) + (-xL)*cc_gradlist[:,y],
+                                                    yU*(-cv_gradlist[:,x]) + (-xU)*cc_gradlist[:,y]),
+                IfElse.ifelse(yU <= 0.0, IfElse.ifelse((-yL)*(-xcc) + (-xL)*(-ycc) - (-xL)*(-yL) > (-yU)*(-xcc) + (-xU)*(-ycc) - (-xU)*(-yU), 
+                                                       (-yL)*(-cc_gradlist[:,x]) + (-xL)*(-cc_gradlist[:,y]), 
+                                                       (-yU)*(-cc_gradlist[:,x]) + (-xU)*(-cc_gradlist[:,y])), 
+                    -IfElse.ifelse(yL*(-xcc) + (-xL)*ycc - (-xL)*yL < yU*(-xcv) + (-xU)*ycc - (-xU)*yU,
+                                yL*(-cc_gradlist[:,x]) + (-xL)*cc_gradlist[:,y],
+                                yU*(-cv_gradlist[:,x]) + (-xU)*cc_gradlist[:,y]))),
+            IfElse.ifelse(yL >= 0.0, IfElse.ifelse((xU)*(ycv) + (yU)*(xcv) - (yU)*(xU) > (xL)*(ycc) + (yL)*(xcv) - (yL)*(xL),
+                                                   (xU)*(cv_gradlist[:,y]) + (yU)*(cv_gradlist[:,x]),
+                                                   (xL)*(cc_gradlist[:,y]) + (yL)*(cv_gradlist[:,x])), 
+                IfElse.ifelse(yU <= 0.0, -IfElse.ifelse((xL)*(-ycc) + (-yL)*(xcc) - (-yL)*(xL) < (xU)*(-ycv) + (-yU)*(xcc) - (-yU)*(xU),
+                                                        (xL)*(-cc_gradlist[:,y]) + (-yL)*(cc_gradlist[:,x]),
+                                                        (xU)*(-cv_gradlist[:,y]) + (-yU)*(cc_gradlist[:,x])),
+                    IfElse.ifelse(yU*xcv + xU*ycv - xU*yU > yL*xcc + xL*ycc - xL*yL, 
+                                  yU*cv_gradlist[:,x] + xU*cv_gradlist[:,y], 
+                                  yL*cc_gradlist[:,x] + xL*cc_gradlist[:,y]))))))
+    @. cc_gradlist[:, z] = IfElse.ifelse(rcc > rU, zero_vec, IfElse.ifelse(xL >= 0.0, 
+        IfElse.ifelse(yL >= 0.0, IfElse.ifelse(yL*xcc + xU*ycc - xU*yL < yU*xcc + xL*ycc - xL*yU, 
+                                            yL*cc_gradlist[:,x] + xU*cc_gradlist[:,y],
+                                            yU*cc_gradlist[:,x] + xL*cc_gradlist[:,y]),
+            IfElse.ifelse(yU <= 0.0, -IfElse.ifelse((-yL)*xcv + xU*(-ycc) - xU*(-yL) > (-yU)*xcv + xL*(-ycc) - xL*(-yU), 
+                                                    (-yL)*cv_gradlist[:,x] + xU*(-cc_gradlist[:,y]), 
+                                                    (-yU)*cv_gradlist[:,x] + xL*(-cc_gradlist[:,y])),
+                IfElse.ifelse(yL*xcv + xU*ycc - xU*yL < yU*xcc + xL*ycc - xL*yU,
+                                yL*cv_gradlist[:,x] + xU*cc_gradlist[:,y],
+                                yU*cc_gradlist[:,x] + xL*cc_gradlist[:,y]))),
+        IfElse.ifelse(xU <= 0.0,
+            IfElse.ifelse(yL >= 0.0, -IfElse.ifelse(yU*(-xcc) + (-xL)*ycv - (-xL)*yU > yL*(-xcc) + (-xU)*ycv - (-xU)*yL, 
+                                                    yU*(-cc_gradlist[:,x]) + (-xL)*cv_gradlist[:,y], 
+                                                    yL*(-cc_gradlist[:,x]) + (-xU)*cv_gradlist[:,y]),
+                IfElse.ifelse(yU <= 0.0, IfElse.ifelse((-yU)*(-xcv) + (-xL)*(-ycv) - (-xL)*(-yU) < (-yL)*(-xcv) + (-xU)*(-ycv) - (-xU)*(-yL), 
+                                                    (-yU)*(-cv_gradlist[:,x]) + (-xL)*(-cv_gradlist[:,y]),
+                                                    (-yL)*(-cv_gradlist[:,x]) + (-xU)*(-cv_gradlist[:,y])), 
+                    -IfElse.ifelse((-xL)*(ycv) + (yU)*(-xcc) - (yU)*(-xL) > (-xU)*(ycv) + (yL)*(-xcv) - (yL)*(-xU), 
+                                   (-xL)*(cv_gradlist[:,y]) + (yU)*(-cc_gradlist[:,x]), 
+                                   (-xU)*(cv_gradlist[:,y]) + (yL)*(-cv_gradlist[:,x])))),
+            IfElse.ifelse(yL >= 0.0, IfElse.ifelse((xL)*(ycv) + (yU)*(xcc) - (yU)*(xL) < (xU)*(ycc) + (yL)*(xcc) - (yL)*(xU),
+                                                (xL)*(cv_gradlist[:,y]) + (yU)*(cc_gradlist[:,x]),
+                                                (xU)*(cc_gradlist[:,y]) + (yL)*(cc_gradlist[:,x])), 
+                IfElse.ifelse(yU <= 0.0, -IfElse.ifelse((xU)*(-ycc) + (-yL)*(xcv) - (-yL)*(xU) > (xL)*(-ycv) + (-yU)*(xcv) - (-yU)*(xL),
+                                                        (xU)*(-cc_gradlist[:,y]) + (-yL)*(cv_gradlist[:,x]),
+                                                        (xL)*(-cv_gradlist[:,y]) + (-yU)*(cv_gradlist[:,x])), 
+                    IfElse.ifelse(yL*xcv + xU*ycc - xU*yL < yU*xcc + xL*ycv - xL*yU, 
+                                yL*cv_gradlist[:,x] + xU*cc_gradlist[:,y],
+                                yU*cc_gradlist[:,x] + xL*cv_gradlist[:,y]))))))
+    return
+end
+function grad_transform!(::McCormickIntervalTransform, ::typeof(*), zL, zU, zcv, zcc, xL, xU, 
+                            xcv, xcc, yL::Real, yU::Real, ycv::Real, ycc::Real, varlist::Vector{String}, cv_gradlist::Matrix{Num}, cc_gradlist::Matrix{Num})
+    # Identify which variables are being used
+    z = findfirst(x -> x==string(zL)[1:end-3], varlist)
+    x = findfirst(x -> x==string(xL)[1:end-3], varlist)
+
+    # Separate variable check, in case x is a constant
+    if isnothing(x)
+        x = findfirst(x -> x==string(xL), varlist)
+    end
+
+    @. cv_gradlist[:, z] = IfElse.ifelse(xL >= 0.0, 
+        IfElse.ifelse(yL >= 0.0, yU*cv_gradlist[:,x],
+            IfElse.ifelse(yU <= 0.0, yU*cc_gradlist[:,x],
+                IfElse.ifelse(xcv > xcc, yU*cv_gradlist[:,x], yL*cc_gradlist[:,x]))),
+        IfElse.ifelse(xU <= 0.0,
+            IfElse.ifelse(yL >= 0.0, yL*cv_gradlist[:,x],
+                    IfElse.ifelse(yU <= 0.0, yL*cc_gradlist[:,x], yL*cv_gradlist[:,x])), 
+                IfElse.ifelse(yL >= 0.0, yU*cv_gradlist[:,x], 
+                    IfElse.ifelse(yU <= 0.0, yL*cc_gradlist[:,x],
+                        IfElse.ifelse(xcv > xcc, yU*cv_gradlist[:,x], yL*cc_gradlist[:,x])))))
+    @. cc_gradlist[:, z] = IfElse.ifelse(xL >= 0.0, 
+        IfElse.ifelse(yL >= 0.0, yL*cc_gradlist[:,x],
+            IfElse.ifelse(yU <= 0.0, yU*cv_gradlist[:,x],
+                IfElse.ifelse(xcv < xcc, yL*cv_gradlist[:,x], yU*cc_gradlist[:,x]))),
+        IfElse.ifelse(xU <= 0.0,
+            IfElse.ifelse(yL >= 0.0, yL*cc_gradlist[:,x],
+                IfElse.ifelse(yU <= 0.0, yU*cv_gradlist[:,x], yU*cc_gradlist[:,x])),
+            IfElse.ifelse(yL >= 0.0, yU*cc_gradlist[:,x], 
+                IfElse.ifelse(yU <= 0.0, yL*cv_gradlist[:,x], 
+                    IfElse.ifelse(xcv < xcc, yL*cv_gradlist[:,x], yU*cc_gradlist[:,x])))))
+    return
+end
+function grad_transform!(::McCormickIntervalTransform, ::typeof(/), zL, zU, zcv, zcc, xL, xU, 
+                            xcv, xcc, yL, yU, ycv, ycc, varlist::Vector{String}, cv_gradlist::Matrix{Num}, cc_gradlist::Matrix{Num})
+    # Identify which variables are being used
+    z = findfirst(x -> x==string(zL)[1:end-3], varlist)
+    x = findfirst(x -> x==string(xL)[1:end-3], varlist)
+    y = findfirst(x -> x==string(yL)[1:end-3], varlist)
+
+    # Separate variable check, in case x or y are constants
+    if isnothing(x)
+        x = findfirst(x -> x==string(xL), varlist)
+    end
+    if isnothing(y)
+        y = findfirst(x -> x==string(yL), varlist)
+    end
+
+    # For division, we do x*(y^-1). Note that if yL < 0 < yU, the inverses of ycv, ycc, and their subgradients will
+    # be NaN, which will set zcv, zcc, and their subgradients to NaN in every case. First, we define the vector
+    # of zeros for the subgradient
+    zero_vec = Num.(zeros(size(cv_gradlist[:,y])))
+
+    # Next we calculate the inverse of y
+    yL_inv = inv(yU)
+    yU_inv = inv(yL)
+    ycv_inv = IfElse.ifelse(yL > 0.0, 1.0 ./ (mid_expr(ycc, ycv, yU)), 
+            IfElse.ifelse(yU < 0.0, IfElse.ifelse(yL == yU, mid_expr(ycc, ycv, yL).^(-1), (yL.^(-1).*(yU - mid_expr(ycc, ycv, yL)) + yU.^(-1).*(mid_expr(ycc, ycv, yL) - yL))./(yU - yL)),
+                NaN))
+    ycc_inv = IfElse.ifelse(yL > 0.0, (yU + yL - mid_expr(ycc, ycv, yL))./(yL*yU), 
+            IfElse.ifelse(yU < 0.0, mid_expr(ycc, ycv, yU).^(-1),
+                NaN))
+    y_cv_gradlist_inv = similar(cv_gradlist[:,y])
+    @. y_cv_gradlist_inv = IfElse.ifelse(yU < 0.0, IfElse.ifelse(yU == yL, -1/(mid_expr(ycc, ycv, yL)*mid_expr(ycc, ycv, yL)), (yU^-1 - yL^-1)/(yU - yL)) * 
+                mid_grad(ycc, ycv, yL, cc_gradlist[:,y], cv_gradlist[:,y], zero_vec),
+        IfElse.ifelse(yL > 0.0, -1.0/(mid_expr(ycc, ycv, yU)*mid_expr(ycc, ycv, yU)) * 
+                mid_grad(ycc, ycv, yU, cc_gradlist[:,y], cv_gradlist[:,y], zero_vec),
+        NaN * zero_vec))
+    y_cc_gradlist_inv = similar(cc_gradlist[:,y])
+    @. y_cc_gradlist_inv = IfElse.ifelse(yU < 0.0, -1/(mid_expr(ycc, ycv, yU)*mid_expr(ycc, ycv, yU)) *
+                mid_grad(ycc, ycv, yU, cc_gradlist[:,y], cv_gradlist[:,y], zero_vec),
+        IfElse.ifelse(yL > 0.0, -1.0/(yL*yU) * 
+                mid_grad(ycc, ycv, yL, cc_gradlist[:,y], cv_gradlist[:,y], zero_vec),
+        NaN * zero_vec))
+
+    # Now we use the multiplication rules, but replace each instance of
+    # y with its inverse. 
+
+
+    # Include the cut definition in the gradlist, setting the gradient to the zero vector
+    # if cv would have been cut
+    # zero_vec = Num.(zeros(size(cv_gradlist[:,y])))
+
+    @. cv_gradlist[:, z] = IfElse.ifelse(xL >= 0.0, 
+        IfElse.ifelse(yL_inv >= 0.0, IfElse.ifelse(yU_inv*xcv + xU*ycv_inv - xU*yU_inv > yL_inv*xcv + xL*ycv_inv - xL*yL_inv, 
+                                            yU_inv*cv_gradlist[:,x] + xU*(y_cv_gradlist_inv), 
+                                            yL_inv*cv_gradlist[:,x] + xL*(y_cv_gradlist_inv)),
+            IfElse.ifelse(yU_inv <= 0.0, -IfElse.ifelse((-yU_inv)*xcc + xU*(-ycv_inv) - xU*(-yU_inv) < (-yL_inv)*xcc + xL*(-ycv_inv) - xL*(-yL_inv), 
+                                                    (-yU_inv)*cc_gradlist[:,x] + xU*(-(y_cv_gradlist_inv)),
+                                                    (-yL_inv)*cc_gradlist[:,x] + xL*(-(y_cv_gradlist_inv))),
+                IfElse.ifelse(yU_inv*xcv + xU*ycv_inv - xU*yU_inv > yL_inv*xcc + xL*ycv_inv - xL*yL_inv,
+                            yU_inv*cv_gradlist[:,x] + xU*(y_cv_gradlist_inv),
+                            yL_inv*cc_gradlist[:,x] + xL*(y_cv_gradlist_inv)))),
+        IfElse.ifelse(xU <= 0.0,
+            IfElse.ifelse(yL_inv >= 0.0, -IfElse.ifelse(yL_inv*(-xcv) + (-xL)*ycc_inv - (-xL)*yL_inv < yU_inv*(-xcv) + (-xU)*ycc_inv - (-xU)*yU_inv, 
+                                                    yL_inv*(-cv_gradlist[:,x]) + (-xL)*(y_cc_gradlist_inv),
+                                                    yU_inv*(-cv_gradlist[:,x]) + (-xU)*(y_cc_gradlist_inv)),
+                IfElse.ifelse(yU_inv <= 0.0, IfElse.ifelse((-yL_inv)*(-xcc) + (-xL)*(-ycc_inv) - (-xL)*(-yL_inv) > (-yU_inv)*(-xcc) + (-xU)*(-ycc_inv) - (-xU)*(-yU_inv), 
+                                                    (-yL_inv)*(-cc_gradlist[:,x]) + (-xL)*(-(y_cc_gradlist_inv)), 
+                                                    (-yU_inv)*(-cc_gradlist[:,x]) + (-xU)*(-(y_cc_gradlist_inv))), 
+                    -IfElse.ifelse((-xU)*(ycc_inv) + (yU_inv)*(-xcv) - (yU_inv)*(-xU) < (-xL)*(ycc_inv) + (yL_inv)*(-xcv) - (yL_inv)*(-xL), 
+                                (-xU)*((y_cc_gradlist_inv)) + (yU_inv)*(-cv_gradlist[:,x]),
+                                (-xL)*((y_cc_gradlist_inv)) + (yL_inv)*(-cv_gradlist[:,x])))),
+            IfElse.ifelse(yL_inv >= 0.0, IfElse.ifelse((xU)*(ycv_inv) + (yU_inv)*(xcv) - (yU_inv)*(xU) > (xL)*(ycc_inv) + (yL_inv)*(xcv) - (yL_inv)*(xL),
+                                                (xU)*((y_cv_gradlist_inv)) + (yU_inv)*(cv_gradlist[:,x]),
+                                                (xL)*((y_cc_gradlist_inv)) + (yL_inv)*(cv_gradlist[:,x])), 
+                IfElse.ifelse(yU_inv <= 0.0, -IfElse.ifelse((xL)*(-ycc_inv) + (-yL_inv)*(xcc) - (-yL_inv)*(xL) < (xU)*(-ycv_inv) + (-yU_inv)*(xcc) - (-yU_inv)*(xU),
+                                                        (xL)*(-(y_cc_gradlist_inv)) + (-yL_inv)*(cc_gradlist[:,x]),
+                                                        (xU)*(-(y_cv_gradlist_inv)) + (-yU_inv)*(cc_gradlist[:,x])),
+                    IfElse.ifelse(yU_inv*xcv + xU*ycv_inv - xU*yU_inv > yL_inv*xcc + xL*ycc_inv - xL*yL_inv, 
+                                yU_inv*cv_gradlist[:,x] + xU*(y_cv_gradlist_inv), 
+                                yL_inv*cc_gradlist[:,x] + xL*(y_cc_gradlist_inv))))))
+
+    @. cc_gradlist[:, z] = IfElse.ifelse(xL >= 0.0, 
+        IfElse.ifelse(yL_inv >= 0.0, IfElse.ifelse(yL_inv*xcc + xU*ycc_inv - xU*yL_inv < yU_inv*xcc + xL*ycc_inv - xL*yU_inv, 
+                                            yL_inv*cc_gradlist[:,x] + xU*(y_cc_gradlist_inv),
+                                            yU_inv*cc_gradlist[:,x] + xL*(y_cc_gradlist_inv)),
+            IfElse.ifelse(yU_inv <= 0.0, -IfElse.ifelse((-yL_inv)*xcv + xU*(-ycc_inv) - xU*(-yL_inv) > (-yU_inv)*xcv + xL*(-ycc_inv) - xL*(-yU_inv), 
+                                                    (-yL_inv)*cv_gradlist[:,x] + xU*(-(y_cc_gradlist_inv)), 
+                                                    (-yU_inv)*cv_gradlist[:,x] + xL*(-(y_cc_gradlist_inv))),
+                IfElse.ifelse(yL_inv*xcv + xU*ycc_inv - xU*yL_inv < yU_inv*xcc + xL*ycc_inv - xL*yU_inv,
+                                yL_inv*cv_gradlist[:,x] + xU*(y_cc_gradlist_inv),
+                                yU_inv*cc_gradlist[:,x] + xL*(y_cc_gradlist_inv)))),
+        IfElse.ifelse(xU <= 0.0,
+            IfElse.ifelse(yL_inv >= 0.0, -IfElse.ifelse(yU_inv*(-xcc) + (-xL)*ycv_inv - (-xL)*yU_inv > yL_inv*(-xcc) + (-xU)*ycv_inv - (-xU)*yL_inv, 
+                                                    yU_inv*(-cc_gradlist[:,x]) + (-xL)*(y_cv_gradlist_inv), 
+                                                    yL_inv*(-cc_gradlist[:,x]) + (-xU)*(y_cv_gradlist_inv)),
+                IfElse.ifelse(yU_inv <= 0.0, IfElse.ifelse((-yU_inv)*(-xcv) + (-xL)*(-ycv_inv) - (-xL)*(-yU_inv) < (-yL_inv)*(-xcv) + (-xU)*(-ycv_inv) - (-xU)*(-yL_inv), 
+                                                    (-yU_inv)*(-cv_gradlist[:,x]) + (-xL)*(-(y_cv_gradlist_inv)),
+                                                    (-yL_inv)*(-cv_gradlist[:,x]) + (-xU)*(-(y_cv_gradlist_inv))), 
+                    -IfElse.ifelse((-xL)*(ycv_inv) + (yU_inv)*(-xcc) - (yU_inv)*(-xL) > (-xU)*(ycv_inv) + (yL_inv)*(-xcc) - (yL_inv)*(-xU), 
+                                (-xL)*((y_cv_gradlist_inv)) + (yU_inv)*(-cc_gradlist[:,x]), 
+                                (-xU)*((y_cv_gradlist_inv)) + (yL_inv)*(-cc_gradlist[:,x])))),
+            IfElse.ifelse(yL_inv >= 0.0, IfElse.ifelse((xL)*(ycv_inv) + (yU_inv)*(xcc) - (yU_inv)*(xL) < (xU)*(ycc_inv) + (yL_inv)*(xcc) - (yL_inv)*(xU),
+                                                (xL)*((y_cv_gradlist_inv)) + (yU_inv)*(cc_gradlist[:,x]),
+                                                (xU)*((y_cc_gradlist_inv)) + (yL_inv)*(cc_gradlist[:,x])), 
+                IfElse.ifelse(yU_inv <= 0.0, -IfElse.ifelse((xU)*(-ycc_inv) + (-yL_inv)*(xcv) - (-yL_inv)*(xU) > (xL)*(-ycv_inv) + (-yU_inv)*(xcv) - (-yU_inv)*(xL),
+                                                        (xU)*(-(y_cc_gradlist_inv)) + (-yL_inv)*(cv_gradlist[:,x]),
+                                                        (xL)*(-(y_cv_gradlist_inv)) + (-yU_inv)*(cv_gradlist[:,x])), 
+                    IfElse.ifelse(yL_inv*xcv + xU*ycc_inv - xU*yL_inv < yU_inv*xcc + xL*ycv_inv - xL*yU_inv, 
+                                yL_inv*cv_gradlist[:,x] + xU*(y_cc_gradlist_inv),
+                                yU_inv*cc_gradlist[:,x] + xL*(y_cv_gradlist_inv))))))
+    return
+end
+function grad_transform!(::McCormickIntervalTransform, ::typeof(/), zL, zU, zcv, zcc, xL::Real, xU::Real, 
+                            xcv::Real, xcc::Real, yL, yU, ycv, ycc, varlist::Vector{String}, cv_gradlist::Matrix{Num}, cc_gradlist::Matrix{Num})
+    # Identify which variables are being used
+    z = findfirst(x -> x==string(zL)[1:end-3], varlist)
+    y = findfirst(x -> x==string(yL)[1:end-3], varlist)
+
+    # Separate variable check, in case y is a constant
+    if isnothing(y)
+        x = findfirst(x -> x==string(yL), varlist)
+    end
+
+    # For division, we do x*(y^-1). Note that if yL < 0 < yU, the inverses of ycv, ycc, and their subgradients will
+    # be NaN, which will set zcv, zcc, and their subgradients to NaN in every case. First, we define the vector
+    # of zeros for the subgradient
+    zero_vec = Num.(zeros(size(cv_gradlist[:,y])))
+
+    # Next we calculate the inverse of y
+    yL_inv = inv(yU)
+    yU_inv = inv(yL)
+    ycv_inv = IfElse.ifelse(yL > 0.0, 1.0 ./ (mid_expr(ycc, ycv, yU)), 
+            IfElse.ifelse(yU < 0.0, IfElse.ifelse(yL == yU, mid_expr(ycc, ycv, yL).^(-1), (yL.^(-1).*(yU - mid_expr(ycc, ycv, yL)) + yU.^(-1).*(mid_expr(ycc, ycv, yL) - yL))./(yU - yL)),
+                NaN))
+    ycc_inv = IfElse.ifelse(yL > 0.0, (yU + yL - mid_expr(ycc, ycv, yL))./(yL*yU), 
+            IfElse.ifelse(yU < 0.0, mid_expr(ycc, ycv, yU).^(-1),
+                NaN))
+    y_cv_gradlist_inv = similar(cv_gradlist[:,y])
+    @. y_cv_gradlist_inv = IfElse.ifelse(yU < 0.0, IfElse.ifelse(yU == yL, -1/(mid_expr(ycc, ycv, yL)*mid_expr(ycc, ycv, yL)), (yU^-1 - yL^-1)/(yU - yL)) * 
+                mid_grad(ycc, ycv, yL, cc_gradlist[:,y], cv_gradlist[:,y], zero_vec),
+        IfElse.ifelse(yL > 0.0, -1.0/(mid_expr(ycc, ycv, yU)*mid_expr(ycc, ycv, yU)) * 
+                mid_grad(ycc, ycv, yU, cc_gradlist[:,y], cv_gradlist[:,y], zero_vec),
+        NaN * zero_vec))
+    y_cc_gradlist_inv = similar(cc_gradlist[:,y])
+    @. y_cc_gradlist_inv = IfElse.ifelse(yU < 0.0, -1/(mid_expr(ycc, ycv, yU)*mid_expr(ycc, ycv, yU)) *
+                mid_grad(ycc, ycv, yU, cc_gradlist[:,y], cv_gradlist[:,y], zero_vec),
+        IfElse.ifelse(yL > 0.0, -1.0/(yL*yU) * 
+                mid_grad(ycc, ycv, yL, cc_gradlist[:,y], cv_gradlist[:,y], zero_vec),
+        NaN * zero_vec))
+
+    # Now we use the multiplication rules, but replace each instance of
+    # y with its inverse. 
+    @. cv_gradlist[:, z] = IfElse.ifelse(xL >= 0.0, 
+        IfElse.ifelse(yL_inv >= 0.0, IfElse.ifelse(yU_inv*xcv + xU*ycv_inv - xU*yU_inv > yL_inv*xcv + xL*ycv_inv - xL*yL_inv, 
+                                            xU*(y_cv_gradlist_inv), 
+                                            xL*(y_cv_gradlist_inv)),
+            IfElse.ifelse(yU_inv <= 0.0, -IfElse.ifelse((-yU_inv)*xcc + xU*(-ycv_inv) - xU*(-yU_inv) < (-yL_inv)*xcc + xL*(-ycv_inv) - xL*(-yL_inv), 
+                                                    xU*(-(y_cv_gradlist_inv)),
+                                                    xL*(-(y_cv_gradlist_inv))),
+                IfElse.ifelse(yU_inv*xcv + xU*ycv_inv - xU*yU_inv > yL_inv*xcc + xL*ycv_inv - xL*yL_inv,
+                            xU*(y_cv_gradlist_inv),
+                            xL*(y_cv_gradlist_inv)))),
+        IfElse.ifelse(xU <= 0.0,
+            IfElse.ifelse(yL_inv >= 0.0, -IfElse.ifelse(yL_inv*(-xcv) + (-xL)*ycc_inv - (-xL)*yL_inv < yU_inv*(-xcv) + (-xU)*ycc_inv - (-xU)*yU_inv, 
+                                                    (-xL)*(y_cc_gradlist_inv),
+                                                    (-xU)*(y_cc_gradlist_inv)),
+                IfElse.ifelse(yU_inv <= 0.0, IfElse.ifelse((-yL_inv)*(-xcc) + (-xL)*(-ycc_inv) - (-xL)*(-yL_inv) > (-yU_inv)*(-xcc) + (-xU)*(-ycc_inv) - (-xU)*(-yU_inv), 
+                                                    (-xL)*(-(y_cc_gradlist_inv)), 
+                                                    (-xU)*(-(y_cc_gradlist_inv))), 
+                    -IfElse.ifelse((-xU)*(ycc_inv) + (yU_inv)*(-xcv) - (yU_inv)*(-xU) < (-xL)*(ycc_inv) + (yL_inv)*(-xcv) - (yL_inv)*(-xL), 
+                                (-xU)*((y_cc_gradlist_inv)),
+                                (-xL)*((y_cc_gradlist_inv))))),
+            IfElse.ifelse(yL_inv >= 0.0, IfElse.ifelse((xU)*(ycv_inv) + (yU_inv)*(xcv) - (yU_inv)*(xU) > (xL)*(ycc_inv) + (yL_inv)*(xcv) - (yL_inv)*(xL),
+                                                (xU)*((y_cv_gradlist_inv)),
+                                                (xL)*((y_cc_gradlist_inv))), 
+                IfElse.ifelse(yU_inv <= 0.0, -IfElse.ifelse((xL)*(-ycc_inv) + (-yL_inv)*(xcc) - (-yL_inv)*(xL) < (xU)*(-ycv_inv) + (-yU_inv)*(xcc) - (-yU_inv)*(xU),
+                                                        (xL)*(-(y_cc_gradlist_inv)),
+                                                        (xU)*(-(y_cv_gradlist_inv))),
+                    IfElse.ifelse(yU_inv*xcv + xU*ycv_inv - xU*yU_inv > yL_inv*xcc + xL*ycc_inv - xL*yL_inv, 
+                                xU*(y_cv_gradlist_inv), 
+                                xL*(y_cc_gradlist_inv))))))
+
+    @. cc_gradlist[:, z] = IfElse.ifelse(xL >= 0.0, 
+        IfElse.ifelse(yL_inv >= 0.0, IfElse.ifelse(yL_inv*xcc + xU*ycc_inv - xU*yL_inv < yU_inv*xcc + xL*ycc_inv - xL*yU_inv, 
+                                            xU*(y_cc_gradlist_inv),
+                                            xL*(y_cc_gradlist_inv)),
+            IfElse.ifelse(yU_inv <= 0.0, -IfElse.ifelse((-yL_inv)*xcv + xU*(-ycc_inv) - xU*(-yL_inv) > (-yU_inv)*xcv + xL*(-ycc_inv) - xL*(-yU_inv), 
+                                                    xU*(-(y_cc_gradlist_inv)), 
+                                                    xL*(-(y_cc_gradlist_inv))),
+                IfElse.ifelse(yL_inv*xcv + xU*ycc_inv - xU*yL_inv < yU_inv*xcc + xL*ycc_inv - xL*yU_inv,
+                                xU*(y_cc_gradlist_inv),
+                                xL*(y_cc_gradlist_inv)))),
+        IfElse.ifelse(xU <= 0.0,
+            IfElse.ifelse(yL_inv >= 0.0, -IfElse.ifelse(yU_inv*(-xcc) + (-xL)*ycv_inv - (-xL)*yU_inv > yL_inv*(-xcc) + (-xU)*ycv_inv - (-xU)*yL_inv, 
+                                                    (-xL)*(y_cv_gradlist_inv), 
+                                                    (-xU)*(y_cv_gradlist_inv)),
+                IfElse.ifelse(yU_inv <= 0.0, IfElse.ifelse((-yU_inv)*(-xcv) + (-xL)*(-ycv_inv) - (-xL)*(-yU_inv) < (-yL_inv)*(-xcv) + (-xU)*(-ycv_inv) - (-xU)*(-yL_inv), 
+                                                    (-xL)*(-(y_cv_gradlist_inv)),
+                                                    (-xU)*(-(y_cv_gradlist_inv))), 
+                    -IfElse.ifelse((-xL)*(ycv_inv) + (yU_inv)*(-xcc) - (yU_inv)*(-xL) > (-xU)*(ycv_inv) + (yL_inv)*(-xcc) - (yL_inv)*(-xU), 
+                                (-xL)*((y_cv_gradlist_inv)), 
+                                (-xU)*((y_cv_gradlist_inv))))),
+            IfElse.ifelse(yL_inv >= 0.0, IfElse.ifelse((xL)*(ycv_inv) + (yU_inv)*(xcc) - (yU_inv)*(xL) < (xU)*(ycc_inv) + (yL_inv)*(xcc) - (yL_inv)*(xU),
+                                                (xL)*((y_cv_gradlist_inv)),
+                                                (xU)*((y_cc_gradlist_inv))), 
+                IfElse.ifelse(yU_inv <= 0.0, -IfElse.ifelse((xU)*(-ycc_inv) + (-yL_inv)*(xcv) - (-yL_inv)*(xU) > (xL)*(-ycv_inv) + (-yU_inv)*(xcv) - (-yU_inv)*(xL),
+                                                        (xU)*(-(y_cc_gradlist_inv)),
+                                                        (xL)*(-(y_cv_gradlist_inv))), 
+                    IfElse.ifelse(yL_inv*xcv + xU*ycc_inv - xU*yL_inv < yU_inv*xcc + xL*ycv_inv - xL*yU_inv, 
+                                xU*(y_cc_gradlist_inv),
+                                xL*(y_cv_gradlist_inv))))))
+    return
+end
+function grad_transform!(::McCormickIntervalTransform, ::typeof(/), zL, zU, zcv, zcc, xL, xU, 
+                            xcv, xcc, yL::Real, yU::Real, ycv::Real, ycc::Real, varlist::Vector{String}, cv_gradlist::Matrix{Num}, cc_gradlist::Matrix{Num})
+    # Identify which variables are being used
+    z = findfirst(x -> x==string(zL)[1:end-3], varlist)
+    x = findfirst(x -> x==string(xL)[1:end-3], varlist)
+
+    # Separate variable check, in case x is a constant
+    if isnothing(x)
+        x = findfirst(x -> x==string(xL), varlist)
+    end
+
+    # For division, we do x*(y^-1). Note that if yL < 0 < yU, the inverses of ycv, ycc, and their subgradients will
+    # be NaN, which will set zcv, zcc, and their subgradients to NaN in every case. First, we define the vector
+    # of zeros for the subgradient
+    zero_vec = Num.(zeros(size(cv_gradlist[:,y])))
+
+    # Next we calculate the inverse of y
+    yL_inv = inv(yU)
+    yU_inv = inv(yL)
+    ycv_inv = inv(ycc)
+    ycc_inv = inv(ycv)
+    y_cv_gradlist_inv = zero_vec
+    y_cc_gradlist_inv = zero_vec
+
+    # Now we use the multiplication rules, but replace each instance of
+    # y with its inverse. 
+    @. cv_gradlist[:, z] = IfElse.ifelse(xL >= 0.0, 
+        IfElse.ifelse(yL_inv >= 0.0, IfElse.ifelse(yU_inv*xcv + xU*ycv_inv - xU*yU_inv > yL_inv*xcv + xL*ycv_inv - xL*yL_inv, 
+                                            yU_inv*cv_gradlist[:,x] + xU*(y_cv_gradlist_inv), 
+                                            yL_inv*cv_gradlist[:,x] + xL*(y_cv_gradlist_inv)),
+            IfElse.ifelse(yU_inv <= 0.0, -IfElse.ifelse((-yU_inv)*xcc + xU*(-ycv_inv) - xU*(-yU_inv) < (-yL_inv)*xcc + xL*(-ycv_inv) - xL*(-yL_inv), 
+                                                    (-yU_inv)*cc_gradlist[:,x] + xU*(-(y_cv_gradlist_inv)),
+                                                    (-yL_inv)*cc_gradlist[:,x] + xL*(-(y_cv_gradlist_inv))),
+                IfElse.ifelse(yU_inv*xcv + xU*ycv_inv - xU*yU_inv > yL_inv*xcc + xL*ycv_inv - xL*yL_inv,
+                            yU_inv*cv_gradlist[:,x] + xU*(y_cv_gradlist_inv),
+                            yL_inv*cc_gradlist[:,x] + xL*(y_cv_gradlist_inv)))),
+        IfElse.ifelse(xU <= 0.0,
+            IfElse.ifelse(yL_inv >= 0.0, -IfElse.ifelse(yL_inv*(-xcv) + (-xL)*ycc_inv - (-xL)*yL_inv < yU_inv*(-xcv) + (-xU)*ycc_inv - (-xU)*yU_inv, 
+                                                    yL_inv*(-cv_gradlist[:,x]) + (-xL)*(y_cc_gradlist_inv),
+                                                    yU_inv*(-cv_gradlist[:,x]) + (-xU)*(y_cc_gradlist_inv)),
+                IfElse.ifelse(yU_inv <= 0.0, IfElse.ifelse((-yL_inv)*(-xcc) + (-xL)*(-ycc_inv) - (-xL)*(-yL_inv) > (-yU_inv)*(-xcc) + (-xU)*(-ycc_inv) - (-xU)*(-yU_inv), 
+                                                    (-yL_inv)*(-cc_gradlist[:,x]) + (-xL)*(-(y_cc_gradlist_inv)), 
+                                                    (-yU_inv)*(-cc_gradlist[:,x]) + (-xU)*(-(y_cc_gradlist_inv))), 
+                    -IfElse.ifelse((-xU)*(ycc_inv) + (yU_inv)*(-xcv) - (yU_inv)*(-xU) < (-xL)*(ycc_inv) + (yL_inv)*(-xcv) - (yL_inv)*(-xL), 
+                                (-xU)*((y_cc_gradlist_inv)) + (yU_inv)*(-cv_gradlist[:,x]),
+                                (-xL)*((y_cc_gradlist_inv)) + (yL_inv)*(-cv_gradlist[:,x])))),
+            IfElse.ifelse(yL_inv >= 0.0, IfElse.ifelse((xU)*(ycv_inv) + (yU_inv)*(xcv) - (yU_inv)*(xU) > (xL)*(ycc_inv) + (yL_inv)*(xcv) - (yL_inv)*(xL),
+                                                (xU)*((y_cv_gradlist_inv)) + (yU_inv)*(cv_gradlist[:,x]),
+                                                (xL)*((y_cc_gradlist_inv)) + (yL_inv)*(cv_gradlist[:,x])), 
+                IfElse.ifelse(yU_inv <= 0.0, -IfElse.ifelse((xL)*(-ycc_inv) + (-yL_inv)*(xcc) - (-yL_inv)*(xL) < (xU)*(-ycv_inv) + (-yU_inv)*(xcc) - (-yU_inv)*(xU),
+                                                        (xL)*(-(y_cc_gradlist_inv)) + (-yL_inv)*(cc_gradlist[:,x]),
+                                                        (xU)*(-(y_cv_gradlist_inv)) + (-yU_inv)*(cc_gradlist[:,x])),
+                    IfElse.ifelse(yU_inv*xcv + xU*ycv_inv - xU*yU_inv > yL_inv*xcc + xL*ycc_inv - xL*yL_inv, 
+                                yU_inv*cv_gradlist[:,x] + xU*(y_cv_gradlist_inv), 
+                                yL_inv*cc_gradlist[:,x] + xL*(y_cc_gradlist_inv))))))
+
+    @. cc_gradlist[:, z] = IfElse.ifelse(xL >= 0.0, 
+        IfElse.ifelse(yL_inv >= 0.0, IfElse.ifelse(yL_inv*xcc + xU*ycc_inv - xU*yL_inv < yU_inv*xcc + xL*ycc_inv - xL*yU_inv, 
+                                            yL_inv*cc_gradlist[:,x] + xU*(y_cc_gradlist_inv),
+                                            yU_inv*cc_gradlist[:,x] + xL*(y_cc_gradlist_inv)),
+            IfElse.ifelse(yU_inv <= 0.0, -IfElse.ifelse((-yL_inv)*xcv + xU*(-ycc_inv) - xU*(-yL_inv) > (-yU_inv)*xcv + xL*(-ycc_inv) - xL*(-yU_inv), 
+                                                    (-yL_inv)*cv_gradlist[:,x] + xU*(-(y_cc_gradlist_inv)), 
+                                                    (-yU_inv)*cv_gradlist[:,x] + xL*(-(y_cc_gradlist_inv))),
+                IfElse.ifelse(yL_inv*xcv + xU*ycc_inv - xU*yL_inv < yU_inv*xcc + xL*ycc_inv - xL*yU_inv,
+                                yL_inv*cv_gradlist[:,x] + xU*(y_cc_gradlist_inv),
+                                yU_inv*cc_gradlist[:,x] + xL*(y_cc_gradlist_inv)))),
+        IfElse.ifelse(xU <= 0.0,
+            IfElse.ifelse(yL_inv >= 0.0, -IfElse.ifelse(yU_inv*(-xcc) + (-xL)*ycv_inv - (-xL)*yU_inv > yL_inv*(-xcc) + (-xU)*ycv_inv - (-xU)*yL_inv, 
+                                                    yU_inv*(-cc_gradlist[:,x]) + (-xL)*(y_cv_gradlist_inv), 
+                                                    yL_inv*(-cc_gradlist[:,x]) + (-xU)*(y_cv_gradlist_inv)),
+                IfElse.ifelse(yU_inv <= 0.0, IfElse.ifelse((-yU_inv)*(-xcv) + (-xL)*(-ycv_inv) - (-xL)*(-yU_inv) < (-yL_inv)*(-xcv) + (-xU)*(-ycv_inv) - (-xU)*(-yL_inv), 
+                                                    (-yU_inv)*(-cv_gradlist[:,x]) + (-xL)*(-(y_cv_gradlist_inv)),
+                                                    (-yL_inv)*(-cv_gradlist[:,x]) + (-xU)*(-(y_cv_gradlist_inv))), 
+                    -IfElse.ifelse((-xL)*(ycv_inv) + (yU_inv)*(-xcc) - (yU_inv)*(-xL) > (-xU)*(ycv_inv) + (yL_inv)*(-xcc) - (yL_inv)*(-xU), 
+                                (-xL)*((y_cv_gradlist_inv)) + (yU_inv)*(-cc_gradlist[:,x]), 
+                                (-xU)*((y_cv_gradlist_inv)) + (yL_inv)*(-cc_gradlist[:,x])))),
+            IfElse.ifelse(yL_inv >= 0.0, IfElse.ifelse((xL)*(ycv_inv) + (yU_inv)*(xcc) - (yU_inv)*(xL) < (xU)*(ycc_inv) + (yL_inv)*(xcc) - (yL_inv)*(xU),
+                                                (xL)*((y_cv_gradlist_inv)) + (yU_inv)*(cc_gradlist[:,x]),
+                                                (xU)*((y_cc_gradlist_inv)) + (yL_inv)*(cc_gradlist[:,x])), 
+                IfElse.ifelse(yU_inv <= 0.0, -IfElse.ifelse((xU)*(-ycc_inv) + (-yL_inv)*(xcv) - (-yL_inv)*(xU) > (xL)*(-ycv_inv) + (-yU_inv)*(xcv) - (-yU_inv)*(xL),
+                                                        (xU)*(-(y_cc_gradlist_inv)) + (-yL_inv)*(cv_gradlist[:,x]),
+                                                        (xL)*(-(y_cv_gradlist_inv)) + (-yU_inv)*(cv_gradlist[:,x])), 
+                    IfElse.ifelse(yL_inv*xcv + xU*ycc_inv - xU*yL_inv < yU_inv*xcc + xL*ycv_inv - xL*yU_inv, 
+                                yL_inv*cv_gradlist[:,x] + xU*(y_cc_gradlist_inv),
+                                yU_inv*cc_gradlist[:,x] + xL*(y_cv_gradlist_inv))))))
+    return
+end
+function grad_transform!(::McCormickIntervalTransform, ::typeof(^), zL, zU, zcv, zcc, xL, xU, 
+                            xcv, xcc, yL, yU, ycv, ycc, varlist::Vector{String}, cv_gradlist::Matrix{Num}, cc_gradlist::Matrix{Num})
+    # Check that the exponent is supported
+    ~((typeof(yL) <: Int) || (typeof(yL) <: AbstractFloat)) && error("Symbolic exponents not currently supported.")
+    ~(yL == 2) && error("Exponents besides 2 not currently supported")
+    
+    # Identify which variables are being used
+    z = findfirst(x -> x==string(zL)[1:end-3], varlist)
+    x = findfirst(x -> x==string(xL)[1:end-3], varlist)
+    y = findfirst(x -> x==string(yL)[1:end-3], varlist)
+
+    # Separate variable check, in case x or y are constants
+    if isnothing(x)
+        x = findfirst(x -> x==string(xL), varlist)
+    end
+    if isnothing(y)
+        y = findfirst(x -> x==string(yL), varlist)
+    end
+
+    # Helper variables
+    eps_min = @. IfElse.ifelse(xU < 0.0, xU, IfElse.ifelse(xL > 0.0, xL, 0.0))
+    eps_max = @. IfElse.ifelse(xU < 0.0, xL, IfElse.ifelse(xL > 0.0, xU, IfElse.ifelse(abs(xL) > abs(xU), xL, xU)))
+
+    # Modify the gradlists accordingly
+    @. cv_gradlist[:,z] = mid_grad(xcc, xcv, eps_min, cc_gradlist[:,x]*2.0*xcc, cv_gradlist[:,x]*2.0*xcv, 0.0)
+    @. cc_gradlist[:,z] = mid_grad(xcc, xcv, eps_max, cc_gradlist[:,x]*IfElse.ifelse(xU > xL, xL+xU, 0.0), cv_gradlist[:,x]*IfElse.ifelse(xU > xL, xL+xU, 0.0), 0.0)
+    return
+end
+
+# This one's annoying right now because of the abs(x-y), so I'm going to add min/max for grad later.
+# function grad_transform!(::McCormickIntervalTransform, ::typeof(max), zL, zU, zcv, zcc, xL, xU, 
+#                             xcv, xcc, yL, yU, ycv, ycc, varlist::Vector{String}, cv_gradlist::Matrix{Num}, cc_gradlist::Matrix{Num})
+#     # Identify which variables are being used
+#     z = findfirst(x -> x==string(zL)[1:end-3], varlist)
+#     x = findfirst(x -> x==string(xL)[1:end-3], varlist)
+#     y = findfirst(x -> x==string(yL)[1:end-3], varlist)
+
+#     cc_gradlist[:,z] = IfElse.ifelse(xU <= yL, cc_gradlist[:,y], #NOTE: Regular McCormick checks the `cnst` flag, which SCMC doesn't have.
+#                             IfElse.ifelse(xL >= yU, cc_gradlist[:,x], #NOTE: Regular McCormick checks the `cnst` flag, which SCMC doesn't have.
+#                                 0.5*cc_gradlist[:,x]+cc_gradlist[:,y]+abs(x-y)
+
+#                             )
+#     eps_min = IfElse.ifelse(xL >= 0.0, xL, IfElse.ifelse(xU <= 0.0, xU, 0.0))
+#     eps_max = IfElse.ifelse(abs(xU) >= abs(xL), xU, xL)
+
+#     xcc, xcv, eps_max
+#     midcc = IfElse.ifelse(xcc < xcv, IfElse.ifelse(xcv < c, b, IfElse.ifelse(eps_max < xcc, a, c)),
+#                 IfElse.ifelse(eps_max < xcv, b, IfElse.ifelse(xcc < eps_max, a, c)))
+#     whatever result is, times dcc
+
+
+#     abs(x) = IfElse.ifelse(xcc < xcv, IfElse.ifelse(xcv < eps_min, IfElse.ifelse(xcv > 0.0, cv_grad[:,x], -cv_grad[:,x]), IfElse.ifelse(eps_min < xcc, IfElse.ifelse(xcc > 0.0, cc_grad[:,x], -cc_grad[:,x]), 0.0)),
+#                 IfElse.ifelse(eps_min < xcv, IfElse.ifelse(xcv > 0.0, cv_grad[:,x], -cv_grad[:,x]), IfElse.ifelse(xcc < eps_min, IfElse.ifelse(xcc > 0.0, cc_grad[:,x], -cc_grad[:,x]), 0.0)))
+# end
+
+
+
+# function cut(xL::Float64, xU::Float64, cv::Float64, cc::Float64,
+#              cv_grad::SVector{N,Float64}, cc_grad::SVector{N,Float64}) where N
+#     if cc > xU
+#         cco = xU
+#         cc_grado = zero(SVector{N,Float64})
+#     else
+#         cco = cc
+#         cc_grado = cc_grad
+#     end
+#     if cv < xL
+#         cvo = xL
+#         cv_grado = zero(SVector{N,Float64})
+#     else
+#         cvo = cv
+#         cv_grado = cv_grad
+#     end
+#     return cvo, cco, cv_grado, cc_grado
+# end
+
+
+
diff --git a/src/relaxation/relaxation.jl b/src/relaxation/relaxation.jl
index 9f85cb1..d42ec4e 100644
--- a/src/relaxation/relaxation.jl
+++ b/src/relaxation/relaxation.jl
@@ -87,4 +87,11 @@ line_expr(x, xL, xU, zL, zU) = IfElse.ifelse(zU > zL, (zL*(xU - x) + zU*(x - xL)
 mid_expr(x, y, z) = IfElse.ifelse(x >= y, IfElse.ifelse(y >= z, y, IfElse.ifelse(y == x, y, IfElse.ifelse(z >= x, x, z))),
         IfElse.ifelse(z >= y, y, IfElse.ifelse(x >= z, x, z)))
 
+mid_grad(x, y, z, ccgrad, cvgrad, zerovec) = IfElse.ifelse(x >= y, IfElse.ifelse(y >= z, cvgrad, IfElse.ifelse(y == x, cvgrad, IfElse.ifelse(z >= x, ccgrad, zerovec))),
+        IfElse.ifelse(z >= y, cvgrad, IfElse.ifelse(x >= z, ccgrad, zerovec)))
+
+# # Equation (29) from Ye2023
+# psi_cv(a, xcv, xcc) = IfElse.ifelse(a >= 0.0, a*xcv, a*xcc)
+# psi_cc(a, xcv, xcc) = IfElse.ifelse(a >= 0.0, a*xcc, a*xcv)
+
 include(joinpath(@__DIR__, "rules.jl"))
\ No newline at end of file
diff --git a/src/relaxation/rules.jl b/src/relaxation/rules.jl
index be88d03..8b39417 100644
--- a/src/relaxation/rules.jl
+++ b/src/relaxation/rules.jl
@@ -44,20 +44,39 @@ end
 #=
 Binary Rules
 =#
+# Alternative multiplication rule from Ye2023. Note that the cut operator is applied at the beginning,
+# to the inputs of the transform rule.
+# function transform_rule(::McCormickTransform, ::typeof(+), zL, zU, zcv, zcc, xL, xU, xcv, xcc, yL, yU, ycv, ycc)
+#     cut_xcv = max(xL, xcv)
+#     cut_xcc = min(xU, xcc)
+#     cut_ycv = max(yL, ycv)
+#     cut_ycc = min(yU, ycc)
+#     rcv = Equation(zcv, cut_xcv + cut_ycv)
+#     rcc = Equation(zcc, cut_xcc + cut_ycc)
+#     return rcv, rcc
+# end
 function transform_rule(::McCormickTransform, ::typeof(+), zL, zU, zcv, zcc, xL, xU, xcv, xcc, yL, yU, ycv, ycc)
     rcv = Equation(zcv, xcv + ycv)
     rcc = Equation(zcc, xcc + ycc)
     return rcv, rcc
 end
 
-
 # Rules for multiplication adapted from:
 # https://github.com/PSORLab/McCormick.jl/blob/master/src/forward_operators/multiplication.jl
+# TODO: Add in "cut" function from McCormick.jl
 function transform_rule(::McCormickTransform, ::typeof(*), zL, zU, zcv, zcc, xL, xU, xcv, xcc, yL::Real, yU::Real, ycv::Real, ycc::Real)
     rcv = Equation(zcv, IfElse.ifelse(yL >= 0.0, ycv*xcv, ycc*xcc))
     rcc = Equation(zcc, IfElse.ifelse(yL >= 0.0, ycc*xcc, ycv*xcv))
     return rcv, rcc
 end
+# Alternative multiplication rule from Ye2023. Note that the cut operator is not applied at the end.
+# function transform_rule(::McCormickTransform, ::typeof(*), zL, zU, zcv, zcc, xL, xU, xcv, xcc, yL, yU, ycv, ycc)
+#     rcv = Equation(zcv, max(psi_cv(yL, max(xL, xcv), min(xU, xcc)) + psi_cv(xL, max(yL, ycv), min(yU, ycc)) - xL*yL,
+#                             psi_cv(yU, max(xL, xcv), min(xU, xcc)) + psi_cv(xU, max(yL, ycv), min(yU, ycc)) - xU*yU))
+#     rcc = Equation(zcc, min(psi_cc(yL, max(xL, xcv), min(xU, xcc)) + psi_cc(xU, max(yL, ycv), min(yU, ycc)) - xU*yL,
+#                             psi_cc(yU, max(xL, xcv), min(xU, xcc)) + psi_cc(xL, max(yL, ycv), min(yU, ycc)) - xL*yU))
+#     return rcv, rcc
+# end
 function transform_rule(::McCormickTransform, ::typeof(*), zL, zU, zcv, zcc, xL, xU, xcv, xcc, yL, yU, ycv, ycc)
     rcv = Equation(zcv, max(zL, IfElse.ifelse(xL >= 0.0,
         IfElse.ifelse(yL >= 0.0, max(yU*xcv + xU*ycv - xU*yU, yL*xcv + xL*ycv - xL*yL),
@@ -108,6 +127,9 @@ function transform_rule(::McCormickTransform, ::typeof(*), zL, zU, zcv, zcc, xL,
     # cv = -min((-yU)*xcc + xU*(-ycv) - xU*(-yU), (-yL)*xcc + xU*(-ycv) - xL*(-yL))
     # cc = -max((-yL)*xcv + xU*(-ycc) - xU*(-yL), (-yU)*xcv + xL*(-ycc) - xL*(-yU))
 
+    
+    # cv = -min((-yU)*xcc + xU*(-ycv) - xU*(-yU), (-yL)*xcc + xL*(-ycv) - xL*(-yL)) #typo fixed
+
 
     # # [x+,ym]
     # # Different from "normal case", note mix of cc's and cv's
@@ -276,3 +298,4 @@ end
 TODO: Add other operators. It's probably helpful to break the McCormick overload and McCormick + Interval Outputs
 into separate transform_rules since the coupling for the ODEs are one directional and potentially useful.
 =#
+
diff --git a/src/transform/transform.jl b/src/transform/transform.jl
index 7060ddd..77fa1b5 100644
--- a/src/transform/transform.jl
+++ b/src/transform/transform.jl
@@ -1,6 +1,7 @@
 include(joinpath(@__DIR__, "utilities.jl"))
 include(joinpath(@__DIR__, "binarize.jl"))
 include(joinpath(@__DIR__, "factor.jl"))
+include(joinpath(@__DIR__, "write.jl"))
 
 
 # function apply_transform(transform::T, prob::ODESystem; constants::Vector{Num}=Num[]) where T<:AbstractTransform
diff --git a/src/transform/utilities.jl b/src/transform/utilities.jl
index 3fadc08..2ae8cc8 100644
--- a/src/transform/utilities.jl
+++ b/src/transform/utilities.jl
@@ -220,7 +220,10 @@ end
     pull_vars(::Vector{Equation})
 
 Pull out all variables/symbols from an expression or the RHS of an
-equation (or RHSs of a set of equations), and sort them alphabetically.
+equation (or RHSs of a set of equations), and sort them. Variables
+are sorted alphabetically, then in the order [cv, cc, L, U], then
+followed by the terms for the subgradient of the convex relaxation
+and terms for  the subgradient of the concave relaxation.
 
 # Example
 
@@ -404,8 +407,8 @@ function _pull_vars(term::BasicSymbolic, vars::Vector{Num}, strings::Vector{Stri
         return vars, strings
     end
     if exprtype(term)==TERM && varterm(term)
-        if ~(string(term.f) in strings) || (term.f==getindex && ~(string(term) in string.(vars)))
-            push!(strings, string(term.f))
+        if ~(string(term.f) in strings) && (term.f==getindex && ~(string(term) in string.(vars)))
+            push!(strings, string(term))
             push!(vars, term)
             return vars, strings
         end
@@ -430,6 +433,8 @@ function _pull_vars(term::BasicSymbolic, vars::Vector{Num}, strings::Vector{Stri
     end
     return vars, strings
 end
+# _pull_vars(term, vars::Vector{Num}, strings::Vector{String}) = vars, strings
+
 
 
 """
@@ -467,6 +472,7 @@ function shrink_eqs(eqs::Vector{Equation}, keep::Int64=4; force::Bool=false)
         new_eqs[replace] = substitute(new_eqs[replace], Dict(new_eqs[1].lhs => new_eqs[1].rhs))
         new_eqs = new_eqs[2:end]
     end
+    # Need to add in the final shrinking for the cut.
     return new_eqs
 end
 
diff --git a/src/transform/write.jl b/src/transform/write.jl
new file mode 100644
index 0000000..9d29f67
--- /dev/null
+++ b/src/transform/write.jl
@@ -0,0 +1,1186 @@
+
+# Given a vector of equations, calculate the "levels" of a computational graph, 
+# where -1 represents the base-level variables, and the highest number represents
+# the original expression.
+function levels(a::Vector{Equation})
+    # Pull all the variables from the vector of equations
+    vars = [string.(pull_vars(a)); string(a[end].lhs)]
+
+    # Create a levels dict
+    levels = Dict("" => -2.0)
+    for var in vars
+        levels[var] = 0.0
+    end
+
+    # Extract all variables in the LHS and RHS terms of equations
+    pre_LHS = [string.(x) for x in pull_vars.(Num.(getfield.(a, :lhs)))]
+    LHS = hcat(pre_LHS...)
+
+    RHS = fill("", length(LHS), 2)
+    for i in eachindex(LHS)
+        RHS_vars = string.(pull_vars(Num(a[i].rhs)))
+        RHS[i,1] = RHS_vars[1]
+        try
+            RHS[i,2] = RHS_vars[2]
+        catch
+        end
+    end
+
+    # For the variables, if they don't appear in the LHS, mark them as -1
+    for var in vars
+        if ~(var in LHS)
+            levels[var] = -1.0
+        end
+    end
+
+    # Loop through and repeatedly update the dictionary, based on which variables
+    # appear where. 
+    flag = true
+    while flag
+        flag = false #Continue flag. If it gets set to true, we continue
+        # Scan through RHS's to make a matrix of corresponding levels
+        RHS_vals = zeros(size(RHS))
+        for i in eachindex(RHS)
+            RHS_vals[i] = levels[RHS[i]]
+        end
+
+        # For each row, do -1's, then max
+        for i in 1:size(RHS_vals, 1)
+            if RHS_vals[i,1]==-1 && RHS_vals[i,2]==-2 #Function of non-aux var only
+                levels[LHS[i]] = 1.0
+            elseif RHS_vals[i,1]==-1 && RHS_vals[i,2]==0 #Function of aux and non-aux, but aux unknown
+                flag = true
+                levels[LHS[i]] = 1.0
+            elseif RHS_vals[i,1]==0 && RHS_vals[i,2]==-1 #Function of aux and non-aux, but aux unknown
+                flag = true
+                levels[LHS[i]] = 1.0
+            else #Function of aux only
+                maxval = max(RHS_vals[i,1], RHS_vals[i,2])
+                if levels[LHS[i]] != maxval + 1
+                    flag = true
+                    levels[LHS[i]] = maxval + 1
+                end
+            end
+        end
+    end
+    delete!(levels, "")
+    return levels
+end
+
+# Translate a vector of equations into a vector of `Graphs.Edge`s,
+# which can be used to construct a SimpleDiGraph
+function eqn_edges(a::Vector{Equation})
+    # Create the list of edges and pull all relevant variables
+    edgelist = Edge{Int}[]
+    vars = [string.(get_name.(pull_vars(a))); string(a[end].lhs)]
+    nums = collect(1:length(vars))
+    varid = Dict()
+    for i in eachindex(vars)
+        varid[vars[i]] = nums[i]
+    end
+
+    
+    # Extract all variables in the LHS and RHS terms of equations
+    pre_LHS = [string.(get_name.(x)) for x in pull_vars.(Num.(getfield.(a, :lhs)))]
+    LHS = hcat(pre_LHS...)
+    LHS_id = zeros(Int, size(LHS))
+    for i in eachindex(LHS_id)
+        LHS_id[i] = varid[LHS[i]]
+    end
+
+    RHS_vars = pull_vars.(a)
+    RHS = fill("", length(LHS), maximum(length.(RHS_vars)))
+    for i in eachindex(LHS)
+        for j in 1:length(RHS_vars[i])
+            RHS[i,j] = string(get_name(RHS_vars[i][j]))
+        end
+    end
+    RHS_id = zeros(Int, size(RHS))
+    for i in eachindex(RHS_id)
+        if RHS[i]==""
+            RHS_id[i] = 0
+        else
+            RHS_id[i] = varid[RHS[i]]
+        end
+    end
+
+    # Create edges of RHS -> LHS
+    for i in eachindex(LHS_id)
+        for j in eachindex(RHS_id[i,:])
+            if ~iszero(RHS_id[i,j])
+                push!(edgelist, Edge(RHS_id[i,j], LHS_id[i]))
+            end
+        end
+    end
+    return edgelist, vars
+end
+
+# A new topological sort that tries to minimize the number of temporary vectors
+# that need to be preallocated
+function topological_sort(g::SimpleDiGraph; order::Vector{Int64}=Int64[])
+    for i in length(g.badjlist):-1:1 # Each i is a vector
+        # Always go in order of most to least complex
+        lengths = [length(g.badjlist[j]) for j in g.badjlist[i]]
+        for j in g.badjlist[i][sortperm(-lengths)]
+            recursive_add(g, j, order)
+        end
+        if ~in(i, order)
+            push!(order, i)
+        end
+    end
+    return order
+end
+# A recursive function. If given a graph, and a specific number to add,
+# either add that number, or dig into the badjlist further.
+function recursive_add(g::SimpleDiGraph, i::Int, order::Vector{Int64})
+    # Given an integer, check to see if that number is already in order.
+    # If so, don't do anything.
+    if in(i, order)
+        return nothing
+    else
+        # If the integer i is not in order, check to see if it depends on
+        # anything
+        if isempty(g.badjlist[i])
+            # If there are no dependencies, add i to order
+            push!(order, i)
+        else
+            # There are dependencies. Go through each dependency and add it.
+            # Always go in order of most to least complex
+            lengths = [length(g.badjlist[j]) for j in g.badjlist[i]]
+            for j in g.badjlist[i][sortperm(-lengths)]
+                recursive_add(g, j, order)
+            end
+            # Now that all the dependencies have been added, we can safely add i
+            push!(order, i)
+        end
+    end
+    return nothing
+end
+
+function combine_addition(orig_set::Vector{Equation}; maxterms::Int=8)
+    # We want to collapse add's together, if the resulting expression would
+    # still have fewer than 32 inputs. If no subgradients are required,
+    # expressions can have up to 8 unique variables. If subgradients are
+    # needed, the max allowable unique variables is:
+    # Subgradient dimensions -> max terms
+    # 1 -> 5
+    # 2 -> 4
+    # 3 -> 3
+    # 4 -> 2
+    # 5 -> 2
+    # 6 -> 2
+    set = copy(orig_set)
+    LHSs = getfield.(set, :lhs)
+    RHSs = pull_vars.(set)
+    
+    for i in eachindex(set)
+        if SourceCodeMcCormick.op(set[i]) == +
+            new_RHS = Num[]
+            for term in RHSs[i]
+                if string(term) in string.(LHSs)
+                    ID = findfirst(==(string(term)), string.(LHSs))
+                    if length(union(RHSs[i], RHSs[ID])) <= maxterms #Limit from CUDA
+                        set[i] = substitute(set[i], term => set[ID].rhs)
+                        push!(new_RHS, RHSs[ID]...)
+                    else
+                        push!(new_RHS, term)
+                    end
+                else
+                    push!(new_RHS, term)
+                end
+            end
+            RHSs[i] = new_RHS
+        end
+    end
+
+    # Remove equations that are no longer needed
+    rm_flag = fill(true, length(set))
+    rm_flag[end] = false
+    for i in 1:(length(set)-1)
+        for j in eachindex(set)
+            if string(LHSs[i]) in string.(RHSs[j])
+                rm_flag[i] = false
+            end
+        end
+    end
+    deleteat!(set, rm_flag)
+
+    return set
+end
+
+# Function to read and interpret a function/path name
+function read_string(in_string::String)
+    if length(in_string)>4 && in_string[end-2:end]==".jl"
+        return in_string, in_string[1:end-3]*"_vector.jl", in_string[1:end-3]*"_kernel.jl", string(split(in_string, ('/', '\\'))[end][1:end-3])
+    else
+        return in_string*".jl", in_string*"_vector.jl", in_string*"_kernel.jl", in_string
+    end
+end
+
+# Function to generate a unique path if one was not provided
+function generate_paths()
+    # Always use the current directory for simplicity
+    fileID = 1
+    searching = true
+    while searching
+        stringID = string(fileID)
+        if ~isfile(joinpath(@__DIR__, "storage", "newfunc"*stringID*".jl")) && ~isfile(joinpath(@__DIR__, "storage", "newfunc"*stringID*"_vector.jl"))&& ~isfile(joinpath(@__DIR__, "storage", "newfunc"*stringID*"_kernel.jl"))
+        # if ~isfile("newfunc"*stringID*".jl") && ~isfile("newfunc"*stringID*"_vector.jl")&& ~isfile("newfunc"*stringID*"_kernel.jl")
+            println("Creating new Julia files:")
+            # println(joinpath(@__DIR__, "newfunc"*stringID*".jl"))
+            println(joinpath(@__DIR__, "storage", "newfunc"*stringID*".jl"))
+            println(joinpath(@__DIR__, "storage", "newfunc"*stringID*"_vector.jl"))
+            println(joinpath(@__DIR__, "storage", "newfunc"*stringID*"_kernel.jl"))
+            return joinpath(@__DIR__, "storage", "newfunc"*stringID*".jl"),joinpath(@__DIR__, "storage", "newfunc"*stringID*"_vector.jl"), joinpath(@__DIR__, "storage", "newfunc"*stringID*"_kernel.jl"), "newfunc"*stringID
+        end
+        fileID += 1
+    end
+end
+
+# Function to check that paths and function name won't cause problems
+function validate_paths(path::String, path_vector::String, path_kernel::String, fname::String)
+    if path[end-2:end] != ".jl"
+        error("Path must end in `.jl`")
+    end
+    if path_vector[end-2:end] != ".jl"
+        error("Path must end in `.jl`")
+    end
+    if path_kernel[end-2:end] != ".jl"
+        error("Path must end in `.jl`")
+    end
+end
+
+function factor_classifier(factors::Vector{Equation})
+    # Set up the string tracker and ID vector
+    strings = fill("", length(factors))
+    number_vector = zeros(Int, length(factors))
+    
+    # Convert symbolic terms into strings
+    for i in eachindex(factors)
+        strings[i] = sym_to_string(factors[i].rhs)
+    end
+
+    # Identify numbers based on strings
+    count = 1
+    for i in eachindex(number_vector)
+        if i==1
+            number_vector[i] = count
+            count += 1
+        elseif (strings[i] != "") && (strings[i] in strings[1:(i-1)])
+            number_vector[i] = number_vector[findfirst(x -> x==strings[i], strings[1:(i-1)])]
+        else
+            number_vector[i] = count
+            count += 1
+        end
+    end
+
+    return number_vector
+end
+
+sym_to_string(a::Num) = sym_to_string(a.val, "")
+sym_to_string(a::BasicSymbolic) = sym_to_string(a, "")
+sym_to_string(a::Num, str::String) = sym_to_string(a.val, str)
+sym_to_string(a::Float64, str::String) = str .* "c"
+function sym_to_string(a::BasicSymbolic, str::String)
+    if exprtype(a)==SYM || (exprtype(a)==TERM && a.f==getindex)
+        str *= "v"
+    elseif exprtype(a)==TERM
+        str *= string(a.f)
+    elseif exprtype(a)==ADD && arity(a)==2
+        valid_flag = true
+        for pair in a.dict
+            if ~isone(pair.second)
+                valid_flag = false
+                continue
+            elseif ~(exprtype(pair.first)==SYM) && ~(exprtype(pair.first)==TERM && pair.first.f==getindex)
+                valid_flag = false
+                continue
+            end
+        end
+        if valid_flag
+            if ~iszero(a.coeff)
+                str *= "+cv"
+            else
+                str *= "+vv"
+            end
+        end
+    elseif exprtype(a)==MUL && arity(a)==2
+        valid_flag = true
+        for pair in a.dict
+            if ~(isone(pair.second))
+                valid_flag = false
+                continue
+            elseif ~(exprtype(pair.first)==SYM) && ~(exprtype(pair.first)==TERM && pair.first.f==getindex)
+                valid_flag = false
+                continue
+            end
+        end
+        if valid_flag
+            if ~isone(a.coeff)
+                str *= "*cv"
+            else
+                str *= "*vv"
+            end
+        end
+    elseif exprtype(a)==POW
+        if typeof(a.base) <: Real
+            if (exprtype(a.exp)==SYM && (exprtype(a.exp)==TERM && a.exp.f==getindex))
+                str *= "^cv"
+            end
+        elseif (exprtype(a.base)==SYM && (exprtype(a.base)==TERM && a.base.f==getindex))
+            if typeof(a.exp) <: Real
+                str *= "^vc"
+            elseif (exprtype(a.exp)==SYM && (exprtype(a.exp)==TERM && a.exp.f==getindex))
+                str *= "^vv"
+            end
+        end
+    elseif exprtype(a)==DIV
+        if typeof(a.num) <: Real
+            if (exprtype(a.den)==SYM && (exprtype(a.den)==TERM && a.den.f==getindex))
+                str *= "/cv"
+            end
+        elseif (exprtype(a.num)==SYM && (exprtype(a.num)==TERM && a.num.f==getindex))
+            if typeof(a.den) <: Real
+                str *= "/vc"
+            elseif (exprtype(a.den)==SYM && (exprtype(a.den)==TERM && a.den.f==getindex))
+                str *= "/vv"
+            end
+        end
+    end
+    return str
+end
+
+
+# # This version works with more complicated structures, but then we'd need to keep
+# # track of what order the variables appear in, which is overly complicated
+# function sym_to_string(a::BasicSymbolic, str::String)
+#     if exprtype(a)==SYM || (exprtype(a)==TERM && a.f==getindex)
+#         str *= "v"
+#     elseif exprtype(a)==TERM
+#         str *= string(a.f)
+#         str = sym_to_string(a, str)
+#     elseif exprtype(a)==ADD
+#         str *= string(arity(a)) * "+"
+#         if ~iszero(a.coeff)
+#             str *= "C"
+#         end
+#         for pair in a.dict
+#             if ~isone(pair.second)
+#                 str *= "c"
+#             end
+#             str = sym_to_string(pair.first, str)
+#         end
+#     elseif exprtype(a)==MUL
+#         str *= string(arity(a)) * "*"
+#         if ~isone(a.coeff)
+#             str *= "C"
+#         end
+#         for pair in a.dict
+#             str = sym_to_string(pair.first^pair.second, str)
+#         end
+#     elseif exprtype(a)==POW
+#         str *= "^"
+#         str = sym_to_string(a.base, str)
+#         str = sym_to_string(a.exp, str)
+#     elseif exprtype(a)==DIV
+#         str *= "/"
+#         str = sym_to_string(a.num, str)
+#         str = sym_to_string(a.den, str)
+#     end
+#     str *= ","
+#     return str
+# end
+
+
+generate_inputs(num::BasicSymbolic; constants::Vector{Num}=Num[]) = generate_inputs(Num(num), constants=constants)
+function generate_inputs(num::Num; constants::Vector{Num}=Num[])
+    equation = 0 ~ num
+    step_1 = apply_transform(McCormickIntervalTransform(), [equation], constants=constants)
+    step_2 = shrink_eqs(step_1)
+    input_list = pull_vars(step_2)
+    return input_list
+end
+generate_grad_inputs(num::BasicSymbolic, gradlist::Vector{Num}; constants::Vector{Num}=Num[]) = generate_grad_inputs(Num(num), gradlist, constants=constants)
+function generate_grad_inputs(num::Num, gradlist::Vector{Num}; constants::Vector{Num}=Num[])
+    cvgrad, ccgrad = grad(num, gradlist, constants=constants, expand=true)
+    input_list = pull_vars(cvgrad + ccgrad)
+    return input_list
+end
+
+function constant_converter(input::BasicSymbolic, constants::Vector{Num})
+    if exprtype(input)==ADD && arity(input)==2 && ~iszero(input.coeff)
+        new_expr = @variables(constant)[]
+        new_constants = copy(constants)
+        push!(new_constants, new_expr)
+        for pair in input.dict
+            new_expr += pair.first*pair.second
+        end
+        return new_expr.val, new_constants, input.coeff
+    elseif exprtype(input)==MUL && arity(input)==2 && ~isone(input.coeff)
+        new_expr = @variables(constant)[]
+        new_constants = copy(constants)
+        push!(new_constants, new_expr)
+        for pair in input.dict
+            new_expr *= pair.first^pair.second
+        end
+        return new_expr.val, new_constants, input.coeff
+    elseif exprtype(input)==DIV
+        if typeof(input.num)<:BasicSymbolic && typeof(input.den)<:Real
+            new_expr = @variables(constant)[]
+            new_constants = copy(constants)
+            push!(new_constants, new_expr)
+            new_expr = input.num/new_expr
+            return new_expr.val, new_constants, input.den
+        elseif typeof(input.num)<:Real && typeof(input.den)<:BasicSymbolic
+            new_expr = @variables(constant)[]
+            new_constants = copy(constants)
+            push!(new_constants, new_expr)
+            new_expr = new_expr/input.den
+            return new_expr.val, new_constants, input.num
+        else
+            return input, constants, nothing
+        end
+    else
+        return input, constants, nothing
+    end
+end
+
+
+
+eval_generator(num::Num, title::String; constants::Vector{Num}=Num[]) = eval_generator(num, factor(num), title, constants)
+function eval_generator(num::Num, factorized::Vector{Equation}, title::String, constants::Vector{Num})
+    # To reduce the number of functions being created, we can start by classifying each factor
+    # based on the math. 
+    eqn_reference = factor_classifier(factorized)
+
+    # Now we want something that'll automatically figure out what functions to make
+    # and then make them. 
+    funcs = []
+    normal_inputs = Vector{String}[]
+    for i in eachindex(factorized)
+        if (i==1) || (eqn_reference[i] > maximum(eqn_reference[1:(i-1)]))
+            expr, new_constants, extra_value = constant_converter(factorized[i].rhs, constants)
+            f_cv = Symbol("$(title)_$(i)_cv")
+            f_cc = Symbol("$(title)_$(i)_cc")
+            f_lo = Symbol("$(title)_$(i)_lo")
+            f_hi = Symbol("$(title)_$(i)_hi")
+            out = @eval $f_cv, $f_cc, $f_lo, $f_hi, normal_order = all_evaluators(Num($(expr)), constants=$new_constants)
+            normal_order = out[5]
+            if ~isnothing(extra_value)
+                normal_order[1] = extra_value
+            end
+            push!(funcs, ["$(title)_$(i)_cv", "$(title)_$(i)_cc", "$(title)_$(i)_lo", "$(title)_$(i)_hi"])
+            push!(normal_inputs, string.(normal_order))
+        else
+            expr, new_constants, extra_value = constant_converter(factorized[i].rhs, constants)
+            push!(funcs, funcs[findfirst(x -> x==eqn_reference[i], eqn_reference)])
+            normal_order = generate_inputs(expr, constants=new_constants)
+            if ~isnothing(extra_value)
+                normal_order[1] = extra_value
+            end
+            push!(normal_inputs, string.(normal_order))
+        end
+    end
+    return funcs, normal_inputs
+end
+
+grad_eval_generator(num::Num, title::String; constants::Vector{Num}=Num[]) = grad_eval_generator(num, pull_vars(num), factor(num), title, constants)
+grad_eval_generator(num::Num, gradlist::Vector{Num}, title::String; constants::Vector{Num}=Num[]) = grad_eval_generator(num, gradlist, factor(num), title, constants)
+function grad_eval_generator(num::Num, gradlist::Vector{Num}, factorized::Vector{Equation}, title::String, constants::Vector{Num})
+    # To reduce the number of functions being created, we can start by classifying each factor
+    # based on the math. 
+    eqn_reference = factor_classifier(factorized)
+
+    # Now we want something that'll automatically figure out what functions to make
+    # and then make them. 
+    funcs = []
+    normal_inputs = Vector{String}[]
+    grad_inputs = Vector{String}[]
+
+    # Check that the length of the gradlist is <=6. Note that since each term of the subgradient
+    # is independent of the other terms, if more than 6 dimensions are desired, this function can
+    # simply be called with the first 6 terms, and then future terms can be substituted in as
+    # needed. 
+    if length(gradlist) > 6
+        error("Subgradients in more than 6 dimensions currently not supported. Please submit
+               an issue if you encounter this error.")
+    end
+    for i in eachindex(factorized)
+        if (i==1) || (eqn_reference[i] > maximum(eqn_reference[1:(i-1)]))
+            expr, new_constants, extra_value = constant_converter(factorized[i].rhs, constants)
+            f_cv = Symbol("$(title)_$(i)_cv")
+            f_cc = Symbol("$(title)_$(i)_cc")
+            f_lo = Symbol("$(title)_$(i)_lo")
+            f_hi = Symbol("$(title)_$(i)_hi")
+            out = @eval $f_cv, $f_cc, $f_lo, $f_hi, normal_order = all_evaluators(Num($(expr)), constants=$new_constants)
+            normal_order = out[5]
+
+            df_cv = Symbol("∂$(title)_$(i)_cv")
+            df_cc = Symbol("∂$(title)_$(i)_cc")
+            out = @eval $df_cv, $df_cc, grad_order = all_subgradients(Num($(expr)), $gradlist, expand=true, constants=$new_constants)
+            grad_order = out[3]
+
+            if ~isnothing(extra_value)
+                normal_order[1] = extra_value
+                if !(isempty(grad_order)) && string(grad_order[1])=="constant" #The constant doesn't appear in the derivative for addition, but does for multiplication
+                    grad_order[1] = extra_value
+                end
+            end
+            push!(funcs, ["$(title)_$(i)_cv", "$(title)_$(i)_cc", "$(title)_$(i)_lo", "$(title)_$(i)_hi", "∂$(title)_$(i)_cv", "∂$(title)_$(i)_cc"])
+            push!(normal_inputs, string.(normal_order))
+            push!(grad_inputs, string.(grad_order))
+        else
+            expr, new_constants, extra_value = constant_converter(factorized[i].rhs, constants)
+            push!(funcs, funcs[findfirst(x -> x==eqn_reference[i], eqn_reference)])
+            normal_order = generate_inputs(expr, constants=new_constants)
+            grad_order = generate_grad_inputs(expr, gradlist, constants=new_constants)
+            if ~isnothing(extra_value)
+                normal_order[1] = extra_value
+                if !isempty(grad_order) && string(grad_order[1])=="constant"
+                    grad_order[1] = extra_value
+                end
+            end
+            push!(normal_inputs, string.(normal_order))
+            push!(grad_inputs, string.(grad_order))
+        end
+    end
+    return funcs, normal_inputs, grad_inputs
+end
+
+
+fgen(num::Num; constants::Vector{Num}=Num[], mutate::Bool=false, all_inputs::Bool=false) = fgen(num, setdiff(pull_vars(num), constants), [:all], generate_paths()..., constants, mutate, all_inputs)
+fgen(num::Num, string::String; constants::Vector{Num}=Num[], mutate::Bool=false, all_inputs::Bool=false) = fgen(num, setdiff(pull_vars(num), constants), [:all], read_string(string)..., constants, mutate, all_inputs)
+fgen(num::Num, outputs::Vector{Symbol}; constants::Vector{Num}=Num[], mutate::Bool=false, all_inputs::Bool=false) = fgen(num, setdiff(pull_vars(num), constants), outputs, generate_paths()..., constants, mutate, all_inputs)
+fgen(num::Num, outputs::Vector{Symbol}, constants::Vector{Num}, string::String, mutate::Bool=false, all_inputs::Bool=false) = fgen(num, setdiff(pull_vars(num), constants), outputs, read_string(string)..., constants, mutate, all_inputs)
+fgen(num::Num, gradlist::Vector{Num}; constants::Vector{Num}=Num[], mutate::Bool=false, all_inputs::Bool=false) = fgen(num, gradlist, [:all], generate_paths()..., constants, mutate, all_inputs)
+fgen(num::Num, gradlist::Vector{Num}, outputs::Vector{Symbol}; constants::Vector{Num}=Num[], mutate::Bool=false, all_inputs::Bool=false) = fgen(num, gradlist, outputs, generate_paths()..., constants, mutate, all_inputs)
+fgen(num::Num, gradlist::Vector{Num}, outputs::Vector{Symbol}, constants::Vector{Num}, mutate::Bool=false, all_inputs::Bool=false) = fgen(num, gradlist, outputs, generate_paths()..., constants, mutate, all_inputs)
+fgen(num::Num, gradlist::Vector{Num}, outputs::Vector{Symbol}, string::String; constants::Vector{Num}=Num[], mutate::Bool=false, all_inputs::Bool=false) = fgen(num, gradlist, outputs, read_string(string)..., constants, mutate, all_inputs)
+
+function fgen(num::Num, gradlist::Vector{Num}, raw_outputs::Vector{Symbol}, path::String, path_vector::String, path_kernel::String, fname::String, constants::Vector{Num}, mutate::Bool, all_inputs::Bool)
+    # Ensure that paths are valid
+    validate_paths(path, path_vector, path_kernel, fname)
+
+    # Determine what objects will be returned, if mutate==false
+    outputs = Symbol[]
+    for output in raw_outputs
+        output == :cv     && push!(outputs, :cv)
+        output == :cc     && push!(outputs, :cc)
+        output == :lo     && push!(outputs, :lo)
+        output == :hi     && push!(outputs, :hi)
+        output == :MC     && push!(outputs, :cv, :cc, :lo, :hi)
+        output == :cvgrad && push!(outputs, :cvgrad)
+        output == :ccgrad && push!(outputs, :ccgrad)
+        output == :grad   && push!(outputs, :cvgrad, :ccgrad)
+        output == :all    && push!(outputs, :cv, :cc, :lo, :hi, :cvgrad, :ccgrad)
+        if ~(output in [:cv, :cc, :lo, :hi, :MC, :cvgrad, :ccgrad, :grad, :all])
+            error("Output list contains an invalid output symbol: :$output. Acceptable symbols
+                   include [:cv, :cc, :lo, :hi, :MC, :cvgrad, :ccgrad, :grad, :all]")
+        end
+    end
+    if isempty(outputs)
+        error("No outputs specified.")
+    end
+    
+    # Perform a factorization of the input. If subgradients are not required, 
+    # combine addition terms together in the factorization since the math is simple.
+    # If subgradients are needed, we can still combine terms, but we are limited
+    # further in how many terms we can combine together.
+    factorized = factor(num)
+    if ~(:cvgrad in outputs) && ~(:ccgrad in outputs)
+        factorized = combine_addition(factorized)
+    else
+        factorized = combine_addition(factorized, maxterms = 5) #Would be 6-length(gradlist) for length(gradlist)<=3, but we only pass 1 grad term
+    end
+    
+    # Generate functions from the factorization
+    if (:cvgrad in outputs) || (:ccgrad in outputs)
+        # Only do the first element of the gradlist, to make as few functions as possible
+        funcs, normal_inputs, grad_inputs = grad_eval_generator(num, [gradlist[1]], factorized, fname, constants)
+    else
+        funcs, normal_inputs = eval_generator(num, factorized, fname, constants)
+        grad_inputs = Vector{String}[]
+    end
+    # Collect LHS terms and extract original problem variables from the Num input
+    LHSs = string.(getfield.(factorized, :lhs))
+    if all_inputs
+        vars = get_name.(gradlist)
+    else
+        vars = get_name.(pull_vars(num))
+    end
+    # vars = get_name.(pull_vars(num))
+    @show vars
+
+    # Put the factorized expression into a directed acyclic graph form
+    edgelist, varids = eqn_edges(factorized) #varids includes all aux variables also
+    g = SimpleDiGraph(edgelist)
+
+    # Perform a topological sort to get the order in which we should perform
+    # calculations (i.e., the final entry in "varorder" is the full original expression)
+    varorder = varids[topological_sort(g)]
+
+    # Open and begin writing information to the Julia files. Writing to files instead
+    # of constructing the functions purely internally helps with debugging, and also
+    # makes it easier to see how the new functions work. Note that functions cannot
+    # be used or run independently of calling `fgen`, because the internal functions
+    # being composed only exist at the time that the function is initially written.
+    file = open(path, "w")
+    file_vector = open(path_vector, "w")
+    file_kernel = open(path_kernel, "w")
+    files = [file, file_vector, file_kernel]
+    for loc in files
+        write(loc, "# Generated at $(Dates.now())\n\n")
+    end
+
+    # Determine the input list for the main function and begin writing the function
+    input_list = ""
+    mutate_input_list = ""
+    for out in [:cv, :cc, :lo, :hi]
+        if out in outputs
+            mutate_input_list *= "OUT_$(string(out)), "
+        end
+    end
+    if :cvgrad in outputs
+        for dvar in string.(get_name.(gradlist))
+            mutate_input_list *= "OUT_∂$(dvar)_cv, "
+        end
+    end
+    if :ccgrad in outputs
+        for dvar in string.(get_name.(gradlist))
+            mutate_input_list *= "OUT_∂$(dvar)_cc, "
+        end
+    end
+
+    for var in vars
+        if string(var) in string.(get_name.(constants))
+            input_list *= "$(var), "
+            mutate_input_list *= "$(var), "
+        end
+    end
+    for var in vars
+        if ~(string(var) in string.(get_name.(constants)))
+            input_list *= "$(var)_cv, $(var)_cc, $(var)_lo, $(var)_hi, "
+            mutate_input_list *= "$(var)_cv, $(var)_cc, $(var)_lo, $(var)_hi, "
+            if ((:cvgrad in outputs) || (:ccgrad in outputs)) && ~(string(var) in string.(get_name.(gradlist)))
+                for dvar in string.(get_name.(gradlist))
+                    input_list *= "∂$(var)∂$(dvar)_cv, "
+                    mutate_input_list *= "∂$(var)∂$(dvar)_cv, "
+                end
+                for dvar in string.(get_name.(gradlist))
+                    input_list *= "∂$(var)∂$(dvar)_cc, "
+                    mutate_input_list *= "∂$(var)∂$(dvar)_cc, "
+                end
+            end
+        end
+    end
+
+    input_list = input_list[1:end-2]
+    mutate_input_list = mutate_input_list[1:end-2]
+    write(file, "function $(fname)($(input_list)::Float64)\n")
+    if mutate
+        write(file_vector, "function $(fname)($(mutate_input_list)::Vector{Float64})\n")
+        write(file_kernel, "function $(fname)($(mutate_input_list)::CuArray{Float64})\n")
+    else
+        write(file_vector, "function $(fname)($(input_list)::Vector{Float64})\n")
+        write(file_kernel, "function $(fname)($(input_list)::CuArray{Float64})\n")
+    end
+    representative = split(input_list, " ")[end]
+    println("Required inputs:")
+    if mutate
+        @show mutate_input_list
+    else
+        @show input_list
+    end
+
+    # Add in comments that describe what expression is being calculated
+    for loc in files
+        if mutate
+            write(loc, "   # Mutate $(outputs) for the following expression: \n")
+        else
+            write(loc, "   # Return $(outputs) for the following expression: \n")
+        end
+        write(loc, "   # $(num) \n\n")
+        write(loc, "   # The expression is factored into the following subexpressions,\n")
+        write(loc, "   # with the functions $(fname)_i_*() referring to the i'th factor:\n")
+        write(loc, "   #  FACTOR  |  EXPRESSION\n")
+        L = length(string(length(factorized)))
+        for i in eachindex(factorized)
+            Li = length(string(i)) 
+            write(loc, "   #"*" "^Int(5-floor(L/2)+(L-Li))*"$(i)"*" "^Int(5-floor((L+1)/2))*"|  $(factorized[i].lhs) = $(factorized[i].rhs)\n")
+        end
+    end
+
+    # Determine how many auxiliary variables are needed
+    temp_endlist = []
+    maxtemp = 0
+    final_only_flag = false
+    for i in eachindex(varorder) # Loop through every variable that appears in the problem
+        if (varorder[i] in string.(vars)) # Skip the variable if it's an input (i.e., we don't need to make an auxiliary variable for it)
+            continue
+        end
+        if mutate && (i==length(varorder)) # If we're mutating, don't make a temporary variable for the final entry in varorder
+            break
+        end
+        ID = findfirst(x -> occursin(varorder[i], x), varids)
+        tempID = 0
+        if isempty(temp_endlist)
+            push!(temp_endlist, copy(g.fadjlist[ID]))
+            tempID = 1
+        else
+            for j in eachindex(temp_endlist)
+                if isempty(temp_endlist[j]) # Then we can override this one
+                    temp_endlist[j] = copy(g.fadjlist[ID])
+                    tempID = j
+                    break
+                end
+            end
+            if tempID==0 #Then we haven't found one we can override
+                push!(temp_endlist, copy(g.fadjlist[ID]))
+                tempID = length(temp_endlist)
+            end
+        end
+        for j in eachindex(temp_endlist)
+            if ID in temp_endlist[j]
+                filter!(x -> x!=ID, temp_endlist[j])
+            end
+        end
+        if tempID > maxtemp
+            maxtemp = tempID
+            if i==length(varorder)
+                # A flag to indicate if a temporary variable is only used for the final term. This
+                # allows us to only pre-allocate elements of the McCormick tuple we need to return,
+                # rather than including all of [cv, cc, lo, hi, cvgrad, ccgrad] and not using some.
+                final_only_flag = true 
+            end
+        end
+    end
+
+    # Pre-allocate space for vector and kernel versions of functions,
+    # and save the names for the CUDA version to free up memory later
+    write(file_vector, "   # Pre-allocate arrays for each used temp variable, similar in size to $(representative)_cv\n")
+    write(file_kernel, "   # Pre-allocate CuArrays for each used temp variable, similar in size to $(representative)_cv\n")
+    cuarray_list = String[]
+    for loc in [file_vector, file_kernel]
+        for i = 1:maxtemp-1
+            write(loc, "   temp$(i)_cv = similar($(representative))\n")
+            write(loc, "   temp$(i)_cc = similar($(representative))\n")
+            write(loc, "   temp$(i)_lo = similar($(representative))\n")
+            write(loc, "   temp$(i)_hi = similar($(representative))\n")
+            if loc==file_kernel
+                push!(cuarray_list, ["temp$(i)_cv", "temp$(i)_cc", "temp$(i)_lo", "temp$(i)_hi"]...)
+            end
+            if (:cvgrad in outputs) || (:ccgrad in outputs)
+                for j in string.(get_name.(gradlist))
+                    write(loc, "   ∂temp$(i)∂$(j)_cv = similar($(representative))\n")
+                    if loc==file_kernel
+                        push!(cuarray_list, "∂temp$(i)∂$(j)_cv")
+                    end
+                end
+                for j in string.(get_name.(gradlist))
+                    write(loc, "   ∂temp$(i)∂$(j)_cc = similar($(representative))\n")
+                    if loc==file_kernel
+                        push!(cuarray_list, "∂temp$(i)∂$(j)_cc")
+                    end
+                end
+            end
+        end
+        if final_only_flag==false
+            write(loc, "   temp$(maxtemp)_cv = similar($(representative))\n")
+            write(loc, "   temp$(maxtemp)_cc = similar($(representative))\n")
+            write(loc, "   temp$(maxtemp)_lo = similar($(representative))\n")
+            write(loc, "   temp$(maxtemp)_hi = similar($(representative))\n")
+            if loc==file_kernel
+                push!(cuarray_list, ["temp$(maxtemp)_cv", "temp$(maxtemp)_cc", "temp$(maxtemp)_lo", "temp$(maxtemp)_hi"]...)
+            end
+            if (:cvgrad in outputs) || (:ccgrad in outputs)
+                for j in string.(get_name.(gradlist))
+                    write(loc, "   ∂temp$(maxtemp)∂$(j)_cv = similar($(representative))\n")
+                    if loc==file_kernel
+                        push!(cuarray_list, "∂temp$(maxtemp)∂$(j)_cv")
+                    end
+                end
+                for j in string.(get_name.(gradlist))
+                    write(loc, "   ∂temp$(maxtemp)∂$(j)_cc = similar($(representative))\n")
+                    if loc==file_kernel
+                        push!(cuarray_list, "∂temp$(maxtemp)∂$(j)_cc")
+                    end
+                end
+            end
+        else
+            if :cv in outputs
+                write(loc, "   temp$(maxtemp)_cv = similar($(representative))\n")
+                if loc==file_kernel
+                    push!(cuarray_list, "temp$(maxtemp)_cv")
+                end
+            end
+            if :cc in outputs
+                write(loc, "   temp$(maxtemp)_cc = similar($(representative))\n")
+                if loc==file_kernel
+                    push!(cuarray_list, "temp$(maxtemp)_cc")
+                end
+            end
+            if :lo in outputs
+                write(loc, "   temp$(maxtemp)_lo = similar($(representative))\n")
+                if loc==file_kernel
+                    push!(cuarray_list, "temp$(maxtemp)_lo")
+                end
+            end
+            if :hi in outputs
+                write(loc, "   temp$(maxtemp)_hi = similar($(representative))\n")
+                if loc==file_kernel
+                    push!(cuarray_list, "temp$(maxtemp)_hi")
+                end
+            end
+            if :cvgrad in outputs
+                for j in string.(get_name.(gradlist))
+                    write(loc, "   ∂temp$(maxtemp)∂$(j)_cv = similar($(representative))\n")
+                    if loc==file_kernel
+                        push!(cuarray_list, "∂temp$(maxtemp)∂$(j)_cv")
+                    end
+                end
+            end
+            if :ccgrad in outputs
+                for j in string.(get_name.(gradlist))
+                    write(loc, "   ∂temp$(maxtemp)∂$(j)_cc = similar($(representative))\n")
+                    if loc==file_kernel
+                        push!(cuarray_list, "∂temp$(maxtemp)∂$(j)_cc")
+                    end
+                end
+            end
+        end
+    end
+
+    # Loop through the topological list to add calculations in order
+    temp_endlist = []
+    name_tracker = copy(varids)
+    for i in eachindex(varorder) #Order in which variables are calculated
+        # Skip calculation if the variable is one of the inputs
+        if (varorder[i] in string.(vars))
+            continue
+        end
+
+        # Determine the corresponding ID of the variable in varids
+        ID = findfirst(x -> occursin(varorder[i], x), varids)
+
+        # Figure out which tempID to use/override. temp_endlist keeps
+        # track of where variables will be used in the future (stored
+        # as g.fadjlist), with elements removed as they are used. If
+        # there is an empty row in temp_endlist, we can re-use that
+        # tempID. If there isn't an empty row, we add a new row.
+        tempID = 0
+        if isempty(temp_endlist)
+            push!(temp_endlist, copy(g.fadjlist[ID]))
+            tempID = 1
+        else
+            for j in eachindex(temp_endlist)
+                if isempty(temp_endlist[j])
+                    # Then we can override this one
+                    temp_endlist[j] = copy(g.fadjlist[ID])
+                    tempID = j
+                    break
+                end
+            end
+            if tempID==0 #Then we haven't found one we can override
+                push!(temp_endlist, copy(g.fadjlist[ID]))
+                tempID = length(temp_endlist)
+            end
+        end
+
+        # When we refer to this variable in the future, we need to know what tempID
+        # the variable is using
+        name_tracker[ID] = "temp$(tempID)"
+
+        # Prepare variable names to use for printing
+        sym_cv = Symbol("$(name_tracker[ID])_cv")
+        sym_cc = Symbol("$(name_tracker[ID])_cc")
+        sym_lo = Symbol("$(name_tracker[ID])_lo")
+        sym_hi = Symbol("$(name_tracker[ID])_hi")
+        if (:cvgrad in outputs) || (:ccgrad in outputs)
+            dsym_cv = Symbol[]
+            dsym_cc = Symbol[]
+            for j in string.(get_name.(gradlist))
+                push!(dsym_cv, Symbol("∂$(name_tracker[ID])∂$(j)_cv"))
+                push!(dsym_cc, Symbol("∂$(name_tracker[ID])∂$(j)_cc"))
+            end
+        end
+
+        # Use inputs from the eval_generator to determine inputs for individual functions
+        func_num = (findfirst(x -> occursin(varorder[i], x), LHSs))
+        normal_input = ""
+        for i in eachindex(normal_inputs[func_num])
+            # Identify the name of the variable to be added
+            name = normal_inputs[func_num][i]
+
+            # Replace names with their tempIDs, if necessary
+            for var in g.badjlist[ID]
+                name = replace(name, varids[var] => name_tracker[var], count=1)
+            end
+
+            # Add to the input list
+            normal_input *= name*", "
+        end
+        normal_input = normal_input[1:end-2]
+
+        # Use inputs from the grad_eval_generator to determine inputs for subgradient functions.
+        # Due to CUDA limitations, the max subgradient dimensionality is ~6~ 1, but we can bypass
+        # this limit by calling the same functions multiple times for different sets of
+        # subgradient dimensions
+        if (:cvgrad in outputs) || (:ccgrad in outputs)
+            grad_input = ["" for _ in 1:length(gradlist)]
+            for i in eachindex(grad_inputs[func_num]) #E.g.: Num[a_cv, a_cc, a_lo, a_hi, dada_cv, dada_cc, b_cv, b_cc, b_lo, b_hi, dbda_cv, dbda_cc])
+                # Identify the name of the variable to be added
+                name = grad_inputs[func_num][i]
+
+                # Replace names with their tempIDs, if necessary
+                for var in g.badjlist[ID]
+                    if varids[var] != name_tracker[var]
+                        name = replace(name, varids[var] => name_tracker[var], count=1)
+                    end
+                end
+
+                # Adjust derivative terms for the correct subgradient
+                gradlen = length(gradlist)
+                for j in eachindex(grad_input)
+                    tempname = name
+                    # @show tempname
+
+                    # if (j>1) && (gradlen > 6)
+                    #     for k in 1:6
+                    #         @show k
+                    #         if gradlen >= 6*j + k
+                    #             println("We're inside, making this replacement:")
+                    #             println("replace(tempname, ∂"*string(gradlist[k])*"_ => ∂"*string(gradlist[6*j + k])*"_")
+                    #         elseif contains(tempname, "∂"*string(gradlist[k])*"_")
+                    #             # If we have a dimensionality not divisible by 6, some subgradients
+                    #             # will be passed as 0's
+                    #             println("Otherwise, we have the condition:")
+                    #             println("contains(tempname, ∂"*string(gradlist[k])*"_)")
+                    #             println("tempname = 0.0")
+                    #         end
+                    #     end
+                    # end
+                    # @show gradlist
+                    # @show length(gradlist)
+
+                    if j>1
+                        tempname = replace(tempname, "∂"*string(get_name(gradlist[1]))*"_" => "∂"*string(get_name(gradlist[j]))*"_")
+                    end
+
+                    # if (j>1) && (gradlen > 6)
+                    #     for k in 1:6
+                    #         if gradlen >= 6*j + k
+                    #             tempname = replace(tempname, "∂"*string(gradlist[k])*"_" => "∂"*string(gradlist[6*j + k])*"_")
+                    #         elseif contains(tempname, "∂"*string(gradlist[k])*"_")
+                    #             # If we have a dimensionality not divisible by 6, some subgradients
+                    #             # will be passed as 0's
+                    #             tempname = "0.0"
+                    #         end
+                    #     end
+                    # end
+
+                    # Check if the derivative is "obvious" (e.g. dxdx = 1, and dxdy = 0, always)
+                    for m in eachindex(gradlist)
+                        for n in eachindex(gradlist)
+                            if m==n
+                                if (tempname=="∂"*string(get_name(gradlist[m]))*"∂"*string(get_name(gradlist[n]))*"_cc") || (tempname=="∂"*string(get_name(gradlist[m]))*"∂"*string(get_name(gradlist[n]))*"_cv")
+                                    tempname = "1.0"
+                                end
+                            else
+                                if (tempname=="∂"*string(get_name(gradlist[m]))*"∂"*string(get_name(gradlist[n]))*"_cc") || (tempname=="∂"*string(get_name(gradlist[m]))*"∂"*string(get_name(gradlist[n]))*"_cv")
+                                    tempname = "0.0"
+                                end
+                            end
+                        end
+                    end
+
+                    # Add the corrected name to the grad input
+                    grad_input[j] *= tempname*", "
+                end
+            end
+            # Remove the tails
+            for i in eachindex(grad_input)
+                grad_input[i] = grad_input[i][1:end-2]
+            end
+        end
+        
+        # Write the function calls
+        if i==length(varorder) # The final set
+            for loc in files
+                # Set equality type to be broadcast, in parallel cases
+                if loc==file # The floating-point version, which doesn't allow broadcasting
+                    eq = "="
+                else
+                    eq = ".="
+                end
+                if (loc==file) || (mutate==false)
+                    write(loc, "\n   # Calculate and return $(outputs) for: \n")
+                    write(loc, "   # $(factorized[func_num].lhs) = ($(factorized[func_num].rhs))\n")
+                    if :cv in outputs
+                        write(loc, "   $(sym_cv) $(eq) $(funcs[func_num][1]).($(normal_input))\n")
+                    end
+                    if :cc in outputs
+                        write(loc, "   $(sym_cc) $(eq) $(funcs[func_num][2]).($(normal_input))\n")
+                    end
+                    if :lo in outputs
+                        write(loc, "   $(sym_lo) $(eq) $(funcs[func_num][3]).($(normal_input))\n")
+                    end
+                    if :hi in outputs
+                        write(loc, "   $(sym_hi) $(eq) $(funcs[func_num][4]).($(normal_input))\n")
+                    end
+                    if :cvgrad in outputs
+                        for j in eachindex(dsym_cv)
+                            write(loc, "   $(dsym_cv[j]) $(eq) $(funcs[func_num][5])[1].($(grad_input[j]))\n")
+                        end
+                    end
+                    if :ccgrad in outputs
+                        for j in eachindex(dsym_cc)
+                            write(loc, "   $(dsym_cc[j]) $(eq) $(funcs[func_num][6])[1].($(grad_input[j]))\n")
+                        end
+                    end
+                else # It's the vector/CuArray version and mutate==true
+                    write(loc, "\n   # Mutate $(outputs) for: \n")
+                    write(loc, "   # $(factorized[func_num].lhs) = ($(factorized[func_num].rhs))\n")
+                    if :cv in outputs
+                        write(loc, "   OUT_cv $(eq) $(funcs[func_num][1]).($(normal_input))\n")
+                    end
+                    if :cc in outputs
+                        write(loc, "   OUT_cc $(eq) $(funcs[func_num][2]).($(normal_input))\n")
+                    end
+                    if :lo in outputs
+                        write(loc, "   OUT_lo $(eq) $(funcs[func_num][3]).($(normal_input))\n")
+                    end
+                    if :hi in outputs
+                        write(loc, "   OUT_hi $(eq) $(funcs[func_num][4]).($(normal_input))\n")
+                    end
+                    if :cvgrad in outputs
+                        # for j in eachindex(dsym_cv)
+                        #     @show j
+                        #     @show dsym_cv
+                        #     @show dsym_cv[j]
+                        #     @show grad_input[j]
+                        # end
+                        # for dvar in string.(get_name.(gradlist))
+                        #     @show dvar
+                        # end
+                        # for dvar in string.(get_name.(gradlist))
+                        for j in eachindex(dsym_cv)
+                            outvar = string(get_name.(gradlist)[j])
+                            write(loc, "   OUT_∂$(outvar)_cv $(eq) $(funcs[func_num][5])[1].($(grad_input[j]))\n")
+                        end
+                    end
+                    if :ccgrad in outputs
+                        for j in eachindex(dsym_cc)
+                            outvar = string(get_name.(gradlist)[j])
+                            write(loc, "   OUT_∂$(outvar)_cv $(eq) $(funcs[func_num][6])[1].($(grad_input[j]))\n")
+                        end
+                    end
+                end
+
+                # Return the desired outputs
+                file_output = ""
+                if :cv in outputs
+                    file_output *= string(sym_cv)*", "
+                    if (loc==file_kernel) && (mutate==false)
+                        deleteat!(cuarray_list, findfirst(x->x==string(sym_cv), cuarray_list))
+                    end
+                end
+                if :cc in outputs
+                    file_output *= string(sym_cc)*", "
+                    if (loc==file_kernel) && (mutate==false)
+                        deleteat!(cuarray_list, findfirst(x->x==string(sym_cc), cuarray_list))
+                    end
+                end
+                if :lo in outputs
+                    file_output *= string(sym_lo)*", "
+                    if (loc==file_kernel) && (mutate==false)
+                        deleteat!(cuarray_list, findfirst(x->x==string(sym_lo), cuarray_list))
+                    end
+                end
+                if :hi in outputs
+                    file_output *= string(sym_hi)*", "
+                    if (loc==file_kernel) && (mutate==false)
+                        deleteat!(cuarray_list, findfirst(x->x==string(sym_hi), cuarray_list))
+                    end
+                end
+                if :cvgrad in outputs
+                    for item in dsym_cv
+                        file_output *= string(item)*", "
+                        if (loc==file_kernel) && (mutate==false)
+                            deleteat!(cuarray_list, findfirst(x->x==string(item), cuarray_list))
+                        end
+                    end
+                end
+                if :ccgrad in outputs
+                    for item in dsym_cc
+                        file_output *= string(item)*", "
+                        if (loc==file_kernel) && (mutate==false)
+                            deleteat!(cuarray_list, findfirst(x->x==string(item), cuarray_list))
+                        end
+                    end
+                end
+                file_output = file_output[1:end-2]
+
+                # Clear up CUDA memory
+                if loc==file_kernel
+                    if ~isempty(cuarray_list)
+                        clear_list = "["
+                        for i in cuarray_list
+                            clear_list *= i*", "
+                        end
+                        clear_list = clear_list[1:end-2]*"]"
+                        write(loc, "\n # Clear CUDA objects from memory\n")
+                        write(loc, "   for i in $(clear_list)\n")
+                        write(loc, "      CUDA.unsafe_free!(i)\n")
+                        write(loc, "   end\n\n")
+                    end
+                end
+
+                if mutate
+                    write(loc, "   return nothing")
+                else
+                    write(loc, "   return $file_output")
+                end
+            end
+        else
+            for loc in files
+                if loc==file
+                    eq = "="
+                else
+                    eq = ".="
+                end
+                write(loc, "\n   # Calculate the McCormick expansion of $(factorized[func_num].lhs) = ($(factorized[func_num].rhs)) where $(factorized[func_num].lhs)=temp$(tempID)\n")
+                write(loc, "   $(sym_cv) $(eq) $(funcs[func_num][1]).($(normal_input))\n")
+                write(loc, "   $(sym_cc) $(eq) $(funcs[func_num][2]).($(normal_input))\n")
+                write(loc, "   $(sym_lo) $(eq) $(funcs[func_num][3]).($(normal_input))\n")
+                write(loc, "   $(sym_hi) $(eq) $(funcs[func_num][4]).($(normal_input))\n")
+                if (:cvgrad in outputs) || (:ccgrad in outputs)
+                    for j in eachindex(dsym_cv)
+                        write(loc, "   $(dsym_cv[j]) $(eq) $(funcs[func_num][5])[1].($(grad_input[j]))\n")
+                    end
+                    for j in eachindex(dsym_cc)
+                        write(loc, "   $(dsym_cc[j]) $(eq) $(funcs[func_num][6])[1].($(grad_input[j]))\n")
+                    end
+                end
+            end
+        end
+
+        # Remove instances of ID from the templist
+        for j in eachindex(temp_endlist)
+            if ID in temp_endlist[j]
+                filter!(x -> x!=ID, temp_endlist[j])
+            end
+        end
+    end
+
+    # Wrap up the files
+    for loc in files
+        write(loc, "\nend")
+        close(loc)
+    end
+
+    # Include the new functions
+    new_func = include(path)
+    include(path_vector)
+    include(path_kernel)
+    return new_func
+end
\ No newline at end of file
diff --git a/test/addition.jl b/test/addition.jl
index 141f060..07d307a 100644
--- a/test/addition.jl
+++ b/test/addition.jl
@@ -1,15 +1,24 @@
-
+function eval_check_grad_add(eval_func, MC1)
+    return eval_func(MC1.cv_grad[1], MC1.cc_grad[1])
+end
+function eval_check_grad_add(eval_func, MC1, MC2)
+    return eval_func(MC1.cv_grad[1], MC1.cv_grad[2], MC1.cc_grad[1], MC1.cc_grad[2],
+                     MC2.cv_grad[1], MC2.cv_grad[2], MC2.cc_grad[1], MC2.cc_grad[2])
+end
 @testset "Addition" begin
     @variables x, y
 
     to_compute = y+5
     posreal_add_cv, posreal_add_cc, posreal_add_lo, posreal_add_hi, posreal_order = all_evaluators(to_compute)
+    posreal_add_cvgrad, posreal_add_ccgrad, posreal_order_grad = all_subgradients(to_compute, expand=true)
 
     to_compute = y-5
     negreal_add_cv, negreal_add_cc, negreal_add_lo, negreal_add_hi, negreal_order = all_evaluators(to_compute)
+    negreal_add_cvgrad, negreal_add_ccgrad, negreal_order_grad = all_subgradients(to_compute, expand=true)
 
     to_compute = x+y
     add_cv, add_cc, add_lo, add_hi, add_order = all_evaluators(to_compute)
+    add_cvgrad, add_ccgrad, add_order_grad = all_subgradients(to_compute, expand=true)
 
     # Addition rules are very simple; each component of the McCormick expansion
     # is added separately. No need to test more than one type of McCormick object
@@ -23,14 +32,22 @@
     @test abs(eval_check(posreal_add_cc, y_1D) - (y_1D+5).cc) <= 1E-15
     @test abs(eval_check(posreal_add_lo, y_1D) - (y_1D+5).Intv.lo) <= 1E-15
     @test abs(eval_check(posreal_add_hi, y_1D) - (y_1D+5).Intv.hi) <= 1E-15
+    @test abs(eval_check_grad_add(posreal_add_cvgrad[1], y_1D) - (y_1D+5).cv_grad[1]) <= 1E-15
+    @test abs(eval_check_grad_add(posreal_add_ccgrad[1], y_1D) - (y_1D+5).cc_grad[1]) <= 1E-15
 
     @test abs(eval_check(negreal_add_cv, y_1D) - (y_1D-5).cv) <= 1E-15
     @test abs(eval_check(negreal_add_cc, y_1D) - (y_1D-5).cc) <= 1E-15
     @test abs(eval_check(negreal_add_lo, y_1D) - (y_1D-5).Intv.lo) <= 1E-15
     @test abs(eval_check(negreal_add_hi, y_1D) - (y_1D-5).Intv.hi) <= 1E-15
+    @test abs(eval_check_grad_add(negreal_add_cvgrad[1], y_1D) - (y_1D-5).cv_grad[1]) <= 1E-15
+    @test abs(eval_check_grad_add(negreal_add_ccgrad[1], y_1D) - (y_1D-5).cc_grad[1]) <= 1E-15
     
     @test abs(eval_check(add_cv, xMC, yMC) - (xMC+yMC).cv) <= 1E-15
     @test abs(eval_check(add_cc, xMC, yMC) - (xMC+yMC).cc) <= 1E-15
     @test abs(eval_check(add_lo, xMC, yMC) - (xMC+yMC).Intv.lo) <= 1E-15
     @test abs(eval_check(add_hi, xMC, yMC) - (xMC+yMC).Intv.hi) <= 1E-15
+    @test abs(eval_check_grad_add(add_cvgrad[1], xMC, yMC) - (xMC+yMC).cv_grad[1]) <= 1E-15
+    @test abs(eval_check_grad_add(add_cvgrad[2], xMC, yMC) - (xMC+yMC).cv_grad[2]) <= 1E-15
+    @test abs(eval_check_grad_add(add_ccgrad[1], xMC, yMC) - (xMC+yMC).cc_grad[1]) <= 1E-15
+    @test abs(eval_check_grad_add(add_ccgrad[2], xMC, yMC) - (xMC+yMC).cc_grad[2]) <= 1E-15
 end
diff --git a/test/division.jl b/test/division.jl
index 148b432..77cd8d3 100644
--- a/test/division.jl
+++ b/test/division.jl
@@ -1,6 +1,8 @@
 mid_expr(x, y, z) = IfElse.ifelse(x >= y, IfElse.ifelse(y >= z, y, IfElse.ifelse(y == x, y, IfElse.ifelse(z >= x, x, z))),
         IfElse.ifelse(z >= y, y, IfElse.ifelse(x >= z, x, z)))
 
+mid_grad(x, y, z, ccgrad, cvgrad, zerovec) = IfElse.ifelse(x >= y, IfElse.ifelse(y >= z, cvgrad, IfElse.ifelse(y == x, cvgrad, IfElse.ifelse(z >= x, ccgrad, zerovec))),
+        IfElse.ifelse(z >= y, cvgrad, IfElse.ifelse(x >= z, ccgrad, zerovec)))
 function div_cv_case(xcv, xcc, xL, xU, ycv, ycc, yL, yU)
     yL_inv = inv(yU)
     yU_inv = inv(yL)
@@ -139,15 +141,19 @@ div_cc_case(A::MC, B::MC) = div_cv_case(A.cv, A.cc, A.Intv.lo, A.Intv.hi, B.cv,
 
     to_compute = 5.0/y
     posreal_div_cv, posreal_div_cc, posreal_div_lo, posreal_div_hi, posreal_order = all_evaluators(to_compute)
+    posreal_div_cvgrad, posreal_div_ccgrad, posreal_order_grad = all_subgradients(to_compute, expand=true)
     
     to_compute = -5.0/y
     negreal_div_cv, negreal_div_cc, negreal_div_lo, negreal_div_hi, negreal_order = all_evaluators(to_compute)
+    negreal_div_cvgrad, negreal_div_ccgrad, negreal_order_grad = all_subgradients(to_compute, expand=true)
 
     # pos/pos
     @test abs(eval_check(posreal_div_cv, pos) - (5.0/pos).cv) <= 1E-15
     @test abs(eval_check(posreal_div_cc, pos) - (5.0/pos).cc) <= 1E-15
     @test abs(eval_check(posreal_div_lo, pos) - (5.0/pos).Intv.lo) <= 1E-15
     @test abs(eval_check(posreal_div_hi, pos) - (5.0/pos).Intv.hi) <= 1E-15
+    @test abs(eval_check_grad(posreal_div_cvgrad[1], pos) - (5.0/pos).cv_grad[1]) <= 1E-15
+    @test abs(eval_check_grad(posreal_div_ccgrad[1], pos) - (5.0/pos).cc_grad[1]) <= 1E-15
 
     # pos/mix
     @test isnan(eval_check(posreal_div_cv, mix))
@@ -158,18 +164,26 @@ div_cc_case(A::MC, B::MC) = div_cv_case(A.cv, A.cc, A.Intv.lo, A.Intv.hi, B.cv,
     @test isnan((5.0/mix).Intv.lo)
     @test isnan(eval_check(posreal_div_hi, mix))
     @test isnan((5.0/mix).Intv.hi)
+    @test isnan(eval_check_grad(posreal_div_cvgrad[1], mix))
+    @test isnan((5.0/mix).cv_grad[1])
+    @test isnan(eval_check_grad(posreal_div_ccgrad[1], mix))
+    @test isnan((5.0/mix).cc_grad[1])
     
     # pos/neg
     @test abs(eval_check(posreal_div_cv, neg) - (5.0/neg).cv) <= 1E-15
     @test abs(eval_check(posreal_div_cc, neg) - (5.0/neg).cc) <= 1E-15
     @test abs(eval_check(posreal_div_lo, neg) - (5.0/neg).Intv.lo) <= 1E-15
     @test abs(eval_check(posreal_div_hi, neg) - (5.0/neg).Intv.hi) <= 1E-15
+    @test abs(eval_check_grad(posreal_div_cvgrad[1], neg) - (5.0/neg).cv_grad[1]) <= 1E-15
+    @test abs(eval_check_grad(posreal_div_ccgrad[1], neg) - (5.0/neg).cc_grad[1]) <= 1E-15
 
     # neg/pos
     @test abs(eval_check(negreal_div_cv, pos) - (-5.0/pos).cv) <= 1E-15
     @test abs(eval_check(negreal_div_cc, pos) - (-5.0/pos).cc) <= 1E-15
     @test abs(eval_check(negreal_div_lo, pos) - (-5.0/pos).Intv.lo) <= 1E-15
     @test abs(eval_check(negreal_div_hi, pos) - (-5.0/pos).Intv.hi) <= 1E-15
+    @test abs(eval_check_grad(negreal_div_cvgrad[1], pos) - (-5.0/pos).cv_grad[1]) <= 1E-15
+    @test abs(eval_check_grad(negreal_div_ccgrad[1], pos) - (-5.0/pos).cc_grad[1]) <= 1E-15
 
     # neg/mix
     @test isnan(eval_check(negreal_div_cv, mix))
@@ -180,12 +194,18 @@ div_cc_case(A::MC, B::MC) = div_cv_case(A.cv, A.cc, A.Intv.lo, A.Intv.hi, B.cv,
     @test isnan((-5.0/mix).Intv.lo)
     @test isnan(eval_check(negreal_div_hi, mix))
     @test isnan((-5.0/mix).Intv.hi)
+    @test isnan(eval_check_grad(negreal_div_cvgrad[1], mix))
+    @test isnan((-5.0/mix).cv_grad[1])
+    @test isnan(eval_check_grad(negreal_div_ccgrad[1], mix))
+    @test isnan((-5.0/mix).cc_grad[1])
     
     # neg/neg
     @test abs(eval_check(negreal_div_cv, neg) - (-5.0/neg).cv) <= 1E-15
     @test abs(eval_check(negreal_div_cc, neg) - (-5.0/neg).cc) <= 1E-15
     @test abs(eval_check(negreal_div_lo, neg) - (-5.0/neg).Intv.lo) <= 1E-15
     @test abs(eval_check(negreal_div_hi, neg) - (-5.0/neg).Intv.hi) <= 1E-15
+    @test abs(eval_check_grad(negreal_div_cvgrad[1], neg) - (-5.0/neg).cv_grad[1]) <= 1E-15
+    @test abs(eval_check_grad(negreal_div_ccgrad[1], neg) - (-5.0/neg).cc_grad[1]) <= 1E-15
 
     # Note: McCormick object divided by a real automatically converts from (MC/real) to (MC*(real^-1))
     # through Symbolics.jl, so this is simply multiplication.
@@ -204,6 +224,7 @@ div_cc_case(A::MC, B::MC) = div_cv_case(A.cv, A.cc, A.Intv.lo, A.Intv.hi, B.cv,
     @variables x, y
     to_compute = x/y
     div_cv, div_cc, div_lo, div_hi, order = all_evaluators(to_compute)
+    div_cvgrad, div_ccgrad, order_grad = all_subgradients(to_compute, expand=true)
 
     @test div_cv_case(pos, pos_lo) == 1
     @test div_cc_case(pos, pos_lo) == 1
@@ -211,6 +232,10 @@ div_cc_case(A::MC, B::MC) = div_cv_case(A.cv, A.cc, A.Intv.lo, A.Intv.hi, B.cv,
     @test abs(eval_check(div_cc, pos, pos_lo) - (pos/pos_lo).cc) <= 1E-15
     @test abs(eval_check(div_lo, pos, pos_lo) - (pos/pos_lo).Intv.lo) <= 1E-15
     @test abs(eval_check(div_hi, pos, pos_lo) - (pos/pos_lo).Intv.hi) <= 1E-15
+    @test abs(eval_check_grad(div_cvgrad[1], pos, pos_lo) - (pos/pos_lo).cv_grad[1]) <= 1E-15
+    @test abs(eval_check_grad(div_cvgrad[2], pos, pos_lo) - (pos/pos_lo).cv_grad[2]) <= 1E-15
+    @test abs(eval_check_grad(div_ccgrad[1], pos, pos_lo) - (pos/pos_lo).cc_grad[1]) <= 1E-15
+    @test abs(eval_check_grad(div_ccgrad[2], pos, pos_lo) - (pos/pos_lo).cc_grad[2]) <= 1E-15
     
     @test div_cv_case(pos, pos_hi) == 2
     @test div_cc_case(pos, pos_hi) == 2
@@ -218,6 +243,10 @@ div_cc_case(A::MC, B::MC) = div_cv_case(A.cv, A.cc, A.Intv.lo, A.Intv.hi, B.cv,
     @test abs(eval_check(div_cc, pos, pos_hi) - (pos/pos_hi).cc) <= 1E-15
     @test abs(eval_check(div_lo, pos, pos_hi) - (pos/pos_hi).Intv.lo) <= 1E-15
     @test abs(eval_check(div_hi, pos, pos_hi) - (pos/pos_hi).Intv.hi) <= 1E-15
+    @test abs(eval_check_grad(div_cvgrad[1], pos, pos_hi) - (pos/pos_hi).cv_grad[1]) <= 1E-15
+    @test abs(eval_check_grad(div_cvgrad[2], pos, pos_hi) - (pos/pos_hi).cv_grad[2]) <= 1E-15
+    @test abs(eval_check_grad(div_ccgrad[1], pos, pos_hi) - (pos/pos_hi).cc_grad[1]) <= 1E-15
+    @test abs(eval_check_grad(div_ccgrad[2], pos, pos_hi) - (pos/pos_hi).cc_grad[2]) <= 1E-15
     
     @test div_cv_case(pos, neg_lo) == 3
     @test div_cc_case(pos, neg_lo) == 3
@@ -225,6 +254,10 @@ div_cc_case(A::MC, B::MC) = div_cv_case(A.cv, A.cc, A.Intv.lo, A.Intv.hi, B.cv,
     @test abs(eval_check(div_cc, pos, neg_lo) - (pos/neg_lo).cc) <= 1E-15
     @test abs(eval_check(div_lo, pos, neg_lo) - (pos/neg_lo).Intv.lo) <= 1E-15
     @test abs(eval_check(div_hi, pos, neg_lo) - (pos/neg_lo).Intv.hi) <= 1E-15
+    @test abs(eval_check_grad(div_cvgrad[1], pos, neg_lo) - (pos/neg_lo).cv_grad[1]) <= 1E-15
+    @test abs(eval_check_grad(div_cvgrad[2], pos, neg_lo) - (pos/neg_lo).cv_grad[2]) <= 1E-15
+    @test abs(eval_check_grad(div_ccgrad[1], pos, neg_lo) - (pos/neg_lo).cc_grad[1]) <= 1E-15
+    @test abs(eval_check_grad(div_ccgrad[2], pos, neg_lo) - (pos/neg_lo).cc_grad[2]) <= 1E-15
     
     @test div_cv_case(pos, neg) == 4
     @test div_cc_case(pos, neg) == 4
@@ -232,6 +265,10 @@ div_cc_case(A::MC, B::MC) = div_cv_case(A.cv, A.cc, A.Intv.lo, A.Intv.hi, B.cv,
     @test abs(eval_check(div_cc, pos, neg) - (pos/neg).cc) <= 1E-15
     @test abs(eval_check(div_lo, pos, neg) - (pos/neg).Intv.lo) <= 1E-15
     @test abs(eval_check(div_hi, pos, neg) - (pos/neg).Intv.hi) <= 1E-15
+    @test abs(eval_check_grad(div_cvgrad[1], pos, neg) - (pos/neg).cv_grad[1]) <= 1E-15
+    @test abs(eval_check_grad(div_cvgrad[2], pos, neg) - (pos/neg).cv_grad[2]) <= 1E-15
+    @test abs(eval_check_grad(div_ccgrad[1], pos, neg) - (pos/neg).cc_grad[1]) <= 1E-15
+    @test abs(eval_check_grad(div_ccgrad[2], pos, neg) - (pos/neg).cc_grad[2]) <= 1E-15
     
     @test div_cv_case(pos_hi, mix) == 5
     @test div_cc_case(pos_hi, mix) == 5
@@ -243,6 +280,14 @@ div_cc_case(A::MC, B::MC) = div_cv_case(A.cv, A.cc, A.Intv.lo, A.Intv.hi, B.cv,
     @test isnan((pos_hi/mix).Intv.lo)
     @test isnan(eval_check(div_hi, pos_hi, mix))
     @test isnan((pos_hi/mix).Intv.hi)
+    @test isnan(eval_check_grad(div_cvgrad[1], pos_hi, mix))
+    @test isnan((pos_hi/mix).cv_grad[1])
+    @test isnan(eval_check_grad(div_cvgrad[2], pos_hi, mix))
+    @test isnan((pos_hi/mix).cv_grad[2])
+    @test isnan(eval_check_grad(div_ccgrad[1], pos_hi, mix))
+    @test isnan((pos_hi/mix).cc_grad[1])
+    @test isnan(eval_check_grad(div_ccgrad[2], pos_hi, mix))
+    @test isnan((pos_hi/mix).cc_grad[2])
 
     @test div_cv_case(neg, pos_lo) == 6
     @test div_cc_case(neg, pos_lo) == 6
@@ -250,6 +295,10 @@ div_cc_case(A::MC, B::MC) = div_cv_case(A.cv, A.cc, A.Intv.lo, A.Intv.hi, B.cv,
     @test abs(eval_check(div_cc, neg, pos_lo) - (neg/pos_lo).cc) <= 1E-15
     @test abs(eval_check(div_lo, neg, pos_lo) - (neg/pos_lo).Intv.lo) <= 1E-15
     @test abs(eval_check(div_hi, neg, pos_lo) - (neg/pos_lo).Intv.hi) <= 1E-15
+    @test abs(eval_check_grad(div_cvgrad[1], neg, pos_lo) - (neg/pos_lo).cv_grad[1]) <= 1E-15
+    @test abs(eval_check_grad(div_cvgrad[2], neg, pos_lo) - (neg/pos_lo).cv_grad[2]) <= 1E-15
+    @test abs(eval_check_grad(div_ccgrad[1], neg, pos_lo) - (neg/pos_lo).cc_grad[1]) <= 1E-15
+    @test abs(eval_check_grad(div_ccgrad[2], neg, pos_lo) - (neg/pos_lo).cc_grad[2]) <= 1E-15
     
     @test div_cv_case(neg, pos_hi) == 7
     @test div_cc_case(neg, pos_hi) == 7
@@ -257,6 +306,10 @@ div_cc_case(A::MC, B::MC) = div_cv_case(A.cv, A.cc, A.Intv.lo, A.Intv.hi, B.cv,
     @test abs(eval_check(div_cc, neg, pos_hi) - (neg/pos_hi).cc) <= 1E-15
     @test abs(eval_check(div_lo, neg, pos_hi) - (neg/pos_hi).Intv.lo) <= 1E-15
     @test abs(eval_check(div_hi, neg, pos_hi) - (neg/pos_hi).Intv.hi) <= 1E-15
+    @test abs(eval_check_grad(div_cvgrad[1], neg, pos_hi) - (neg/pos_hi).cv_grad[1]) <= 1E-15
+    @test abs(eval_check_grad(div_cvgrad[2], neg, pos_hi) - (neg/pos_hi).cv_grad[2]) <= 1E-15
+    @test abs(eval_check_grad(div_ccgrad[1], neg, pos_hi) - (neg/pos_hi).cc_grad[1]) <= 1E-15
+    @test abs(eval_check_grad(div_ccgrad[2], neg, pos_hi) - (neg/pos_hi).cc_grad[2]) <= 1E-15
     
     @test div_cv_case(neg, neg_lo) == 8
     @test div_cc_case(neg, neg_lo) == 8
@@ -264,6 +317,10 @@ div_cc_case(A::MC, B::MC) = div_cv_case(A.cv, A.cc, A.Intv.lo, A.Intv.hi, B.cv,
     @test abs(eval_check(div_cc, neg, neg_lo) - (neg/neg_lo).cc) <= 1E-15
     @test abs(eval_check(div_lo, neg, neg_lo) - (neg/neg_lo).Intv.lo) <= 1E-15
     @test abs(eval_check(div_hi, neg, neg_lo) - (neg/neg_lo).Intv.hi) <= 1E-15
+    @test abs(eval_check_grad(div_cvgrad[1], neg, neg_lo) - (neg/neg_lo).cv_grad[1]) <= 1E-15
+    @test abs(eval_check_grad(div_cvgrad[2], neg, neg_lo) - (neg/neg_lo).cv_grad[2]) <= 1E-15
+    @test abs(eval_check_grad(div_ccgrad[1], neg, neg_lo) - (neg/neg_lo).cc_grad[1]) <= 1E-15
+    @test abs(eval_check_grad(div_ccgrad[2], neg, neg_lo) - (neg/neg_lo).cc_grad[2]) <= 1E-15
     
     @test div_cv_case(neg, neg_hi) == 9
     @test div_cc_case(neg, neg_hi) == 9
@@ -271,6 +328,10 @@ div_cc_case(A::MC, B::MC) = div_cv_case(A.cv, A.cc, A.Intv.lo, A.Intv.hi, B.cv,
     @test abs(eval_check(div_cc, neg, neg_hi) - (neg/neg_hi).cc) <= 1E-15
     @test abs(eval_check(div_lo, neg, neg_hi) - (neg/neg_hi).Intv.lo) <= 1E-15
     @test abs(eval_check(div_hi, neg, neg_hi) - (neg/neg_hi).Intv.hi) <= 1E-15
+    @test abs(eval_check_grad(div_cvgrad[1], neg, neg_hi) - (neg/neg_hi).cv_grad[1]) <= 1E-15
+    @test abs(eval_check_grad(div_cvgrad[2], neg, neg_hi) - (neg/neg_hi).cv_grad[2]) <= 1E-15
+    @test abs(eval_check_grad(div_ccgrad[1], neg, neg_hi) - (neg/neg_hi).cc_grad[1]) <= 1E-15
+    @test abs(eval_check_grad(div_ccgrad[2], neg, neg_hi) - (neg/neg_hi).cc_grad[2]) <= 1E-15
     
     @test div_cv_case(neg, mix) == 10
     @test div_cc_case(neg, mix) == 10
@@ -282,6 +343,14 @@ div_cc_case(A::MC, B::MC) = div_cv_case(A.cv, A.cc, A.Intv.lo, A.Intv.hi, B.cv,
     @test isnan((neg/mix).Intv.lo)
     @test isnan(eval_check(div_hi, neg, mix))
     @test isnan((neg/mix).Intv.hi)
+    @test isnan(eval_check_grad(div_cvgrad[1], neg, mix))
+    @test isnan((neg/mix).cv_grad[1])
+    @test isnan(eval_check_grad(div_cvgrad[2], neg, mix))
+    @test isnan((neg/mix).cv_grad[2])
+    @test isnan(eval_check_grad(div_ccgrad[1], neg, mix))
+    @test isnan((neg/mix).cc_grad[1])
+    @test isnan(eval_check_grad(div_ccgrad[2], neg, mix))
+    @test isnan((neg/mix).cc_grad[2])
 
     @test div_cv_case(mix, pos_lo) == 11
     @test div_cc_case(mix, pos_lo) == 11
@@ -289,6 +358,10 @@ div_cc_case(A::MC, B::MC) = div_cv_case(A.cv, A.cc, A.Intv.lo, A.Intv.hi, B.cv,
     @test abs(eval_check(div_cc, mix, pos_lo) - (mix/pos_lo).cc) <= 1E-15
     @test abs(eval_check(div_lo, mix, pos_lo) - (mix/pos_lo).Intv.lo) <= 1E-15
     @test abs(eval_check(div_hi, mix, pos_lo) - (mix/pos_lo).Intv.hi) <= 1E-15
+    @test abs(eval_check_grad(div_cvgrad[1], mix, pos_lo) - (mix/pos_lo).cv_grad[1]) <= 1E-15
+    @test abs(eval_check_grad(div_cvgrad[2], mix, pos_lo) - (mix/pos_lo).cv_grad[2]) <= 1E-15
+    @test abs(eval_check_grad(div_ccgrad[1], mix, pos_lo) - (mix/pos_lo).cc_grad[1]) <= 1E-15
+    @test abs(eval_check_grad(div_ccgrad[2], mix, pos_lo) - (mix/pos_lo).cc_grad[2]) <= 1E-15
     
     @test div_cv_case(mix, pos) == 12
     @test div_cc_case(mix, pos) == 12
@@ -296,6 +369,10 @@ div_cc_case(A::MC, B::MC) = div_cv_case(A.cv, A.cc, A.Intv.lo, A.Intv.hi, B.cv,
     @test abs(eval_check(div_cc, mix, pos) - (mix/pos).cc) <= 1E-15
     @test abs(eval_check(div_lo, mix, pos) - (mix/pos).Intv.lo) <= 1E-15
     @test abs(eval_check(div_hi, mix, pos) - (mix/pos).Intv.hi) <= 1E-15
+    @test abs(eval_check_grad(div_cvgrad[1], mix, pos) - (mix/pos).cv_grad[1]) <= 1E-15
+    @test abs(eval_check_grad(div_cvgrad[2], mix, pos) - (mix/pos).cv_grad[2]) <= 1E-15
+    @test abs(eval_check_grad(div_ccgrad[1], mix, pos) - (mix/pos).cc_grad[1]) <= 1E-15
+    @test abs(eval_check_grad(div_ccgrad[2], mix, pos) - (mix/pos).cc_grad[2]) <= 1E-15
 
     @test div_cv_case(mix, neg) == 13
     @test div_cc_case(mix, neg) == 13
@@ -303,6 +380,10 @@ div_cc_case(A::MC, B::MC) = div_cv_case(A.cv, A.cc, A.Intv.lo, A.Intv.hi, B.cv,
     @test abs(eval_check(div_cc, mix, neg) - (mix/neg).cc) <= 1E-15
     @test abs(eval_check(div_lo, mix, neg) - (mix/neg).Intv.lo) <= 1E-15
     @test abs(eval_check(div_hi, mix, neg) - (mix/neg).Intv.hi) <= 1E-15
+    @test abs(eval_check_grad(div_cvgrad[1], mix, neg) - (mix/neg).cv_grad[1]) <= 1E-15
+    @test abs(eval_check_grad(div_cvgrad[2], mix, neg) - (mix/neg).cv_grad[2]) <= 1E-15
+    @test abs(eval_check_grad(div_ccgrad[1], mix, neg) - (mix/neg).cc_grad[1]) <= 1E-15
+    @test abs(eval_check_grad(div_ccgrad[2], mix, neg) - (mix/neg).cc_grad[2]) <= 1E-15
 
     @test div_cv_case(mix, neg_lo) == 14
     @test div_cc_case(mix, neg_lo) == 14
@@ -310,6 +391,10 @@ div_cc_case(A::MC, B::MC) = div_cv_case(A.cv, A.cc, A.Intv.lo, A.Intv.hi, B.cv,
     @test abs(eval_check(div_cc, mix, neg_lo) - (mix/neg_lo).cc) <= 1E-15
     @test abs(eval_check(div_lo, mix, neg_lo) - (mix/neg_lo).Intv.lo) <= 1E-15
     @test abs(eval_check(div_hi, mix, neg_lo) - (mix/neg_lo).Intv.hi) <= 1E-15
+    @test abs(eval_check_grad(div_cvgrad[1], mix, neg_lo) - (mix/neg_lo).cv_grad[1]) <= 1E-15
+    @test abs(eval_check_grad(div_cvgrad[2], mix, neg_lo) - (mix/neg_lo).cv_grad[2]) <= 1E-15
+    @test abs(eval_check_grad(div_ccgrad[1], mix, neg_lo) - (mix/neg_lo).cc_grad[1]) <= 1E-15
+    @test abs(eval_check_grad(div_ccgrad[2], mix, neg_lo) - (mix/neg_lo).cc_grad[2]) <= 1E-15
     
     @test div_cv_case(mix, mix) == 15
     @test div_cc_case(mix, mix) == 15
@@ -321,4 +406,12 @@ div_cc_case(A::MC, B::MC) = div_cv_case(A.cv, A.cc, A.Intv.lo, A.Intv.hi, B.cv,
     @test isnan((mix/mix).Intv.lo)
     @test isnan(eval_check(div_hi, mix, mix))
     @test isnan((mix/mix).Intv.hi)
+    @test isnan(eval_check_grad(div_cvgrad[1], mix, mix))
+    @test isnan((mix/mix).cv_grad[1])
+    @test isnan(eval_check_grad(div_cvgrad[2], mix, mix))
+    @test isnan((mix/mix).cv_grad[2])
+    @test isnan(eval_check_grad(div_ccgrad[1], mix, mix))
+    @test isnan((mix/mix).cc_grad[1])
+    @test isnan(eval_check_grad(div_ccgrad[2], mix, mix))
+    @test isnan((mix/mix).cc_grad[2])
 end
\ No newline at end of file
diff --git a/test/exp.jl b/test/exp.jl
index 2915208..cfa3b5a 100644
--- a/test/exp.jl
+++ b/test/exp.jl
@@ -4,6 +4,7 @@
 
     to_compute = exp(x)
     exp_cv, exp_cc, exp_lo, exp_hi, exp_order = all_evaluators(to_compute)
+    exp_cvgrad, exp_ccgrad, exp_order_grad = all_subgradients(to_compute, expand=true)
 
     # Check positive/negative/mixed cases, as well as some unique cases where values are the same
     pos = MC{1,NS}(1.0, 1.5, Interval(0.5, 2.0), SVector{1, Float64}(1.0), SVector{1, Float64}(3.0), false)
@@ -17,29 +18,41 @@
     @test abs(eval_check(exp_cc, pos) - exp(pos).cc) <= 1E-15
     @test abs(eval_check(exp_lo, pos) - exp(pos).Intv.lo) <= 1E-15
     @test abs(eval_check(exp_hi, pos) - exp(pos).Intv.hi) <= 1E-15
+    @test abs(eval_check_grad(exp_cvgrad[1], pos) - exp(pos).cv_grad[1]) <= 1E-15
+    @test abs(eval_check_grad(exp_ccgrad[1], pos) - exp(pos).cc_grad[1]) <= 1E-15
     
     @test abs(eval_check(exp_cv, mix) - exp(mix).cv) <= 1E-15
     @test abs(eval_check(exp_cc, mix) - exp(mix).cc) <= 1E-15
     @test abs(eval_check(exp_lo, mix) - exp(mix).Intv.lo) <= 1E-15
     @test abs(eval_check(exp_hi, mix) - exp(mix).Intv.hi) <= 1E-15
-
+    @test abs(eval_check_grad(exp_cvgrad[1], mix) - exp(mix).cv_grad[1]) <= 1E-15
+    @test abs(eval_check_grad(exp_ccgrad[1], mix) - exp(mix).cc_grad[1]) <= 1E-15
+    
     @test abs(eval_check(exp_cv, neg) - exp(neg).cv) <= 1E-15
     @test abs(eval_check(exp_cc, neg) - exp(neg).cc) <= 1E-15
     @test abs(eval_check(exp_lo, neg) - exp(neg).Intv.lo) <= 1E-15
     @test abs(eval_check(exp_hi, neg) - exp(neg).Intv.hi) <= 1E-15
+    @test abs(eval_check_grad(exp_cvgrad[1], neg) - exp(neg).cv_grad[1]) <= 1E-15
+    @test abs(eval_check_grad(exp_ccgrad[1], neg) - exp(neg).cc_grad[1]) <= 1E-15
     
     @test abs(eval_check(exp_cv, pos_same1) - exp(pos_same1).cv) <= 1E-15
     @test abs(eval_check(exp_cc, pos_same1) - exp(pos_same1).cc) <= 1E-15
     @test abs(eval_check(exp_lo, pos_same1) - exp(pos_same1).Intv.lo) <= 1E-15
     @test abs(eval_check(exp_hi, pos_same1) - exp(pos_same1).Intv.hi) <= 1E-15
+    @test abs(eval_check_grad(exp_cvgrad[1], pos_same1) - exp(pos_same1).cv_grad[1]) <= 1E-15
+    @test abs(eval_check_grad(exp_ccgrad[1], pos_same1) - exp(pos_same1).cc_grad[1]) <= 1E-15
     
     @test abs(eval_check(exp_cv, pos_same2) - exp(pos_same2).cv) <= 1E-15
     @test abs(eval_check(exp_cc, pos_same2) - exp(pos_same2).cc) <= 1E-15
     @test abs(eval_check(exp_lo, pos_same2) - exp(pos_same2).Intv.lo) <= 1E-15
     @test abs(eval_check(exp_hi, pos_same2) - exp(pos_same2).Intv.hi) <= 1E-15
+    @test abs(eval_check_grad(exp_cvgrad[1], pos_same2) - exp(pos_same2).cv_grad[1]) <= 1E-15
+    @test abs(eval_check_grad(exp_ccgrad[1], pos_same2) - exp(pos_same2).cc_grad[1]) <= 1E-15
     
     @test abs(eval_check(exp_cv, pos_same3) - exp(pos_same3).cv) <= 1E-15
     @test abs(eval_check(exp_cc, pos_same3) - exp(pos_same3).cc) <= 1E-15
     @test abs(eval_check(exp_lo, pos_same3) - exp(pos_same3).Intv.lo) <= 1E-15
     @test abs(eval_check(exp_hi, pos_same3) - exp(pos_same3).Intv.hi) <= 1E-15
+    @test abs(eval_check_grad(exp_cvgrad[1], pos_same3) - exp(pos_same3).cv_grad[1]) <= 1E-15
+    @test abs(eval_check_grad(exp_ccgrad[1], pos_same3) - exp(pos_same3).cc_grad[1]) <= 1E-15
 end
diff --git a/test/multiplication.jl b/test/multiplication.jl
index 344a1d5..b40a878 100644
--- a/test/multiplication.jl
+++ b/test/multiplication.jl
@@ -128,6 +128,14 @@ end
 mult_cv_case(A::MC, B::MC) = mult_cv_case(A.cv, A.cc, A.Intv.lo, A.Intv.hi, B.cv, B.cc, B.Intv.lo, B.Intv.hi)
 mult_cc_case(A::MC, B::MC) = mult_cv_case(A.cv, A.cc, A.Intv.lo, A.Intv.hi, B.cv, B.cc, B.Intv.lo, B.Intv.hi)
 
+function eval_check_grad_mult(eval_func, MC1, MC2)
+    return eval_func(MC1.cv, MC1.cc, MC1.Intv.lo, MC1.Intv.hi, MC1.cv_grad[1], MC1.cv_grad[2], MC1.cc_grad[1], MC1.cc_grad[2],
+                     MC2.cv, MC2.cc, MC2.Intv.lo, MC2.Intv.hi, MC2.cv_grad[1], MC2.cv_grad[2], MC2.cc_grad[1], MC2.cc_grad[2])
+end
+function eval_check_grad_mult(eval_func, MC1)
+    return eval_func(MC1.Intv.lo, MC1.Intv.hi, MC1.cv_grad[1], MC1.cc_grad[1])
+end
+
 # For multiplication, need to test all cases with MC*MC, then several cases with Real types
 @testset "Multiplication" begin
     @variables x, y
@@ -139,45 +147,59 @@ mult_cc_case(A::MC, B::MC) = mult_cv_case(A.cv, A.cc, A.Intv.lo, A.Intv.hi, B.cv
 
     to_compute = 5.0*y
     posreal_mult_cv, posreal_mult_cc, posreal_mult_lo, posreal_mult_hi, posreal_order = all_evaluators(to_compute)
+    posreal_mult_cvgrad, posreal_mult_ccgrad, posreal_order_grad = all_subgradients(to_compute, expand=true)
     
     to_compute = -5.0*y
     negreal_mult_cv, negreal_mult_cc, negreal_mult_lo, negreal_mult_hi, negreal_order = all_evaluators(to_compute)
+    negreal_mult_cvgrad, negreal_mult_ccgrad, negreal_order_grad = all_subgradients(to_compute, expand=true)
 
     # pos*pos
     @test abs(eval_check(posreal_mult_cv, pos) - (5.0*pos).cv) <= 1E-15
     @test abs(eval_check(posreal_mult_cc, pos) - (5.0*pos).cc) <= 1E-15
     @test abs(eval_check(posreal_mult_lo, pos) - (5.0*pos).Intv.lo) <= 1E-15
     @test abs(eval_check(posreal_mult_hi, pos) - (5.0*pos).Intv.hi) <= 1E-15
+    @test abs(eval_check_grad_mult(posreal_mult_cvgrad[1], pos) - (5.0*pos).cv_grad[1]) <= 1E-15
+    @test abs(eval_check_grad_mult(posreal_mult_ccgrad[1], pos) - (5.0*pos).cc_grad[1]) <= 1E-15
 
     # pos*mix
     @test abs(eval_check(posreal_mult_cv, mix) - (5.0*mix).cv) <= 1E-15
     @test abs(eval_check(posreal_mult_cc, mix) - (5.0*mix).cc) <= 1E-15
     @test abs(eval_check(posreal_mult_lo, mix) - (5.0*mix).Intv.lo) <= 1E-15
     @test abs(eval_check(posreal_mult_hi, mix) - (5.0*mix).Intv.hi) <= 1E-15
+    @test abs(eval_check_grad_mult(posreal_mult_cvgrad[1], mix) - (5.0*mix).cv_grad[1]) <= 1E-15
+    @test abs(eval_check_grad_mult(posreal_mult_ccgrad[1], mix) - (5.0*mix).cc_grad[1]) <= 1E-15
     
     # pos*neg
     @test abs(eval_check(posreal_mult_cv, neg) - (5.0*neg).cv) <= 1E-15
     @test abs(eval_check(posreal_mult_cc, neg) - (5.0*neg).cc) <= 1E-15
     @test abs(eval_check(posreal_mult_lo, neg) - (5.0*neg).Intv.lo) <= 1E-15
     @test abs(eval_check(posreal_mult_hi, neg) - (5.0*neg).Intv.hi) <= 1E-15
+    @test abs(eval_check_grad_mult(posreal_mult_cvgrad[1], neg) - (5.0*neg).cv_grad[1]) <= 1E-15
+    @test abs(eval_check_grad_mult(posreal_mult_ccgrad[1], neg) - (5.0*neg).cc_grad[1]) <= 1E-15
 
     # neg*pos
     @test abs(eval_check(negreal_mult_cv, pos) - (-5.0*pos).cv) <= 1E-15
     @test abs(eval_check(negreal_mult_cc, pos) - (-5.0*pos).cc) <= 1E-15
     @test abs(eval_check(negreal_mult_lo, pos) - (-5.0*pos).Intv.lo) <= 1E-15
     @test abs(eval_check(negreal_mult_hi, pos) - (-5.0*pos).Intv.hi) <= 1E-15
+    @test abs(eval_check_grad_mult(negreal_mult_cvgrad[1], pos) - (-5.0*pos).cv_grad[1]) <= 1E-15
+    @test abs(eval_check_grad_mult(negreal_mult_ccgrad[1], pos) - (-5.0*pos).cc_grad[1]) <= 1E-15
 
     # neg*mix
     @test abs(eval_check(negreal_mult_cv, mix) - (-5.0*mix).cv) <= 1E-15
     @test abs(eval_check(negreal_mult_cc, mix) - (-5.0*mix).cc) <= 1E-15
     @test abs(eval_check(negreal_mult_lo, mix) - (-5.0*mix).Intv.lo) <= 1E-15
     @test abs(eval_check(negreal_mult_hi, mix) - (-5.0*mix).Intv.hi) <= 1E-15
+    @test abs(eval_check_grad_mult(negreal_mult_cvgrad[1], mix) - (-5.0*mix).cv_grad[1]) <= 1E-15
+    @test abs(eval_check_grad_mult(negreal_mult_ccgrad[1], mix) - (-5.0*mix).cc_grad[1]) <= 1E-15
     
     # neg*neg
     @test abs(eval_check(negreal_mult_cv, neg) - (-5.0*neg).cv) <= 1E-15
     @test abs(eval_check(negreal_mult_cc, neg) - (-5.0*neg).cc) <= 1E-15
     @test abs(eval_check(negreal_mult_lo, neg) - (-5.0*neg).Intv.lo) <= 1E-15
     @test abs(eval_check(negreal_mult_hi, neg) - (-5.0*neg).Intv.hi) <= 1E-15
+    @test abs(eval_check_grad_mult(negreal_mult_cvgrad[1], neg) - (-5.0*neg).cv_grad[1]) <= 1E-15
+    @test abs(eval_check_grad_mult(negreal_mult_ccgrad[1], neg) - (-5.0*neg).cc_grad[1]) <= 1E-15
 
 
     # McCormick object times a McCormick object. Verify that cases are satisfied before
@@ -193,6 +215,7 @@ mult_cc_case(A::MC, B::MC) = mult_cv_case(A.cv, A.cc, A.Intv.lo, A.Intv.hi, B.cv
     @variables x, y
     to_compute = x*y
     mult_cv, mult_cc, mult_lo, mult_hi, order = all_evaluators(to_compute)
+    mult_cvgrad, mult_ccgrad, order_grad = all_subgradients(to_compute, expand=true)
 
     @test mult_cv_case(pos, pos_hi) == 1
     @test mult_cc_case(pos, pos_hi) == 1
@@ -200,6 +223,10 @@ mult_cc_case(A::MC, B::MC) = mult_cv_case(A.cv, A.cc, A.Intv.lo, A.Intv.hi, B.cv
     @test abs(eval_check(mult_cc, pos, pos_hi) - (pos*pos_hi).cc) <= 1E-15
     @test abs(eval_check(mult_lo, pos, pos_hi) - (pos*pos_hi).Intv.lo) <= 1E-15
     @test abs(eval_check(mult_hi, pos, pos_hi) - (pos*pos_hi).Intv.hi) <= 1E-15
+    @test abs(eval_check_grad_mult(mult_cvgrad[1], pos, pos_hi) - (pos*pos_hi).cv_grad[1]) <= 1E-15
+    @test abs(eval_check_grad_mult(mult_cvgrad[2], pos, pos_hi) - (pos*pos_hi).cv_grad[2]) <= 1E-15
+    @test abs(eval_check_grad_mult(mult_ccgrad[1], pos, pos_hi) - (pos*pos_hi).cc_grad[1]) <= 1E-15
+    @test abs(eval_check_grad_mult(mult_ccgrad[2], pos, pos_hi) - (pos*pos_hi).cc_grad[2]) <= 1E-15
     
     @test mult_cv_case(pos, pos) == 2
     @test mult_cc_case(pos, pos) == 2
@@ -207,6 +234,10 @@ mult_cc_case(A::MC, B::MC) = mult_cv_case(A.cv, A.cc, A.Intv.lo, A.Intv.hi, B.cv
     @test abs(eval_check(mult_cc, pos, pos) - (pos*pos).cc) <= 1E-15
     @test abs(eval_check(mult_lo, pos, pos) - (pos*pos).Intv.lo) <= 1E-15
     @test abs(eval_check(mult_hi, pos, pos) - (pos*pos).Intv.hi) <= 1E-15
+    @test abs(eval_check_grad_mult(mult_cvgrad[1], pos, pos) - (pos*pos).cv_grad[1]) <= 1E-15
+    @test abs(eval_check_grad_mult(mult_cvgrad[2], pos, pos) - (pos*pos).cv_grad[2]) <= 1E-15
+    @test abs(eval_check_grad_mult(mult_ccgrad[1], pos, pos) - (pos*pos).cc_grad[1]) <= 1E-15
+    @test abs(eval_check_grad_mult(mult_ccgrad[2], pos, pos) - (pos*pos).cc_grad[2]) <= 1E-15
     
     @test mult_cv_case(pos, neg_hi) == 3
     @test mult_cc_case(pos, neg_hi) == 3
@@ -214,6 +245,10 @@ mult_cc_case(A::MC, B::MC) = mult_cv_case(A.cv, A.cc, A.Intv.lo, A.Intv.hi, B.cv
     @test abs(eval_check(mult_cc, pos, neg_hi) - (pos*neg_hi).cc) <= 1E-15
     @test abs(eval_check(mult_lo, pos, neg_hi) - (pos*neg_hi).Intv.lo) <= 1E-15
     @test abs(eval_check(mult_hi, pos, neg_hi) - (pos*neg_hi).Intv.hi) <= 1E-15
+    @test abs(eval_check_grad_mult(mult_cvgrad[1], pos, neg_hi) - (pos*neg_hi).cv_grad[1]) <= 1E-15
+    @test abs(eval_check_grad_mult(mult_cvgrad[2], pos, neg_hi) - (pos*neg_hi).cv_grad[2]) <= 1E-15
+    @test abs(eval_check_grad_mult(mult_ccgrad[1], pos, neg_hi) - (pos*neg_hi).cc_grad[1]) <= 1E-15
+    @test abs(eval_check_grad_mult(mult_ccgrad[2], pos, neg_hi) - (pos*neg_hi).cc_grad[2]) <= 1E-15
     
     @test mult_cv_case(pos, neg) == 4
     @test mult_cc_case(pos, neg) == 4
@@ -221,6 +256,10 @@ mult_cc_case(A::MC, B::MC) = mult_cv_case(A.cv, A.cc, A.Intv.lo, A.Intv.hi, B.cv
     @test abs(eval_check(mult_cc, pos, neg) - (pos*neg).cc) <= 1E-15
     @test abs(eval_check(mult_lo, pos, neg) - (pos*neg).Intv.lo) <= 1E-15
     @test abs(eval_check(mult_hi, pos, neg) - (pos*neg).Intv.hi) <= 1E-15
+    @test abs(eval_check_grad_mult(mult_cvgrad[1], pos, neg) - (pos*neg).cv_grad[1]) <= 1E-15
+    @test abs(eval_check_grad_mult(mult_cvgrad[2], pos, neg) - (pos*neg).cv_grad[2]) <= 1E-15
+    @test abs(eval_check_grad_mult(mult_ccgrad[1], pos, neg) - (pos*neg).cc_grad[1]) <= 1E-15
+    @test abs(eval_check_grad_mult(mult_ccgrad[2], pos, neg) - (pos*neg).cc_grad[2]) <= 1E-15
     
     @test mult_cv_case(pos_hi, mix) == 5
     @test mult_cc_case(pos_hi, mix) == 5
@@ -228,6 +267,10 @@ mult_cc_case(A::MC, B::MC) = mult_cv_case(A.cv, A.cc, A.Intv.lo, A.Intv.hi, B.cv
     @test abs(eval_check(mult_cc, pos_hi, mix) - (pos_hi*mix).cc) <= 1E-15
     @test abs(eval_check(mult_lo, pos_hi, mix) - (pos_hi*mix).Intv.lo) <= 1E-15
     @test abs(eval_check(mult_hi, pos_hi, mix) - (pos_hi*mix).Intv.hi) <= 1E-15
+    @test abs(eval_check_grad_mult(mult_cvgrad[1], pos_hi, mix) - (pos_hi*mix).cv_grad[1]) <= 1E-15
+    @test abs(eval_check_grad_mult(mult_cvgrad[2], pos_hi, mix) - (pos_hi*mix).cv_grad[2]) <= 1E-15
+    @test abs(eval_check_grad_mult(mult_ccgrad[1], pos_hi, mix) - (pos_hi*mix).cc_grad[1]) <= 1E-15
+    @test abs(eval_check_grad_mult(mult_ccgrad[2], pos_hi, mix) - (pos_hi*mix).cc_grad[2]) <= 1E-15
     
     @test mult_cv_case(pos, mix) == 6
     @test mult_cc_case(pos, mix) == 6
@@ -235,6 +278,10 @@ mult_cc_case(A::MC, B::MC) = mult_cv_case(A.cv, A.cc, A.Intv.lo, A.Intv.hi, B.cv
     @test abs(eval_check(mult_cc, pos, mix) - (pos*mix).cc) <= 1E-15
     @test abs(eval_check(mult_lo, pos, mix) - (pos*mix).Intv.lo) <= 1E-15
     @test abs(eval_check(mult_hi, pos, mix) - (pos*mix).Intv.hi) <= 1E-15
+    @test abs(eval_check_grad_mult(mult_cvgrad[1], pos, mix) - (pos*mix).cv_grad[1]) <= 1E-15
+    @test abs(eval_check_grad_mult(mult_cvgrad[2], pos, mix) - (pos*mix).cv_grad[2]) <= 1E-15
+    @test abs(eval_check_grad_mult(mult_ccgrad[1], pos, mix) - (pos*mix).cc_grad[1]) <= 1E-15
+    @test abs(eval_check_grad_mult(mult_ccgrad[2], pos, mix) - (pos*mix).cc_grad[2]) <= 1E-15
     
     @test mult_cv_case(neg, pos_hi) == 7
     @test mult_cc_case(neg, pos_hi) == 7
@@ -242,6 +289,10 @@ mult_cc_case(A::MC, B::MC) = mult_cv_case(A.cv, A.cc, A.Intv.lo, A.Intv.hi, B.cv
     @test abs(eval_check(mult_cc, neg, pos_hi) - (neg*pos_hi).cc) <= 1E-15
     @test abs(eval_check(mult_lo, neg, pos_hi) - (neg*pos_hi).Intv.lo) <= 1E-15
     @test abs(eval_check(mult_hi, neg, pos_hi) - (neg*pos_hi).Intv.hi) <= 1E-15
+    @test abs(eval_check_grad_mult(mult_cvgrad[1], neg, pos_hi) - (neg*pos_hi).cv_grad[1]) <= 1E-15
+    @test abs(eval_check_grad_mult(mult_cvgrad[2], neg, pos_hi) - (neg*pos_hi).cv_grad[2]) <= 1E-15
+    @test abs(eval_check_grad_mult(mult_ccgrad[1], neg, pos_hi) - (neg*pos_hi).cc_grad[1]) <= 1E-15
+    @test abs(eval_check_grad_mult(mult_ccgrad[2], neg, pos_hi) - (neg*pos_hi).cc_grad[2]) <= 1E-15
     
     @test mult_cv_case(neg, pos) == 8
     @test mult_cc_case(neg, pos) == 8
@@ -249,6 +300,10 @@ mult_cc_case(A::MC, B::MC) = mult_cv_case(A.cv, A.cc, A.Intv.lo, A.Intv.hi, B.cv
     @test abs(eval_check(mult_cc, neg, pos) - (neg*pos).cc) <= 1E-15
     @test abs(eval_check(mult_lo, neg, pos) - (neg*pos).Intv.lo) <= 1E-15
     @test abs(eval_check(mult_hi, neg, pos) - (neg*pos).Intv.hi) <= 1E-15
+    @test abs(eval_check_grad_mult(mult_cvgrad[1], neg, pos) - (neg*pos).cv_grad[1]) <= 1E-15
+    @test abs(eval_check_grad_mult(mult_cvgrad[2], neg, pos) - (neg*pos).cv_grad[2]) <= 1E-15
+    @test abs(eval_check_grad_mult(mult_ccgrad[1], neg, pos) - (neg*pos).cc_grad[1]) <= 1E-15
+    @test abs(eval_check_grad_mult(mult_ccgrad[2], neg, pos) - (neg*pos).cc_grad[2]) <= 1E-15
     
     @test mult_cv_case(neg, neg_hi) == 9
     @test mult_cc_case(neg, neg_hi) == 9
@@ -256,6 +311,10 @@ mult_cc_case(A::MC, B::MC) = mult_cv_case(A.cv, A.cc, A.Intv.lo, A.Intv.hi, B.cv
     @test abs(eval_check(mult_cc, neg, neg_hi) - (neg*neg_hi).cc) <= 1E-15
     @test abs(eval_check(mult_lo, neg, neg_hi) - (neg*neg_hi).Intv.lo) <= 1E-15
     @test abs(eval_check(mult_hi, neg, neg_hi) - (neg*neg_hi).Intv.hi) <= 1E-15
+    @test abs(eval_check_grad_mult(mult_cvgrad[1], neg, neg_hi) - (neg*neg_hi).cv_grad[1]) <= 1E-15
+    @test abs(eval_check_grad_mult(mult_cvgrad[2], neg, neg_hi) - (neg*neg_hi).cv_grad[2]) <= 1E-15
+    @test abs(eval_check_grad_mult(mult_ccgrad[1], neg, neg_hi) - (neg*neg_hi).cc_grad[1]) <= 1E-15
+    @test abs(eval_check_grad_mult(mult_ccgrad[2], neg, neg_hi) - (neg*neg_hi).cc_grad[2]) <= 1E-15
     
     @test mult_cv_case(neg, neg) == 10
     @test mult_cc_case(neg, neg) == 10
@@ -263,6 +322,10 @@ mult_cc_case(A::MC, B::MC) = mult_cv_case(A.cv, A.cc, A.Intv.lo, A.Intv.hi, B.cv
     @test abs(eval_check(mult_cc, neg, neg) - (neg*neg).cc) <= 1E-15
     @test abs(eval_check(mult_lo, neg, neg) - (neg*neg).Intv.lo) <= 1E-15
     @test abs(eval_check(mult_hi, neg, neg) - (neg*neg).Intv.hi) <= 1E-15
+    @test abs(eval_check_grad_mult(mult_cvgrad[1], neg, neg) - (neg*neg).cv_grad[1]) <= 1E-15
+    @test abs(eval_check_grad_mult(mult_cvgrad[2], neg, neg) - (neg*neg).cv_grad[2]) <= 1E-15
+    @test abs(eval_check_grad_mult(mult_ccgrad[1], neg, neg) - (neg*neg).cc_grad[1]) <= 1E-15
+    @test abs(eval_check_grad_mult(mult_ccgrad[2], neg, neg) - (neg*neg).cc_grad[2]) <= 1E-15
     
     @test mult_cv_case(neg, mix) == 11
     @test mult_cc_case(neg, mix) == 11
@@ -270,6 +333,10 @@ mult_cc_case(A::MC, B::MC) = mult_cv_case(A.cv, A.cc, A.Intv.lo, A.Intv.hi, B.cv
     @test abs(eval_check(mult_cc, neg, mix) - (neg*mix).cc) <= 1E-15
     @test abs(eval_check(mult_lo, neg, mix) - (neg*mix).Intv.lo) <= 1E-15
     @test abs(eval_check(mult_hi, neg, mix) - (neg*mix).Intv.hi) <= 1E-15
+    @test abs(eval_check_grad_mult(mult_cvgrad[1], neg, mix) - (neg*mix).cv_grad[1]) <= 1E-15
+    @test abs(eval_check_grad_mult(mult_cvgrad[2], neg, mix) - (neg*mix).cv_grad[2]) <= 1E-15
+    @test abs(eval_check_grad_mult(mult_ccgrad[1], neg, mix) - (neg*mix).cc_grad[1]) <= 1E-15
+    @test abs(eval_check_grad_mult(mult_ccgrad[2], neg, mix) - (neg*mix).cc_grad[2]) <= 1E-15
     
     @test mult_cv_case(neg_hi, mix) == 12
     @test mult_cc_case(neg_hi, mix) == 12
@@ -277,6 +344,10 @@ mult_cc_case(A::MC, B::MC) = mult_cv_case(A.cv, A.cc, A.Intv.lo, A.Intv.hi, B.cv
     @test abs(eval_check(mult_cc, neg_hi, mix) - (neg_hi*mix).cc) <= 1E-15
     @test abs(eval_check(mult_lo, neg_hi, mix) - (neg_hi*mix).Intv.lo) <= 1E-15
     @test abs(eval_check(mult_hi, neg_hi, mix) - (neg_hi*mix).Intv.hi) <= 1E-15
+    @test abs(eval_check_grad_mult(mult_cvgrad[1], neg_hi, mix) - (neg_hi*mix).cv_grad[1]) <= 1E-15
+    @test abs(eval_check_grad_mult(mult_cvgrad[2], neg_hi, mix) - (neg_hi*mix).cv_grad[2]) <= 1E-15
+    @test abs(eval_check_grad_mult(mult_ccgrad[1], neg_hi, mix) - (neg_hi*mix).cc_grad[1]) <= 1E-15
+    @test abs(eval_check_grad_mult(mult_ccgrad[2], neg_hi, mix) - (neg_hi*mix).cc_grad[2]) <= 1E-15
     
     @test mult_cv_case(mix, pos_hi) == 13
     @test mult_cc_case(mix, pos_hi) == 13
@@ -284,6 +355,10 @@ mult_cc_case(A::MC, B::MC) = mult_cv_case(A.cv, A.cc, A.Intv.lo, A.Intv.hi, B.cv
     @test abs(eval_check(mult_cc, mix, pos_hi) - (mix*pos_hi).cc) <= 1E-15
     @test abs(eval_check(mult_lo, mix, pos_hi) - (mix*pos_hi).Intv.lo) <= 1E-15
     @test abs(eval_check(mult_hi, mix, pos_hi) - (mix*pos_hi).Intv.hi) <= 1E-15
+    @test abs(eval_check_grad_mult(mult_cvgrad[1], mix, pos_hi) - (mix*pos_hi).cv_grad[1]) <= 1E-15
+    @test abs(eval_check_grad_mult(mult_cvgrad[2], mix, pos_hi) - (mix*pos_hi).cv_grad[2]) <= 1E-15
+    @test abs(eval_check_grad_mult(mult_ccgrad[1], mix, pos_hi) - (mix*pos_hi).cc_grad[1]) <= 1E-15
+    @test abs(eval_check_grad_mult(mult_ccgrad[2], mix, pos_hi) - (mix*pos_hi).cc_grad[2]) <= 1E-15
     
     @test mult_cv_case(mix, pos) == 14
     @test mult_cc_case(mix, pos) == 14
@@ -291,6 +366,10 @@ mult_cc_case(A::MC, B::MC) = mult_cv_case(A.cv, A.cc, A.Intv.lo, A.Intv.hi, B.cv
     @test abs(eval_check(mult_cc, mix, pos) - (mix*pos).cc) <= 1E-15
     @test abs(eval_check(mult_lo, mix, pos) - (mix*pos).Intv.lo) <= 1E-15
     @test abs(eval_check(mult_hi, mix, pos) - (mix*pos).Intv.hi) <= 1E-15
+    @test abs(eval_check_grad_mult(mult_cvgrad[1], mix, pos) - (mix*pos).cv_grad[1]) <= 1E-15
+    @test abs(eval_check_grad_mult(mult_cvgrad[2], mix, pos) - (mix*pos).cv_grad[2]) <= 1E-15
+    @test abs(eval_check_grad_mult(mult_ccgrad[1], mix, pos) - (mix*pos).cc_grad[1]) <= 1E-15
+    @test abs(eval_check_grad_mult(mult_ccgrad[2], mix, pos) - (mix*pos).cc_grad[2]) <= 1E-15
     
     @test mult_cv_case(mix, neg) == 15
     @test mult_cc_case(mix, neg) == 15
@@ -298,6 +377,10 @@ mult_cc_case(A::MC, B::MC) = mult_cv_case(A.cv, A.cc, A.Intv.lo, A.Intv.hi, B.cv
     @test abs(eval_check(mult_cc, mix, neg) - (mix*neg).cc) <= 1E-15
     @test abs(eval_check(mult_lo, mix, neg) - (mix*neg).Intv.lo) <= 1E-15
     @test abs(eval_check(mult_hi, mix, neg) - (mix*neg).Intv.hi) <= 1E-15
+    @test abs(eval_check_grad_mult(mult_cvgrad[1], mix, neg) - (mix*neg).cv_grad[1]) <= 1E-15
+    @test abs(eval_check_grad_mult(mult_cvgrad[2], mix, neg) - (mix*neg).cv_grad[2]) <= 1E-15
+    @test abs(eval_check_grad_mult(mult_ccgrad[1], mix, neg) - (mix*neg).cc_grad[1]) <= 1E-15
+    @test abs(eval_check_grad_mult(mult_ccgrad[2], mix, neg) - (mix*neg).cc_grad[2]) <= 1E-15
     
     @test mult_cv_case(mix, neg_hi) == 16
     @test mult_cc_case(mix, neg_hi) == 16
@@ -305,6 +388,10 @@ mult_cc_case(A::MC, B::MC) = mult_cv_case(A.cv, A.cc, A.Intv.lo, A.Intv.hi, B.cv
     @test abs(eval_check(mult_cc, mix, neg_hi) - (mix*neg_hi).cc) <= 1E-15
     @test abs(eval_check(mult_lo, mix, neg_hi) - (mix*neg_hi).Intv.lo) <= 1E-15
     @test abs(eval_check(mult_hi, mix, neg_hi) - (mix*neg_hi).Intv.hi) <= 1E-15
+    @test abs(eval_check_grad_mult(mult_cvgrad[1], mix, neg_hi) - (mix*neg_hi).cv_grad[1]) <= 1E-15
+    @test abs(eval_check_grad_mult(mult_cvgrad[2], mix, neg_hi) - (mix*neg_hi).cv_grad[2]) <= 1E-15
+    @test abs(eval_check_grad_mult(mult_ccgrad[1], mix, neg_hi) - (mix*neg_hi).cc_grad[1]) <= 1E-15
+    @test abs(eval_check_grad_mult(mult_ccgrad[2], mix, neg_hi) - (mix*neg_hi).cc_grad[2]) <= 1E-15
     
     @test mult_cv_case(mix, mix_lo) == 17
     @test mult_cc_case(mix, mix_lo) == 17
@@ -312,6 +399,10 @@ mult_cc_case(A::MC, B::MC) = mult_cv_case(A.cv, A.cc, A.Intv.lo, A.Intv.hi, B.cv
     @test abs(eval_check(mult_cc, mix, mix_lo) - (mix*mix_lo).cc) <= 1E-15
     @test abs(eval_check(mult_lo, mix, mix_lo) - (mix*mix_lo).Intv.lo) <= 1E-15
     @test abs(eval_check(mult_hi, mix, mix_lo) - (mix*mix_lo).Intv.hi) <= 1E-15
+    @test abs(eval_check_grad_mult(mult_cvgrad[1], mix, mix_lo) - (mix*mix_lo).cv_grad[1]) <= 1E-15
+    @test abs(eval_check_grad_mult(mult_cvgrad[2], mix, mix_lo) - (mix*mix_lo).cv_grad[2]) <= 1E-15
+    @test abs(eval_check_grad_mult(mult_ccgrad[1], mix, mix_lo) - (mix*mix_lo).cc_grad[1]) <= 1E-15
+    @test abs(eval_check_grad_mult(mult_ccgrad[2], mix, mix_lo) - (mix*mix_lo).cc_grad[2]) <= 1E-15
     
     @test mult_cv_case(mix, mix_hi) == 18
     @test mult_cc_case(mix, mix_hi) == 18
@@ -319,4 +410,8 @@ mult_cc_case(A::MC, B::MC) = mult_cv_case(A.cv, A.cc, A.Intv.lo, A.Intv.hi, B.cv
     @test abs(eval_check(mult_cc, mix, mix_hi) - (mix*mix_hi).cc) <= 1E-15
     @test abs(eval_check(mult_lo, mix, mix_hi) - (mix*mix_hi).Intv.lo) <= 1E-15
     @test abs(eval_check(mult_hi, mix, mix_hi) - (mix*mix_hi).Intv.hi) <= 1E-15
+    @test abs(eval_check_grad_mult(mult_cvgrad[1], mix, mix_hi) - (mix*mix_hi).cv_grad[1]) <= 1E-15
+    @test abs(eval_check_grad_mult(mult_cvgrad[2], mix, mix_hi) - (mix*mix_hi).cv_grad[2]) <= 1E-15
+    @test abs(eval_check_grad_mult(mult_ccgrad[1], mix, mix_hi) - (mix*mix_hi).cc_grad[1]) <= 1E-15
+    @test abs(eval_check_grad_mult(mult_ccgrad[2], mix, mix_hi) - (mix*mix_hi).cc_grad[2]) <= 1E-15
 end
diff --git a/test/power.jl b/test/power.jl
index db205b4..721bf1b 100644
--- a/test/power.jl
+++ b/test/power.jl
@@ -5,6 +5,7 @@
 
     to_compute = x^2
     pow2_cv, pow2_cc, pow2_lo, pow2_hi, pow2_order = all_evaluators(to_compute)
+    pow2_cvgrad, pow2_ccgrad, pow2_order_grad = all_subgradients(to_compute, expand=true)
 
     # All cases for ^2 are very similar; check positive/negative/mixed, as well as some unique cases where values are the same
     pos = MC{1,NS}(1.0, 1.5, Interval(0.5, 2.0), SVector{1, Float64}(1.0), SVector{1, Float64}(3.0), false)
@@ -20,29 +21,41 @@
     @test abs(eval_check(pow2_cc, pos) - (pos^2).cc) <= 1E-15
     @test abs(eval_check(pow2_lo, pos) - (pos^2).Intv.lo) <= 1E-15
     @test abs(eval_check(pow2_hi, pos) - (pos^2).Intv.hi) <= 1E-15
+    @test abs(eval_check_grad(pow2_cvgrad[1], pos) - (pos^2).cv_grad[1]) <= 1E-15
+    @test abs(eval_check_grad(pow2_ccgrad[1], pos) - (pos^2).cc_grad[1]) <= 1E-15
     
     @test abs(eval_check(pow2_cv, mix) - (mix^2).cv) <= 1E-15
     @test abs(eval_check(pow2_cc, mix) - (mix^2).cc) <= 1E-15
     @test abs(eval_check(pow2_lo, mix) - (mix^2).Intv.lo) <= 1E-15
     @test abs(eval_check(pow2_hi, mix) - (mix^2).Intv.hi) <= 1E-15
+    @test abs(eval_check_grad(pow2_cvgrad[1], mix) - (mix^2).cv_grad[1]) <= 1E-15
+    @test abs(eval_check_grad(pow2_ccgrad[1], mix) - (mix^2).cc_grad[1]) <= 1E-15
     
     @test abs(eval_check(pow2_cv, neg) - (neg^2).cv) <= 1E-15
     @test abs(eval_check(pow2_cc, neg) - (neg^2).cc) <= 1E-15
     @test abs(eval_check(pow2_lo, neg) - (neg^2).Intv.lo) <= 1E-15
     @test abs(eval_check(pow2_hi, neg) - (neg^2).Intv.hi) <= 1E-15
+    @test abs(eval_check_grad(pow2_cvgrad[1], neg) - (neg^2).cv_grad[1]) <= 1E-15
+    @test abs(eval_check_grad(pow2_ccgrad[1], neg) - (neg^2).cc_grad[1]) <= 1E-15
 
     @test abs(eval_check(pow2_cv, pos_same1) - (pos_same1^2).cv) <= 1E-15
     @test abs(eval_check(pow2_cc, pos_same1) - (pos_same1^2).cc) <= 1E-15
     @test abs(eval_check(pow2_lo, pos_same1) - (pos_same1^2).Intv.lo) <= 1E-15
     @test abs(eval_check(pow2_hi, pos_same1) - (pos_same1^2).Intv.hi) <= 1E-15
+    @test abs(eval_check_grad(pow2_cvgrad[1], pos_same1) - (pos_same1^2).cv_grad[1]) <= 1E-15
+    @test abs(eval_check_grad(pow2_ccgrad[1], pos_same1) - (pos_same1^2).cc_grad[1]) <= 1E-15
 
     @test abs(eval_check(pow2_cv, pos_same2) - (pos_same2^2).cv) <= 1E-15
     @test abs(eval_check(pow2_cc, pos_same2) - (pos_same2^2).cc) <= 1E-15
     @test abs(eval_check(pow2_lo, pos_same2) - (pos_same2^2).Intv.lo) <= 1E-15
     @test abs(eval_check(pow2_hi, pos_same2) - (pos_same2^2).Intv.hi) <= 1E-15
+    @test abs(eval_check_grad(pow2_cvgrad[1], pos_same2) - (pos_same2^2).cv_grad[1]) <= 1E-15
+    @test abs(eval_check_grad(pow2_ccgrad[1], pos_same2) - (pos_same2^2).cc_grad[1]) <= 1E-15
 
     @test abs(eval_check(pow2_cv, pos_same3) - (pos_same3^2).cv) <= 1E-15
     @test abs(eval_check(pow2_cc, pos_same3) - (pos_same3^2).cc) <= 1E-15
     @test abs(eval_check(pow2_lo, pos_same3) - (pos_same3^2).Intv.lo) <= 1E-15
     @test abs(eval_check(pow2_hi, pos_same3) - (pos_same3^2).Intv.hi) <= 1E-15
+    @test abs(eval_check_grad(pow2_cvgrad[1], pos_same3) - (pos_same3^2).cv_grad[1]) <= 1E-15
+    @test abs(eval_check_grad(pow2_ccgrad[1], pos_same3) - (pos_same3^2).cc_grad[1]) <= 1E-15
 end
diff --git a/test/runtests.jl b/test/runtests.jl
index 683603a..ee8834e 100644
--- a/test/runtests.jl
+++ b/test/runtests.jl
@@ -4,9 +4,16 @@ function eval_check(eval_func, MC1, MC2)
     return eval_func(MC1.cv, MC1.cc, MC1.Intv.lo, MC1.Intv.hi,
                      MC2.cv, MC2.cc, MC2.Intv.lo, MC2.Intv.hi)
 end
+function eval_check_grad(eval_func, MC1, MC2)
+    return eval_func(MC1.cv, MC1.cc, MC1.Intv.lo, MC1.Intv.hi, MC1.cv_grad[1], MC1.cv_grad[2], MC1.cc_grad[1], MC1.cc_grad[2],
+                     MC2.cv, MC2.cc, MC2.Intv.lo, MC2.Intv.hi, MC2.cv_grad[1], MC2.cv_grad[2], MC2.cc_grad[1], MC2.cc_grad[2])
+end
 function eval_check(eval_func, MC1)
     return eval_func(MC1.cv, MC1.cc, MC1.Intv.lo, MC1.Intv.hi)
 end
+function eval_check_grad(eval_func, MC1)
+    return eval_func(MC1.cv, MC1.cc, MC1.Intv.lo, MC1.Intv.hi, MC1.cv_grad[1], MC1.cc_grad[1])
+end
 include("multiplication.jl")
 include("division.jl")
 include("addition.jl")