apache · eric-maynard · Apr 30, 2025 · Apr 30, 2025 · Apr 30, 2025 · Apr 30, 2025
@@ -161,4 +161,32 @@ workload {
     # Default: 5
     duration-in-minutes = 5
   }
+
+  # Configuration for the WeightedWorkloadOnTreeDataset simulation
+  weighted-workload-on-tree-dataset {
+    # Seed used for RNG during the test
+    seed = 42
+
+    # Distributions for readers
+    # Each distribution will have `count` threads assigned to it
+    # mean / variance describe the properties of the normal distribution
+    # Readers will read a random table in the table space based on sampling
+    # Default: [{ count = 8, mean = 0.3, variance = 0.0278 }]
+    readers = [
+      { count = 8, mean = 0.3, variance = 0.0278 }
+    ]
+
+    # Distributions for writers
+    # Each distribution will have `count` threads assigned to it
+    # mean / variance describe the properties of the normal distribution
+    # Writers will write to a random table in the table space based on sampling
+    # Default: [{ count = 2, mean = 0.7, variance = 0.0278 }]
+    writers = [
+      { count = 2, mean = 0.7, variance = 0.0278 }
+    ]
+
+    # Duration of the simulation in minutes
+    # Default: 5
+    duration-in-minutes = 5
+  }
 }
@@ -31,6 +31,7 @@ import org.apache.polaris.benchmarks.parameters.ConnectionParameters
 import org.slf4j.LoggerFactory
 
 import java.util.concurrent.atomic.AtomicReference
+import scala.concurrent.duration.DurationInt
 
 /**
  * Actions for performance testing authentication operations. This class provides methods to
@@ -85,8 +86,9 @@ case class AuthenticationActions(
         .check(jsonPath("$.access_token").saveAs("accessToken"))
     )
       .exec { session =>
-        if (session.contains("accessToken"))
+        if (session.contains("accessToken") && session("accessToken") != null) {
           accessToken.set(session("accessToken").as[String])
+        }
         session
       }
 
@@ -96,5 +98,9 @@ case class AuthenticationActions(
    * scenario.
    */
   val restoreAccessTokenInSession: ChainBuilder =
-    exec(session => session.set("accessToken", accessToken.get()))
+    asLongAs(_ => accessToken.get() == null) {
+      pause(1.second)
+    }.exec { session =>
+      session.set("accessToken", accessToken.get())
+    }
 }
@@ -184,6 +184,7 @@ case class TableActions(
     http("Fetch Table")
       .get("/api/catalog/v1/#{catalogName}/namespaces/#{multipartNamespace}/tables/#{tableName}")
       .header("Authorization", "Bearer #{accessToken}")
+      .header("If-None-Match", "")
       .check(status.is(200))
       .check(jsonPath("$.metadata.table-uuid").saveAs("tableUuid"))
       .check(jsonPath("$.metadata.location").is("#{location}"))

@@ -42,6 +42,7 @@ object BenchmarkConfig {
       val rtdConfig = workload.getConfig("read-tree-dataset")
       val ctdConfig = workload.getConfig("create-tree-dataset")
       val rutdConfig = workload.getConfig("read-update-tree-dataset")
+      val wwotdConfig = workload.getConfig("weighted-workload-on-tree-dataset")
 
       WorkloadParameters(
         ReadTreeDatasetParameters(
@@ -56,6 +57,12 @@ object BenchmarkConfig {
           rutdConfig.getDouble("read-write-ratio"),
           rutdConfig.getInt("throughput"),
           rutdConfig.getInt("duration-in-minutes")
+        ),
+        WeightedWorkloadOnTreeDatasetParameters(
+          wwotdConfig.getInt("seed"),
+          WeightedWorkloadOnTreeDatasetParameters.loadDistributionsList(wwotdConfig, "readers"),
+          WeightedWorkloadOnTreeDatasetParameters.loadDistributionsList(wwotdConfig, "writers"),
+          wwotdConfig.getInt("duration-in-minutes")
         )
       )
     }

@@ -54,7 +54,7 @@ case class DatasetParameters(
     numViewProperties: Int
 ) {
   val nAryTree: NAryTreeBuilder = NAryTreeBuilder(nsWidth, nsDepth)
-  private val maxPossibleTables = nAryTree.numberOfLastLevelElements * numTablesPerNs
+  val maxPossibleTables = nAryTree.numberOfLastLevelElements * numTablesPerNs
   private val maxPossibleViews = nAryTree.numberOfLastLevelElements * numViewsPerNs
   val numTables: Int = if (numTablesMax <= 0) maxPossibleTables else numTablesMax
   val numViews: Int = if (numViewsMax <= 0) maxPossibleViews else numViewsMax

@@ -0,0 +1,161 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.polaris.benchmarks.parameters
+
+import com.typesafe.config.Config
+import com.typesafe.scalalogging.Logger
+import org.slf4j.LoggerFactory
+
+import scala.jdk.CollectionConverters._
+import scala.collection.immutable.LazyList
+import scala.collection.mutable
+import scala.collection.mutable.ArrayBuffer
+import scala.util.Random
+
+/**
+ * Case class to hold the parameters for the WeightedWorkloadOnTreeDataset simulation.
+ *
+ * @param seed The RNG seed to use
+ * @param readers A seq of distrbutions to use for reading tables
+ * @param writers A seq of distrbutions to use for writing to tables
+ */
+case class WeightedWorkloadOnTreeDatasetParameters(
+    seed: Int,
+    readers: Seq[Distribution],
+    writers: Seq[Distribution],
+    durationInMinutes: Int
+) {
+  require(readers.nonEmpty || writers.nonEmpty, "At least one reader or writer is required")
+  require(durationInMinutes > 0, "Duration in minutes must be positive")
+}
+
+object WeightedWorkloadOnTreeDatasetParameters {
+  def loadDistributionsList(config: Config, key: String): List[Distribution] =
+    config.getConfigList(key).asScala.toList.map { conf =>
+      Distribution(
+        count = conf.getInt("count"),
+        mean = conf.getDouble("mean"),
+        variance = conf.getDouble("variance")
+      )
+    }
+}
+
+case class Distribution(count: Int, mean: Double, variance: Double) {
+  private val logger = LoggerFactory.getLogger(getClass)
+
+  def printDescription(dataset: DatasetParameters): Unit = {
+    println(s"Summary for ${this}:")
+
+    // Visualize distributions
+    printVisualization(dataset.maxPossibleTables)
+
+    // Warn if a large amount of resampling will be needed
+    val debugRandomNumberProvider = RandomNumberProvider("debug".hashCode, -1)
+    def resampleStream: LazyList[Double] =
+      LazyList.continually(sample(dataset.maxPossibleTables, debugRandomNumberProvider))
+
+    val (_, resamples) = resampleStream.zipWithIndex
+      .take(100000)
+      .find { case (value, _) => value >= 0 && value < dataset.maxPossibleTables }
+      .map { case (value, index) => (value, index) }
+      .getOrElse((-1, 100000))
+
+    if (resamples > 100) {
+      logger.warn(
+        s"A distribution appears to require aggressive resampling: ${this} took ${resamples + 1} samples!"
+      )
+    }
+  }
+
+  /**
+   * Return a value in [0, items) based on this distribution using truncated normal resampling.
+   */
+  def sample(items: Int, randomNumberProvider: RandomNumberProvider): Int = {
+    val stddev = math.sqrt(variance)
+    // Resample until the value is in [0, 1]
+    val maxSamples = 100000
+    val value = Iterator
+      .continually(randomNumberProvider.next() * stddev + mean)
+      .take(maxSamples)
+      .find(x => x >= 0.0 && x <= 1.0)
+      .getOrElse(
+        throw new RuntimeException(
+          s"Failed to sample a value in [0, 1] after ${maxSamples} attempts"
+        )
+      )
+
+    (value * items).toInt.min(items - 1)
+  }
+
+  def printVisualization(tables: Int, samples: Int = 100000, bins: Int = 10): Unit = {
+    val binCounts = Array.fill(bins)(0)
+    val hits = new mutable.HashMap[Int, Int]()
+    val rng = RandomNumberProvider("visualization".hashCode, -1)
+
+    (1 to samples).foreach { _ =>
+      val value = sample(tables, rng)
+      val bin = ((value.toDouble / tables) * bins).toInt.min(bins - 1)
+      hits.put(value, hits.getOrElse(value, 0) + 1)
+      binCounts(bin) += 1
+    }
+
+    val maxBarWidth = 50
+    val total = binCounts.sum.toDouble
+    println("  Range         | % of Samples | Visualization")
+    println("  --------------|--------------|------------------")
+
+    (0 until bins).foreach { i =>
+      val low = i.toDouble / bins
+      val high = (i + 1).toDouble / bins
+      val percent = binCounts(i) / total * 100
+      val bar = "█" * ((percent / 100 * maxBarWidth).round.toInt)
+      println(f"  [$low%.1f - $high%.1f) | $percent%6.2f%%      | $bar")
+    }
+    println()
+
+    val mode = hits.maxBy(_._2)
+    val modePercentage: Int = Math.round(mode._2.toFloat / samples * 100)
+    println(s"  The most frequently selected table was chosen in ~${modePercentage}% of samples")
+
+    println()
+  }
+}
+
+object Distribution {
+
+  // Map an index back to a table path
+  def tableIndexToIdentifier(index: Int, dp: DatasetParameters): (String, List[String], String) = {
+    require(
+      dp.numTablesMax == -1,
+      "Sampling is incompatible with numTablesMax settings other than -1"
+    )
+
+    val namespaceIndex = index / dp.numTablesPerNs
+    val namespaceOrdinal = dp.nAryTree.lastLevelOrdinals.toList.apply(namespaceIndex)
+    val namespacePath = dp.nAryTree.pathToRoot(namespaceOrdinal)
+
+    (s"C_0", namespacePath.map(n => s"NS_${n}"), s"T_${index}")
+  }
+}
+
+case class RandomNumberProvider(seed: Int, threadId: Int) {
+  private[this] val random = new Random(seed + threadId)
+  def next(): Double = random.nextGaussian()
+}
@@ -22,5 +22,6 @@ package org.apache.polaris.benchmarks.parameters
 case class WorkloadParameters(
     readTreeDataset: ReadTreeDatasetParameters,
     createTreeDataset: CreateTreeDatasetParameters,
-    readUpdateTreeDataset: ReadUpdateTreeDatasetParameters
+    readUpdateTreeDataset: ReadUpdateTreeDatasetParameters,
+    weightedWorkloadOnTreeDataset: WeightedWorkloadOnTreeDatasetParameters
 ) {}