Skip to content

Improve compare.py output to use min times and better column titles #6134

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 2 commits into from
Apr 30, 2023
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
26 changes: 9 additions & 17 deletions benchmarks/compare.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,6 @@
from __future__ import annotations

import json
import statistics
from dataclasses import dataclass
from typing import Dict, List, Any
from pathlib import Path
Expand All @@ -33,8 +32,6 @@
print("Try `pip install rich` for using this script.")
raise

MEAN_THRESHOLD = 5


@dataclass
class QueryResult:
Expand Down Expand Up @@ -64,14 +61,9 @@ def load_from(cls, data: Dict[str, Any]) -> QueryRun:
def execution_time(self) -> float:
assert len(self.iterations) >= 1

# If we don't have enough samples, median() is probably
# going to be a worse measure than just an average.
if len(self.iterations) < MEAN_THRESHOLD:
method = statistics.mean
else:
method = statistics.median

return method(iteration.elapsed for iteration in self.iterations)
# Use minimum execution time to account for variations / other
# things the system was doing
return min(iteration.elapsed for iteration in self.iterations)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

lgtm, but imho, can min be deceptive comparing to avg/mean/median? 🤔

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I agree it is misleading -- in terms of measuring a change between datafusion versions, I think min will give us the least variance between runs and represents best case performance.

However, it doesn't really give a sense for how much variation is across runs 🤔



@dataclass
Expand All @@ -81,7 +73,6 @@ class Context:
num_cpus: int
start_time: int
arguments: List[str]
branch: str

@classmethod
def load_from(cls, data: Dict[str, Any]) -> Context:
Expand All @@ -91,7 +82,6 @@ def load_from(cls, data: Dict[str, Any]) -> Context:
num_cpus=data["num_cpus"],
start_time=data["start_time"],
arguments=data["arguments"],
branch=data["arguments"][9]
)


Expand Down Expand Up @@ -119,17 +109,19 @@ def compare(
noise_threshold: float,
) -> None:
baseline = BenchmarkRun.load_from_file(baseline_path)
baselineBranch = baseline.context.branch

comparison = BenchmarkRun.load_from_file(comparison_path)
comparisonBranch = comparison.context.branch

console = Console()

# use basename as the column names
baseline_header = baseline_path.stem
comparison_header = comparison_path.stem

table = Table(show_header=True, header_style="bold magenta")
table.add_column("Query", style="dim", width=12)
table.add_column(baselineBranch, justify="right", style="dim", width=12)
table.add_column(comparisonBranch, justify="right", style="dim", width=12)
table.add_column(baseline_header, justify="right", style="dim")
table.add_column(comparison_header, justify="right", style="dim")
table.add_column("Change", justify="right", style="dim")

for baseline_result, comparison_result in zip(baseline.queries, comparison.queries):
Expand Down