Skip to content

Cherry pick Rows out in EXPLAIN ANALYZE #670

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 19 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions src/backend/cdb/cdbvars.c
Original file line number Diff line number Diff line change
Expand Up @@ -278,6 +278,7 @@ int gp_hashjoin_tuples_per_bucket = 5;
int gp_motion_slice_noop = 0;

/* Apache Cloudberry Experimental Feature GUCs */
bool gp_enable_explain_rows_out = false;
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It is better to send a param to explain command than using a guc to control if print out the "rows out"

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Rows out feature created for auto_explain. If we add "rows out" as parameter of explain, then we'll need change auto_explain source, and auto_explain wont be compatible with vanilla

Copy link
Contributor

@fanfuxiaoran fanfuxiaoran Oct 29, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

 auto_explain wont be compatible with vanilla

Hmm... cannot understand. Could you give more details? Is auto_explain aslo included in vanilla?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

auto_explain is included in vanilla https://github.com/greenplum-db/gpdb-archive/tree/main/contrib/auto_explain

Also GUC gp_enable_explain_rows_out is made by analogy with GUC gp_enable_explain_allstat

bool gp_enable_explain_allstat = false;
bool gp_enable_motion_deadlock_sanity = false; /* planning time sanity
* check */
Expand Down
52 changes: 51 additions & 1 deletion src/backend/commands/explain_gp.c
Original file line number Diff line number Diff line change
Expand Up @@ -944,7 +944,7 @@ cdbexplain_collectStatsFromNode(PlanState *planstate, CdbExplain_SendStatCtx *ct
*/
typedef struct CdbExplain_DepStatAcc
{
/* vmax, vsum, vcnt, segmax */
/* vmax, vmin, vsum, vcnt, segmax, segmin */
CdbExplain_Agg agg;
/* max's received StatHdr */
CdbExplain_StatHdr *rshmax;
Expand Down Expand Up @@ -1801,6 +1801,56 @@ cdbexplain_showExecStats(struct PlanState *planstate, ExplainState *es)
}
pfree(extraData.data);

/*
* Print "Rows out"
*/

if (gp_enable_explain_rows_out && es->analyze && ns->ninst > 0) {
double ntuples_max = ns->ntuples.vmax;
int ntuples_imax = ns->ntuples.imax;
int ntuples_wmax = ns->ntuples.wmax;
double ntuples_min = ns->ntuples.vmin;
int ntuples_imin = ns->ntuples.imin;
int ntuples_wmin = ns->ntuples.wmin;
double ntuples_avg = cdbexplain_agg_avg(&ns->ntuples);

int segments = ns->ninst;
int workers = ns->ntuples.vcnt;

if (es->format == EXPLAIN_FORMAT_TEXT)
{
/*
* create a header for all stats: separate each individual stat by an
* underscore, separate the grouped stats for each node by a slash
*/
appendStringInfoSpaces(es->str, es->indent * 2);
appendStringInfoString(es->str, "Rows out: ");

appendStringInfo(es->str,
"%.2f rows avg x %d workers from %d segments, %.0f rows max (seg%d worker%d), %.0f rows min (seg%d worker%d).\n",
ntuples_avg,
workers,
segments,
ntuples_max,
ntuples_imax,
ntuples_wmax,
ntuples_min,
ntuples_imin,
ntuples_wmin);
}
else {
ExplainPropertyInteger("Workers", NULL,workers, es);
ExplainPropertyInteger("Segments", NULL, segments, es);
ExplainPropertyFloat("Average Rows", NULL, ntuples_avg, 1, es);
ExplainPropertyFloat("Max Rows", NULL, ntuples_max, 0, es);
ExplainPropertyInteger("Max Rows Segment", NULL, ntuples_imax, es);
ExplainPropertyInteger("Max Rows Worker", NULL, ntuples_wmax, es);
ExplainPropertyFloat("Min Rows", NULL, ntuples_min, 0, es);
ExplainPropertyInteger("Min Rows Segment", NULL, ntuples_imin, es);
ExplainPropertyInteger("Min Rows Segment", NULL, ntuples_wmin, es);
}
}

/*
* Dump stats for all workers.
*/
Expand Down
11 changes: 11 additions & 0 deletions src/backend/utils/misc/guc_gp.c
Original file line number Diff line number Diff line change
Expand Up @@ -814,6 +814,17 @@ struct config_bool ConfigureNamesBool_gp[] =
NULL, NULL, NULL
},

{
{"gp_enable_explain_rows_out", PGC_USERSET, CLIENT_CONN_OTHER,
gettext_noop("Print avg, min and max rows out and which segments reach them in EXPLAIN ANALYZE."),
NULL,
GUC_NO_SHOW_ALL | GUC_NOT_IN_SAMPLE
},
&gp_enable_explain_rows_out,
false,
NULL, NULL, NULL
},

{
{"gp_enable_explain_allstat", PGC_USERSET, CLIENT_CONN_OTHER,
gettext_noop("Experimental feature: dump stats for all segments in EXPLAIN ANALYZE."),
Expand Down
20 changes: 19 additions & 1 deletion src/include/cdb/cdbexplain.h
Original file line number Diff line number Diff line change
Expand Up @@ -26,18 +26,26 @@ struct CdbExplain_ShowStatCtx; /* private, in "cdb/cdbexplain.c" */
typedef struct
{
double vmax; /* maximum value of statistic */
double vmin; /* minimum value of statistic */
double vsum; /* sum of values */
int vcnt; /* count of values > 0 */
int imax; /* id of 1st observation having maximum value */
int imin; /* id of 1st observation having minimum value */
int wmax; /* worker id of 1st observation having maximum value */
int wmin; /* worker id of 1st observation having minimum value */
} CdbExplain_Agg;

static inline void
cdbexplain_agg_init0(CdbExplain_Agg *agg)
{
agg->vmax = 0;
agg->vmin = 0;
agg->vsum = 0;
agg->vcnt = 0;
agg->imax = 0;
agg->imin = 0;
agg->wmax = 0;
agg->wmin = 0;
}

static inline bool
Expand All @@ -48,13 +56,23 @@ cdbexplain_agg_upd(CdbExplain_Agg *agg, double v, int id)
agg->vsum += v;
agg->vcnt++;

if (v < agg->vmin ||
agg->vcnt == 1)
{
agg->vmin = v;
agg->imin = id;
agg->wmin = agg->vcnt - 1;
}

if (v > agg->vmax ||
agg->vcnt == 1)
{
agg->vmax = v;
agg->imax = id;
return true;
agg->wmax = agg->vcnt - 1;
}

return agg->imin == id || agg->imax == id;
}
return false;
}
Expand Down
6 changes: 6 additions & 0 deletions src/include/cdb/cdbvars.h
Original file line number Diff line number Diff line change
Expand Up @@ -613,6 +613,12 @@ extern bool gp_enable_agg_pushdown;
*/
extern bool gp_enable_preunique;

/* May Cloudberry print statistics as average, minimum and maximum rows out
* and on which segments reach them for each node during EXPLAIN ANALYZE?
*
*/
extern bool gp_enable_explain_rows_out;

/* May Cloudberry dump statistics for all segments as a huge ugly string
* during EXPLAIN ANALYZE?
*
Expand Down
1 change: 1 addition & 0 deletions src/include/utils/unsync_guc_name.h
Original file line number Diff line number Diff line change
Expand Up @@ -182,6 +182,7 @@
"gp_enable_agg_pushdown",
"gp_enable_ao_indexscan",
"gp_enable_direct_dispatch",
"gp_enable_explain_rows_out",
"gp_enable_explain_allstat",
"gp_enable_fast_sri",
"gp_enable_global_deadlock_detector",
Expand Down
28 changes: 28 additions & 0 deletions src/test/regress/expected/cbdb_parallel.out
Original file line number Diff line number Diff line change
Expand Up @@ -3068,6 +3068,34 @@ select t1_anti.a, t1_anti.b from t1_anti left join t2_anti on t1_anti.a = t2_ant
2 |
(4 rows)

abort;
-- test rows out
-- start_matchsubs
-- m/\(actual rows=\d+ loops=\d+\)/
-- s/\(actual rows=\d+ loops=\d+\)/(actual rows=# loops=#)/
-- m/Rows Removed by Filter: \d+/
-- s/Rows Removed by Filter: \d+/Rows Removed by Filter: ###/
-- m/\d+ rows max \(seg\d+ worker\d+\), \d+ rows min \(seg\d+ worker\d+\)/
-- s/\d+ rows max \(seg\d+ worker\d+\), \d+ rows min \(seg\d+ worker\d+\)/##### rows max (seg# worker#), ##### rows min (seg# worker#)/
-- end_matchsubs
begin;
create table tt (a int, b int) with(parallel_workers=2) distributed by(a, b);
insert into tt select * from generate_series(1,1000)a,generate_series(1,1000)b;
set local enable_parallel = on;
set local max_parallel_workers_per_gather = 2;
set local gp_enable_explain_rows_out = on;
explain(costs off, summary off, timing off, analyze) select * from tt where a > b;
QUERY PLAN
--------------------------------------------------------------------------------------------------------------------------------
Gather Motion 6:1 (slice1; segments: 6) (actual rows=499500 loops=1)
Rows out: 499500.00 rows avg x 1 workers from 1 segments, 499500 rows max (seg-1 worker0), 499500 rows min (seg-1 worker0).
-> Parallel Seq Scan on tt (actual rows=86074 loops=1)
Filter: (a > b)
Rows Removed by Filter: 84852
Rows out: 83250.00 rows avg x 6 workers from 3 segments, 86144 rows max (seg2 worker4), 80391 rows min (seg1 worker2).
Optimizer: Postgres query optimizer
(7 rows)

abort;
-- start_ignore
drop schema test_parallel cascade;
Expand Down
36 changes: 36 additions & 0 deletions src/test/regress/expected/gp_explain.out
Original file line number Diff line number Diff line change
Expand Up @@ -436,6 +436,42 @@ explain analyze SELECT * FROM explaintest;
(8 rows)

set gp_enable_explain_allstat=DEFAULT;
-- Test explain rows out.
begin;
set local gp_enable_explain_rows_out=on;
create table tt (a int, b int) distributed by(a, b);
explain(costs off, summary off, timing off, analyze)
insert into tt select * from generate_series(1,1000)a,generate_series(1,1000)b;
QUERY PLAN
--------------------------------------------------------------------------------------------------------------------------------------------------
Insert on tt (actual rows=0 loops=1)
Rows out: 0.00 rows avg x 0 workers from 3 segments, 0 rows max (seg0 worker0), 0 rows min (seg0 worker0).
-> Redistribute Motion 1:3 (slice1; segments: 1) (actual rows=333518 loops=1)
Hash Key: a.a, b.b
Rows out: 333333.33 rows avg x 3 workers from 3 segments, 333518 rows max (seg2 worker2), 333150 rows min (seg1 worker1).
-> Nested Loop (actual rows=1000000 loops=1)
Rows out: 1000000.00 rows avg x 1 workers from 1 segments, 1000000 rows max (seg2 worker0), 1000000 rows min (seg2 worker0).
-> Function Scan on generate_series a (actual rows=1000 loops=1)
Rows out: 1000.00 rows avg x 1 workers from 1 segments, 1000 rows max (seg2 worker0), 1000 rows min (seg2 worker0).
-> Function Scan on generate_series b (actual rows=1000 loops=1000)
Rows out: 1000000.00 rows avg x 1 workers from 1 segments, 1000000 rows max (seg2 worker0), 1000000 rows min (seg2 worker0).
Optimizer: Postgres query optimizer
(12 rows)

explain(costs off, summary off, timing off, analyze)
select * from tt where a > b;
QUERY PLAN
-----------------------------------------------------------------------------------------------------------------------------------
Gather Motion 3:1 (slice1; segments: 3) (actual rows=499500 loops=1)
Rows out: 499500.00 rows avg x 1 workers from 1 segments, 499500 rows max (seg-1 worker0), 499500 rows min (seg-1 worker0).
-> Seq Scan on tt (actual rows=166461 loops=1)
Filter: (a > b)
Rows Removed by Filter: 167057
Rows out: 166500.00 rows avg x 3 workers from 3 segments, 166557 rows max (seg0 worker0), 166461 rows min (seg2 worker2).
Optimizer: Postgres query optimizer
(7 rows)

abort;
--
-- Test GPDB-specific EXPLAIN (SLICETABLE) option.
--
Expand Down
36 changes: 36 additions & 0 deletions src/test/regress/expected/gp_explain_optimizer.out
Original file line number Diff line number Diff line change
Expand Up @@ -458,6 +458,42 @@ explain analyze SELECT * FROM explaintest;
(8 rows)

set gp_enable_explain_allstat=DEFAULT;
-- Test explain rows out.
begin;
set local gp_enable_explain_rows_out=on;
create table tt (a int, b int) distributed by(a, b);
explain(costs off, summary off, timing off, analyze)
insert into tt select * from generate_series(1,1000)a,generate_series(1,1000)b;
QUERY PLAN
--------------------------------------------------------------------------------------------------------------------------------------------------
Insert on tt (actual rows=0 loops=1)
Rows out: 0.00 rows avg x 0 workers from 3 segments, 0 rows max (seg0 worker0), 0 rows min (seg0 worker0).
-> Redistribute Motion 1:3 (slice1; segments: 1) (actual rows=333518 loops=1)
Hash Key: a.a, b.b
Rows out: 333333.33 rows avg x 3 workers from 3 segments, 333518 rows max (seg2 worker2), 333150 rows min (seg1 worker1).
-> Nested Loop (actual rows=1000000 loops=1)
Rows out: 1000000.00 rows avg x 1 workers from 1 segments, 1000000 rows max (seg2 worker0), 1000000 rows min (seg2 worker0).
-> Function Scan on generate_series a (actual rows=1000 loops=1)
Rows out: 1000.00 rows avg x 1 workers from 1 segments, 1000 rows max (seg2 worker0), 1000 rows min (seg2 worker0).
-> Function Scan on generate_series b (actual rows=1000 loops=1000)
Rows out: 1000000.00 rows avg x 1 workers from 1 segments, 1000000 rows max (seg2 worker0), 1000000 rows min (seg2 worker0).
Optimizer: Postgres query optimizer
(12 rows)

explain(costs off, summary off, timing off, analyze)
select * from tt where a > b;
QUERY PLAN
-----------------------------------------------------------------------------------------------------------------------------------
Gather Motion 3:1 (slice1; segments: 3) (actual rows=499500 loops=1)
Rows out: 499500.00 rows avg x 1 workers from 1 segments, 499500 rows max (seg-1 worker0), 499500 rows min (seg-1 worker0).
-> Seq Scan on tt (actual rows=166461 loops=1)
Filter: (a > b)
Rows Removed by Filter: 167057
Rows out: 166500.00 rows avg x 3 workers from 3 segments, 166557 rows max (seg0 worker0), 166461 rows min (seg2 worker2).
Optimizer: Postgres query optimizer
(7 rows)

abort;
--
-- Test GPDB-specific EXPLAIN (SLICETABLE) option.
--
Expand Down
18 changes: 18 additions & 0 deletions src/test/regress/sql/cbdb_parallel.sql
Original file line number Diff line number Diff line change
Expand Up @@ -986,6 +986,24 @@ select t1_anti.a, t1_anti.b from t1_anti left join t2_anti on t1_anti.a = t2_ant
select t1_anti.a, t1_anti.b from t1_anti left join t2_anti on t1_anti.a = t2_anti.a where t2_anti.a is null;
abort;

-- test rows out
-- start_matchsubs
-- m/\(actual rows=\d+ loops=\d+\)/
-- s/\(actual rows=\d+ loops=\d+\)/(actual rows=# loops=#)/
-- m/Rows Removed by Filter: \d+/
-- s/Rows Removed by Filter: \d+/Rows Removed by Filter: ###/
-- m/\d+ rows max \(seg\d+ worker\d+\), \d+ rows min \(seg\d+ worker\d+\)/
-- s/\d+ rows max \(seg\d+ worker\d+\), \d+ rows min \(seg\d+ worker\d+\)/##### rows max (seg# worker#), ##### rows min (seg# worker#)/
-- end_matchsubs
begin;
create table tt (a int, b int) with(parallel_workers=2) distributed by(a, b);
insert into tt select * from generate_series(1,1000)a,generate_series(1,1000)b;
set local enable_parallel = on;
set local max_parallel_workers_per_gather = 2;
set local gp_enable_explain_rows_out = on;
explain(costs off, summary off, timing off, analyze) select * from tt where a > b;
abort;

-- start_ignore
drop schema test_parallel cascade;
-- end_ignore
Expand Down
10 changes: 10 additions & 0 deletions src/test/regress/sql/gp_explain.sql
Original file line number Diff line number Diff line change
Expand Up @@ -228,6 +228,16 @@ set gp_enable_explain_allstat=on;
explain analyze SELECT * FROM explaintest;
set gp_enable_explain_allstat=DEFAULT;

-- Test explain rows out.
begin;
set local gp_enable_explain_rows_out=on;
create table tt (a int, b int) distributed by(a, b);
explain(costs off, summary off, timing off, analyze)
insert into tt select * from generate_series(1,1000)a,generate_series(1,1000)b;
explain(costs off, summary off, timing off, analyze)
select * from tt where a > b;
abort;


--
-- Test GPDB-specific EXPLAIN (SLICETABLE) option.
Expand Down
Loading