Skip to content

Commit 46cec76

Browse files
derrickstoleejltobler
authored andcommitted
survey: add report of "largest" paths
Since we are already walking our reachable objects using the path-walk API, let's now collect lists of the paths that contribute most to different metrics. Specifically, we care about * Number of versions. * Total size on disk. * Total inflated size (no delta or zlib compression). This information can be critical to discovering which parts of the repository are causing the most growth, especially on-disk size. Different packing strategies might help compress data more efficiently, but the toal inflated size is a representation of the raw size of all snapshots of those paths. Even when stored efficiently on disk, that size represents how much information must be processed to complete a command such as 'git blame'. Since the on-disk size is likely to be fragile, stop testing the exact output of 'git survey' and check that the correct set of headers is output. Signed-off-by: Derrick Stolee <[email protected]>
1 parent 73bed8e commit 46cec76

File tree

2 files changed

+91
-13
lines changed

2 files changed

+91
-13
lines changed

builtin/survey.c

Lines changed: 80 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -80,7 +80,6 @@ struct survey_report_object_size_summary {
8080
typedef int (*survey_top_size_cmp)(struct survey_report_object_size_summary *s1,
8181
struct survey_report_object_size_summary *s2);
8282

83-
MAYBE_UNUSED
8483
static int cmp_by_nr(struct survey_report_object_size_summary *s1,
8584
struct survey_report_object_size_summary *s2)
8685
{
@@ -91,7 +90,6 @@ static int cmp_by_nr(struct survey_report_object_size_summary *s1,
9190
return 0;
9291
}
9392

94-
MAYBE_UNUSED
9593
static int cmp_by_disk_size(struct survey_report_object_size_summary *s1,
9694
struct survey_report_object_size_summary *s2)
9795
{
@@ -102,7 +100,6 @@ static int cmp_by_disk_size(struct survey_report_object_size_summary *s1,
102100
return 0;
103101
}
104102

105-
MAYBE_UNUSED
106103
static int cmp_by_inflated_size(struct survey_report_object_size_summary *s1,
107104
struct survey_report_object_size_summary *s2)
108105
{
@@ -126,7 +123,6 @@ struct survey_report_top_sizes {
126123
size_t alloc;
127124
};
128125

129-
MAYBE_UNUSED
130126
static void init_top_sizes(struct survey_report_top_sizes *top,
131127
size_t limit, const char *name,
132128
survey_top_size_cmp cmp)
@@ -146,7 +142,6 @@ static void clear_top_sizes(struct survey_report_top_sizes *top)
146142
free(top->data);
147143
}
148144

149-
MAYBE_UNUSED
150145
static void maybe_insert_into_top_size(struct survey_report_top_sizes *top,
151146
struct survey_report_object_size_summary *summary)
152147
{
@@ -182,6 +177,10 @@ struct survey_report {
182177
struct survey_report_object_summary reachable_objects;
183178

184179
struct survey_report_object_size_summary *by_type;
180+
181+
struct survey_report_top_sizes *top_paths_by_count;
182+
struct survey_report_top_sizes *top_paths_by_disk;
183+
struct survey_report_top_sizes *top_paths_by_inflate;
185184
};
186185

187186
#define REPORT_TYPE_COMMIT 0
@@ -423,6 +422,13 @@ static void survey_report_object_sizes(const char *title,
423422
clear_table(&table);
424423
}
425424

425+
static void survey_report_plaintext_sorted_size(
426+
struct survey_report_top_sizes *top)
427+
{
428+
survey_report_object_sizes(top->name, _("Path"),
429+
top->data, top->nr);
430+
}
431+
426432
static void survey_report_plaintext(struct survey_context *ctx)
427433
{
428434
printf("GIT SURVEY for \"%s\"\n", ctx->repo->worktree);
@@ -433,6 +439,21 @@ static void survey_report_plaintext(struct survey_context *ctx)
433439
_("Object Type"),
434440
ctx->report.by_type,
435441
REPORT_TYPE_COUNT);
442+
443+
survey_report_plaintext_sorted_size(
444+
&ctx->report.top_paths_by_count[REPORT_TYPE_TREE]);
445+
survey_report_plaintext_sorted_size(
446+
&ctx->report.top_paths_by_count[REPORT_TYPE_BLOB]);
447+
448+
survey_report_plaintext_sorted_size(
449+
&ctx->report.top_paths_by_disk[REPORT_TYPE_TREE]);
450+
survey_report_plaintext_sorted_size(
451+
&ctx->report.top_paths_by_disk[REPORT_TYPE_BLOB]);
452+
453+
survey_report_plaintext_sorted_size(
454+
&ctx->report.top_paths_by_inflate[REPORT_TYPE_TREE]);
455+
survey_report_plaintext_sorted_size(
456+
&ctx->report.top_paths_by_inflate[REPORT_TYPE_BLOB]);
436457
}
437458

438459
static void survey_report_json(struct survey_context *ctx UNUSED)
@@ -668,7 +689,8 @@ static void increment_totals(struct survey_context *ctx,
668689

669690
static void increment_object_totals(struct survey_context *ctx,
670691
struct oid_array *oids,
671-
enum object_type type)
692+
enum object_type type,
693+
const char *path)
672694
{
673695
struct survey_report_object_size_summary *total;
674696
struct survey_report_object_size_summary summary = { 0 };
@@ -696,9 +718,30 @@ static void increment_object_totals(struct survey_context *ctx,
696718
total->disk_size += summary.disk_size;
697719
total->inflated_size += summary.inflated_size;
698720
total->num_missing += summary.num_missing;
721+
722+
if (type == OBJ_TREE || type == OBJ_BLOB) {
723+
int index = type == OBJ_TREE ?
724+
REPORT_TYPE_TREE : REPORT_TYPE_BLOB;
725+
struct survey_report_top_sizes *top;
726+
727+
/*
728+
* Temporarily store (const char *) here, but it will
729+
* be duped if inserted and will not be freed.
730+
*/
731+
summary.label = (char *)path;
732+
733+
top = ctx->report.top_paths_by_count;
734+
maybe_insert_into_top_size(&top[index], &summary);
735+
736+
top = ctx->report.top_paths_by_disk;
737+
maybe_insert_into_top_size(&top[index], &summary);
738+
739+
top = ctx->report.top_paths_by_inflate;
740+
maybe_insert_into_top_size(&top[index], &summary);
741+
}
699742
}
700743

701-
static int survey_objects_path_walk_fn(const char *path UNUSED,
744+
static int survey_objects_path_walk_fn(const char *path,
702745
struct oid_array *oids,
703746
enum object_type type,
704747
void *data)
@@ -707,7 +750,7 @@ static int survey_objects_path_walk_fn(const char *path UNUSED,
707750

708751
increment_object_counts(&ctx->report.reachable_objects,
709752
type, oids->nr);
710-
increment_object_totals(ctx, oids, type);
753+
increment_object_totals(ctx, oids, type, path);
711754

712755
ctx->progress_nr += oids->nr;
713756
display_progress(ctx->progress, ctx->progress_nr);
@@ -752,6 +795,34 @@ static int iterate_tag_chain(struct survey_context *ctx,
752795
return -1;
753796
}
754797

798+
static void initialize_report(struct survey_context *ctx)
799+
{
800+
const int top_limit = 100;
801+
802+
CALLOC_ARRAY(ctx->report.by_type, REPORT_TYPE_COUNT);
803+
ctx->report.by_type[REPORT_TYPE_COMMIT].label = xstrdup(_("Commits"));
804+
ctx->report.by_type[REPORT_TYPE_TREE].label = xstrdup(_("Trees"));
805+
ctx->report.by_type[REPORT_TYPE_BLOB].label = xstrdup(_("Blobs"));
806+
807+
CALLOC_ARRAY(ctx->report.top_paths_by_count, REPORT_TYPE_COUNT);
808+
init_top_sizes(&ctx->report.top_paths_by_count[REPORT_TYPE_TREE],
809+
top_limit, _("TOP DIRECTORIES BY COUNT"), cmp_by_nr);
810+
init_top_sizes(&ctx->report.top_paths_by_count[REPORT_TYPE_BLOB],
811+
top_limit, _("TOP FILES BY COUNT"), cmp_by_nr);
812+
813+
CALLOC_ARRAY(ctx->report.top_paths_by_disk, REPORT_TYPE_COUNT);
814+
init_top_sizes(&ctx->report.top_paths_by_disk[REPORT_TYPE_TREE],
815+
top_limit, _("TOP DIRECTORIES BY DISK SIZE"), cmp_by_disk_size);
816+
init_top_sizes(&ctx->report.top_paths_by_disk[REPORT_TYPE_BLOB],
817+
top_limit, _("TOP FILES BY DISK SIZE"), cmp_by_disk_size);
818+
819+
CALLOC_ARRAY(ctx->report.top_paths_by_inflate, REPORT_TYPE_COUNT);
820+
init_top_sizes(&ctx->report.top_paths_by_inflate[REPORT_TYPE_TREE],
821+
top_limit, _("TOP DIRECTORIES BY INFLATED SIZE"), cmp_by_inflated_size);
822+
init_top_sizes(&ctx->report.top_paths_by_inflate[REPORT_TYPE_BLOB],
823+
top_limit, _("TOP FILES BY INFLATED SIZE"), cmp_by_inflated_size);
824+
}
825+
755826
static void survey_phase_objects(struct survey_context *ctx)
756827
{
757828
struct rev_info revs;
@@ -769,10 +840,7 @@ static void survey_phase_objects(struct survey_context *ctx)
769840
info.blobs = 1;
770841
info.tags = 1;
771842

772-
CALLOC_ARRAY(ctx->report.by_type, REPORT_TYPE_COUNT);
773-
ctx->report.by_type[REPORT_TYPE_COMMIT].label = xstrdup(_("Commits"));
774-
ctx->report.by_type[REPORT_TYPE_TREE].label = xstrdup(_("Trees"));
775-
ctx->report.by_type[REPORT_TYPE_BLOB].label = xstrdup(_("Blobs"));
843+
initialize_report(ctx);
776844

777845
repo_init_revisions(ctx->repo, &revs, "");
778846

t/t8100-git-survey.sh

Lines changed: 11 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -60,7 +60,17 @@ test_expect_success 'git survey (default)' '
6060
Blobs | 10 | 191 | 101
6161
EOF
6262
63-
test_cmp expect out
63+
lines=$(wc -l <expect) &&
64+
head -n $lines out >out-trimmed &&
65+
test_cmp expect out-trimmed &&
66+
67+
for type in "DIRECTORIES" "FILES"
68+
do
69+
for metric in "COUNT" "DISK SIZE" "INFLATED SIZE"
70+
do
71+
grep "TOP $type BY $metric" out || return 1
72+
done || return 1
73+
done
6474
'
6575

6676
test_done

0 commit comments

Comments
 (0)