Skip to content

Commit

Permalink
survey: add tree stats
Browse files Browse the repository at this point in the history
Signed-off-by: Jeff Hostetler <[email protected]>
  • Loading branch information
jeffhostetler committed Apr 29, 2024
1 parent 805ff70 commit 9c41f3c
Showing 1 changed file with 305 additions and 2 deletions.
307 changes: 305 additions & 2 deletions builtin/survey.c
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
#include "builtin.h"
#include "config.h"
#include "environment.h"
#include "hex.h"
#include "json-writer.h"
#include "list-objects.h"
#include "object-name.h"
Expand All @@ -10,8 +11,10 @@
#include "ref-filter.h"
#include "refs.h"
#include "revision.h"
#include "strbuf.h"
#include "strvec.h"
#include "trace2.h"
#include "tree.h"
#include "tree-walk.h"

static const char * const survey_usage[] = {
Expand Down Expand Up @@ -184,9 +187,96 @@ struct survey_stats_commits {
uint32_t parent_cnt_bin[4]; /* see parent_label[] */
};

#define HBIN_LEN (sizeof(unsigned long) * 2) /* number of hex digits */

/*
* Compute HBIN number for the given value. Basically this is the
* number of hex digits required to represent the value with no
* leading zeroes -- minus one. This gives us our HBIN histogram bucket.
*
* We're using exponential binning to see the spread of object sizes
* across the size of an `unsigned long` key with a small number of
* buckets.
*/
static int hbin(unsigned long value)
{
int k;

for (k = 0; k < HBIN_LEN; k++) {
if ((value & ~0xF) == 0)
return k;
value >>= 4;
}

return 0; /* should not happen */
}

#define QBIN_LEN (sizeof(unsigned long) * 4) /* number of base4 digits */

/*
* Compute QBIN number for the given value. Basically this is the
* number of base-4 digits required to represent the value.
*/
static int qbin(unsigned long value)
{
int k;

for (k = 0; k < QBIN_LEN; k++) {
if ((value & ~0x3) == 0)
return k;
value >>= 2;
}

return 0; /* should not happen */
}

/*
* Stats for reachable trees.
*/
struct survey_stats_trees {
uint32_t cnt_seen; /* nr_trees -- number of trees observed */

/*
* nr_trees grouped by where they are stored on disk. This is
* a function of how the ODB is packed.
*
* Missing trees may mean we have a partial-clone.
*/
uint32_t cnt_missing;
uint32_t cnt_cached; /* see oi.whence */
uint32_t cnt_loose; /* see oi.whence */
uint32_t cnt_packed; /* see oi.whence */
uint32_t cnt_dbcached; /* see oi.whence */

/*
* In the following, tree_size is the expanded size of the tree
* object. disk_size refers to the compressed and/or delta-fied
* representation on disk.
*
* In both cases we group trees by the tree_size so that we have
* the same set in each histogram and therefore can get a feel
* for the effectiveness of the compression of delta-chaining.
*/
uint64_t sum_size; /* sum(tree_size) */
uint64_t sum_disk_size; /* sum(disk_size) */

uint32_t cnt_seen_hbin[HBIN_LEN]; /* nr_trees grouped by log16(tree_size) */
uint64_t sum_size_hbin[HBIN_LEN]; /* sum(tree_size) grouped by log16(tree_size) */
uint64_t sum_disk_size_hbin[HBIN_LEN]; /* sum(disk_size) grouped by log16(tree_size) */

/*
* In the following, nr_entries refers to the number of files or
* subdirectories in a tree. We are interested in how wide the
* tree is and if the repo has gigantic directories.
*/
uint64_t max_entries; /* max(nr_entries) -- the width of the largest tree */
uint64_t cnt_entries_qbin[QBIN_LEN]; /* nr_trees grouped by log4(nr_entries) */
};

struct survey_stats {
struct survey_stats_refs refs;
struct survey_stats_commits commits;
struct survey_stats_trees trees;
};

static struct survey_stats survey_stats = { 0 };
Expand Down Expand Up @@ -300,9 +390,92 @@ static void traverse_commit_cb(struct commit *commit, void *data)
stats->parent_cnt_bin[k]++;
}

static void traverse_object_cb_tree(struct object *obj)
{
struct survey_stats_trees *stats = &survey_stats.trees;
struct object_info oi = OBJECT_INFO_INIT;
unsigned oi_flags = OBJECT_INFO_FOR_PREFETCH;
unsigned long object_length;
unsigned long l;

Check failure on line 399 in builtin/survey.c

View workflow job for this annotation

GitHub Actions / win build

builtin/survey.c:399:23: unused variable 'l' [-Werror=unused-variable]

Check failure on line 399 in builtin/survey.c

View workflow job for this annotation

GitHub Actions / linux32 (daald/ubuntu32:xenial)

builtin/survey.c:399:16: unused variable 'l' [-Werror=unused-variable]

Check failure on line 399 in builtin/survey.c

View workflow job for this annotation

GitHub Actions / linux-musl (alpine)

builtin/survey.c:399:23: unused variable 'l' [-Werror=unused-variable]

Check failure on line 399 in builtin/survey.c

View workflow job for this annotation

GitHub Actions / pedantic (fedora)

builtin/survey.c:399:23: unused variable 'l' [-Werror=unused-variable]

Check failure on line 399 in builtin/survey.c

View workflow job for this annotation

GitHub Actions / linux-asan-ubsan (ubuntu-latest)

builtin/survey.c:399:16: unused variable 'l' [-Werror,-Wunused-variable]

Check failure on line 399 in builtin/survey.c

View workflow job for this annotation

GitHub Actions / linux-gcc (ubuntu-20.04)

builtin/survey.c:399:16: unused variable ‘l’ [-Werror=unused-variable]

Check failure on line 399 in builtin/survey.c

View workflow job for this annotation

GitHub Actions / linux-gcc-default (ubuntu-latest)

builtin/survey.c:399:23: unused variable ‘l’ [-Werror=unused-variable]

Check failure on line 399 in builtin/survey.c

View workflow job for this annotation

GitHub Actions / linux-reftable-leaks (ubuntu-latest)

builtin/survey.c:399:23: unused variable ‘l’ [-Werror=unused-variable]

Check failure on line 399 in builtin/survey.c

View workflow job for this annotation

GitHub Actions / linux-reftable (ubuntu-latest)

builtin/survey.c:399:16: unused variable 'l' [-Werror,-Wunused-variable]

Check failure on line 399 in builtin/survey.c

View workflow job for this annotation

GitHub Actions / linux-leaks (ubuntu-latest)

builtin/survey.c:399:23: unused variable ‘l’ [-Werror=unused-variable]

Check failure on line 399 in builtin/survey.c

View workflow job for this annotation

GitHub Actions / linux-sha256 (ubuntu-latest)

builtin/survey.c:399:16: unused variable 'l' [-Werror,-Wunused-variable]

Check failure on line 399 in builtin/survey.c

View workflow job for this annotation

GitHub Actions / linux-TEST-vars (ubuntu-20.04)

builtin/survey.c:399:16: unused variable ‘l’ [-Werror=unused-variable]
off_t disk_sizep;
enum object_type type;
struct tree_desc desc;
struct name_entry entry;
struct tree *tree;
int nr_entries;
int hb;
int qb;

stats->cnt_seen++;

oi.typep = &type;
oi.sizep = &object_length;
oi.disk_sizep = &disk_sizep;

if (oid_object_info_extended(the_repository, &obj->oid, &oi, oi_flags) < 0 ||
type != OBJ_TREE) {
stats->cnt_missing++;
return;
}

switch (oi.whence) {
case OI_CACHED:
stats->cnt_cached++;
break;
case OI_LOOSE:
stats->cnt_loose++;
break;
case OI_PACKED:
stats->cnt_packed++;
break;
case OI_DBCACHED:
stats->cnt_dbcached++;
break;
default:
break;
}

stats->sum_size += object_length;
stats->sum_disk_size += disk_sizep;

hb = hbin(object_length);

stats->cnt_seen_hbin[hb]++;
stats->sum_size_hbin[hb] += object_length;
stats->sum_disk_size_hbin[hb] += disk_sizep;

tree = lookup_tree(the_repository, &obj->oid);
if (!tree)
return;
init_tree_desc(&desc, &obj->oid, tree->buffer, tree->size);
nr_entries = 0;
while (tree_entry(&desc, &entry))
nr_entries++;

if (nr_entries > stats->max_entries)
stats->max_entries = nr_entries;

qb = qbin(nr_entries);

stats->cnt_entries_qbin[qb]++;
}

static void traverse_object_cb(struct object *obj, const char *name, void *data)
{
display_progress(survey_progress, ++survey_progress_total);

switch (obj->type) {
case OBJ_TREE:
traverse_object_cb_tree(obj);
return;
case OBJ_BLOB:
// traverse_object_cb_blob(obj);
return;
case OBJ_TAG: /* ignore -- counted when loading REFS */
case OBJ_COMMIT: /* ignore/bug -- seen in the other callback */
default: /* ignore/bug -- unknown type */
return;
}
}

/*
Expand Down Expand Up @@ -443,8 +616,8 @@ static void survey_print_refs(void)

printf("References\n");

#define CNT_FMT "%-20s: %10d\n"
#define LEN_FMT "%-20s: %10"PRIuMAX"\n"
#define CNT_FMT "%-24s: %10d\n"
#define LEN_FMT "%-24s: %10"PRIuMAX"\n"

Check failure on line 620 in builtin/survey.c

View workflow job for this annotation

GitHub Actions / linux32 (daald/ubuntu32:xenial)

builtin/survey.c:620:17: format '%llu' expects argument of type 'long long unsigned int', but argument 3 has type 'size_t {aka unsigned int}' [-Werror=format=]

Check failure on line 620 in builtin/survey.c

View workflow job for this annotation

GitHub Actions / linux32 (daald/ubuntu32:xenial)

builtin/survey.c:620:17: format '%llu' expects argument of type 'long long unsigned int', but argument 3 has type 'size_t {aka unsigned int}' [-Werror=format=]

printf(CNT_FMT, " Count", prs->cnt_total);

Expand Down Expand Up @@ -488,10 +661,79 @@ static void survey_print_commits(void)
}
}

static void survey_print_trees(void)
{
struct survey_stats_trees *pst = &survey_stats.trees;
struct strbuf buf = STRBUF_INIT;
unsigned long mask;
int k;

printf("Trees\n");

printf(CNT_FMT, " Count", pst->cnt_seen);
printf(CNT_FMT, " Missing", pst->cnt_missing);
printf(CNT_FMT, " Cached", pst->cnt_cached);
printf(CNT_FMT, " Loose", pst->cnt_loose);
printf(CNT_FMT, " Packed", pst->cnt_packed);
printf(CNT_FMT, " DBCached", pst->cnt_dbcached);

printf(LEN_FMT, " MaxEntriesInAnyTree", (uintmax_t)pst->max_entries);
printf(LEN_FMT, " TotalObservedSize", (uintmax_t)pst->sum_size);
printf(LEN_FMT, " TotalOnDiskSize", (uintmax_t)pst->sum_disk_size);

printf(" CountDistributionByNumberOfEntries\n");
mask = 0x3;
for (k = 0; k < QBIN_LEN; k++) {
if (pst->cnt_entries_qbin[k]) {
strbuf_reset(&buf);
strbuf_addf(&buf, " Q%02d(<=0x%08lx)", k+1, mask);
printf(LEN_FMT, buf.buf, (uintmax_t)pst->cnt_entries_qbin[k]);
}
mask = (mask << 2) + 0x3;
}

printf(" CountDistributionByObservedSize\n");
mask = 0xF;
for (k = 0; k < HBIN_LEN; k++) {
if (pst->cnt_seen_hbin[k]) {
strbuf_reset(&buf);
strbuf_addf(&buf, " H%d(<=0x%08lx)", k+1, mask);
printf(CNT_FMT, buf.buf, pst->cnt_seen_hbin[k]);
}
mask = (mask << 4) + 0xF;
}

printf(" SizeDistributionByObservedSize\n");
mask = 0xF;
for (k = 0; k < HBIN_LEN; k++) {
if (pst->cnt_seen_hbin[k]) {
strbuf_reset(&buf);
strbuf_addf(&buf, " H%d(<=0x%08lx)", k+1, mask);
printf(LEN_FMT, buf.buf, (uintmax_t)pst->sum_size_hbin[k]);
}
mask = (mask << 4) + 0xF;
}

printf(" DiskSizeDistributionbyObservedSize\n");
mask = 0xF;
for (k = 0; k < HBIN_LEN; k++) {
if (pst->cnt_seen_hbin[k]) {
strbuf_reset(&buf);
strbuf_addf(&buf, " H%d(<=0x%08lx)", k+1, mask);
printf(LEN_FMT, buf.buf, (uintmax_t)pst->sum_disk_size_hbin[k]);
}
mask = (mask << 4) + 0xF;
}

strbuf_release(&buf);
}

static void survey_json(struct json_writer *jw, int pretty)
{
struct survey_stats_refs *prs = &survey_stats.refs;
struct survey_stats_commits *psc = &survey_stats.commits;
struct survey_stats_trees *pst = &survey_stats.trees;
struct strbuf buf = STRBUF_INIT;
int k;

jw_object_begin(jw, pretty);
Expand Down Expand Up @@ -530,6 +772,66 @@ static void survey_json(struct json_writer *jw, int pretty)
jw_object_intmax(jw, parent_label[k], psc->parent_cnt_bin[k]);
}
jw_end(jw);

jw_object_inline_begin_object(jw, "trees");
{
jw_object_intmax(jw, "total_trees", pst->cnt_seen);
jw_object_intmax(jw, "total_missing", pst->cnt_missing);
jw_object_intmax(jw, "total_cached", pst->cnt_cached);
jw_object_intmax(jw, "total_loose", pst->cnt_loose);
jw_object_intmax(jw, "total_packed", pst->cnt_packed);
jw_object_intmax(jw, "total_dbcached", pst->cnt_dbcached);

jw_object_intmax(jw, "max_entries", pst->max_entries);
jw_object_intmax(jw, "total_size", pst->sum_size);
jw_object_intmax(jw, "total_disk_size", pst->sum_disk_size);

jw_object_inline_begin_object(jw, "count_dist_by_nr_entries");
{
for (k = 0; k < QBIN_LEN; k++)
if (pst->cnt_entries_qbin[k]) {
strbuf_reset(&buf);
strbuf_addf(&buf, "Q%02d", k+1);
jw_object_intmax(jw, buf.buf, pst->cnt_entries_qbin[k]);
}
}
jw_end(jw);

jw_object_inline_begin_object(jw, "count_dist_by_size");
{
for (k = 0; k < HBIN_LEN; k++)
if (pst->cnt_seen_hbin[k]) {
strbuf_reset(&buf);
strbuf_addf(&buf, "H%d", k+1);
jw_object_intmax(jw, buf.buf, pst->cnt_seen_hbin[k]);
}
}
jw_end(jw);

jw_object_inline_begin_object(jw, "size_dist_by_size");
{
for (k = 0; k < HBIN_LEN; k++)
if (pst->cnt_seen_hbin[k]) {
strbuf_reset(&buf);
strbuf_addf(&buf, "H%d", k+1);
jw_object_intmax(jw, buf.buf, pst->sum_size_hbin[k]);
}
}
jw_end(jw);

jw_object_inline_begin_object(jw, "disk_size_dist_by_size");
{
for (k = 0; k < HBIN_LEN; k++)
if (pst->cnt_seen_hbin[k]) {
strbuf_reset(&buf);
strbuf_addf(&buf, "H%d", k+1);
jw_object_intmax(jw, buf.buf, pst->sum_disk_size_hbin[k]);
}
}
jw_end(jw);

}
jw_end(jw);
}
jw_end(jw);
}
Expand All @@ -545,6 +847,7 @@ static void survey_print_results(void)
} else {
survey_print_refs();
survey_print_commits();
survey_print_trees();
}
}

Expand Down

0 comments on commit 9c41f3c

Please sign in to comment.