Skip to content

Commit 8fdcab0

Browse files
philip-stoevMongoDB Bot
authored andcommitted
SERVER-98749 Basic jstest for histogramCE estimation accuracy (#31955)
GitOrigin-RevId: 31d62d6
1 parent ecd1cf2 commit 8fdcab0

File tree

9 files changed

+852
-0
lines changed

9 files changed

+852
-0
lines changed
Lines changed: 131 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,131 @@
1+
/* Check the cardinality estimation of very simple predicates using histograms by running
2+
the predicate itself and comparing the number of documents matched to the estimate.
3+
4+
In this test, we use distributions that allow for "perfect" histograms, that is,
5+
histograms where, even with the information loss, perfect estimates can be made.
6+
7+
Simularily, the predicates used are those that can be estimated perfectly
8+
(except for the occasional off-by-one errors)
9+
*/
10+
11+
import {
12+
getAllPlans,
13+
} from "jstests/libs/query/analyze_plan.js";
14+
import {checkSbeFullyEnabled} from "jstests/libs/query/sbe_util.js";
15+
import {ArrayDataset} from 'jstests/noPassthroughWithMongod/query/cbr/lib/datasets/array.js';
16+
import {BooleanDataset} from 'jstests/noPassthroughWithMongod/query/cbr/lib/datasets/boolean.js';
17+
import {
18+
DateDataset,
19+
TimestampDataset
20+
} from 'jstests/noPassthroughWithMongod/query/cbr/lib/datasets/date_time.js';
21+
import {
22+
OneHoleDataset,
23+
OnePeakDataset,
24+
SkewedDataset,
25+
ThreePeakDataset,
26+
UniformDataset
27+
} from "jstests/noPassthroughWithMongod/query/cbr/lib/datasets/distributions.js";
28+
import {
29+
MixedNumbersDataset,
30+
MixedTypesDataset
31+
} from "jstests/noPassthroughWithMongod/query/cbr/lib/datasets/mixed_types.js";
32+
import {
33+
TwoFieldDataset
34+
} from "jstests/noPassthroughWithMongod/query/cbr/lib/datasets/multifield.js";
35+
import {NumberDataset} from "jstests/noPassthroughWithMongod/query/cbr/lib/datasets/number.js";
36+
import {StringDataset} from "jstests/noPassthroughWithMongod/query/cbr/lib/datasets/string.js";
37+
38+
// TODO SERVER-92589: Remove this exemption
39+
if (checkSbeFullyEnabled(db)) {
40+
jsTestLog(`Skipping ${jsTestName()} as SBE executor is not supported yet`);
41+
quit();
42+
}
43+
44+
const collName = jsTestName();
45+
const coll = db[collName];
46+
47+
function runOneTest({dataset, indexes, analyze, numberBuckets = 1000}) {
48+
try {
49+
assert.commandWorked(db.adminCommand({setParameter: 1, planRankerMode: "histogramCE"}));
50+
51+
coll.drop();
52+
assert.commandWorked(coll.insertMany(dataset.docs()));
53+
54+
for (const index of indexes ? indexes : []) {
55+
assert.commandWorked(coll.createIndex(index));
56+
}
57+
58+
for (const analyze_key of analyze ? analyze : ["a"]) {
59+
var analyze_cmd = {analyze: collName, key: analyze_key, numberBuckets: numberBuckets};
60+
61+
assert.commandWorked(coll.runCommand(analyze_cmd));
62+
}
63+
64+
for (const predicate of dataset.predicates()) {
65+
var cursor = coll.find(predicate);
66+
const actualDocuments = cursor.count();
67+
68+
const explain = cursor.explain();
69+
const plans = getAllPlans(explain);
70+
for (const plan of plans) {
71+
assert(plan.hasOwnProperty("cardinalityEstimate"));
72+
const cardinalityEstimate = plan.cardinalityEstimate;
73+
74+
// 'Histogram', 'Code' and 'Metadata' all imply a confident estimate,
75+
// so we accept all of them.
76+
assert(plan.estimatesMetadata.ceSource === "Histogram" ||
77+
plan.estimatesMetadata.ceSource === "Code" ||
78+
plan.estimatesMetadata.ceSource === "Metadata",
79+
predicate);
80+
81+
printjsononeline(predicate);
82+
print(`actualDocuments: ${actualDocuments}; cardinalityEstimate: ${
83+
cardinalityEstimate}`);
84+
85+
if (Math.abs(actualDocuments - cardinalityEstimate) > 1) {
86+
printjsononeline(plan);
87+
assert(
88+
false,
89+
`Got cardinalityEstimate = ${cardinalityEstimate} but actualDocuments = ${
90+
actualDocuments} for predicate: ${tojson(predicate)}; dataset: ${
91+
dataset.constructor.name}; indexes: ${indexes};`);
92+
}
93+
}
94+
}
95+
} finally {
96+
// Make sure that we restore the default no matter what
97+
assert.commandWorked(db.adminCommand({setParameter: 1, planRankerMode: "multiPlanning"}));
98+
}
99+
}
100+
101+
for (const indexes of [[], [{a: 1}]]) {
102+
for (const dataset of [new ArrayDataset(),
103+
new BooleanDataset(),
104+
new DateDataset(),
105+
new TimestampDataset(),
106+
new SkewedDataset(),
107+
new MixedTypesDataset(),
108+
new MixedNumbersDataset(),
109+
new NumberDataset(),
110+
new StringDataset()]) {
111+
runOneTest({dataset: dataset, indexes: indexes});
112+
}
113+
114+
/* Skewed datasets under a constrained number of buckets. We give each
115+
dataset just enough buckets for it can be estimated accurately.
116+
*/
117+
for (const test of [{dataset: new UniformDataset(), numberBuckets: 2},
118+
{dataset: new OnePeakDataset(), numberBuckets: 4},
119+
{dataset: new OneHoleDataset(), numberBuckets: 3},
120+
{dataset: new ThreePeakDataset(), numberBuckets: 8},
121+
{dataset: new SkewedDataset(), numberBuckets: 10}]) {
122+
test.indexes = indexes;
123+
runOneTest(test);
124+
}
125+
}
126+
127+
// Multi-field predicates
128+
129+
for (const indexes of [[{a: 1, b: 1}], [{a: 1}, {b: 1}]]) {
130+
runOneTest({dataset: new TwoFieldDataset(), indexes: indexes, analyze: ["a", "b"]});
131+
}
Lines changed: 36 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,36 @@
1+
/*
2+
A dataset with an array column and a set of relevant predicates.
3+
*/
4+
5+
export class ArrayDataset {
6+
docs() {
7+
let array_docs = [];
8+
let array = [];
9+
for (let i = 0; i < 100; i++) {
10+
array_docs.push({a: array.slice()});
11+
array.push(i);
12+
}
13+
return array_docs;
14+
}
15+
16+
predicates() {
17+
return [
18+
// TODO(SERVER-99630): {a: null},
19+
{a: -1},
20+
{a: 50},
21+
// TODO(SERVER-99634): {a: {$all:[]}},
22+
{a: {$all: [-1]}},
23+
{a: {$all: [-1, 50]}},
24+
// TODO(SERVER-98085): {a: {$all:[50,75]}},
25+
// Not estimated via histograms: {a: {$size: 50}},
26+
// TODO(SERVER-99025): {a: {$gt: 900}},
27+
// TODO(SERVER-99025): {a: {$gt: 250, $lt: 750}},
28+
29+
/* TODO(SERVER-100451): Not supported under histogramCE:
30+
{a: {$elemMatch: {$eq: 50}}},
31+
{a: {$elemMatch: {$gt: 50}}},
32+
{a: {$elemMatch: {$ne: 50}}}
33+
*/
34+
];
35+
}
36+
}
Lines changed: 41 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,41 @@
1+
/*
2+
A dataset with a boolean column and relevant predicates for it
3+
*/
4+
5+
export class BooleanDataset {
6+
docs() {
7+
let boolean_docs = [];
8+
9+
for (let i = 0; i < 100; i++) {
10+
boolean_docs.push({a: 0});
11+
boolean_docs.push({a: 1});
12+
boolean_docs.push({a: 1.0});
13+
boolean_docs.push({a: ""});
14+
boolean_docs.push({a: true});
15+
boolean_docs.push({a: false});
16+
boolean_docs.push({a: null});
17+
boolean_docs.push({b: 123});
18+
}
19+
20+
return boolean_docs;
21+
}
22+
23+
predicates() {
24+
let boolean_predicates = [];
25+
26+
for (let val of [true,
27+
false,
28+
// TODO(SERVER-98094): null
29+
]) {
30+
boolean_predicates.push({a: val});
31+
boolean_predicates.push({a: {$gt: val}});
32+
boolean_predicates.push({a: {$gte: val}});
33+
34+
boolean_predicates.push({a: {$lt: val}});
35+
boolean_predicates.push({a: {$lte: val}});
36+
37+
boolean_predicates.push({a: {$ne: val}});
38+
}
39+
return boolean_predicates;
40+
}
41+
}
Lines changed: 69 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,69 @@
1+
/*
2+
Datasets and relevant predicates for Date and Timestamp
3+
*/
4+
5+
export class DateDataset {
6+
docs() {
7+
const date_docs = [];
8+
9+
// Vary the year component of the date
10+
for (let i = 0; i < 1000; i++) {
11+
i = String(i).padStart(3, '0');
12+
date_docs.push({a: ISODate(`2${i}-01-01T01:01:01.001`)});
13+
}
14+
15+
// Vary the subsecond component of the date
16+
for (let i = 0; i < 1000; i++) {
17+
i = String(i).padStart(3, '0');
18+
date_docs.push({a: ISODate(`2050-05-05T05:05:05.${i}`)});
19+
}
20+
21+
date_docs.push({a: ISODate(0)});
22+
return date_docs;
23+
}
24+
25+
predicates() {
26+
// Each corresponds to one of the two batches of dates from docs()
27+
const date1 = ISODate("2010-01-01T01:01:01.001");
28+
const date2 = ISODate("2050-05-05T05:05:05.050");
29+
return [
30+
{a: date1},
31+
{a: {$gt: date1}},
32+
{a: {$gte: date1}},
33+
{a: {$lt: date1}},
34+
{a: {$lte: date1}},
35+
{a: {$ne: date1}},
36+
37+
{a: date2},
38+
{a: {$lt: date2}},
39+
{a: {$gt: date2}},
40+
41+
{a: ISODate(0)},
42+
{a: {$ne: ISODate(0)}},
43+
{a: {$gt: ISODate(0)}},
44+
];
45+
}
46+
}
47+
48+
export class TimestampDataset {
49+
docs() {
50+
let timestamp_docs = [];
51+
52+
for (let i = 0; i < 100; i++) {
53+
timestamp_docs.push({a: Timestamp(0, i)});
54+
timestamp_docs.push({a: Timestamp(i, 0)});
55+
}
56+
return timestamp_docs;
57+
}
58+
59+
predicates() {
60+
return [
61+
{a: Timestamp(0, 50)},
62+
{a: {$gt: Timestamp(0, 50)}},
63+
{a: {$ne: Timestamp(0, 50)}},
64+
{a: Timestamp(50, 0)},
65+
{a: {$gt: Timestamp(50, 0)}},
66+
{a: {$ne: Timestamp(50, 0)}},
67+
];
68+
}
69+
}

0 commit comments

Comments
 (0)