Skip to content

Commit 1cb2bd5

Browse files
authored
Merge pull request #691 from NatLabRockies/analytics-cache
Improvements to analytics summary API
2 parents eac5a7c + 2e2dadb commit 1cb2bd5

14 files changed

Lines changed: 455 additions & 123 deletions

File tree

.github/workflows/main.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@ on:
77

88
env:
99
REGISTRY: ghcr.io
10-
IMAGE_NAME: nrel/api-umbrella
10+
IMAGE_NAME: natlabrockies/api-umbrella
1111
DOCKER_BUILDKIT: 1
1212
TESTS_GLOB: "test/**/test_*.rb"
1313

.github/workflows/release.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@ on:
77

88
env:
99
REGISTRY: ghcr.io
10-
IMAGE_NAME: nrel/api-umbrella
10+
IMAGE_NAME: natlabrockies/api-umbrella
1111
DOCKER_BUILDKIT: 1
1212

1313
jobs:

Dockerfile-opensearch

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,2 +1,2 @@
1-
FROM public.ecr.aws/opensearchproject/opensearch:2.17.1
1+
FROM public.ecr.aws/opensearchproject/opensearch:3.3.2
22
RUN /usr/share/opensearch/bin/opensearch-plugin install --batch mapper-murmur3

config/schema.cue

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -380,6 +380,8 @@ import "path"
380380
analytics_v0_summary_start_time: string | *"2013-07-01T00:00:00.000Z"
381381
analytics_v0_summary_end_time?: string
382382
analytics_v0_summary_filter?: string
383+
analytics_v0_summary_db_timeout: uint | *900 // 15 minutes
384+
analytics_v0_summary_analytics_timeout: uint | *2400 // 40 minutes
383385
max_body_size: string | *"1m"
384386
allowed_signup_embed_urls_regex?: string
385387
default_host?: string

config/test.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -83,7 +83,7 @@ web:
8383
user_name: api_umbrella
8484
password: dev_password
8585
contact_form_email: default-test-contact-email@example.com
86-
analytics_v0_summary_start_time: "2013-07-01T00:00:00.000-06:00"
86+
analytics_v0_summary_start_time: "2013-06-01T00:00:00.000-06:00"
8787
analytics_v0_summary_end_time: "2013-08-31T23:59:59.999-06:00"
8888
router:
8989
trusted_proxies:

db/schema.sql

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -67,7 +67,7 @@ CREATE FUNCTION api_umbrella.analytics_cache_extract_unique_user_ids() RETURNS t
6767
AS $$
6868
BEGIN
6969
IF (jsonb_typeof(NEW.data->'aggregations'->'unique_user_ids'->'buckets') = 'array') THEN
70-
NEW.unique_user_ids := (SELECT array_agg(DISTINCT bucket->>'key')::uuid[] FROM jsonb_array_elements(NEW.data->'aggregations'->'unique_user_ids'->'buckets') AS bucket);
70+
NEW.unique_user_ids := (SELECT array_agg(DISTINCT bucket->'key'->>'user_id')::uuid[] FROM jsonb_array_elements(NEW.data->'aggregations'->'unique_user_ids'->'buckets') AS bucket);
7171
END IF;
7272

7373
RETURN NEW;
@@ -765,6 +765,9 @@ CREATE TABLE api_umbrella.analytics_cache (
765765
created_at timestamp with time zone DEFAULT transaction_timestamp() NOT NULL,
766766
updated_at timestamp with time zone DEFAULT transaction_timestamp() NOT NULL,
767767
unique_user_ids uuid[],
768+
data_date character varying GENERATED ALWAYS AS (((((((data -> 'aggregations'::text) -> 'hits_over_time'::text) -> 'buckets'::text) -> 0) ->> 'key_as_string'::text))::character varying) STORED,
769+
hit_count bigint GENERATED ALWAYS AS (((((((data -> 'aggregations'::text) -> 'hits_over_time'::text) -> 'buckets'::text) -> 0) ->> 'doc_count'::text))::bigint) STORED,
770+
response_time_average bigint GENERATED ALWAYS AS (round(((((data -> 'aggregations'::text) -> 'response_time_average'::text) ->> 'value'::text))::numeric)) STORED,
768771
CONSTRAINT analytics_cache_enforce_single_date_bucket CHECK ((NOT (jsonb_array_length((((data -> 'aggregations'::text) -> 'hits_over_time'::text) -> 'buckets'::text)) > 1)))
769772
);
770773

@@ -2801,6 +2804,7 @@ ALTER TABLE ONLY api_umbrella.rate_limits
28012804
-- PostgreSQL database dump complete
28022805
--
28032806

2807+
28042808
INSERT INTO api_umbrella.lapis_migrations (name) VALUES ('1498350289');
28052809
INSERT INTO api_umbrella.lapis_migrations (name) VALUES ('1554823736');
28062810
INSERT INTO api_umbrella.lapis_migrations (name) VALUES ('1560722058');
@@ -2820,3 +2824,5 @@ INSERT INTO api_umbrella.lapis_migrations (name) VALUES ('1701483732');
28202824
INSERT INTO api_umbrella.lapis_migrations (name) VALUES ('1721347955');
28212825
INSERT INTO api_umbrella.lapis_migrations (name) VALUES ('1738353016');
28222826
INSERT INTO api_umbrella.lapis_migrations (name) VALUES ('1753472899');
2827+
INSERT INTO api_umbrella.lapis_migrations (name) VALUES ('1769633747');
2828+
INSERT INTO api_umbrella.lapis_migrations (name) VALUES ('1769732670');

docker-compose.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@ services:
55
dockerfile: Dockerfile
66
target: test
77
cache_from:
8-
- ${DOCKER_IMAGE_CACHE_FROM:-ghcr.io/nrel/api-umbrella:dev-env-main}
8+
- ${DOCKER_IMAGE_CACHE_FROM:-ghcr.io/natlabrockies/api-umbrella:dev-env-main}
99
entrypoint: /app/docker/dev/docker-entrypoint
1010
command: /app/docker/dev/docker-start
1111
volumes:

src/api-umbrella/cli/migrate.lua

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -62,7 +62,7 @@ return function()
6262
local clean_lines = {}
6363
local removing_comments = true
6464
for _, line in ipairs(lines) do
65-
if not removing_comments or (line ~= "" and not startswith(line, "--")) then
65+
if (not removing_comments or (line ~= "" and not startswith(line, "--"))) and not startswith(line, "\\restrict") and not startswith(line, "\\unrestrict") then
6666
if startswith(line, "COMMENT ON EXTENSION") then
6767
line = "-- " .. line
6868
end

src/api-umbrella/version.txt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1 +1 @@
1-
1.6.0
1+
1.6.6

src/api-umbrella/web-app/actions/v0/analytics.lua

Lines changed: 109 additions & 55 deletions
Original file line numberDiff line numberDiff line change
@@ -15,10 +15,17 @@ local stable_object_hash = require "api-umbrella.utils.stable_object_hash"
1515
local t = require("api-umbrella.web-app.utils.gettext").gettext
1616
local time = require "api-umbrella.utils.time"
1717

18+
local db_statement_timeout_ms = config["web"]["analytics_v0_summary_db_timeout"] * 1000
19+
1820
local _M = {}
1921

20-
local function generate_organization_summary(start_time, end_time, recent_start_time, filters)
21-
local cache_id = "analytics_summary:organization:" .. start_time .. ":" .. end_time .. ":" .. recent_start_time .. ":" .. stable_object_hash(filters)
22+
local function generate_organization_summary(organization_name, start_time, end_time, recent_start_time, filters)
23+
local cache_id = "analytics_summary:organization:" .. organization_name .. ":" .. start_time .. ":" .. end_time .. ":" .. recent_start_time .. ":" .. stable_object_hash({
24+
filters = filters,
25+
timezone = config["analytics"]["timezone"],
26+
max_buckets = config["opensearch"]["max_buckets"],
27+
analytics_v0_summary_filter = config["web"]["analytics_v0_summary_filter"],
28+
})
2229
local cache = Cache:find(cache_id)
2330
if cache then
2431
ngx.log(ngx.NOTICE, "Using cached analytics response for " .. cache_id)
@@ -38,50 +45,65 @@ local function generate_organization_summary(start_time, end_time, recent_start_
3845
if config["web"]["analytics_v0_summary_filter"] then
3946
search:set_search_query_string(config["web"]["analytics_v0_summary_filter"])
4047
end
41-
search:set_timeout(20 * 60) -- 20 minutes
48+
search:set_timeout(config["web"]["analytics_v0_summary_analytics_timeout"])
4249
search:set_permission_scope(filters)
4350

4451
local aggregate_sql = [[
52+
WITH interval_rows AS (
53+
SELECT
54+
substring(data_date from 1 for :date_key_length) AS interval_date,
55+
hit_count,
56+
unique_user_ids,
57+
response_time_average
58+
FROM analytics_cache
59+
WHERE id IN :ids
60+
),
61+
interval_unique_user_ids AS (
62+
SELECT
63+
interval_date,
64+
array_agg(DISTINCT user_ids.user_id) FILTER (WHERE user_ids.user_id IS NOT NULL) AS unique_user_ids
65+
FROM interval_rows
66+
CROSS JOIN LATERAL unnest(unique_user_ids) AS user_ids(user_id)
67+
GROUP BY interval_date
68+
),
69+
interval_counts AS (
70+
SELECT
71+
interval_date,
72+
SUM(hit_count) AS hit_count,
73+
SUM(response_time_average) AS response_time_average
74+
FROM interval_rows
75+
GROUP BY interval_date
76+
),
77+
interval_totals AS (
78+
SELECT
79+
interval_counts.interval_date,
80+
interval_counts.hit_count,
81+
interval_unique_user_ids.unique_user_ids,
82+
interval_counts.response_time_average
83+
FROM interval_counts
84+
NATURAL LEFT JOIN interval_unique_user_ids
85+
ORDER BY interval_date
86+
),
87+
all_unique_users AS (
88+
SELECT COUNT(DISTINCT user_ids.user_id) FILTER (WHERE user_ids.user_id IS NOT NULL) AS total_unique_users
89+
FROM interval_totals
90+
LEFT JOIN LATERAL unnest(interval_totals.unique_user_ids) AS user_ids(user_id) ON true
91+
)
4592
SELECT jsonb_build_object(
4693
'hits', jsonb_build_object(
4794
:interval_name, jsonb_agg(jsonb_build_array(interval_totals.interval_date, COALESCE(interval_totals.hit_count, 0))),
4895
'total', SUM(interval_totals.hit_count)
4996
),
5097
'active_api_keys', jsonb_build_object(
5198
:interval_name, jsonb_agg(jsonb_build_array(interval_totals.interval_date, COALESCE(array_length(interval_totals.unique_user_ids, 1), 0))),
52-
'total', (
53-
SELECT COUNT(DISTINCT user_ids.id)
54-
FROM unnest(array_accum(interval_totals.unique_user_ids)) AS user_ids(id)
55-
)
99+
'total', (SELECT total_unique_users FROM all_unique_users)
56100
),
57101
'average_response_times', jsonb_build_object(
58102
:interval_name, jsonb_agg(jsonb_build_array(interval_totals.interval_date, interval_totals.response_time_average)),
59103
'average', ROUND(SUM(CASE WHEN interval_totals.response_time_average IS NOT NULL AND interval_totals.hit_count IS NOT NULL THEN interval_totals.response_time_average * interval_totals.hit_count END) / SUM(CASE WHEN interval_totals.response_time_average IS NOT NULL AND interval_totals.hit_count IS NOT NULL THEN interval_totals.hit_count END))
60104
)
61105
) AS response
62-
FROM (
63-
SELECT
64-
interval_date,
65-
hit_count,
66-
response_time_average,
67-
(
68-
SELECT array_agg(DISTINCT user_id)
69-
FROM unnest(interval_agg.user_ids) AS user_id
70-
LEFT JOIN api_users ON user_id = api_users.id
71-
WHERE user_id IS NOT NULL AND api_users.disabled_at IS NULL
72-
) AS unique_user_ids
73-
FROM (
74-
SELECT
75-
substring(data->'aggregations'->'hits_over_time'->'buckets'->0->>'key_as_string' from 1 for :date_key_length) AS interval_date,
76-
SUM((data->'aggregations'->'hits_over_time'->'buckets'->0->>'doc_count')::bigint) AS hit_count,
77-
array_accum(unique_user_ids) AS user_ids,
78-
SUM(ROUND((data->'aggregations'->'response_time_average'->>'value')::numeric)) AS response_time_average
79-
FROM analytics_cache
80-
WHERE id IN :ids
81-
GROUP BY interval_date
82-
ORDER BY interval_date
83-
) AS interval_agg
84-
) AS interval_totals
106+
FROM interval_totals
85107
]]
86108

87109
-- Expire the monthly data in 3 months. While the historical data shouldn't
@@ -97,7 +119,7 @@ local function generate_organization_summary(start_time, end_time, recent_start_
97119
date_key_length = 7,
98120
}, {
99121
fatal = true,
100-
statement_timeout = 5 * 60 * 1000, -- 5 minutes
122+
statement_timeout = db_statement_timeout_ms,
101123
})[1]["response"]
102124

103125
search:set_start_time(recent_start_time)
@@ -112,13 +134,26 @@ local function generate_organization_summary(start_time, end_time, recent_start_
112134
date_key_length = 10,
113135
}, {
114136
fatal = true,
115-
statement_timeout = 5 * 60 * 1000, -- 5 minutes
137+
statement_timeout = db_statement_timeout_ms,
116138
})[1]["response"]
117139

118140
response["hits"]["recent"] = recent_response["hits"]
119141
response["active_api_keys"]["recent"] = recent_response["active_api_keys"]
120142
response["average_response_times"]["recent"] = recent_response["average_response_times"]
121143

144+
-- Only cache the data if it includes the expected latest month of data, and
145+
-- also includes all months/days expected. This prevents returning and
146+
-- caching incomplete data due to the underlying analytics queries failing
147+
-- for certain time periods and forces the data to wait until all of the
148+
-- underlying data is cached before returning the overall summary data.
149+
local last_month = response["hits"]["monthly"][#response["hits"]["monthly"]]
150+
local expected_last_month = string.sub(end_time, 1, 7)
151+
local last_day = response["hits"]["recent"]["daily"][#response["hits"]["recent"]["daily"]]
152+
local expected_last_day = string.sub(end_time, 1, 10)
153+
if last_month[1] ~= expected_last_month or last_day[1] ~= expected_last_day or #analytics_cache_ids ~= #response["hits"]["monthly"] or #recent_analytics_cache_ids ~= #response["hits"]["recent"]["daily"] then
154+
return nil, "incomplete data"
155+
end
156+
122157
local response_json = json_encode(response)
123158
expires_at = ngx.now() + 60 * 60 * 24 * 2 -- 2 days
124159
Cache:upsert(cache_id, response_json, expires_at)
@@ -127,6 +162,8 @@ local function generate_organization_summary(start_time, end_time, recent_start_
127162
end
128163

129164
local function generate_production_apis_summary(start_time, end_time, recent_start_time)
165+
local any_err = false
166+
130167
local data = {
131168
organizations = {},
132169
}
@@ -139,7 +176,7 @@ local function generate_production_apis_summary(start_time, end_time, recent_sta
139176
WHERE api_backends.status_description = 'Production'
140177
]], nil, {
141178
fatal = true,
142-
statement_timeout = 5 * 60 * 1000, -- 5 minutes
179+
statement_timeout = db_statement_timeout_ms,
143180
})
144181
data["organization_count"] = int64_to_json_number(counts[1]["organization_count"])
145182
data["api_backend_count"] = int64_to_json_number(counts[1]["api_backend_count"])
@@ -162,7 +199,7 @@ local function generate_production_apis_summary(start_time, end_time, recent_sta
162199
ORDER BY api_backends.organization_name
163200
]], nil, {
164201
fatal = true,
165-
statement_timeout = 5 * 60 * 1000, -- 5 minutes
202+
statement_timeout = db_statement_timeout_ms,
166203
})
167204
for _, organization in ipairs(organizations) do
168205
local filters = {
@@ -190,18 +227,32 @@ local function generate_production_apis_summary(start_time, end_time, recent_sta
190227
end
191228

192229
ngx.log(ngx.NOTICE, 'Fetching analytics for organization "' .. organization["organization_name"] .. '"')
193-
local organization_data = generate_organization_summary(start_time, end_time, recent_start_time, filters)
194-
organization_data["name"] = organization["organization_name"]
195-
organization_data["api_backend_count"] = int64_to_json_number(organization["api_backend_count"])
196-
organization_data["api_backend_url_match_count"] = int64_to_json_number(organization["api_backend_url_match_count"])
197-
table.insert(data["organizations"], organization_data)
230+
local organization_data, organization_data_err = generate_organization_summary(organization["organization_name"], start_time, end_time, recent_start_time, filters)
231+
if organization_data_err then
232+
ngx.log(ngx.ERR, 'Analytics for organization "' .. organization["organization_name"] .. '" failed: ', organization_data_err)
233+
any_err = true
234+
else
235+
organization_data["name"] = organization["organization_name"]
236+
organization_data["api_backend_count"] = int64_to_json_number(organization["api_backend_count"])
237+
organization_data["api_backend_url_match_count"] = int64_to_json_number(organization["api_backend_url_match_count"])
238+
table.insert(data["organizations"], organization_data)
239+
end
198240
end
199241

200242
ngx.log(ngx.NOTICE, "Fetching analytics for all organizations")
201-
local all_data = generate_organization_summary(start_time, end_time, recent_start_time, all_filters)
202-
data["all"] = all_data
243+
local all_data, all_data_err = generate_organization_summary("all", start_time, end_time, recent_start_time, all_filters)
244+
if all_data_err then
245+
ngx.log(ngx.ERR, "Analytics for all organization failed: ", all_data_err)
246+
any_err = true
247+
else
248+
data["all"] = all_data
249+
end
203250

204-
return data
251+
if any_err then
252+
return nil, "incomplete data"
253+
else
254+
return data
255+
end
205256
end
206257

207258
local function generate_summary()
@@ -236,21 +287,24 @@ local function generate_summary()
236287
date_tz:set(icu_date.fields.MILLISECOND, 0)
237288
local recent_start_time = date_tz:format(format_iso8601)
238289

239-
local response = {
240-
production_apis = generate_production_apis_summary(start_time, end_time, recent_start_time),
241-
start_time = time.timestamp_ms_to_iso8601(start_time_ms),
242-
end_time = time.timestamp_ms_to_iso8601(end_time_ms),
243-
timezone = date_tz:get_time_zone_id(),
244-
}
245-
246-
response["cached_at"] = time.timestamp_to_iso8601(ngx.now())
290+
local production_apis, production_apis_err = generate_production_apis_summary(start_time, end_time, recent_start_time)
291+
if production_apis_err then
292+
ngx.log(ngx.ERR, "Production APIs summary error: ", production_apis_err)
293+
else
294+
local response = {
295+
production_apis = production_apis,
296+
start_time = time.timestamp_ms_to_iso8601(start_time_ms),
297+
end_time = time.timestamp_ms_to_iso8601(end_time_ms),
298+
timezone = date_tz:get_time_zone_id(),
299+
}
247300

248-
local cache_id = "analytics_summary"
249-
local response_json = json_encode(response)
250-
local expires_at = ngx.now() + 60 * 60 * 24 * 2 -- 2 days
251-
Cache:upsert(cache_id, response_json, expires_at)
301+
response["cached_at"] = time.timestamp_to_iso8601(ngx.now())
252302

253-
return response_json
303+
local cache_id = "analytics_summary"
304+
local response_json = json_encode(response)
305+
local expires_at = nil -- Never expire
306+
Cache:upsert(cache_id, response_json, expires_at)
307+
end
254308
end
255309

256310
function _M.summary(self)

0 commit comments

Comments
 (0)