Skip to content

Commit bde87bb

Browse files
New tech report DB and APIs migration (#88)
* new api draft * ranks and geos * cleanup * lint * flat mobile origins * lint * firestore timeout * perf settings refactored * bulkWriter * lint * lint
1 parent a5194f1 commit bde87bb

File tree

20 files changed

+338
-189
lines changed

20 files changed

+338
-189
lines changed

.github/dependabot.yml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
1+
---
12
# To get started with Dependabot version updates, you'll need to specify which
23
# package ecosystems to update and where the package manifests are located.
34
# Please see the documentation for all configuration options:

.github/workflows/linter.yaml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
1+
---
12
name: Linter
23

34
on:

Makefile

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,7 @@ tf_apply:
1313
terraform -chdir=infra/tf init && terraform -chdir=infra/tf apply -auto-approve
1414

1515
bigquery_export_deploy:
16-
cd infra/bigquery-export && npm install && npm run buildpack
16+
cd infra/bigquery-export && npm run build
1717

1818
#bigquery_export_spark_deploy:
1919
# cd infra/bigquery_export_spark && gcloud builds submit --region=global --tag us-docker.pkg.dev/httparchive/bigquery-spark-procedures/firestore_export:latest
Lines changed: 31 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,31 @@
1+
const pastMonth = constants.fnPastMonth(constants.currentMonth)
2+
3+
publish('tech_report_geos', {
4+
schema: 'reports',
5+
type: 'table',
6+
tags: ['tech_report']
7+
}).query(ctx => `
8+
SELECT
9+
geo,
10+
adoption.mobile AS mobile_origins
11+
FROM ${ctx.ref('reports', 'tech_report_adoption')}
12+
WHERE
13+
date = '${pastMonth}'
14+
AND rank = 'ALL'
15+
AND technology = 'ALL'
16+
AND version = 'ALL'
17+
${constants.devRankFilter}
18+
`).postOps(ctx => `
19+
SELECT
20+
reports.run_export_job(
21+
JSON '''{
22+
"destination": "firestore",
23+
"config": {
24+
"database": "tech-report-api-${constants.environment}",
25+
"collection": "geos",
26+
"type": "dict"
27+
},
28+
"query": "SELECT * FROM ${ctx.self()}"
29+
}'''
30+
);
31+
`)
Lines changed: 31 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,31 @@
1+
const pastMonth = constants.fnPastMonth(constants.currentMonth)
2+
3+
publish('tech_report_ranks', {
4+
schema: 'reports',
5+
type: 'table',
6+
tags: ['tech_report']
7+
}).query(ctx => `
8+
SELECT
9+
rank,
10+
adoption.mobile AS mobile_origins
11+
FROM ${ctx.ref('reports', 'tech_report_adoption')}
12+
WHERE
13+
date = '${pastMonth}'
14+
AND geo = 'ALL'
15+
AND technology = 'ALL'
16+
AND version = 'ALL'
17+
${constants.devRankFilter}
18+
`).postOps(ctx => `
19+
SELECT
20+
reports.run_export_job(
21+
JSON '''{
22+
"destination": "firestore",
23+
"config": {
24+
"database": "tech-report-api-${constants.environment}",
25+
"collection": "ranks",
26+
"type": "dict"
27+
},
28+
"query": "SELECT * FROM ${ctx.self()}"
29+
}'''
30+
);
31+
`)

infra/bigquery-export/.dockerignore

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,9 @@
1+
node_modules
2+
npm-debug.log
3+
.git
4+
.gitignore
5+
.env
6+
.nyc_output
7+
coverage
8+
*.md
9+
.DS_Store

infra/bigquery-export/Dockerfile

Lines changed: 6 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -3,11 +3,13 @@ FROM node:current-slim
33

44
WORKDIR /usr/src/app
55

6-
COPY . .
6+
# Copy package files first for better caching
7+
COPY package*.json ./
78

8-
# Clean up the node_modules directory
9-
RUN rm -rf node_modules
9+
# Install dependencies
10+
RUN npm ci --only=production --quiet --no-fund --no-audit
1011

11-
RUN npm ci --only=production
12+
# Copy source code
13+
COPY . .
1214

1315
CMD ["node", "index.js"]

infra/bigquery-export/cloudbuild.yaml

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,7 @@
1+
---
2+
steps:
3+
- name: "gcr.io/cloud-builders/docker"
4+
args:
5+
["build", "-t", "us.gcr.io/httparchive/cloud-run/bigquery-export", "."]
6+
images:
7+
- "us.gcr.io/httparchive/cloud-run/bigquery-export"

infra/bigquery-export/firestore.js

Lines changed: 134 additions & 84 deletions
Original file line numberDiff line numberDiff line change
@@ -3,135 +3,185 @@ import { BigQueryExport } from './bigquery.js'
33

44
export class FirestoreBatch {
55
constructor () {
6-
this.firestore = new Firestore()
6+
this.firestore = new Firestore({
7+
gaxOptions: {
8+
grpc: {
9+
max_receive_message_length: 500 * 1024 * 1024, // 500MB
10+
max_send_message_length: 500 * 1024 * 1024, // 500MB
11+
'grpc.max_connection_idle_ms': 5 * 60 * 1000, // 5 minutes
12+
'grpc.keepalive_time_ms': 30 * 1000, // 30 seconds
13+
'grpc.keepalive_timeout_ms': 60 * 1000, // 1 minute
14+
'grpc.keepalive_permit_without_calls': true
15+
}
16+
}
17+
})
718
this.bigquery = new BigQueryExport()
8-
this.batchSize = 500
9-
this.maxConcurrentBatches = 200
19+
20+
// Configuration constants
21+
this.config = {
22+
timeout: 10 * 60 * 1000, // 10 minutes
23+
progressReportInterval: 200000, // Report progress every N operations
24+
flushThreshold: 200000 // Flush BulkWriter every N operations
25+
}
26+
27+
this.reset()
28+
}
29+
30+
reset () {
31+
this.processedDocs = 0
32+
this.totalDocs = 0
33+
this.bulkWriter = null
1034
}
1135

12-
queueBatch (operation) {
13-
const batch = this.firestore.batch()
14-
15-
this.currentBatch.forEach((doc) => {
16-
if (operation === 'delete') {
17-
batch.delete(doc.ref)
18-
} else if (operation === 'set') {
19-
const docRef = this.firestore.collection(this.collectionName).doc()
20-
batch.set(docRef, doc)
21-
} else {
22-
throw new Error('Invalid operation')
36+
createBulkWriter (operation) {
37+
const bulkWriter = this.firestore.bulkWriter()
38+
39+
// Configure error handling with progress info
40+
bulkWriter.onWriteError((error) => {
41+
const progressInfo = this.totalDocs > 0 ? ` (${this.processedDocs}/${this.totalDocs})` : ''
42+
console.warn(`${operation} operation failed${progressInfo}:`, error.message)
43+
44+
// Retry on transient errors, fail on permanent ones
45+
const retryableErrors = ['deadline-exceeded', 'unavailable', 'resource-exhausted']
46+
return retryableErrors.includes(error.code)
47+
})
48+
49+
// Track progress on successful writes
50+
bulkWriter.onWriteResult(() => {
51+
this.processedDocs++
52+
53+
// Report progress periodically
54+
if (this.processedDocs % this.config.progressReportInterval === 0) {
55+
const progressInfo = this.totalDocs > 0 ? ` (${this.processedDocs}/${this.totalDocs})` : ` (${this.processedDocs} processed)`
56+
console.log(`Progress${progressInfo} - ${operation}ing documents in ${this.collectionName}`)
2357
}
2458
})
25-
this.batchPromises.push(batch)
26-
this.currentBatch = []
27-
}
2859

29-
async commitBatches () {
30-
console.log(`Committing ${this.batchPromises.length} batches to ${this.collectionName}`)
31-
await Promise.all(
32-
this.batchPromises.map(async (batchPromise) => await batchPromise.commit()
33-
.catch((error) => {
34-
console.error('Error committing batch:', error)
35-
throw error
36-
})
37-
)
38-
)
39-
this.batchPromises = []
60+
return bulkWriter
4061
}
4162

42-
async finalFlush (operation) {
43-
if (this.currentBatch.length > 0) {
44-
this.queueBatch(operation)
63+
buildQuery (collectionRef) {
64+
const queryMap = {
65+
report: () => {
66+
console.info(`Deleting documents from ${this.collectionName} for date ${this.date}`)
67+
return collectionRef.where('date', '==', this.date)
68+
},
69+
dict: () => {
70+
console.info(`Deleting documents from ${this.collectionName}`)
71+
return collectionRef
72+
}
73+
}
74+
75+
const queryBuilder = queryMap[this.collectionType]
76+
if (!queryBuilder) {
77+
throw new Error(`Invalid collection type: ${this.collectionType}`)
4578
}
4679

47-
if (this.batchPromises.length > 0) {
48-
await this.commitBatches()
80+
return queryBuilder()
81+
}
82+
83+
async getDocumentCount (query) {
84+
try {
85+
const countSnapshot = await query.count().get()
86+
return countSnapshot.data().count
87+
} catch (error) {
88+
console.warn('Could not get document count for progress tracking:', error.message)
89+
return 0
4990
}
5091
}
5192

5293
async batchDelete () {
5394
console.info('Starting batch deletion...')
5495
const startTime = Date.now()
55-
this.currentBatch = []
56-
this.batchPromises = []
96+
this.reset()
5797

58-
let totalDocsDeleted = 0
5998
const collectionRef = this.firestore.collection(this.collectionName)
99+
const collectionQuery = this.buildQuery(collectionRef)
60100

61-
let collectionQuery
62-
if (this.collectionType === 'report') {
63-
console.info('Deleting documents from ' + this.collectionName + ' for date ' + this.date)
64-
// Query to fetch monthly documents
65-
collectionQuery = collectionRef.where('date', '==', this.date)
66-
} else if (this.collectionType === 'dict') {
67-
console.info('Deleting documents from ' + this.collectionName)
68-
collectionQuery = collectionRef
69-
} else {
70-
throw new Error('Invalid collection type')
101+
// Get total count for progress tracking
102+
this.totalDocs = await this.getDocumentCount(collectionQuery)
103+
if (this.totalDocs > 0) {
104+
console.info(`Total documents to delete: ${this.totalDocs}`)
71105
}
72106

73-
while (true) {
74-
const snapshot = await collectionQuery.limit(this.batchSize * this.maxConcurrentBatches).get()
75-
if (snapshot.empty) {
76-
break
77-
}
107+
// Create BulkWriter for delete operations
108+
this.bulkWriter = this.createBulkWriter('delet')
78109

79-
for await (const doc of snapshot.docs) {
80-
this.currentBatch.push(doc)
110+
let deletedCount = 0
111+
const batchSize = this.config.flushThreshold // Process documents in chunks
81112

82-
if (this.currentBatch.length >= this.batchSize) {
83-
this.queueBatch('delete')
84-
}
85-
if (this.batchPromises.length >= this.maxConcurrentBatches) {
86-
await this.commitBatches()
87-
}
88-
totalDocsDeleted++
89-
}
113+
while (deletedCount < this.totalDocs || this.totalDocs === 0) {
114+
const snapshot = await collectionQuery.limit(batchSize).get()
115+
if (snapshot.empty) break
116+
117+
// Add all delete operations to BulkWriter
118+
snapshot.docs.forEach(doc => {
119+
this.bulkWriter.delete(doc.ref)
120+
deletedCount++
121+
})
122+
123+
// Periodically flush to manage memory
124+
// if (deletedCount % this.config.flushThreshold === 0) {
125+
console.log(`Flushing BulkWriter at ${deletedCount} operations...`)
126+
await this.bulkWriter.flush()
127+
// }
90128
}
91-
await this.finalFlush('delete')
129+
130+
// Final flush and close
131+
console.log('Finalizing deletion operations...')
132+
await this.bulkWriter.close()
92133

93134
const duration = (Date.now() - startTime) / 1000
94-
console.info(`Deletion complete. Total docs deleted: ${totalDocsDeleted}. Time: ${duration} seconds`)
135+
console.info(`Deletion complete. Total docs deleted: ${this.processedDocs}. Time: ${duration} seconds`)
95136
}
96137

97-
/**
98-
* Streams BigQuery query results into a Firestore collection using batch commits.
99-
* @param {string} query - The BigQuery SQL query.
100-
*/
101138
async streamFromBigQuery (rowStream) {
102139
console.info('Starting BigQuery to Firestore transfer...')
103140
const startTime = Date.now()
104-
let totalRowsProcessed = 0
141+
this.reset()
105142

106-
this.currentBatch = []
107-
this.batchPromises = []
143+
// Create BulkWriter for write operations
144+
this.bulkWriter = this.createBulkWriter('writ')
145+
146+
let rowCount = 0
147+
const collectionRef = this.firestore.collection(this.collectionName)
108148

109149
for await (const row of rowStream) {
110-
this.currentBatch.push(row)
150+
// Add document to BulkWriter
151+
const docRef = collectionRef.doc()
152+
this.bulkWriter.set(docRef, row)
111153

112-
// Write batch when it reaches specified size
113-
if (this.currentBatch.length >= this.batchSize) {
114-
this.queueBatch('set')
115-
}
154+
rowCount++
155+
this.totalDocs = rowCount // Update total as we go since we can't predict BigQuery result size
116156

117-
if (this.batchPromises.length >= this.maxConcurrentBatches) {
118-
await this.commitBatches()
157+
// Periodically flush to manage memory
158+
if (rowCount % this.config.flushThreshold === 0) {
159+
console.log(`Flushing BulkWriter at ${rowCount} operations...`)
160+
await this.bulkWriter.flush()
119161
}
120-
totalRowsProcessed++
121162
}
122-
await this.finalFlush('set')
163+
164+
// Final flush and close
165+
console.log('Finalizing write operations...')
166+
await this.bulkWriter.close()
123167

124168
const duration = (Date.now() - startTime) / 1000
125-
console.info(`Transfer to ${this.collectionName} complete. Total rows processed: ${totalRowsProcessed}. Time: ${duration} seconds`)
169+
console.info(`Transfer to ${this.collectionName} complete. Total rows processed: ${this.processedDocs}. Time: ${duration} seconds`)
126170
}
127171

128172
async export (query, exportConfig) {
173+
// Configure Firestore settings
129174
this.firestore.settings({
130-
databaseId: exportConfig.database
175+
databaseId: exportConfig.database,
176+
timeout: this.config.timeout
177+
})
178+
179+
// Set instance properties
180+
Object.assign(this, {
181+
collectionName: exportConfig.collection,
182+
collectionType: exportConfig.type,
183+
date: exportConfig.date
131184
})
132-
this.collectionName = exportConfig.collection
133-
this.collectionType = exportConfig.type
134-
this.date = exportConfig.date
135185

136186
await this.batchDelete()
137187

infra/bigquery-export/package.json

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@
44
"main": "index.js",
55
"scripts": {
66
"start": "node index.js",
7-
"buildpack": "rm -rf node_modules; gcloud builds submit --pack image=us.gcr.io/httparchive/cloud-run/bigquery-export"
7+
"build": "gcloud builds submit"
88
},
99
"type": "module",
1010
"dependencies": {

0 commit comments

Comments
 (0)