@@ -25,7 +25,7 @@ Count Example 1: Get # of records with a certain subject_id
25
25
26
26
credentials = DocumentDbSSHCredentials()
27
27
with DocumentDbSSHClient(credentials = credentials) as doc_db_client:
28
- filter = {" subject.subject_id" : " 689418 " }
28
+ filter = {" subject.subject_id" : " 731015 " }
29
29
count = doc_db_client.collection.count_documents(filter )
30
30
print (count)
31
31
@@ -35,7 +35,7 @@ Filter Example 1: Get records with a certain subject_id
35
35
.. code :: python
36
36
37
37
with DocumentDbSSHClient(credentials = credentials) as doc_db_client:
38
- filter = {" subject.subject_id" : " 689418 " }
38
+ filter = {" subject.subject_id" : " 731015 " }
39
39
records = list (
40
40
doc_db_client.collection.find(filter = filter )
41
41
)
@@ -47,7 +47,7 @@ With projection (recommended):
47
47
.. code :: python
48
48
49
49
with DocumentDbSSHClient(credentials = credentials) as doc_db_client:
50
- filter = {" subject.subject_id" : " 689418 " }
50
+ filter = {" subject.subject_id" : " 731015 " }
51
51
projection = {
52
52
" name" : 1 ,
53
53
" created" : 1 ,
@@ -68,7 +68,7 @@ Filter Example 2: Get records with a certain breeding group
68
68
69
69
with DocumentDbSSHClient(credentials = credentials) as doc_db_client:
70
70
filter = {
71
- " subject.breeding_info.breeding_group" : " Chat -IRES-Cre_Jax006410 "
71
+ " subject.breeding_info.breeding_group" : " Slc17a6 -IRES-Cre;Ai230-hyg(ND) "
72
72
}
73
73
records = list (
74
74
doc_db_client.collection.find(filter = filter )
@@ -82,7 +82,7 @@ With projection (recommended):
82
82
83
83
with DocumentDbSSHClient(credentials = credentials) as doc_db_client:
84
84
filter = {
85
- " subject.breeding_info.breeding_group" : " Chat -IRES-Cre_Jax006410 "
85
+ " subject.breeding_info.breeding_group" : " Slc17a6 -IRES-Cre;Ai230-hyg(ND) "
86
86
}
87
87
projection = {
88
88
" name" : 1 ,
@@ -125,37 +125,19 @@ https://www.mongodb.com/docs/manual/aggregation/
125
125
Updating Metadata
126
126
~~~~~~~~~~~~~~~~~~~~~~
127
127
128
- We provide several utility functions for interacting with DocDB within the
129
- ``aind_data_access_api.utils `` module. Below is an example of how to use these
130
- functions to update records in DocDB.
128
+ Below is an example of how to update records in DocDB using ``DocumentDbSSHClient ``.
131
129
132
130
.. code :: python
133
131
134
- import json
135
132
import logging
136
- from typing import List, Optional
137
133
138
134
from aind_data_access_api.document_db_ssh import (
139
135
DocumentDbSSHClient,
140
136
DocumentDbSSHCredentials,
141
137
)
142
- from aind_data_schema.core.metadata import Metadata
143
-
144
- from aind_data_access_api.utils import paginate_docdb, is_dict_corrupt
145
138
146
139
logging.basicConfig(level = " INFO" )
147
140
148
- def _process_docdb_records (records : List[dict ], doc_db_client : DocumentDbSSHClient, dryrun : bool ) -> None :
149
- """
150
- Process records.
151
- Parameters
152
- ----------
153
- records : List[dict]
154
-
155
- """
156
- for record in records:
157
- _process_docdb_record(record = record, doc_db_client = doc_db_client, dryrun = dryrun)
158
-
159
141
def _process_docdb_record (record : dict , doc_db_client : DocumentDbSSHClient, dryrun : bool ) -> None :
160
142
"""
161
143
Process record. This example updates the data_description.name field
@@ -171,58 +153,17 @@ functions to update records in DocDB.
171
153
location = record.get(" location" )
172
154
if _id:
173
155
if record.get(" data_description" ) and record[" data_description" ].get(" name" ) != name:
174
- # Option 1: update specific fields(s) only
156
+ # update specific fields(s) only
175
157
new_fields = {
176
158
" data_description.name" : name
177
159
}
178
160
update_docdb_record_partial(record_id = _id, new_fields = new_fields, doc_db_client = doc_db_client, dryrun = dryrun)
179
- # Option 2: build new record Metadata.py and replace entire document with new record
180
- # new_record = build_new_docdb_record(record=record)
181
- # if new_record is not None:
182
- # update_docdb_record_entire(record_id=_id, new_record=new_record, doc_db_client=doc_db_client, dryrun=dryrun)
183
161
# else:
184
162
# logging.info(f"Record for {location} does not need to be updated.")
185
163
else :
186
164
logging.warning(f " Record for { location} does not have an _id field! Skipping. " )
187
165
188
166
189
- def build_new_docdb_record (record : Optional[dict ]) -> Optional[dict ]:
190
- """ Build new record from existing record. This example updates the
191
- data_description.name field if it does not match the record.name field.
192
-
193
- Parameters
194
- ----------
195
- record : Optional[dict]
196
-
197
- Returns
198
- -------
199
- Optional[dict]
200
- The new record, or None if the record cannot be constructed.
201
- """
202
- # Example: Update record.data_description.name if not matching record.name
203
- new_record = None
204
- if record.get(" data_description" ) and record[" data_description" ].get(" name" ) != name:
205
- _id = record.get(" _id" )
206
- name = record.get(" name" )
207
- location = record.get(" location" )
208
- created = record.get(" created" )
209
- if _id is None or name is None or location is None or created is None :
210
- logging.warning(f " Record does not have _id, name, location, or created! Skipping. " )
211
- return None
212
- try :
213
- new_record = record.copy()
214
- new_record[" data_description" ][" name" ] = name
215
- new_record_str = Metadata.model_construct(
216
- ** new_record
217
- ).model_dump_json(warnings = False , by_alias = True )
218
- new_record = json.loads(new_record_str)
219
- if is_dict_corrupt(new_record):
220
- logging.warning(f " New record for { location} is corrupt! Skipping. " )
221
- new_record = None
222
- except Exception :
223
- new_record = None
224
- return new_record
225
-
226
167
def update_docdb_record_partial (record_id : str , new_fields : dict , doc_db_client : DocumentDbSSHClient, dryrun : bool ) -> None :
227
168
"""
228
169
Update record in docdb by updating specific fields only.
@@ -244,54 +185,24 @@ functions to update records in DocDB.
244
185
upsert = False ,
245
186
)
246
187
logging.info(response.raw_result)
247
-
248
-
249
- def update_docdb_record_entire (record_id : str , new_record : dict , doc_db_client : DocumentDbSSHClient, dryrun : bool ) -> None :
250
- """
251
- Update record in docdb by replacing the entire document with new record.
252
- Parameters
253
- ----------
254
- record_id : str
255
- The _id of the record to update.
256
- new_record : dict
257
- The new record to replace the existing record with.
258
-
259
- """
260
- if is_dict_corrupt(new_record) or record_id != new_record.get(" _id" ):
261
- logging.warning(f " Attempting to update corrupt record { record_id} ! Skipping. " )
262
- return
263
- if dryrun:
264
- logging.info(f " (dryrun) doc_db_client.collection.update_one: { record_id} " )
265
- else :
266
- logging.info(f " doc_db_client.collection.update_one: { record_id} " )
267
- response = doc_db_client.collection.update_one(
268
- {" _id" : record_id},
269
- {" $set" : new_record},
270
- upsert = False ,
271
- )
272
- logging.info(response.raw_result)
273
188
274
189
275
190
if __name__ == " __main__" :
276
191
credentials = DocumentDbSSHCredentials() # credentials in environment
277
192
dryrun = True
278
- filter = {" location" : {" $regex" : " .*s3://codeocean-s3datasetsbucket .*" }}
193
+ filter = {" location" : {" $regex" : " .*s3://aind-open-data .*" }}
279
194
projection = None
280
195
281
196
with DocumentDbSSHClient(credentials = credentials) as doc_db_client:
282
197
db_name = doc_db_client.database_name
283
198
col_name = doc_db_client.collection_name
284
199
# count = doc_db_client.collection.count_documents(filter)
285
- # logging.info(f"{db_name}.{col_name}: Found {count} records with {filter}: {count} ")
200
+ # logging.info(f"{db_name}.{col_name}: Found {count} records with {filter}")
286
201
287
202
logging.info(f " { db_name} . { col_name} : Starting to scan for { filter } . " )
288
- docdb_pages = paginate_docdb(
289
- db_name = doc_db_client.database_name,
290
- collection_name = doc_db_client.collection_name,
291
- docdb_client = doc_db_client._client,
292
- page_size = 500 ,
293
- filter_query = filter ,
203
+ records = doc_db_client.collection.find(
204
+ filter = filter ,
294
205
)
295
- for page in docdb_pages :
296
- _process_docdb_records( records = page , doc_db_client = doc_db_client, dryrun = dryrun)
206
+ for record in records :
207
+ _process_docdb_record( record = record , doc_db_client = doc_db_client, dryrun = dryrun)
297
208
logging.info(f " { db_name} . { col_name} :Finished scanning through DocDb. " )
0 commit comments