|
3 | 3 | {
|
4 | 4 | "cell_type": "markdown",
|
5 | 5 | "metadata": {
|
6 |
| - "collapsed": false |
| 6 | + "collapsed": false, |
| 7 | + "jupyter": { |
| 8 | + "outputs_hidden": false |
| 9 | + } |
7 | 10 | },
|
8 | 11 | "source": [
|
9 | 12 | "# NLP text search using hugging face transformer model\n",
|
|
44 | 47 | },
|
45 | 48 | "outputs": [],
|
46 | 49 | "source": [
|
47 |
| - "# install packages\n", |
48 |
| - "!python3 -m pip install -qU sentence-transformers eland elasticsearch transformers\n", |
49 |
| - "\n", |
| 50 | + "!python3 -m pip -qU install sentence-transformers eland elasticsearch transformers" |
| 51 | + ] |
| 52 | + }, |
| 53 | + { |
| 54 | + "cell_type": "code", |
| 55 | + "execution_count": null, |
| 56 | + "metadata": {}, |
| 57 | + "outputs": [], |
| 58 | + "source": [ |
50 | 59 | "# import modules\n",
|
51 |
| - "import pandas as pd, json\n", |
52 | 60 | "from elasticsearch import Elasticsearch\n",
|
53 | 61 | "from getpass import getpass\n",
|
54 |
| - "from urllib.request import urlopen" |
| 62 | + "from urllib.request import urlopen\n", |
| 63 | + "import json" |
55 | 64 | ]
|
56 | 65 | },
|
57 | 66 | {
|
|
93 | 102 | "ELASTIC_CLOUD_ID = getpass(\"Elastic Cloud ID: \")\n",
|
94 | 103 | "\n",
|
95 | 104 | "# https://www.elastic.co/search-labs/tutorials/install-elasticsearch/elastic-cloud#creating-an-api-key\n",
|
96 |
| - "ELASTIC_API_KEY = getpass(\"Elastic Api Key: \")\n", |
97 |
| - "\n", |
| 105 | + "ELASTIC_API_KEY = getpass(\"Elastic Api Key: \")" |
| 106 | + ] |
| 107 | + }, |
| 108 | + { |
| 109 | + "cell_type": "code", |
| 110 | + "execution_count": null, |
| 111 | + "metadata": {}, |
| 112 | + "outputs": [], |
| 113 | + "source": [ |
98 | 114 | "!eland_import_hub_model --cloud-id $ELASTIC_CLOUD_ID --hub-model-id sentence-transformers/all-MiniLM-L6-v2 --task-type text_embedding --es-api-key $ELASTIC_API_KEY --start"
|
99 | 115 | ]
|
100 | 116 | },
|
|
304 | 320 | },
|
305 | 321 | {
|
306 | 322 | "cell_type": "code",
|
307 |
| - "execution_count": 106, |
| 323 | + "execution_count": 22, |
308 | 324 | "metadata": {
|
309 | 325 | "colab": {
|
310 | 326 | "base_uri": "https://localhost:8080/",
|
|
315 | 331 | },
|
316 | 332 | "outputs": [
|
317 | 333 | {
|
318 |
| - "data": { |
319 |
| - "text/html": [ |
320 |
| - "<div>\n", |
321 |
| - "<style scoped>\n", |
322 |
| - " .dataframe tbody tr th:only-of-type {\n", |
323 |
| - " vertical-align: middle;\n", |
324 |
| - " }\n", |
325 |
| - "\n", |
326 |
| - " .dataframe tbody tr th {\n", |
327 |
| - " vertical-align: top;\n", |
328 |
| - " }\n", |
329 |
| - "\n", |
330 |
| - " .dataframe thead th {\n", |
331 |
| - " text-align: right;\n", |
332 |
| - " }\n", |
333 |
| - "</style>\n", |
334 |
| - "<table border=\"1\" class=\"dataframe\">\n", |
335 |
| - " <thead>\n", |
336 |
| - " <tr style=\"text-align: right;\">\n", |
337 |
| - " <th></th>\n", |
338 |
| - " <th>_id</th>\n", |
339 |
| - " <th>_score</th>\n", |
340 |
| - " <th>fields.title</th>\n", |
341 |
| - " </tr>\n", |
342 |
| - " </thead>\n", |
343 |
| - " <tbody>\n", |
344 |
| - " <tr>\n", |
345 |
| - " <th>0</th>\n", |
346 |
| - " <td>TxUU-YkBAHcz2kFqAun2</td>\n", |
347 |
| - " <td>0.591786</td>\n", |
348 |
| - " <td>[Brewing in Beats: Track network connections]</td>\n", |
349 |
| - " </tr>\n", |
350 |
| - " <tr>\n", |
351 |
| - " <th>1</th>\n", |
352 |
| - " <td>SxUU-YkBAHcz2kFqAun2</td>\n", |
353 |
| - " <td>0.401099</td>\n", |
354 |
| - " <td>[Machine Learning for Nginx Logs - Identifying...</td>\n", |
355 |
| - " </tr>\n", |
356 |
| - " <tr>\n", |
357 |
| - " <th>2</th>\n", |
358 |
| - " <td>UxUU-YkBAHcz2kFqAun2</td>\n", |
359 |
| - " <td>0.390279</td>\n", |
360 |
| - " <td>[Data Visualization For Machine Learning]</td>\n", |
361 |
| - " </tr>\n", |
362 |
| - " <tr>\n", |
363 |
| - " <th>3</th>\n", |
364 |
| - " <td>TBUU-YkBAHcz2kFqAun2</td>\n", |
365 |
| - " <td>0.368995</td>\n", |
366 |
| - " <td>[Logstash Lines: Introduce integration plugins]</td>\n", |
367 |
| - " </tr>\n", |
368 |
| - " <tr>\n", |
369 |
| - " <th>4</th>\n", |
370 |
| - " <td>UhUU-YkBAHcz2kFqAun2</td>\n", |
371 |
| - " <td>0.368995</td>\n", |
372 |
| - " <td>[Logstash Lines: Introduce integration plugins]</td>\n", |
373 |
| - " </tr>\n", |
374 |
| - " <tr>\n", |
375 |
| - " <th>5</th>\n", |
376 |
| - " <td>URUU-YkBAHcz2kFqAun2</td>\n", |
377 |
| - " <td>0.356903</td>\n", |
378 |
| - " <td>[Keeping up with Kibana: This week in Kibana f...</td>\n", |
379 |
| - " </tr>\n", |
380 |
| - " <tr>\n", |
381 |
| - " <th>6</th>\n", |
382 |
| - " <td>UBUU-YkBAHcz2kFqAun2</td>\n", |
383 |
| - " <td>0.341939</td>\n", |
384 |
| - " <td>[Kibana 4 Video Tutorials, Part 3]</td>\n", |
385 |
| - " </tr>\n", |
386 |
| - " <tr>\n", |
387 |
| - " <th>7</th>\n", |
388 |
| - " <td>VBUU-YkBAHcz2kFqAun2</td>\n", |
389 |
| - " <td>0.337294</td>\n", |
390 |
| - " <td>[Introducing approximate nearest neighbor sear...</td>\n", |
391 |
| - " </tr>\n", |
392 |
| - " <tr>\n", |
393 |
| - " <th>8</th>\n", |
394 |
| - " <td>ThUU-YkBAHcz2kFqAun2</td>\n", |
395 |
| - " <td>0.336460</td>\n", |
396 |
| - " <td>[Where in the World is Elastic? - QCon Beijing...</td>\n", |
397 |
| - " </tr>\n", |
398 |
| - " <tr>\n", |
399 |
| - " <th>9</th>\n", |
400 |
| - " <td>TRUU-YkBAHcz2kFqAun2</td>\n", |
401 |
| - " <td>0.320756</td>\n", |
402 |
| - " <td>[EQL for the masses]</td>\n", |
403 |
| - " </tr>\n", |
404 |
| - " </tbody>\n", |
405 |
| - "</table>\n", |
406 |
| - "</div>" |
407 |
| - ], |
408 |
| - "text/plain": [ |
409 |
| - " _id _score \\\n", |
410 |
| - "0 TxUU-YkBAHcz2kFqAun2 0.591786 \n", |
411 |
| - "1 SxUU-YkBAHcz2kFqAun2 0.401099 \n", |
412 |
| - "2 UxUU-YkBAHcz2kFqAun2 0.390279 \n", |
413 |
| - "3 TBUU-YkBAHcz2kFqAun2 0.368995 \n", |
414 |
| - "4 UhUU-YkBAHcz2kFqAun2 0.368995 \n", |
415 |
| - "5 URUU-YkBAHcz2kFqAun2 0.356903 \n", |
416 |
| - "6 UBUU-YkBAHcz2kFqAun2 0.341939 \n", |
417 |
| - "7 VBUU-YkBAHcz2kFqAun2 0.337294 \n", |
418 |
| - "8 ThUU-YkBAHcz2kFqAun2 0.336460 \n", |
419 |
| - "9 TRUU-YkBAHcz2kFqAun2 0.320756 \n", |
420 |
| - "\n", |
421 |
| - " fields.title \n", |
422 |
| - "0 [Brewing in Beats: Track network connections] \n", |
423 |
| - "1 [Machine Learning for Nginx Logs - Identifying... \n", |
424 |
| - "2 [Data Visualization For Machine Learning] \n", |
425 |
| - "3 [Logstash Lines: Introduce integration plugins] \n", |
426 |
| - "4 [Logstash Lines: Introduce integration plugins] \n", |
427 |
| - "5 [Keeping up with Kibana: This week in Kibana f... \n", |
428 |
| - "6 [Kibana 4 Video Tutorials, Part 3] \n", |
429 |
| - "7 [Introducing approximate nearest neighbor sear... \n", |
430 |
| - "8 [Where in the World is Elastic? - QCon Beijing... \n", |
431 |
| - "9 [EQL for the masses] " |
432 |
| - ] |
433 |
| - }, |
434 |
| - "execution_count": 106, |
435 |
| - "metadata": {}, |
436 |
| - "output_type": "execute_result" |
| 334 | + "name": "stdout", |
| 335 | + "output_type": "stream", |
| 336 | + "text": [ |
| 337 | + "['Brewing in Beats: Track network connections']\n", |
| 338 | + "Score: 0.5917864\n", |
| 339 | + "\n", |
| 340 | + "['Machine Learning for Nginx Logs - Identifying Operational Issues with Your Website']\n", |
| 341 | + "Score: 0.40109876\n", |
| 342 | + "\n", |
| 343 | + "['Data Visualization For Machine Learning']\n", |
| 344 | + "Score: 0.39027885\n", |
| 345 | + "\n", |
| 346 | + "['Logstash Lines: Introduce integration plugins']\n", |
| 347 | + "Score: 0.36899462\n", |
| 348 | + "\n", |
| 349 | + "['Keeping up with Kibana: This week in Kibana for November 29th, 2019']\n", |
| 350 | + "Score: 0.35690257\n", |
| 351 | + "\n", |
| 352 | + "['How to implement similarity image search | Elastic.co | Elastic Blog']\n", |
| 353 | + "Score: 0.34473613\n", |
| 354 | + "\n", |
| 355 | + "['Kibana 4 Video Tutorials, Part 3']\n", |
| 356 | + "Score: 0.34193927\n", |
| 357 | + "\n", |
| 358 | + "['Introducing approximate nearest neighbor search in Elasticsearch 8.0 | Elastic Blog']\n", |
| 359 | + "Score: 0.3372936\n", |
| 360 | + "\n", |
| 361 | + "['Where in the World is Elastic? - QCon Beijing, Devoxx France, Percona Live & AWS Summit Chicago']\n", |
| 362 | + "Score: 0.33645985\n", |
| 363 | + "\n", |
| 364 | + "['EQL for the masses']\n", |
| 365 | + "Score: 0.3207562\n", |
| 366 | + "\n" |
| 367 | + ] |
437 | 368 | }
|
438 | 369 | ],
|
439 | 370 | "source": [
|
|
458 | 389 | " knn=query,\n",
|
459 | 390 | " source=False)\n",
|
460 | 391 | "\n",
|
461 |
| - "\n", |
462 |
| - "results = pd.json_normalize(json.loads(json.dumps(response.body['hits']['hits'])))\n", |
463 |
| - "\n", |
464 |
| - "# shows the result\n", |
465 |
| - "results[['_id', '_score', 'fields.title']]\n" |
| 392 | + "def show_results(results):\n", |
| 393 | + " for result in results:\n", |
| 394 | + " print(f'{result[\"fields\"][\"title\"]}\\nScore: {result[\"_score\"]}\\n')\n", |
| 395 | + " \n", |
| 396 | + "show_results(response.body['hits']['hits'])" |
466 | 397 | ]
|
| 398 | + }, |
| 399 | + { |
| 400 | + "cell_type": "code", |
| 401 | + "execution_count": null, |
| 402 | + "metadata": {}, |
| 403 | + "outputs": [], |
| 404 | + "source": [] |
467 | 405 | }
|
468 | 406 | ],
|
469 | 407 | "metadata": {
|
470 | 408 | "colab": {
|
471 | 409 | "provenance": []
|
472 | 410 | },
|
473 | 411 | "kernelspec": {
|
474 |
| - "display_name": "Python 3.11.3 64-bit", |
| 412 | + "display_name": "Python 3 (ipykernel)", |
475 | 413 | "language": "python",
|
476 | 414 | "name": "python3"
|
477 | 415 | },
|
478 | 416 | "language_info": {
|
| 417 | + "codemirror_mode": { |
| 418 | + "name": "ipython", |
| 419 | + "version": 3 |
| 420 | + }, |
| 421 | + "file_extension": ".py", |
| 422 | + "mimetype": "text/x-python", |
479 | 423 | "name": "python",
|
480 |
| - "version": "3.9.6" |
| 424 | + "nbconvert_exporter": "python", |
| 425 | + "pygments_lexer": "ipython3", |
| 426 | + "version": "3.11.6" |
481 | 427 | },
|
482 | 428 | "vscode": {
|
483 | 429 | "interpreter": {
|
|
486 | 432 | }
|
487 | 433 | },
|
488 | 434 | "nbformat": 4,
|
489 |
| - "nbformat_minor": 0 |
| 435 | + "nbformat_minor": 4 |
490 | 436 | }
|
0 commit comments