Skip to content

Commit a99a6c1

Browse files
authored
Feature/update luceneknn (#443)
* Updating luceneknn * updating docker file
1 parent d4de07f commit a99a6c1

File tree

2 files changed

+26
-18
lines changed

2 files changed

+26
-18
lines changed

ann_benchmarks/algorithms/luceneknn/Dockerfile

Lines changed: 13 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -8,25 +8,29 @@ RUN apt-get install -y wget apt-transport-https gnupg
88
RUN wget -O - https://packages.adoptium.net/artifactory/api/gpg/key/public | apt-key add -
99
RUN echo "deb https://packages.adoptium.net/artifactory/deb $(awk -F= '/^VERSION_CODENAME/{print$2}' /etc/os-release) main" | tee /etc/apt/sources.list.d/adoptium.list
1010
RUN apt-get update
11-
RUN apt-get install -y temurin-17-jdk
11+
RUN apt-get install -y temurin-20-jdk temurin-17-jdk
1212

1313
# Install PyLucene & JCC
14-
RUN wget https://dlcdn.apache.org/lucene/pylucene/pylucene-9.4.1-src.tar.gz
15-
RUN tar -xzf pylucene-9.4.1-src.tar.gz
16-
ENV JCC_JDK=/usr/lib/jvm/temurin-17-jdk-amd64
17-
WORKDIR /home/app/pylucene-9.4.1/jcc
14+
RUN wget https://dlcdn.apache.org/lucene/pylucene/pylucene-9.7.0-src.tar.gz
15+
RUN tar -xzf pylucene-9.7.0-src.tar.gz
16+
ENV JCC_JDK=/usr/lib/jvm/temurin-20-jdk-amd64
17+
WORKDIR /home/app/pylucene-9.7.0/jcc
1818
RUN python3 ./setup.py build
1919
RUN python3 ./setup.py install
2020

21-
WORKDIR /home/app/pylucene-9.4.1
21+
WORKDIR /home/app/pylucene-9.7.0
2222
ENV PYTHON=python3
2323
ENV JCC="$(PYTHON) -m jcc --shared"
2424
ENV NUM_FILES=16
25-
RUN make lucene-java-9.4.1
26-
RUN mkdir lucene-java-9.4.1/lucene/extensions/src/java/org/apache/pylucene/codecs
27-
RUN wget -O ./lucene-java-9.4.1/lucene/extensions/src/java/org/apache/pylucene/codecs/PyLucene94Codec.java https://gist.githubusercontent.com/benwtrent/f3a6c4a9ce9749e702285dc82f39a129/raw/4742cf91401103f86809655d5c708b833beae43f/PyLucene94Codec.java
25+
# Needed as current Lucene gradle version doesn't support java 20 on build
26+
ENV JAVA_HOME=/usr/lib/jvm/temurin-17-jdk-amd64
27+
RUN make lucene-java-9.7.0
28+
RUN mkdir lucene-java-9.7.0/lucene/extensions/src/java/org/apache/pylucene/codecs
29+
RUN wget -O ./lucene-java-9.7.0/lucene/extensions/src/java/org/apache/pylucene/codecs/PyLucene95Codec.java https://gist.githubusercontent.com/benwtrent/79d70d59716f0e25833c5ea84d956c12/raw/8f529f3437c2fb8318f0127ecd71c960e43e0a7f/PyLucene95Codec.java
2830
RUN make
2931
RUN make install
32+
# Switch back to jdk20 for panama vectorization support
33+
ENV JAVA_HOME=/usr/lib/jvm/temurin-20-jdk-amd64
3034

3135
# Reset the work dir so scripts can be ran
3236
WORKDIR /home/app

ann_benchmarks/algorithms/luceneknn/module.py

Lines changed: 13 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -7,21 +7,21 @@
77
import sklearn.preprocessing
88
from java.nio.file import Paths
99
from lucene import JArray
10-
from org.apache.lucene.codecs.lucene94 import Lucene94HnswVectorsFormat
10+
from org.apache.lucene.codecs.lucene95 import Lucene95HnswVectorsFormat
1111
from org.apache.lucene.document import Document, KnnVectorField, StoredField
1212
from org.apache.lucene.index import (DirectoryReader, IndexWriter,
1313
IndexWriterConfig,
1414
VectorSimilarityFunction)
1515
from org.apache.lucene.search import IndexSearcher, KnnVectorQuery
1616
from org.apache.lucene.store import FSDirectory
17-
from org.apache.pylucene.codecs import PyLucene94Codec
17+
from org.apache.pylucene.codecs import PyLucene95Codec
1818

1919
from ..base.module import BaseANN
2020

2121

22-
class Codec(PyLucene94Codec):
22+
class Codec(PyLucene95Codec):
2323
"""
24-
Custom codec so that the appropriate Lucene94 codec can be returned with the configured M and efConstruction
24+
Custom codec so that the appropriate Lucene95 codec can be returned with the configured M and efConstruction
2525
"""
2626

2727
def __init__(self, M, efConstruction):
@@ -30,7 +30,7 @@ def __init__(self, M, efConstruction):
3030
self.efConstruction = efConstruction
3131

3232
def getKnnVectorsFormatForField(self, field):
33-
return Lucene94HnswVectorsFormat(self.M, self.efConstruction)
33+
return Lucene95HnswVectorsFormat(self.M, self.efConstruction)
3434

3535

3636
class PyLuceneKNN(BaseANN):
@@ -40,9 +40,13 @@ class PyLuceneKNN(BaseANN):
4040

4141
def __init__(self, metric: str, dimension: int, param):
4242
try:
43-
lucene.initVM(vmargs=["-Djava.awt.headless=true -Xmx6g -Xms6g"])
44-
except ValueError:
45-
print("VM already initialized")
43+
lucene.initVM(
44+
initialheap="6g",
45+
maxheap="6g",
46+
vmargs=["--add-modules=jdk.incubator.vector"]
47+
)
48+
except ValueError as e:
49+
print(f"VM already initialized: {e}")
4650
self.metric = metric
4751
self.dimension = dimension
4852
self.param = param
@@ -78,7 +82,7 @@ def fit(self, X):
7882
doc.add(StoredField("id", id))
7983
iw.addDocument(doc)
8084
id += 1
81-
if id + 1 % 1000 == 0:
85+
if (id + 1) % 1000 == 0:
8286
print(f"LuceneKNN: written {id} docs")
8387
# Force merge so only one HNSW graph is searched.
8488
iw.forceMerge(1)

0 commit comments

Comments
 (0)