Skip to content

Commit c59d0ca

Browse files
authored
Expand Python Bindings to include S2RegionTermIndexer (google#160)
* Python: Implement S2Earth class * Python: implement some key S2Testing methods * Python: implement S2RegionTermIndexer and tests * Python: Port the `term_index` C++ example to Python. * Update authors/contributors
1 parent c69344a commit c59d0ca

File tree

6 files changed

+439
-2
lines changed

6 files changed

+439
-2
lines changed

AUTHORS

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10,3 +10,4 @@
1010

1111
Dan Larkin-York <[email protected]>
1212
Google Inc.
13+
Koordinates Limited

CONTRIBUTORS

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -27,3 +27,4 @@ Eric Veach <[email protected]>
2727
Jesse Rosenstock <[email protected]>
2828
Julien Basch <[email protected]>
2929
Phil Elson <[email protected]>
30+
Robert Coup <[email protected]>

doc/examples/term_index.py

Lines changed: 101 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,101 @@
1+
"""
2+
This example shows how to add spatial data to an information retrieval
3+
system. Such systems work by converting documents into a collection of
4+
"index terms" (e.g., representing words or phrases), and then building an
5+
"inverted index" that maps each term to a list of documents (and document
6+
positions) where that term occurs.
7+
8+
This example shows how to convert spatial data into index terms, which can
9+
then be indexed along with the other document information.
10+
11+
This is a port of the C++ term_index.cc example for the Python API.
12+
"""
13+
import argparse
14+
from collections import defaultdict
15+
16+
import pywraps2 as s2
17+
18+
19+
def main():
20+
parser = argparse.ArgumentParser(
21+
description=(
22+
"This example shows how to convert spatial data into index terms, "
23+
"which can then be indexed along with the other document "
24+
"information."
25+
)
26+
)
27+
parser.add_argument(
28+
'--num_documents', type=int, default=10000, help="Number of documents"
29+
)
30+
parser.add_argument(
31+
'--num_queries', type=int, default=10000, help="Number of queries"
32+
)
33+
parser.add_argument(
34+
'--query_radius_km', type=float, default=100,
35+
help="Query radius in kilometers"
36+
)
37+
38+
args = parser.parse_args()
39+
40+
# A prefix added to spatial terms to distinguish them from other index terms
41+
# (e.g. representing words or phrases).
42+
PREFIX = "s2:"
43+
44+
# Create a set of "documents" to be indexed. Each document consists of a
45+
# single point. (You can easily substitute any S2Region type here, or even
46+
# index a mixture of region types using S2Region. Other
47+
# region types include polygons, polylines, rectangles, discs, buffered
48+
# geometry, etc.)
49+
documents = []
50+
for i in range(args.num_documents):
51+
documents.append(s2.S2Testing.RandomPoint())
52+
53+
# We use a dict as our inverted index. The key is an index term, and
54+
# the value is the set of "document ids" where this index term is present.
55+
index = defaultdict(set)
56+
57+
# Create an indexer suitable for an index that contains points only.
58+
# (You may also want to adjust min_level() or max_level() if you plan
59+
# on querying very large or very small regions.)
60+
indexer = s2.S2RegionTermIndexer()
61+
indexer.set_index_contains_points_only(True)
62+
63+
# Add the documents to the index.
64+
for docid, index_region in enumerate(documents):
65+
for term in indexer.GetIndexTerms(index_region, PREFIX):
66+
index[term].add(docid)
67+
68+
# Convert the query radius to an angle representation.
69+
radius = s2.S1Angle.Radians(s2.S2Earth.KmToRadians(args.query_radius_km))
70+
71+
# Count the number of documents (points) found in all queries.
72+
num_found = 0
73+
for i in range(args.num_queries):
74+
# Choose a random center for query.
75+
query_region = s2.S2Cap(s2.S2Testing.RandomPoint(), radius)
76+
77+
# Convert the query region to a set of terms, and compute the union of
78+
# the document ids associated with those terms. (An actual information
79+
# retrieval system would do something more sophisticated.)
80+
candidates = set()
81+
for term in indexer.GetQueryTerms(query_region, PREFIX):
82+
candidates |= index[term]
83+
84+
# "candidates" now contains all documents that intersect the query
85+
# region, along with some documents that nearly intersect it. We can
86+
# prune the results by retrieving the original "document" and checking
87+
# the distance more precisely.
88+
result = []
89+
for docid in candidates:
90+
if query_region.Contains(documents[docid]):
91+
result.append(docid)
92+
93+
# Now do something with the results (in this example we just count
94+
# them).
95+
num_found += len(result)
96+
97+
print("Found %d points in %d queries" % (num_found, args.num_queries))
98+
99+
100+
if __name__ == "__main__":
101+
main()

src/python/pywraps2_test.py

Lines changed: 178 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -14,9 +14,9 @@
1414
# limitations under the License.
1515
#
1616

17-
18-
1917
import unittest
18+
from collections import defaultdict
19+
2020
import pywraps2 as s2
2121

2222

@@ -606,5 +606,181 @@ def testS2CellIdToFromFaceIJ(self):
606606
self.assertEqual(1234, i)
607607
self.assertEqual(5678, j)
608608

609+
def testS2EarthMetricRadians(self):
610+
radius_rad = s2.S2Earth.KmToRadians(12.34)
611+
self.assertAlmostEqual(radius_rad, 0.0019368985451286374)
612+
angle = s2.S1Angle.Radians(radius_rad)
613+
radius_m = s2.S2Earth.RadiansToMeters(angle.radians())
614+
self.assertEqual(radius_m, 12340.0)
615+
616+
617+
class RegionTermIndexerTest(unittest.TestCase):
618+
def _randomCaps(self, query_type, **indexer_options):
619+
# This function creates an index consisting either of points (if
620+
# options.index_contains_points_only() is true) or S2Caps of random size.
621+
# It then executes queries consisting of points (if query_type == POINT)
622+
# or S2Caps of random size (if query_type == CAP).
623+
#
624+
# indexer_options are set on both the indexer & coverer (if relevant)
625+
# eg. _randomCaps('cap', min_level=0) calls indexer.set_min_level(0)
626+
ITERATIONS = 400
627+
628+
indexer = s2.S2RegionTermIndexer()
629+
coverer = s2.S2RegionCoverer()
630+
631+
# set indexer options
632+
for opt_key, opt_value in indexer_options.items():
633+
setter = "set_%s" % opt_key
634+
getattr(indexer, setter)(opt_value)
635+
if hasattr(coverer, setter):
636+
getattr(coverer, setter)(opt_value)
637+
638+
caps = []
639+
coverings = []
640+
index = defaultdict(set)
641+
642+
index_terms = 0
643+
query_terms = 0
644+
for i in range(ITERATIONS):
645+
# Choose the region to be indexed: either a single point or a cap
646+
# of random size (up to a full sphere).
647+
terms = []
648+
if indexer.index_contains_points_only():
649+
cap = s2.S2Cap.FromPoint(s2.S2Testing.RandomPoint())
650+
terms = indexer.GetIndexTerms(cap.center(), "")
651+
else:
652+
cap = s2.S2Testing.GetRandomCap(
653+
0.3 * s2.S2Cell.AverageArea(indexer.max_level()),
654+
4.0 * s2.S2Cell.AverageArea(indexer.min_level())
655+
)
656+
terms = indexer.GetIndexTerms(cap, "")
657+
658+
caps.append(cap)
659+
coverings.append(s2.S2CellUnion(coverer.GetCovering(cap)))
660+
for term in terms:
661+
index[term].add(i)
662+
663+
index_terms += len(terms)
664+
665+
for i in range(ITERATIONS):
666+
# Choose the region to be queried: either a random point or a cap of
667+
# random size.
668+
terms = []
669+
670+
if query_type == 'cap':
671+
cap = s2.S2Cap.FromPoint(s2.S2Testing.RandomPoint())
672+
terms = indexer.GetQueryTerms(cap.center(), "")
673+
else:
674+
cap = s2.S2Testing.GetRandomCap(
675+
0.3 * s2.S2Cell.AverageArea(indexer.max_level()),
676+
4.0 * s2.S2Cell.AverageArea(indexer.min_level())
677+
)
678+
terms = indexer.GetQueryTerms(cap, "")
679+
680+
# Compute the expected results of the S2Cell query by brute force.
681+
covering = s2.S2CellUnion(coverer.GetCovering(cap))
682+
expected, actual = set(), set()
683+
for j in range(len(caps)):
684+
if covering.Intersects(coverings[j]):
685+
expected.add(j)
686+
687+
for term in terms:
688+
actual |= index[term]
689+
690+
self.assertEqual(expected, actual)
691+
query_terms += len(terms)
692+
693+
print("Index terms/doc: %0.2f, Query terms/doc: %0.2f" % (
694+
float(index_terms) / ITERATIONS,
695+
float(query_terms) / ITERATIONS)
696+
)
697+
698+
# We run one test case for each combination of space vs. time optimization,
699+
# and indexing regions vs. only points.
700+
701+
def testIndexRegionsQueryRegionsOptimizeTime(self):
702+
self._randomCaps("cap",
703+
optimize_for_space=False,
704+
min_level=0,
705+
max_level=16,
706+
max_cells=20,
707+
)
708+
709+
def testIndexRegionsQueryPointsOptimizeTime(self):
710+
self._randomCaps("point",
711+
optimize_for_space=False,
712+
min_level=0,
713+
max_level=16,
714+
max_cells=20,
715+
)
716+
717+
def testIndexRegionsQueryRegionsOptimizeTimeWithLevelMod(self):
718+
self._randomCaps("cap",
719+
optimize_for_space=False,
720+
min_level=6,
721+
max_level=12,
722+
level_mod=3,
723+
)
724+
725+
def testIndexRegionsQueryRegionsOptimizeSpace(self):
726+
self._randomCaps("cap",
727+
optimize_for_space=True,
728+
min_level=4,
729+
max_level=s2.S2CellId.kMaxLevel,
730+
max_cells=8,
731+
)
732+
733+
def testIndexPointsQueryRegionsOptimizeTime(self):
734+
self._randomCaps("cap",
735+
optimize_for_space=False,
736+
min_level=0,
737+
max_level=s2.S2CellId.kMaxLevel,
738+
level_mod=2,
739+
max_cells=20,
740+
index_contains_points_only=True,
741+
)
742+
743+
def testIndexPointsQueryRegionsOptimizeSpace(self):
744+
self._randomCaps("cap",
745+
optimize_for_space=True,
746+
index_contains_points_only=True,
747+
)
748+
749+
def testMaxLevelSetLoosely(self):
750+
# Test that correct terms are generated even when (max_level - min_level)
751+
# is not a multiple of level_mod.
752+
indexer1 = s2.S2RegionTermIndexer()
753+
indexer1.set_min_level(1)
754+
indexer1.set_level_mod(2)
755+
indexer1.set_max_level(19)
756+
757+
indexer2 = s2.S2RegionTermIndexer()
758+
indexer2.set_min_level(1)
759+
indexer2.set_level_mod(2)
760+
indexer2.set_max_level(19)
761+
indexer2.set_max_level(20)
762+
763+
point = s2.S2Testing.RandomPoint()
764+
765+
self.assertEqual(
766+
indexer1.GetIndexTerms(point, ""),
767+
indexer2.GetIndexTerms(point, "")
768+
)
769+
self.assertEqual(
770+
indexer1.GetQueryTerms(point, ""),
771+
indexer2.GetQueryTerms(point, "")
772+
)
773+
774+
cap = s2.S2Testing.GetRandomCap(0.0, 1.0)
775+
self.assertEqual(
776+
indexer1.GetIndexTerms(cap, ""),
777+
indexer2.GetIndexTerms(cap, "")
778+
)
779+
self.assertEqual(
780+
indexer1.GetQueryTerms(cap, ""),
781+
indexer2.GetQueryTerms(cap, "")
782+
)
783+
784+
609785
if __name__ == "__main__":
610786
unittest.main()

src/python/s2.i

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@
33
%include "stdint.i"
44

55
%template() std::vector<unsigned long long>;
6+
%template() std::vector<string>;
67
%template() std::vector<S2CellId>;
78
%template() std::vector<S2Point>;
89
%template() std::vector<S2LatLng>;

0 commit comments

Comments
 (0)