Expand Python Bindings to include S2RegionTermIndexer (google#160)

rcoup · web-flow · commit c59d0ca01ae3 · 2020-12-14T12:49:52.000+01:00
* Python: Implement S2Earth class

* Python: implement some key S2Testing methods

* Python: implement S2RegionTermIndexer and tests

* Python: Port the `term_index` C++ example to Python.

* Update authors/contributors
diff --git a/AUTHORS b/AUTHORS
@@ -10,3 +10,4 @@
 
 Dan Larkin-York <dan@arangodb.com>
 Google Inc.
+Koordinates Limited
diff --git a/CONTRIBUTORS b/CONTRIBUTORS
@@ -27,3 +27,4 @@ Eric Veach <ericv@google.com>
 Jesse Rosenstock <jmr@google.com>
 Julien Basch <julienbasch@google.com>
 Phil Elson <pelson.pub@gmail.com>
+Robert Coup <robert.coup@koordinates.com>
diff --git a/doc/examples/term_index.py b/doc/examples/term_index.py
@@ -0,0 +1,101 @@
+"""
+This example shows how to add spatial data to an information retrieval
+system.  Such systems work by converting documents into a collection of
+"index terms" (e.g., representing words or phrases), and then building an
+"inverted index" that maps each term to a list of documents (and document
+positions) where that term occurs.
+
+This example shows how to convert spatial data into index terms, which can
+then be indexed along with the other document information.
+
+This is a port of the C++ term_index.cc example for the Python API.
+"""
+import argparse
+from collections import defaultdict
+
+import pywraps2 as s2
+
+
+def main():
+    parser = argparse.ArgumentParser(
+        description=(
+            "This example shows how to convert spatial data into index terms, "
+            "which can then be indexed along with the other document "
+            "information."
+        )
+    )
+    parser.add_argument(
+        '--num_documents', type=int, default=10000, help="Number of documents"
+    )
+    parser.add_argument(
+        '--num_queries', type=int, default=10000, help="Number of queries"
+    )
+    parser.add_argument(
+        '--query_radius_km', type=float, default=100,
+        help="Query radius in kilometers"
+    )
+
+    args = parser.parse_args()
+
+    # A prefix added to spatial terms to distinguish them from other index terms
+    # (e.g. representing words or phrases).
+    PREFIX = "s2:"
+
+    # Create a set of "documents" to be indexed.  Each document consists of a
+    # single point.  (You can easily substitute any S2Region type here, or even
+    # index a mixture of region types using S2Region.  Other
+    # region types include polygons, polylines, rectangles, discs, buffered
+    # geometry, etc.)
+    documents = []
+    for i in range(args.num_documents):
+        documents.append(s2.S2Testing.RandomPoint())
+
+    # We use a dict as our inverted index.  The key is an index term, and
+    # the value is the set of "document ids" where this index term is present.
+    index = defaultdict(set)
+
+    # Create an indexer suitable for an index that contains points only.
+    # (You may also want to adjust min_level() or max_level() if you plan
+    # on querying very large or very small regions.)
+    indexer = s2.S2RegionTermIndexer()
+    indexer.set_index_contains_points_only(True)
+
+    # Add the documents to the index.
+    for docid, index_region in enumerate(documents):
+        for term in indexer.GetIndexTerms(index_region, PREFIX):
+            index[term].add(docid)
+
+    # Convert the query radius to an angle representation.
+    radius = s2.S1Angle.Radians(s2.S2Earth.KmToRadians(args.query_radius_km))
+
+    # Count the number of documents (points) found in all queries.
+    num_found = 0
+    for i in range(args.num_queries):
+        # Choose a random center for query.
+        query_region = s2.S2Cap(s2.S2Testing.RandomPoint(), radius)
+
+        # Convert the query region to a set of terms, and compute the union of
+        # the document ids associated with those terms.  (An actual information
+        # retrieval system would do something more sophisticated.)
+        candidates = set()
+        for term in indexer.GetQueryTerms(query_region, PREFIX):
+            candidates |= index[term]
+
+        # "candidates" now contains all documents that intersect the query
+        # region, along with some documents that nearly intersect it.  We can
+        # prune the results by retrieving the original "document" and checking
+        # the distance more precisely.
+        result = []
+        for docid in candidates:
+            if query_region.Contains(documents[docid]):
+                result.append(docid)
+
+        # Now do something with the results (in this example we just count
+        # them).
+        num_found += len(result)
+
+    print("Found %d points in %d queries" % (num_found, args.num_queries))
+
+
+if __name__ == "__main__":
+    main()
diff --git a/src/python/pywraps2_test.py b/src/python/pywraps2_test.py
@@ -14,9 +14,9 @@
 # limitations under the License.
 #
 
-
-
 import unittest
+from collections import defaultdict
+
 import pywraps2 as s2
 
 
@@ -606,5 +606,181 @@ def testS2CellIdToFromFaceIJ(self):
     self.assertEqual(1234, i)
     self.assertEqual(5678, j)
 
+  def testS2EarthMetricRadians(self):
+    radius_rad = s2.S2Earth.KmToRadians(12.34)
+    self.assertAlmostEqual(radius_rad, 0.0019368985451286374)
+    angle = s2.S1Angle.Radians(radius_rad)
+    radius_m = s2.S2Earth.RadiansToMeters(angle.radians())
+    self.assertEqual(radius_m, 12340.0)
+
+
+class RegionTermIndexerTest(unittest.TestCase):
+  def _randomCaps(self, query_type, **indexer_options):
+    # This function creates an index consisting either of points (if
+    # options.index_contains_points_only() is true) or S2Caps of random size.
+    # It then executes queries consisting of points (if query_type == POINT)
+    # or S2Caps of random size (if query_type == CAP).
+    #
+    # indexer_options are set on both the indexer & coverer (if relevant)
+    # eg. _randomCaps('cap', min_level=0) calls indexer.set_min_level(0)
+    ITERATIONS = 400
+
+    indexer = s2.S2RegionTermIndexer()
+    coverer = s2.S2RegionCoverer()
+
+    # set indexer options
+    for opt_key, opt_value in indexer_options.items():
+      setter = "set_%s" % opt_key
+      getattr(indexer, setter)(opt_value)
+      if hasattr(coverer, setter):
+        getattr(coverer, setter)(opt_value)
+
+    caps = []
+    coverings = []
+    index = defaultdict(set)
+
+    index_terms = 0
+    query_terms = 0
+    for i in range(ITERATIONS):
+      # Choose the region to be indexed: either a single point or a cap
+      # of random size (up to a full sphere).
+      terms = []
+      if indexer.index_contains_points_only():
+        cap = s2.S2Cap.FromPoint(s2.S2Testing.RandomPoint())
+        terms = indexer.GetIndexTerms(cap.center(), "")
+      else:
+        cap = s2.S2Testing.GetRandomCap(
+          0.3 * s2.S2Cell.AverageArea(indexer.max_level()),
+          4.0 * s2.S2Cell.AverageArea(indexer.min_level())
+        )
+        terms = indexer.GetIndexTerms(cap, "")
+
+      caps.append(cap)
+      coverings.append(s2.S2CellUnion(coverer.GetCovering(cap)))
+      for term in terms:
+        index[term].add(i)
+
+      index_terms += len(terms)
+
+    for i in range(ITERATIONS):
+      # Choose the region to be queried: either a random point or a cap of
+      # random size.
+      terms = []
+
+      if query_type == 'cap':
+        cap = s2.S2Cap.FromPoint(s2.S2Testing.RandomPoint())
+        terms = indexer.GetQueryTerms(cap.center(), "")
+      else:
+        cap = s2.S2Testing.GetRandomCap(
+          0.3 * s2.S2Cell.AverageArea(indexer.max_level()),
+          4.0 * s2.S2Cell.AverageArea(indexer.min_level())
+        )
+        terms = indexer.GetQueryTerms(cap, "")
+
+      # Compute the expected results of the S2Cell query by brute force.
+      covering = s2.S2CellUnion(coverer.GetCovering(cap))
+      expected, actual = set(), set()
+      for j in range(len(caps)):
+        if covering.Intersects(coverings[j]):
+          expected.add(j)
+      
+      for term in terms:
+        actual |= index[term]
+
+      self.assertEqual(expected, actual)
+      query_terms += len(terms)
+
+      print("Index terms/doc: %0.2f, Query terms/doc: %0.2f" % (
+        float(index_terms) / ITERATIONS,
+        float(query_terms) / ITERATIONS)
+      )
+
+  # We run one test case for each combination of space vs. time optimization,
+  # and indexing regions vs. only points.
+
+  def testIndexRegionsQueryRegionsOptimizeTime(self):
+    self._randomCaps("cap",
+      optimize_for_space=False,
+      min_level=0,
+      max_level=16,
+      max_cells=20,
+    )
+
+  def testIndexRegionsQueryPointsOptimizeTime(self):
+    self._randomCaps("point",
+      optimize_for_space=False,
+      min_level=0,
+      max_level=16,
+      max_cells=20,
+    )
+
+  def testIndexRegionsQueryRegionsOptimizeTimeWithLevelMod(self):
+    self._randomCaps("cap",
+      optimize_for_space=False,
+      min_level=6,
+      max_level=12,
+      level_mod=3,
+    )
+
+  def testIndexRegionsQueryRegionsOptimizeSpace(self):
+    self._randomCaps("cap",
+      optimize_for_space=True,
+      min_level=4,
+      max_level=s2.S2CellId.kMaxLevel,
+      max_cells=8,
+    )
+
+  def testIndexPointsQueryRegionsOptimizeTime(self):
+    self._randomCaps("cap",
+      optimize_for_space=False,
+      min_level=0,
+      max_level=s2.S2CellId.kMaxLevel,
+      level_mod=2,
+      max_cells=20,
+      index_contains_points_only=True,
+    )
+
+  def testIndexPointsQueryRegionsOptimizeSpace(self):
+    self._randomCaps("cap",
+      optimize_for_space=True,
+      index_contains_points_only=True,
+    )
+
+  def testMaxLevelSetLoosely(self):
+    # Test that correct terms are generated even when (max_level - min_level)
+    # is not a multiple of level_mod.
+    indexer1 = s2.S2RegionTermIndexer()
+    indexer1.set_min_level(1)
+    indexer1.set_level_mod(2)
+    indexer1.set_max_level(19)
+
+    indexer2 = s2.S2RegionTermIndexer()
+    indexer2.set_min_level(1)
+    indexer2.set_level_mod(2)
+    indexer2.set_max_level(19)
+    indexer2.set_max_level(20)
+
+    point = s2.S2Testing.RandomPoint()
+
+    self.assertEqual(
+      indexer1.GetIndexTerms(point, ""),
+      indexer2.GetIndexTerms(point, "")
+    )
+    self.assertEqual(
+      indexer1.GetQueryTerms(point, ""),
+      indexer2.GetQueryTerms(point, "")
+    )
+
+    cap = s2.S2Testing.GetRandomCap(0.0, 1.0)
+    self.assertEqual(
+      indexer1.GetIndexTerms(cap, ""),
+      indexer2.GetIndexTerms(cap, "")
+    )
+    self.assertEqual(
+      indexer1.GetQueryTerms(cap, ""),
+      indexer2.GetQueryTerms(cap, "")
+    )
+
+
 if __name__ == "__main__":
   unittest.main()
diff --git a/src/python/s2.i b/src/python/s2.i
@@ -3,6 +3,7 @@
 %include "stdint.i"
 
 %template() std::vector<unsigned long long>;
+%template() std::vector<string>;
 %template() std::vector<S2CellId>;
 %template() std::vector<S2Point>;
 %template() std::vector<S2LatLng>;
diff --git a/src/python/s2_common.i b/src/python/s2_common.i

Original file line number	Diff line number	Diff line change
`@@ -10,3 +10,4 @@`
`10`	`10`
`11`	`11`	`Dan Larkin-York <[email protected]>`
`12`	`12`	`Google Inc.`
	`13`	`+Koordinates Limited`
-Original file line number
+Diff line change
 Jesse Rosenstock <[email protected]>
 Julien Basch <[email protected]>
 Phil Elson <[email protected]>
 +Robert Coup <[email protected]>