Skip to content

kmeans incorrect result #297

@DrTodd13

Description

@DrTodd13

sum_points is not recognized as a reduction by Numba. Keep this issue alive to track the Numba issue and once the Numba issue is fixed then I presume we may need some work for the new reduction type on the bodo side.

import numpy as np
import time
from bodo import jit, prange

@jit
def kmeans(num_points, num_features, k, num_iterations):
data = np.fromfile("kmeans_data", dtype=np.float64)
data = np.ascontiguousarray(data[:num_points*num_features].reshape((num_points, num_features)))
centroids = np.linspace(0, 1, k * num_features).reshape(k, num_features)
centroids.tofile("orig_centroids.out")
labels = np.zeros(num_points, dtype=np.int32)

start_time = time.time()
for it in range(num_iterations):
    # Assign labels based on closest centroid
    for i in prange(num_points):
        distances = np.array([np.linalg.norm(data[i] - c) for c in centroids])
        labels[i] = np.argmin(distances)

    sum_points = np.zeros((k, num_features))
    count_points = np.zeros(k, dtype=np.int32)

    for i in prange(num_points):
        cluster_id = labels[i]
        sum_points[cluster_id, :] += data[i, :]
        count_points[cluster_id] += 1

    for j in range(k):
        if count_points[j] > 0:
            centroids[j] = sum_points[j] / count_points[j]

end_time = time.time()
print("internal time", end_time - start_time)

sum_points.tofile("sum_points")
count_points.tofile("count_points")
centroids.tofile("centroids.out")
labels.tofile("labels.out")

Metadata

Metadata

Assignees

No one assigned

    Labels

    No labels
    No labels

    Type

    No type

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions