From 75a10f34e6ba5abeddeb54a270c2e57e2b5ca54e Mon Sep 17 00:00:00 2001
From: Evgeni Burovski <evgeny.burovskiy@gmail.com>
Date: Sat, 15 Jul 2023 15:22:56 +0300
Subject: [PATCH 1/2] e2e: add kmeans inner op

---
 e2e/kmeans/kmeans.py | 58 ++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 58 insertions(+)
 create mode 100644 e2e/kmeans/kmeans.py

diff --git a/e2e/kmeans/kmeans.py b/e2e/kmeans/kmeans.py
new file mode 100644
index 00000000..204d9ce2
--- /dev/null
+++ b/e2e/kmeans/kmeans.py
@@ -0,0 +1,58 @@
+# k-means step, given data, `X` and centroids
+# https://realpython.com/numpy-array-programming/#clustering-algorithms
+import numpy as np
+import torch
+torch.set_default_device("cpu")
+import torch._dynamo.config as cfg
+cfg.numpy_ndarray_as_tensor = True
+
+
+# np.linalg.norm replacement (2-norm only), https://github.com/pytorch/pytorch/issues/105269
+def norm(a, axis):
+    s = (a.conj() * a).real
+    return np.sqrt(s.sum(axis=axis))
+
+
+#@torch.compile
+def get_labels(X, centroids) -> np.ndarray:
+    return np.argmin(norm(X - centroids[:, None], axis=2),
+                     axis=0)
+
+
+def init(npts):
+    np.random.seed(12345)
+    X = np.repeat([[5, 5], [10, 10]], [npts, npts], axis=0)
+    X = X + np.random.randn(*X.shape)  # 2 distinct "blobs"
+    centroids = np.array([[5, 5], [10, 10]])
+    return X, centroids
+
+
+################ benchmark #####################
+import time
+
+# ### numpy ###
+npts = int(2e7)
+X, centroids = init(npts)
+
+start_time = time.time()
+labels = get_labels(X, centroids)
+end_time = time.time()
+numpy_time = end_time - start_time
+print("\n\nnumpy:    elapsed=", numpy_time)
+
+
+# ### compile ###
+get_labels_c = torch.compile(get_labels)
+
+# ### warm up ###
+for _ in range(5):
+    get_labels_c(X, centroids)
+
+
+# ### measure ###
+start_time = time.time()
+labels = get_labels_c(X, centroids)
+end_time = time.time()
+compiled_time = end_time - start_time
+print("compiled: elapsed=", compiled_time, '  speedup = ', numpy_time / compiled_time)
+

From fb14b5508623976a75fc78d9cb2a9c3d4df49067 Mon Sep 17 00:00:00 2001
From: Evgeni Burovski <evgeny.burovskiy@gmail.com>
Date: Sat, 15 Jul 2023 16:07:27 +0300
Subject: [PATCH 2/2] DOC: update Readme

---
 Readme.md | 40 +++++++++++++++++++++++++++++++++++++++-
 1 file changed, 39 insertions(+), 1 deletion(-)

diff --git a/Readme.md b/Readme.md
index 38da1a2b..4869aa27 100644
--- a/Readme.md
+++ b/Readme.md
@@ -3,7 +3,9 @@ To test our wrapper, we use two strategies:
 - port parts of the numpy test suite
 - run several small examples which use NumPy and check that the results are identical to original NumPy.
 
-We only run tests and examples in the eager mode by replacing `import numpy as np` by `import torch_np as np`.
+We only run tests in the eager mode by replacing `import numpy as np` by `import torch_np as np`.
+Examples we run in both eager and JIT modes. 
+
 
 For numpy tests, see `torch_np/testing/numpy_tests` folder.
 
@@ -13,6 +15,42 @@ For numpy tests, see `torch_np/testing/numpy_tests` folder.
 - Build a random maze and find a path in it
 - Simulate a diffusion/advection process
 - Construct and visualize the Mandelbrot fractal
+- Inner operation of the k-means clustering
+
+# JIT compiled mode
+
+The main observation is that `torch.dynamo` unrolls python-level loops. For
+iterative algorithms this leads to very long compile times. We therefore
+often only compile the inner loop.
+
+## Maze path-finding
+
+The Bellman-Ford algorithm simply does not compile because it contains a
+data-dependent loop `while point != start`.
+
+
+## CFD diffusion/advecton process
+
+We compile the inner loop of the diffusion-advection simulation. While the code
+compiles, the performance is on par or slightly worse than the original NumPy.
+
+## Mandelbrot fractal
+
+Results strongly depend on an implementation: a straighforward NumPy implementation
+uses a data-dependent loop, which does not compile.
+
+The implementation based on the [Mojo benchmark](https://shashankprasanna.com/benchmarking-modular-mojo-and-pytorch-torch.compile-on-mandelbrot-function/index.html#benchmarking-pytorch-cpu-with-torchcompile) allows to compile the inner loop. The performance
+increase relative to numpy is substantial and strongly data size and machine
+dependent: x8 for smaller inputs and up to x50 for unputs larger than the cache size of the machine.
+
+
+## K-means clustering
+
+The internal loop of the k-means algorithm compiles into a straighforward
+C++ loop and offers up to x30 speedups versus NumPy.
+
+
+# Eager mode
 
 In short, the main changes to examples are: