KAFKA-19312 Avoiding concurrent execution of onComplete and tryComplete (#19759)

apoorvmittal10 · web-flow · commit adb76779ed14 · 2025-05-25T14:36:43.000+08:00
The `onComplete` method in DelayedOperation is guaranteed to run only
once, through `forceComplete`, invoked either by `tryComplete` or when
operation is expired (`run` method). The invocation of  `tryComplete` is
done by attaining `lock` so no concurrent execution of  `tryComplete`
happens for same delayed operation. However, there can be  concurrent
execution of `tryComplete` and `onComplete` as the  `expiration` thread
can trigger a separte run of `onComplete` while  `tryComplete` is still
executing. This behaviour is not ideal as there  are parallel runs where
1 threads method execution is wasteful i.e. if  `onComplete` is already
invoked by another thread then execution of  `tryComplete` is not
required.

I ran some tests and performance is same.

### After the chages:

```
--num 10000 --rate 100 --timeout 1000 --pct50 0.5 --pct75 0.75

# latency samples: pct75 = 0, pct50 = 0, min = 0, max = 7
# interval samples: rate = 100.068948, min = 0, max = 129
# enqueue rate (10000 requests):
# &lt;elapsed time ms&gt;	&lt;target rate&gt;	&lt;actual rate&gt;	&lt;process cpu time ms&gt;	&lt;G1 Old Generation count&gt; &lt;G1 Young Generation count&gt;	&lt;G1 Old Generation time ms&gt; &lt;G1 Young Generation time ms&gt;
101196	99.809364	99.806376	3240	0 2	0 8
```

```
--num 10000 --rate 1000 --timeout 1000 --pct50 0.5 --pct75 0.75

# latency samples: pct75 = 0, pct50 = 0, min = 0, max = 9
# interval samples: rate = 999.371395, min = 0, max = 14
# enqueue rate (10000 requests):
# &lt;elapsed time ms&gt;	&lt;target rate&gt;	&lt;actual rate&gt;	&lt;process cpu time ms&gt;	&lt;G1 Old Generation count&gt; &lt;G1 Young Generation count&gt;	&lt;G1 Old Generation time ms&gt; &lt;G1 Young Generation time ms&gt;
11104	989.902990	989.805008	1349	0 2	0 7
```

### Before changes:

```
--num 10000 --rate 100 --timeout 1000 --pct50 0.5 --pct75 0.75

# latency samples: pct75 = 0, pct50 = 0, min = 0, max = 9
# interval samples: rate = 100.020304, min = 0, max = 130
# enqueue rate (10000 requests):
# &lt;elapsed time ms&gt;	&lt;target rate&gt;	&lt;actual rate&gt;	&lt;process cpu time ms&gt;	&lt;G1 Old Generation count&gt; &lt;G1 Young Generation count&gt;	&lt;G1 Old Generation time ms&gt; &lt;G1 Young Generation time ms&gt;
102366	98.657274	98.652408	3444	0 2	0 8

--num 10000 --rate 1000 --timeout 1000 --pct50 0.5 --pct75 0.75

# latency samples: pct75 = 0, pct50 = 0, min = 0, max = 8
# interval samples: rate = 997.134236, min = 0, max = 14
# enqueue rate (10000 requests):
# &lt;elapsed time ms&gt;	&lt;target rate&gt;	&lt;actual rate&gt;	&lt;process cpu time ms&gt;	&lt;G1 Old Generation count&gt; &lt;G1 Young Generation count&gt;	&lt;G1 Old Generation time ms&gt; &lt;G1 Young Generation time ms&gt;
11218	978.665101	978.665101	1624	0 2	0 7

Reviewers: Jun Rao &lt;junrao@gmail.com&gt;, Andrew Schofield
 &lt;aschofield@confluent.io&gt;, Chia-Ping Tsai &lt;chia7712@gmail.com&gt;
diff --git a/core/src/main/java/kafka/server/share/DelayedShareFetch.java b/core/src/main/java/kafka/server/share/DelayedShareFetch.java
@@ -202,32 +202,29 @@ public void onExpiration() {
      * Complete the share fetch operation by fetching records for all partitions in the share fetch request irrespective
      * of whether they have any acquired records. This is called when the fetch operation is forced to complete either
      * because records can be acquired for some partitions or due to MaxWaitMs timeout.
+     * <p>
+     * On operation timeout, onComplete is invoked, last try occurs to acquire partitions and read
+     * from log, if acquired. The fetch will only happen from local log and not remote storage, on
+     * operation expiration.
      */
     @Override
     public void onComplete() {
-        // We are utilizing lock so that onComplete doesn't do a dirty read for instance variables -
-        // partitionsAcquired and localPartitionsAlreadyFetched, since these variables can get updated in a different tryComplete thread.
-        lock.lock();
         log.trace("Completing the delayed share fetch request for group {}, member {}, "
             + "topic partitions {}", shareFetch.groupId(), shareFetch.memberId(),
             partitionsAcquired.keySet());
 
-        try {
-            if (remoteStorageFetchException.isPresent()) {
-                completeErroneousRemoteShareFetchRequest();
-            } else if (pendingRemoteFetchesOpt.isPresent()) {
-                if (maybeRegisterCallbackPendingRemoteFetch()) {
-                    log.trace("Registered remote storage fetch callback for group {}, member {}, "
-                            + "topic partitions {}", shareFetch.groupId(), shareFetch.memberId(),
-                        partitionsAcquired.keySet());
-                    return;
-                }
-                completeRemoteStorageShareFetchRequest();
-            } else {
-                completeLocalLogShareFetchRequest();
+        if (remoteStorageFetchException.isPresent()) {
+            completeErroneousRemoteShareFetchRequest();
+        } else if (pendingRemoteFetchesOpt.isPresent()) {
+            if (maybeRegisterCallbackPendingRemoteFetch()) {
+                log.trace("Registered remote storage fetch callback for group {}, member {}, "
+                        + "topic partitions {}", shareFetch.groupId(), shareFetch.memberId(),
+                    partitionsAcquired.keySet());
+                return;
             }
-        } finally {
-            lock.unlock();
+            completeRemoteStorageShareFetchRequest();
+        } else {
+            completeLocalLogShareFetchRequest();
         }
     }
 
@@ -358,15 +355,15 @@ public boolean tryComplete() {
                 if (anyPartitionHasLogReadError(replicaManagerReadResponse) || isMinBytesSatisfied(topicPartitionData, partitionMaxBytesStrategy.maxBytes(shareFetch.fetchParams().maxBytes, topicPartitionData.keySet(), topicPartitionData.size()))) {
                     partitionsAcquired = topicPartitionData;
                     localPartitionsAlreadyFetched = replicaManagerReadResponse;
-                    return forceCompleteRequest();
+                    return forceComplete();
                 } else {
                     log.debug("minBytes is not satisfied for the share fetch request for group {}, member {}, " +
                             "topic partitions {}", shareFetch.groupId(), shareFetch.memberId(),
                         sharePartitions.keySet());
                     releasePartitionLocks(topicPartitionData.keySet());
                 }
             } else {
-                log.trace("Can't acquire records for any partition in the share fetch request for group {}, member {}, " +
+                log.trace("Can't acquire any partitions in the share fetch request for group {}, member {}, " +
                         "topic partitions {}", shareFetch.groupId(), shareFetch.memberId(),
                     sharePartitions.keySet());
             }
@@ -381,10 +378,8 @@ public boolean tryComplete() {
                 releasePartitionLocks(topicPartitionData.keySet());
                 partitionsAcquired.clear();
                 localPartitionsAlreadyFetched.clear();
-                return forceCompleteRequest();
-            } else {
-                return forceCompleteRequest();
             }
+            return forceComplete();
         }
     }
 
@@ -785,7 +780,7 @@ private boolean maybeCompletePendingRemoteFetch() {
         }
 
         if (canComplete || pendingRemoteFetchesOpt.get().isDone()) { // Case d
-            return forceCompleteRequest();
+            return forceComplete();
         } else
             return false;
     }
@@ -944,16 +939,6 @@ private void cancelRemoteFetchTask(RemoteFetch remoteFetch) {
         }
     }
 
-    private boolean forceCompleteRequest() {
-        boolean completedByMe = forceComplete();
-        // If the delayed operation is completed by me, the acquired locks are already released in onComplete().
-        // Otherwise, we need to release the acquired locks.
-        if (!completedByMe) {
-            releasePartitionLocksAndAddToActionQueue(partitionsAcquired.keySet());
-        }
-        return completedByMe;
-    }
-
     private void completeRemoteShareFetchRequestOutsidePurgatory() {
         if (outsidePurgatoryCallbackLock.compareAndSet(false, true)) {
             completeRemoteStorageShareFetchRequest();
diff --git a/server-common/src/main/java/org/apache/kafka/server/purgatory/DelayedOperation.java b/server-common/src/main/java/org/apache/kafka/server/purgatory/DelayedOperation.java
@@ -19,7 +19,6 @@
 import org.apache.kafka.server.util.timer.TimerTask;
 
 import java.util.Optional;
-import java.util.concurrent.atomic.AtomicBoolean;
 import java.util.concurrent.locks.Lock;
 import java.util.concurrent.locks.ReentrantLock;
 
@@ -29,10 +28,9 @@
  * a delayed fetch operation could be waiting for a given number of bytes to accumulate.
  * <br/>
  * The logic upon completing a delayed operation is defined in onComplete() and will be called exactly once.
- * Once an operation is completed, isCompleted() will return true. onComplete() can be triggered by either
- * forceComplete(), which forces calling onComplete() after delayMs if the operation is not yet completed,
- * or tryComplete(), which first checks if the operation can be completed or not now, and if yes calls
- * forceComplete().
+ * Once an operation is completed, isCompleted() will return true. onComplete() is called from forceComplete(),
+ * which is triggered by either expiration, if the operation is not completed after delayMs; or tryComplete(),
+ * if the operation can be completed now.
  * <br/>
  * A subclass of DelayedOperation needs to provide an implementation of both onComplete() and tryComplete().
  * <br/>
@@ -41,7 +39,7 @@
  */
 public abstract class DelayedOperation extends TimerTask {
 
-    private final AtomicBoolean completed = new AtomicBoolean(false);
+    private volatile boolean completed = false;
 
     protected final Lock lock;
 
@@ -68,24 +66,36 @@ public DelayedOperation(long delayMs, Lock lock) {
      * Return true iff the operation is completed by the caller: note that
      * concurrent threads can try to complete the same operation, but only
      * the first thread will succeed in completing the operation and return
-     * true, others will still return false
+     * true, others will still return false.
      */
     public boolean forceComplete() {
-        if (completed.compareAndSet(false, true)) {
-            // cancel the timeout timer
-            cancel();
-            onComplete();
-            return true;
-        } else {
+        // Do not proceed if the operation is already completed.
+        if (completed) {
             return false;
         }
+        // Attain lock prior completing the request.
+        lock.lock();
+        try {
+            // Re-check, if the operation is already completed by some other thread.
+            if (!completed) {
+                completed = true;
+                // cancel the timeout timer
+                cancel();
+                onComplete();
+                return true;
+            } else {
+                return false;
+            }
+        } finally {
+            lock.unlock();
+        }
     }
 
     /**
      * Check if the delayed operation is already completed
      */
     public boolean isCompleted() {
-        return completed.get();
+        return completed;
     }
 
     /**