diff --git a/vpr/src/place/net_cost_handler.cpp b/vpr/src/place/net_cost_handler.cpp
index 539e837090c..63fd0bf07fd 100644
--- a/vpr/src/place/net_cost_handler.cpp
+++ b/vpr/src/place/net_cost_handler.cpp
@@ -31,8 +31,9 @@
 #include "placer_state.h"
 #include "move_utils.h"
 #include "place_timing_update.h"
-#include "noc_place_utils.h"
 #include "vtr_math.h"
+#include "vtr_ndmatrix.h"
+#include "vtr_ndoffsetmatrix.h"
 
 #include <array>
 
@@ -53,9 +54,6 @@ constexpr std::array<float, MAX_FANOUT_CROSSING_COUNT> cross_count = {1.0000, 1.
                                                                       2.5610, 2.5864, 2.6117, 2.6371, 2.6625, 2.6887, 2.7148, 2.7410,
                                                                       2.7671, 2.7933};
 
-
-
-
 /**
  * @brief If the moving pin is of type type SINK, update bb_pin_sink_count_new which stores the number of sink pins on each layer of "net_id"
  * @param pin_old_loc Old location of the moving pin
@@ -229,6 +227,70 @@ void NetCostHandler::alloc_and_load_chan_w_factors_for_place_cost_(float place_c
             chany_place_cost_fac_[high][low] = pow((double)chany_place_cost_fac_[high][low], (double)place_cost_exp);
         }
     }
+    
+    if (device_ctx.grid.get_num_layers() > 1) {
+        alloc_and_load_for_fast_vertical_cost_update_(place_cost_exp);
+    }
+}
+
+void NetCostHandler::alloc_and_load_for_fast_vertical_cost_update_(float place_cost_exp) {
+    const auto& device_ctx = g_vpr_ctx.device();
+    const auto& rr_graph = device_ctx.rr_graph;
+    
+    const size_t grid_height = device_ctx.grid.height();
+    const size_t grid_width = device_ctx.grid.width();
+
+
+    chanz_place_cost_fac_ = vtr::NdMatrix<float, 4>({grid_width, grid_height, grid_width, grid_height}, 0.);
+
+    vtr::NdMatrix<float, 2> tile_num_inter_die_conn({grid_width, grid_height}, 0.);                           
+
+    for (const auto& src_rr_node : rr_graph.nodes()) {
+        for (const auto& rr_edge_idx : rr_graph.configurable_edges(src_rr_node)) {
+            const auto& sink_rr_node = rr_graph.edge_sink_node(src_rr_node, rr_edge_idx);
+            if (rr_graph.node_layer(src_rr_node) != rr_graph.node_layer(sink_rr_node)) {
+                // We assume that the nodes driving the inter-layer connection or being driven by it
+                // are not streched across multiple tiles
+                int src_x = rr_graph.node_xhigh(src_rr_node);
+                int src_y = rr_graph.node_yhigh(src_rr_node);
+                VTR_ASSERT(rr_graph.node_xlow(src_rr_node) == src_x && rr_graph.node_ylow(src_rr_node) == src_y);
+
+                tile_num_inter_die_conn[src_x][src_y]++;
+            }
+        }
+
+        for (const auto& rr_edge_idx : rr_graph.non_configurable_edges(src_rr_node)) {
+            const auto& sink_rr_node = rr_graph.edge_sink_node(src_rr_node, rr_edge_idx);
+            if (rr_graph.node_layer(src_rr_node) != rr_graph.node_layer(sink_rr_node)) {
+                int src_x = rr_graph.node_xhigh(src_rr_node);
+                VTR_ASSERT(rr_graph.node_xlow(src_rr_node) == src_x && rr_graph.node_xlow(src_rr_node) == src_x);
+                int src_y = rr_graph.node_yhigh(src_rr_node);
+                VTR_ASSERT(rr_graph.node_ylow(src_rr_node) == src_y && rr_graph.node_ylow(src_rr_node) == src_y);
+                tile_num_inter_die_conn[src_x][src_y]++;
+            }
+        }
+    }
+
+    for (int x_high = 0; x_high < (int)device_ctx.grid.width(); x_high++) {
+        for (int y_high = 0; y_high < (int)device_ctx.grid.height(); y_high++) {
+            for (int x_low = 0; x_low <= x_high; x_low++) {
+                for (int y_low = 0; y_low <= y_high; y_low++) {
+                    int num_inter_die_conn = 0;
+                    for (int x = x_low; x <= x_high; x++) {
+                        for (int y = y_low; y <= y_high; y++) {
+                            num_inter_die_conn += tile_num_inter_die_conn[x][y];
+                        }
+                    }
+                    int seen_num_tiles = (x_high - x_low + 1) * (y_high - y_low + 1);
+                    chanz_place_cost_fac_[x_high][y_high][x_low][y_low] = seen_num_tiles / static_cast<float>(num_inter_die_conn);
+
+                    chanz_place_cost_fac_[x_high][y_high][x_low][y_low] = pow(
+                        (double)chanz_place_cost_fac_[x_high][y_high][x_low][y_low],
+                        (double)place_cost_exp);
+                }
+            }
+        }
+    }
 }
 
 double NetCostHandler::comp_bb_cost(e_cost_methods method) {
@@ -1395,6 +1457,8 @@ double NetCostHandler::get_net_cube_bb_cost_(ClusterNetId net_id, bool use_ts) {
 
     const t_bb& bb = use_ts ? ts_bb_coord_new_[net_id] : placer_state_.move().bb_coords[net_id];
 
+    const bool is_multi_layer = (g_vpr_ctx.device().grid.get_num_layers() > 1);
+
     double crossing = wirelength_crossing_count(cluster_ctx.clb_nlist.net_pins(net_id).size());
 
     /* Could insert a check for xmin == xmax.  In that case, assume  *
@@ -1413,6 +1477,9 @@ double NetCostHandler::get_net_cube_bb_cost_(ClusterNetId net_id, bool use_ts) {
     double ncost;
     ncost = (bb.xmax - bb.xmin + 1) * crossing * chanx_place_cost_fac_[bb.ymax][bb.ymin - 1];
     ncost += (bb.ymax - bb.ymin + 1) * crossing * chany_place_cost_fac_[bb.xmax][bb.xmin - 1];
+    if (is_multi_layer) {
+        ncost += (bb.layer_max - bb.layer_min) * crossing * chanz_place_cost_fac_[bb.xmax][bb.ymax][bb.xmin][bb.ymin];
+    }
 
     return ncost;
 }
diff --git a/vpr/src/place/net_cost_handler.h b/vpr/src/place/net_cost_handler.h
index 65fab00afc2..3048b7637ea 100644
--- a/vpr/src/place/net_cost_handler.h
+++ b/vpr/src/place/net_cost_handler.h
@@ -195,6 +195,13 @@ class NetCostHandler {
      */
     vtr::NdOffsetMatrix<float, 2> chanx_place_cost_fac_; // [-1...device_ctx.grid.width()-1]
     vtr::NdOffsetMatrix<float, 2> chany_place_cost_fac_; // [-1...device_ctx.grid.height()-1]
+    /**
+      @brief This data structure functions similarly to the matrices described above 
+      but is applied to 3D connections linking different FPGA layers. It is used in the 
+      placement cost function calculation, where the height of the bounding box is divided 
+      by the average number of inter-die connections within the bounding box.
+     */
+    vtr::NdMatrix<float, 4> chanz_place_cost_fac_; // [0...device_ctx.grid.width()-1][0...device_ctx.grid.height()-1][0...device_ctx.grid.width()-1][0...device_ctx.grid.height()-1]
 
 
   private:
@@ -249,6 +256,18 @@ class NetCostHandler {
      */
     void alloc_and_load_chan_w_factors_for_place_cost_(float place_cost_exp);
 
+    /**
+    * @brief Allocates and loads the chanz_place_cost_fac array with the inverse of
+    * the average number of inter-die connections between [subhigh] and [sublow].
+    *
+    * @details This is only useful for multi-die FPGAs. The place_cost_exp factor specifies to
+    * what power the average number of inter-die connections should be take -- larger numbers make narrower channels more expensive.
+    *
+    * @param place_cost_exp It is an exponent to which you take the average number of inter-die connections;
+    * a higher value would favour areas with more inter-die connections over areas with less of those during placement (usually we use 1).
+    */
+    void alloc_and_load_for_fast_vertical_cost_update_(float place_cost_exp);
+
     /**
      * @brief Calculate the new connection delay and timing cost of all the
      * sink pins affected by moving a specific pin to a new location. Also