Skip to content

Commit b390afd

Browse files
zhijianli88Juan Quintela
authored andcommitted
migration/rdma: Fix out of order wrid
destination: ../qemu/build/qemu-system-x86_64 -enable-kvm -netdev tap,id=hn0,script=/etc/qemu-ifup,downscript=/etc/qemu-ifdown -device e1000,netdev=hn0,mac=50:52:54:00:11:22 -boot c -drive if=none,file=./Fedora-rdma-server-migration.qcow2,id=drive-virtio-disk0 -device virtio-blk-pci,bus=pci.0,addr=0x4,drive=drive-virtio-disk0,id=virtio-disk0 -m 2048 -smp 2 -device piix3-usb-uhci -device usb-tablet -monitor stdio -vga qxl -spice streaming-video=filter,port=5902,disable-ticketing -incoming rdma:192.168.22.23:8888 qemu-system-x86_64: -spice streaming-video=filter,port=5902,disable-ticketing: warning: short-form boolean option 'disable-ticketing' deprecated Please use disable-ticketing=on instead QEMU 6.0.50 monitor - type 'help' for more information (qemu) trace-event qemu_rdma_block_for_wrid_miss on (qemu) dest_init RDMA Device opened: kernel name rxe_eth0 uverbs device name uverbs2, infiniband_verbs class device path /sys/class/infiniband_verbs/uverbs2, infiniband class device path /sys/class/infiniband/rxe_eth0, transport: (2) Ethernet qemu_rdma_block_for_wrid_miss A Wanted wrid CONTROL SEND (2000) but got CONTROL RECV (4000) source: ../qemu/build/qemu-system-x86_64 -enable-kvm -netdev tap,id=hn0,script=/etc/qemu-ifup,downscript=/etc/qemu-ifdown -device e1000,netdev=hn0,mac=50:52:54:00:11:22 -boot c -drive if=none,file=./Fedora-rdma-server.qcow2,id=drive-virtio-disk0 -device virtio-blk-pci,bus=pci.0,addr=0x4,drive=drive-virtio-disk0,id=virtio-disk0 -m 2048 -smp 2 -device piix3-usb-uhci -device usb-tablet -monitor stdio -vga qxl -spice streaming-video=filter,port=5901,disable-ticketing -S qemu-system-x86_64: -spice streaming-video=filter,port=5901,disable-ticketing: warning: short-form boolean option 'disable-ticketing' deprecated Please use disable-ticketing=on instead QEMU 6.0.50 monitor - type 'help' for more information (qemu) (qemu) trace-event qemu_rdma_block_for_wrid_miss on (qemu) migrate -d rdma:192.168.22.23:8888 source_resolve_host RDMA Device opened: kernel name rxe_eth0 uverbs device name uverbs2, infiniband_verbs class device path /sys/class/infiniband_verbs/uverbs2, infiniband class device path /sys/class/infiniband/rxe_eth0, transport: (2) Ethernet (qemu) qemu_rdma_block_for_wrid_miss A Wanted wrid WRITE RDMA (1) but got CONTROL RECV (4000) NOTE: we use soft RoCE as the rdma device. [root@iaas-rpma images]# rdma link show rxe_eth0/1 link rxe_eth0/1 state ACTIVE physical_state LINK_UP netdev eth0 This migration could not be completed when out of order(OOO) CQ event occurs. The send queue and receive queue shared a same completion queue, and qemu_rdma_block_for_wrid() will drop the CQs it's not interested in. But the dropped CQs by qemu_rdma_block_for_wrid() could be later CQs it wants. So in this case, qemu_rdma_block_for_wrid() will block forever. OOO cases will occur in both source side and destination side. And a forever blocking happens on only SEND and RECV are out of order. OOO between 'WRITE RDMA' and 'RECV' doesn't matter. below the OOO sequence: source destination rdma_write_one() qemu_rdma_registration_handle() 1. S1: post_recv X D1: post_recv Y 2. wait for recv CQ event X 3. D2: post_send X ---------------+ 4. wait for send CQ send event X (D2) | 5. recv CQ event X reaches (D2) | 6. +-S2: post_send Y | 7. | wait for send CQ event Y | 8. | recv CQ event Y (S2) (drop it) | 9. +-send CQ event Y reaches (S2) | 10. send CQ event X reaches (D2) -----+ 11. wait recv CQ event Y (dropped by (8)) Although a hardware IB works fine in my a hundred of runs, the IB specification doesn't guaratee the CQ order in such case. Here we introduce a independent send completion queue to distinguish ibv_post_send completion queue from the original mixed completion queue. It helps us to poll the specific CQE we are really interested in. Signed-off-by: Li Zhijian <[email protected]> Reviewed-by: Juan Quintela <[email protected]> Signed-off-by: Juan Quintela <[email protected]>
1 parent af53175 commit b390afd

File tree

1 file changed

+101
-37
lines changed

1 file changed

+101
-37
lines changed

migration/rdma.c

Lines changed: 101 additions & 37 deletions
Original file line numberDiff line numberDiff line change
@@ -358,9 +358,11 @@ typedef struct RDMAContext {
358358
struct ibv_context *verbs;
359359
struct rdma_event_channel *channel;
360360
struct ibv_qp *qp; /* queue pair */
361-
struct ibv_comp_channel *comp_channel; /* completion channel */
361+
struct ibv_comp_channel *recv_comp_channel; /* recv completion channel */
362+
struct ibv_comp_channel *send_comp_channel; /* send completion channel */
362363
struct ibv_pd *pd; /* protection domain */
363-
struct ibv_cq *cq; /* completion queue */
364+
struct ibv_cq *recv_cq; /* recvieve completion queue */
365+
struct ibv_cq *send_cq; /* send completion queue */
364366

365367
/*
366368
* If a previous write failed (perhaps because of a failed
@@ -1059,21 +1061,34 @@ static int qemu_rdma_alloc_pd_cq(RDMAContext *rdma)
10591061
return -1;
10601062
}
10611063

1062-
/* create completion channel */
1063-
rdma->comp_channel = ibv_create_comp_channel(rdma->verbs);
1064-
if (!rdma->comp_channel) {
1065-
error_report("failed to allocate completion channel");
1064+
/* create receive completion channel */
1065+
rdma->recv_comp_channel = ibv_create_comp_channel(rdma->verbs);
1066+
if (!rdma->recv_comp_channel) {
1067+
error_report("failed to allocate receive completion channel");
10661068
goto err_alloc_pd_cq;
10671069
}
10681070

10691071
/*
1070-
* Completion queue can be filled by both read and write work requests,
1071-
* so must reflect the sum of both possible queue sizes.
1072+
* Completion queue can be filled by read work requests.
10721073
*/
1073-
rdma->cq = ibv_create_cq(rdma->verbs, (RDMA_SIGNALED_SEND_MAX * 3),
1074-
NULL, rdma->comp_channel, 0);
1075-
if (!rdma->cq) {
1076-
error_report("failed to allocate completion queue");
1074+
rdma->recv_cq = ibv_create_cq(rdma->verbs, (RDMA_SIGNALED_SEND_MAX * 3),
1075+
NULL, rdma->recv_comp_channel, 0);
1076+
if (!rdma->recv_cq) {
1077+
error_report("failed to allocate receive completion queue");
1078+
goto err_alloc_pd_cq;
1079+
}
1080+
1081+
/* create send completion channel */
1082+
rdma->send_comp_channel = ibv_create_comp_channel(rdma->verbs);
1083+
if (!rdma->send_comp_channel) {
1084+
error_report("failed to allocate send completion channel");
1085+
goto err_alloc_pd_cq;
1086+
}
1087+
1088+
rdma->send_cq = ibv_create_cq(rdma->verbs, (RDMA_SIGNALED_SEND_MAX * 3),
1089+
NULL, rdma->send_comp_channel, 0);
1090+
if (!rdma->send_cq) {
1091+
error_report("failed to allocate send completion queue");
10771092
goto err_alloc_pd_cq;
10781093
}
10791094

@@ -1083,11 +1098,19 @@ static int qemu_rdma_alloc_pd_cq(RDMAContext *rdma)
10831098
if (rdma->pd) {
10841099
ibv_dealloc_pd(rdma->pd);
10851100
}
1086-
if (rdma->comp_channel) {
1087-
ibv_destroy_comp_channel(rdma->comp_channel);
1101+
if (rdma->recv_comp_channel) {
1102+
ibv_destroy_comp_channel(rdma->recv_comp_channel);
1103+
}
1104+
if (rdma->send_comp_channel) {
1105+
ibv_destroy_comp_channel(rdma->send_comp_channel);
1106+
}
1107+
if (rdma->recv_cq) {
1108+
ibv_destroy_cq(rdma->recv_cq);
1109+
rdma->recv_cq = NULL;
10881110
}
10891111
rdma->pd = NULL;
1090-
rdma->comp_channel = NULL;
1112+
rdma->recv_comp_channel = NULL;
1113+
rdma->send_comp_channel = NULL;
10911114
return -1;
10921115

10931116
}
@@ -1104,8 +1127,8 @@ static int qemu_rdma_alloc_qp(RDMAContext *rdma)
11041127
attr.cap.max_recv_wr = 3;
11051128
attr.cap.max_send_sge = 1;
11061129
attr.cap.max_recv_sge = 1;
1107-
attr.send_cq = rdma->cq;
1108-
attr.recv_cq = rdma->cq;
1130+
attr.send_cq = rdma->send_cq;
1131+
attr.recv_cq = rdma->recv_cq;
11091132
attr.qp_type = IBV_QPT_RC;
11101133

11111134
ret = rdma_create_qp(rdma->cm_id, rdma->pd, &attr);
@@ -1496,14 +1519,14 @@ static void qemu_rdma_signal_unregister(RDMAContext *rdma, uint64_t index,
14961519
* (of any kind) has completed.
14971520
* Return the work request ID that completed.
14981521
*/
1499-
static uint64_t qemu_rdma_poll(RDMAContext *rdma, uint64_t *wr_id_out,
1500-
uint32_t *byte_len)
1522+
static uint64_t qemu_rdma_poll(RDMAContext *rdma, struct ibv_cq *cq,
1523+
uint64_t *wr_id_out, uint32_t *byte_len)
15011524
{
15021525
int ret;
15031526
struct ibv_wc wc;
15041527
uint64_t wr_id;
15051528

1506-
ret = ibv_poll_cq(rdma->cq, 1, &wc);
1529+
ret = ibv_poll_cq(cq, 1, &wc);
15071530

15081531
if (!ret) {
15091532
*wr_id_out = RDMA_WRID_NONE;
@@ -1575,7 +1598,8 @@ static uint64_t qemu_rdma_poll(RDMAContext *rdma, uint64_t *wr_id_out,
15751598
/* Wait for activity on the completion channel.
15761599
* Returns 0 on success, none-0 on error.
15771600
*/
1578-
static int qemu_rdma_wait_comp_channel(RDMAContext *rdma)
1601+
static int qemu_rdma_wait_comp_channel(RDMAContext *rdma,
1602+
struct ibv_comp_channel *comp_channel)
15791603
{
15801604
struct rdma_cm_event *cm_event;
15811605
int ret = -1;
@@ -1586,7 +1610,7 @@ static int qemu_rdma_wait_comp_channel(RDMAContext *rdma)
15861610
*/
15871611
if (rdma->migration_started_on_destination &&
15881612
migration_incoming_get_current()->state == MIGRATION_STATUS_ACTIVE) {
1589-
yield_until_fd_readable(rdma->comp_channel->fd);
1613+
yield_until_fd_readable(comp_channel->fd);
15901614
} else {
15911615
/* This is the source side, we're in a separate thread
15921616
* or destination prior to migration_fd_process_incoming()
@@ -1597,7 +1621,7 @@ static int qemu_rdma_wait_comp_channel(RDMAContext *rdma)
15971621
*/
15981622
while (!rdma->error_state && !rdma->received_error) {
15991623
GPollFD pfds[2];
1600-
pfds[0].fd = rdma->comp_channel->fd;
1624+
pfds[0].fd = comp_channel->fd;
16011625
pfds[0].events = G_IO_IN | G_IO_HUP | G_IO_ERR;
16021626
pfds[0].revents = 0;
16031627

@@ -1655,6 +1679,17 @@ static int qemu_rdma_wait_comp_channel(RDMAContext *rdma)
16551679
return rdma->error_state;
16561680
}
16571681

1682+
static struct ibv_comp_channel *to_channel(RDMAContext *rdma, int wrid)
1683+
{
1684+
return wrid < RDMA_WRID_RECV_CONTROL ? rdma->send_comp_channel :
1685+
rdma->recv_comp_channel;
1686+
}
1687+
1688+
static struct ibv_cq *to_cq(RDMAContext *rdma, int wrid)
1689+
{
1690+
return wrid < RDMA_WRID_RECV_CONTROL ? rdma->send_cq : rdma->recv_cq;
1691+
}
1692+
16581693
/*
16591694
* Block until the next work request has completed.
16601695
*
@@ -1675,13 +1710,15 @@ static int qemu_rdma_block_for_wrid(RDMAContext *rdma, int wrid_requested,
16751710
struct ibv_cq *cq;
16761711
void *cq_ctx;
16771712
uint64_t wr_id = RDMA_WRID_NONE, wr_id_in;
1713+
struct ibv_comp_channel *ch = to_channel(rdma, wrid_requested);
1714+
struct ibv_cq *poll_cq = to_cq(rdma, wrid_requested);
16781715

1679-
if (ibv_req_notify_cq(rdma->cq, 0)) {
1716+
if (ibv_req_notify_cq(poll_cq, 0)) {
16801717
return -1;
16811718
}
16821719
/* poll cq first */
16831720
while (wr_id != wrid_requested) {
1684-
ret = qemu_rdma_poll(rdma, &wr_id_in, byte_len);
1721+
ret = qemu_rdma_poll(rdma, poll_cq, &wr_id_in, byte_len);
16851722
if (ret < 0) {
16861723
return ret;
16871724
}
@@ -1702,12 +1739,12 @@ static int qemu_rdma_block_for_wrid(RDMAContext *rdma, int wrid_requested,
17021739
}
17031740

17041741
while (1) {
1705-
ret = qemu_rdma_wait_comp_channel(rdma);
1742+
ret = qemu_rdma_wait_comp_channel(rdma, ch);
17061743
if (ret) {
17071744
goto err_block_for_wrid;
17081745
}
17091746

1710-
ret = ibv_get_cq_event(rdma->comp_channel, &cq, &cq_ctx);
1747+
ret = ibv_get_cq_event(ch, &cq, &cq_ctx);
17111748
if (ret) {
17121749
perror("ibv_get_cq_event");
17131750
goto err_block_for_wrid;
@@ -1721,7 +1758,7 @@ static int qemu_rdma_block_for_wrid(RDMAContext *rdma, int wrid_requested,
17211758
}
17221759

17231760
while (wr_id != wrid_requested) {
1724-
ret = qemu_rdma_poll(rdma, &wr_id_in, byte_len);
1761+
ret = qemu_rdma_poll(rdma, poll_cq, &wr_id_in, byte_len);
17251762
if (ret < 0) {
17261763
goto err_block_for_wrid;
17271764
}
@@ -2437,13 +2474,21 @@ static void qemu_rdma_cleanup(RDMAContext *rdma)
24372474
rdma_destroy_qp(rdma->cm_id);
24382475
rdma->qp = NULL;
24392476
}
2440-
if (rdma->cq) {
2441-
ibv_destroy_cq(rdma->cq);
2442-
rdma->cq = NULL;
2477+
if (rdma->recv_cq) {
2478+
ibv_destroy_cq(rdma->recv_cq);
2479+
rdma->recv_cq = NULL;
2480+
}
2481+
if (rdma->send_cq) {
2482+
ibv_destroy_cq(rdma->send_cq);
2483+
rdma->send_cq = NULL;
2484+
}
2485+
if (rdma->recv_comp_channel) {
2486+
ibv_destroy_comp_channel(rdma->recv_comp_channel);
2487+
rdma->recv_comp_channel = NULL;
24432488
}
2444-
if (rdma->comp_channel) {
2445-
ibv_destroy_comp_channel(rdma->comp_channel);
2446-
rdma->comp_channel = NULL;
2489+
if (rdma->send_comp_channel) {
2490+
ibv_destroy_comp_channel(rdma->send_comp_channel);
2491+
rdma->send_comp_channel = NULL;
24472492
}
24482493
if (rdma->pd) {
24492494
ibv_dealloc_pd(rdma->pd);
@@ -3115,10 +3160,14 @@ static void qio_channel_rdma_set_aio_fd_handler(QIOChannel *ioc,
31153160
{
31163161
QIOChannelRDMA *rioc = QIO_CHANNEL_RDMA(ioc);
31173162
if (io_read) {
3118-
aio_set_fd_handler(ctx, rioc->rdmain->comp_channel->fd,
3163+
aio_set_fd_handler(ctx, rioc->rdmain->recv_comp_channel->fd,
3164+
false, io_read, io_write, NULL, opaque);
3165+
aio_set_fd_handler(ctx, rioc->rdmain->send_comp_channel->fd,
31193166
false, io_read, io_write, NULL, opaque);
31203167
} else {
3121-
aio_set_fd_handler(ctx, rioc->rdmaout->comp_channel->fd,
3168+
aio_set_fd_handler(ctx, rioc->rdmaout->recv_comp_channel->fd,
3169+
false, io_read, io_write, NULL, opaque);
3170+
aio_set_fd_handler(ctx, rioc->rdmaout->send_comp_channel->fd,
31223171
false, io_read, io_write, NULL, opaque);
31233172
}
31243173
}
@@ -3332,7 +3381,22 @@ static size_t qemu_rdma_save_page(QEMUFile *f, void *opaque,
33323381
*/
33333382
while (1) {
33343383
uint64_t wr_id, wr_id_in;
3335-
int ret = qemu_rdma_poll(rdma, &wr_id_in, NULL);
3384+
int ret = qemu_rdma_poll(rdma, rdma->recv_cq, &wr_id_in, NULL);
3385+
if (ret < 0) {
3386+
error_report("rdma migration: polling error! %d", ret);
3387+
goto err;
3388+
}
3389+
3390+
wr_id = wr_id_in & RDMA_WRID_TYPE_MASK;
3391+
3392+
if (wr_id == RDMA_WRID_NONE) {
3393+
break;
3394+
}
3395+
}
3396+
3397+
while (1) {
3398+
uint64_t wr_id, wr_id_in;
3399+
int ret = qemu_rdma_poll(rdma, rdma->send_cq, &wr_id_in, NULL);
33363400
if (ret < 0) {
33373401
error_report("rdma migration: polling error! %d", ret);
33383402
goto err;

0 commit comments

Comments
 (0)