Peter Eriksson
2024-05-09 17:58:32 UTC
Hi,
I’m trying to fix a bug in the cciss driver that has been there “forever” when using it with an HP H241 SAS HBA card.
I think I could use some input/suggestions from someone more knowledgeable with CAM and device drivers in general…
An example (this is with extra instrumentation added so not quite a stock kernel driver):
ciss1: *** Drive failure imminent, Port=1E Box=1 Bay=17 reason=0x37 [ts=4460, class=4, subclass=0, detail=1, tag=0x2]
(da122:ciss1:32:78:0): WRITE(16). CDB: 8a 00 00 00 00 04 7f 17 51 98 00 00 00 10 00 00
(da122:ciss1:32:78:0): CAM status: CCB request completed with an error
(da122:ciss1:32:78:0): Retrying command, 3 more tries remain
(da122:ciss1:32:78:0): WRITE(16). CDB: 8a 00 00 00 00 04 7f 17 52 18 00 00 00 10 00 00
(da122:ciss1:32:78:0): CAM status: CCB request completed with an error
(da122:ciss1:32:78:0): Retrying command, 3 more tries remain
(da122:ciss1:32:78:0): READ(16). CDB: 88 00 00 00 00 04 7f 17 65 c8 00 00 02 00 00 00
(da122:ciss1:32:78:0): CAM status: CCB request completed with an error
(da122:ciss1:32:78:0): Retrying command, 3 more tries remain
(da122:ciss1:32:78:0): READ(16). CDB: 88 00 00 00 00 04 7f 17 63 c8 00 00 02 00 00 00
(da122:ciss1:32:78:0): CAM status: CCB request completed with an error
(da122:ciss1:32:78:0): Retrying command, 3 more tries remain
(da122:ciss1:32:78:0): READ(16). CDB: 88 00 00 00 00 04 7f 17 61 c8 00 00 02 00 00 00
(da122:ciss1:32:78:0): CAM status: CCB request completed with an error
(da122:ciss1:32:78:0): Retrying command, 3 more tries remain
(da122:ciss1:32:78:0): READ(16). CDB: 88 00 00 00 00 04 7f 17 5f c8 00 00 02 00 00 00
(da122:ciss1:32:78:0): CAM status: CCB request completed with an error
(da122:ciss1:32:78:0): Retrying command, 3 more tries remain
(da122:ciss1:32:78:0): READ(16). CDB: 88 00 00 00 00 04 7f 17 5f a8 00 00 00 20 00 00
(da122:ciss1:32:78:0): CAM status: CCB request completed with an error
(da122:ciss1:32:78:0): Retrying command, 3 more tries remain
(da122:ciss1:32:78:0): READ(16). CDB: 88 00 00 00 00 04 7f 17 5f 38 00 00 00 70 00 00
(da122:ciss1:32:78:0): CAM status: CCB request completed with an error
(da122:ciss1:32:78:0): Retrying command, 3 more tries remain
(da122:ciss1:32:78:0): READ(16). CDB: 88 00 00 00 00 04 7f 17 5d 38 00 00 02 00 00 00
(da122:ciss1:32:78:0): CAM status: CCB request completed with an error
(da122:ciss1:32:78:0): Retrying command, 3 more tries remain
(da122:ciss1:32:78:0): READ(16). CDB: 88 00 00 00 00 04 7f 17 5b 38 00 00 02 00 00 00
(da122:ciss1:32:78:0): CAM status: CCB request completed with an error
(da122:ciss1:32:78:0): Retrying command, 3 more tries remain
(da122:ciss1:32:78:0): READ(16). CDB: 88 00 00 00 00 04 7f 17 59 38 00 00 02 00 00 00
(da122:ciss1:32:78:0): CAM status: CCB request completed with an error
(da122:ciss1:32:78:0): Retrying command, 3 more tries remain
(da122:ciss1:32:78:0): READ(16). CDB: 88 00 00 00 00 04 7f 17 58 48 00 00 00 f0 00 00
(da122:ciss1:32:78:0): CAM status: CCB request completed with an error
(da122:ciss1:32:78:0): Retrying command, 3 more tries remain
(da122:ciss1:32:78:0): READ(16). CDB: 88 00 00 00 00 04 7f 17 56 48 00 00 02 00 00 00
(da122:ciss1:32:78:0): CAM status: CCB request completed with an error
(da122:ciss1:32:78:0): Retrying command, 3 more tries remain
(da122:ciss1:32:78:0): WRITE(16). CDB: 8a 00 00 00 00 04 88 d8 48 d0 00 00 00 10 00 00
(da122:ciss1:32:78:0): CAM status: CCB request completed with an error
(da122:ciss1:32:78:0): Retrying command, 3 more tries remain
(da122:ciss1:32:78:0): WRITE(16). CDB: 8a 00 00 00 00 04 88 d8 46 e0 00 00 01 f0 00 00
(da122:ciss1:32:78:0): CAM status: CCB request completed with an error
(da122:ciss1:32:78:0): Retrying command, 3 more tries remain
(da122:ciss1:32:78:0): READ(16). CDB: 88 00 00 00 00 04 88 d8 8f d0 00 00 02 00 00 00
(da122:ciss1:32:78:0): CAM status: CCB request completed with an error
(da122:ciss1:32:78:0): Retrying command, 3 more tries remain
(da122:ciss1:32:78:0): READ(16). CDB: 88 00 00 00 00 04 88 d8 8d d0 00 00 02 00 00 00
(da122:ciss1:32:78:0): CAM status: CCB request completed with an error
(da122:ciss1:32:78:0): Retrying command, 3 more tries remain
ciss1: WARNING: completing non-busy request #1 [onq=0, length=262144, ccphys=1791048448, tag=72, flags=0x50 (----I-C), sg_tag=0x3, cr_complete=0xffffffff806059a0]
ciss1: WARNING: clearing cr_complete due to ciss_cam_complete & nonbusy
ciss1: WARNING: completing non-busy request #1 [onq=0, length=262144, ccphys=1791318368, tag=313, flags=0x50 (----I-C), sg_tag=0x3, cr_complete=0xffffffff806059a0]
ciss1: WARNING: clearing cr_complete due to ciss_cam_complete & nonbusy
ciss1: WARNING: completing non-busy request #1 [onq=0, length=8192, ccphys=1791226528, tag=231, flags=0x50 (----I-C), sg_tag=0x3, cr_complete=0xffffffff806059a0]
ciss1: WARNING: clearing cr_complete due to ciss_cam_complete & nonbusy
ciss1: WARNING: completing non-busy request #1 [onq=0, length=262144, ccphys=1791313888, tag=309, flags=0x50 (----I-C), sg_tag=0x5, cr_complete=0xffffffff806059a0]
ciss1: WARNING: clearing cr_complete due to ciss_cam_complete & nonbusy
ciss1: WARNING: completing non-busy request #1 [onq=0, length=8192, ccphys=1791206368, tag=213, flags=0x50 (----I-C), sg_tag=0x3, cr_complete=0xffffffff806059a0]
ciss1: WARNING: clearing cr_complete due to ciss_cam_complete & nonbusy
panic: camq_remove: Attempt to remove out-of-bounds index -1 from queue 0xfffff80107fa9838 of size 1
cpuid = 7
time = 1715266713
KDB: stack backtrace:
#0 0xffffffff80c542d5 at kdb_backtrace+0x65
#1 0xffffffff80c06c51 at vpanic+0x151
#2 0xffffffff80c06af3 at panic+0x43
#3 0xffffffff80390676 at camq_remove+0xf6
#4 0xffffffff803936c8 at xpt_run_devq+0x148
#5 0xffffffff80392b0c at xpt_action_default+0x35c
#6 0xffffffff803c3544 at dastart+0x314
#7 0xffffffff80394f73 at xpt_run_allocq+0x83
#8 0xffffffff803c5152 at dastrategy+0x82
#9 0xffffffff80b438d4 at g_disk_start+0x314
#10 0xffffffff80b46b55 at g_io_request+0x1d5
#11 0xffffffff80b46b55 at g_io_request+0x1d5
#12 0xffffffff821a34dc at vdev_geom_io_start+0x23c
#13 0xffffffff82347fb4 at zio_vdev_io_start+0x204
#14 0xffffffff8234204f at zio_nowait+0x12f
#15 0xffffffff822a0ce7 at vdev_mirror_io_start+0x177
#16 0xffffffff82347fb4 at zio_vdev_io_start+0x204
#17 0xffffffff8234204f at zio_nowait+0x12f
Uptime: 10h16m26s
The code where I’m trying to figure out what’s going on is this part in sys/dev/ciss/ciss.c:
static void
ciss_complete(struct ciss_softc *sc, cr_qhead_t *qh)
{
struct ciss_request *cr;
debug_called(2);
/*
* Loop taking requests off the completed queue and performing
* completion processing on them.
*/
for (;;) {
int got_nonbusy = 0;
if ((cr = ciss_dequeue_complete(sc, qh)) == NULL)
break;
ciss_unmap_request(cr);
if ((cr->cr_flags & CISS_REQ_BUSY) == 0) {
cr->nonbusy_request_counter++;
ciss_printf(sc,
"WARNING: completing non-busy request #%d [onq=%d, length=%u, ccphys=%u, tag=%d, flags=0x%x (%s), sg_tag=0x%x, cr_complete=%p]\n",
cr->nonbusy_request_counter,
cr->cr_onq, cr->cr_length, cr->cr_ccphys, cr->cr_tag,
cr->cr_flags, cr_flags2str(cr->cr_flags),
cr->cr_sg_tag, cr->cr_complete
);
got_nonbusy = 1;
}
cr->cr_flags &= ~CISS_REQ_BUSY;
/*
* If the request has a callback, invoke it.
*/
if (cr->cr_complete != NULL) {
cr->cr_complete(cr);
/* pen: Attempt to make sure ciss_cam_complete() doesn’t get called multiple times */
if (cr->cr_complete == ciss_cam_complete && got_nonbusy) {
ciss_printf(sc, "WARNING: clearing cr_complete due to ciss_cam_complete & nonbusy\n");
cr->cr_complete = NULL;
}
continue;
}
/*
* If someone is sleeping on this request, wake them up.
*/
if (cr->cr_flags & CISS_REQ_SLEEP) {
cr->cr_flags &= ~CISS_REQ_SLEEP;
wakeup(cr);
continue;
}
/*
* If someone is polling this request for completion, signal.
*/
if (cr->cr_flags & CISS_REQ_POLL) {
cr->cr_flags &= ~CISS_REQ_POLL;
continue;
}
/*
* Give up and throw the request back on the free queue. This
* should never happen; resources will probably be lost.
*/
ciss_printf(sc, "WARNING: completed command with no submitter\n");
ciss_enqueue_free(cr);
}
}
- Any suggestions? (Probably not but I thought I’d ask anyway :-)
- Peter
--
Posted automagically by a mail2news gateway at muc.de e.V.
Please direct questions, flames, donations, etc. to news-***@muc.de
I’m trying to fix a bug in the cciss driver that has been there “forever” when using it with an HP H241 SAS HBA card.
ciss1: WARNING: completing non-busy request
ciss1: WARNING: completing non-busy request
ciss1: WARNING: completing non-busy request
panic: cam_periph_done_panic: already done with ccb 0xfffff88abefc4000
cpuid = 0
time = 1714852260
#0 0xffffffff80c541b5 at kdb_backtrace+0x65
#1 0xffffffff80c06b31 at vpanic+0x151
#2 0xffffffff80c069d3 at panic+0x43
#3 0xffffffff8039017c at cam_periph_done_panic+0x1c
#4 0xffffffff80397193 at xpt_done_process+0x313
#5 0xffffffff803990a5 at xpt_done_td+0xf5
#6 0xffffffff80bc333e at fork_exit+0x7e
#7 0xffffffff8108a44e at fork_trampoline+0xe
but it’s still there…ciss1: WARNING: completing non-busy request
ciss1: WARNING: completing non-busy request
panic: cam_periph_done_panic: already done with ccb 0xfffff88abefc4000
cpuid = 0
time = 1714852260
#0 0xffffffff80c541b5 at kdb_backtrace+0x65
#1 0xffffffff80c06b31 at vpanic+0x151
#2 0xffffffff80c069d3 at panic+0x43
#3 0xffffffff8039017c at cam_periph_done_panic+0x1c
#4 0xffffffff80397193 at xpt_done_process+0x313
#5 0xffffffff803990a5 at xpt_done_td+0xf5
#6 0xffffffff80bc333e at fork_exit+0x7e
#7 0xffffffff8108a44e at fork_trampoline+0xe
I think I could use some input/suggestions from someone more knowledgeable with CAM and device drivers in general…
An example (this is with extra instrumentation added so not quite a stock kernel driver):
ciss1: *** Drive failure imminent, Port=1E Box=1 Bay=17 reason=0x37 [ts=4460, class=4, subclass=0, detail=1, tag=0x2]
(da122:ciss1:32:78:0): WRITE(16). CDB: 8a 00 00 00 00 04 7f 17 51 98 00 00 00 10 00 00
(da122:ciss1:32:78:0): CAM status: CCB request completed with an error
(da122:ciss1:32:78:0): Retrying command, 3 more tries remain
(da122:ciss1:32:78:0): WRITE(16). CDB: 8a 00 00 00 00 04 7f 17 52 18 00 00 00 10 00 00
(da122:ciss1:32:78:0): CAM status: CCB request completed with an error
(da122:ciss1:32:78:0): Retrying command, 3 more tries remain
(da122:ciss1:32:78:0): READ(16). CDB: 88 00 00 00 00 04 7f 17 65 c8 00 00 02 00 00 00
(da122:ciss1:32:78:0): CAM status: CCB request completed with an error
(da122:ciss1:32:78:0): Retrying command, 3 more tries remain
(da122:ciss1:32:78:0): READ(16). CDB: 88 00 00 00 00 04 7f 17 63 c8 00 00 02 00 00 00
(da122:ciss1:32:78:0): CAM status: CCB request completed with an error
(da122:ciss1:32:78:0): Retrying command, 3 more tries remain
(da122:ciss1:32:78:0): READ(16). CDB: 88 00 00 00 00 04 7f 17 61 c8 00 00 02 00 00 00
(da122:ciss1:32:78:0): CAM status: CCB request completed with an error
(da122:ciss1:32:78:0): Retrying command, 3 more tries remain
(da122:ciss1:32:78:0): READ(16). CDB: 88 00 00 00 00 04 7f 17 5f c8 00 00 02 00 00 00
(da122:ciss1:32:78:0): CAM status: CCB request completed with an error
(da122:ciss1:32:78:0): Retrying command, 3 more tries remain
(da122:ciss1:32:78:0): READ(16). CDB: 88 00 00 00 00 04 7f 17 5f a8 00 00 00 20 00 00
(da122:ciss1:32:78:0): CAM status: CCB request completed with an error
(da122:ciss1:32:78:0): Retrying command, 3 more tries remain
(da122:ciss1:32:78:0): READ(16). CDB: 88 00 00 00 00 04 7f 17 5f 38 00 00 00 70 00 00
(da122:ciss1:32:78:0): CAM status: CCB request completed with an error
(da122:ciss1:32:78:0): Retrying command, 3 more tries remain
(da122:ciss1:32:78:0): READ(16). CDB: 88 00 00 00 00 04 7f 17 5d 38 00 00 02 00 00 00
(da122:ciss1:32:78:0): CAM status: CCB request completed with an error
(da122:ciss1:32:78:0): Retrying command, 3 more tries remain
(da122:ciss1:32:78:0): READ(16). CDB: 88 00 00 00 00 04 7f 17 5b 38 00 00 02 00 00 00
(da122:ciss1:32:78:0): CAM status: CCB request completed with an error
(da122:ciss1:32:78:0): Retrying command, 3 more tries remain
(da122:ciss1:32:78:0): READ(16). CDB: 88 00 00 00 00 04 7f 17 59 38 00 00 02 00 00 00
(da122:ciss1:32:78:0): CAM status: CCB request completed with an error
(da122:ciss1:32:78:0): Retrying command, 3 more tries remain
(da122:ciss1:32:78:0): READ(16). CDB: 88 00 00 00 00 04 7f 17 58 48 00 00 00 f0 00 00
(da122:ciss1:32:78:0): CAM status: CCB request completed with an error
(da122:ciss1:32:78:0): Retrying command, 3 more tries remain
(da122:ciss1:32:78:0): READ(16). CDB: 88 00 00 00 00 04 7f 17 56 48 00 00 02 00 00 00
(da122:ciss1:32:78:0): CAM status: CCB request completed with an error
(da122:ciss1:32:78:0): Retrying command, 3 more tries remain
(da122:ciss1:32:78:0): WRITE(16). CDB: 8a 00 00 00 00 04 88 d8 48 d0 00 00 00 10 00 00
(da122:ciss1:32:78:0): CAM status: CCB request completed with an error
(da122:ciss1:32:78:0): Retrying command, 3 more tries remain
(da122:ciss1:32:78:0): WRITE(16). CDB: 8a 00 00 00 00 04 88 d8 46 e0 00 00 01 f0 00 00
(da122:ciss1:32:78:0): CAM status: CCB request completed with an error
(da122:ciss1:32:78:0): Retrying command, 3 more tries remain
(da122:ciss1:32:78:0): READ(16). CDB: 88 00 00 00 00 04 88 d8 8f d0 00 00 02 00 00 00
(da122:ciss1:32:78:0): CAM status: CCB request completed with an error
(da122:ciss1:32:78:0): Retrying command, 3 more tries remain
(da122:ciss1:32:78:0): READ(16). CDB: 88 00 00 00 00 04 88 d8 8d d0 00 00 02 00 00 00
(da122:ciss1:32:78:0): CAM status: CCB request completed with an error
(da122:ciss1:32:78:0): Retrying command, 3 more tries remain
ciss1: WARNING: completing non-busy request #1 [onq=0, length=262144, ccphys=1791048448, tag=72, flags=0x50 (----I-C), sg_tag=0x3, cr_complete=0xffffffff806059a0]
ciss1: WARNING: clearing cr_complete due to ciss_cam_complete & nonbusy
ciss1: WARNING: completing non-busy request #1 [onq=0, length=262144, ccphys=1791318368, tag=313, flags=0x50 (----I-C), sg_tag=0x3, cr_complete=0xffffffff806059a0]
ciss1: WARNING: clearing cr_complete due to ciss_cam_complete & nonbusy
ciss1: WARNING: completing non-busy request #1 [onq=0, length=8192, ccphys=1791226528, tag=231, flags=0x50 (----I-C), sg_tag=0x3, cr_complete=0xffffffff806059a0]
ciss1: WARNING: clearing cr_complete due to ciss_cam_complete & nonbusy
ciss1: WARNING: completing non-busy request #1 [onq=0, length=262144, ccphys=1791313888, tag=309, flags=0x50 (----I-C), sg_tag=0x5, cr_complete=0xffffffff806059a0]
ciss1: WARNING: clearing cr_complete due to ciss_cam_complete & nonbusy
ciss1: WARNING: completing non-busy request #1 [onq=0, length=8192, ccphys=1791206368, tag=213, flags=0x50 (----I-C), sg_tag=0x3, cr_complete=0xffffffff806059a0]
ciss1: WARNING: clearing cr_complete due to ciss_cam_complete & nonbusy
panic: camq_remove: Attempt to remove out-of-bounds index -1 from queue 0xfffff80107fa9838 of size 1
cpuid = 7
time = 1715266713
KDB: stack backtrace:
#0 0xffffffff80c542d5 at kdb_backtrace+0x65
#1 0xffffffff80c06c51 at vpanic+0x151
#2 0xffffffff80c06af3 at panic+0x43
#3 0xffffffff80390676 at camq_remove+0xf6
#4 0xffffffff803936c8 at xpt_run_devq+0x148
#5 0xffffffff80392b0c at xpt_action_default+0x35c
#6 0xffffffff803c3544 at dastart+0x314
#7 0xffffffff80394f73 at xpt_run_allocq+0x83
#8 0xffffffff803c5152 at dastrategy+0x82
#9 0xffffffff80b438d4 at g_disk_start+0x314
#10 0xffffffff80b46b55 at g_io_request+0x1d5
#11 0xffffffff80b46b55 at g_io_request+0x1d5
#12 0xffffffff821a34dc at vdev_geom_io_start+0x23c
#13 0xffffffff82347fb4 at zio_vdev_io_start+0x204
#14 0xffffffff8234204f at zio_nowait+0x12f
#15 0xffffffff822a0ce7 at vdev_mirror_io_start+0x177
#16 0xffffffff82347fb4 at zio_vdev_io_start+0x204
#17 0xffffffff8234204f at zio_nowait+0x12f
Uptime: 10h16m26s
connector 1E box 1 bay 17 HP MB010000JWAYK 7PH8LU3G HPD5 S.M.A.R.T. predictive failure.
It is _not_ da122 though… The drive with a SMART error (da85) is not in use so basically is just sitting there.The code where I’m trying to figure out what’s going on is this part in sys/dev/ciss/ciss.c:
static void
ciss_complete(struct ciss_softc *sc, cr_qhead_t *qh)
{
struct ciss_request *cr;
debug_called(2);
/*
* Loop taking requests off the completed queue and performing
* completion processing on them.
*/
for (;;) {
int got_nonbusy = 0;
if ((cr = ciss_dequeue_complete(sc, qh)) == NULL)
break;
ciss_unmap_request(cr);
if ((cr->cr_flags & CISS_REQ_BUSY) == 0) {
cr->nonbusy_request_counter++;
ciss_printf(sc,
"WARNING: completing non-busy request #%d [onq=%d, length=%u, ccphys=%u, tag=%d, flags=0x%x (%s), sg_tag=0x%x, cr_complete=%p]\n",
cr->nonbusy_request_counter,
cr->cr_onq, cr->cr_length, cr->cr_ccphys, cr->cr_tag,
cr->cr_flags, cr_flags2str(cr->cr_flags),
cr->cr_sg_tag, cr->cr_complete
);
got_nonbusy = 1;
}
cr->cr_flags &= ~CISS_REQ_BUSY;
/*
* If the request has a callback, invoke it.
*/
if (cr->cr_complete != NULL) {
cr->cr_complete(cr);
/* pen: Attempt to make sure ciss_cam_complete() doesn’t get called multiple times */
if (cr->cr_complete == ciss_cam_complete && got_nonbusy) {
ciss_printf(sc, "WARNING: clearing cr_complete due to ciss_cam_complete & nonbusy\n");
cr->cr_complete = NULL;
}
continue;
}
/*
* If someone is sleeping on this request, wake them up.
*/
if (cr->cr_flags & CISS_REQ_SLEEP) {
cr->cr_flags &= ~CISS_REQ_SLEEP;
wakeup(cr);
continue;
}
/*
* If someone is polling this request for completion, signal.
*/
if (cr->cr_flags & CISS_REQ_POLL) {
cr->cr_flags &= ~CISS_REQ_POLL;
continue;
}
/*
* Give up and throw the request back on the free queue. This
* should never happen; resources will probably be lost.
*/
ciss_printf(sc, "WARNING: completed command with no submitter\n");
ciss_enqueue_free(cr);
}
}
- Any suggestions? (Probably not but I thought I’d ask anyway :-)
- Peter
--
Posted automagically by a mail2news gateway at muc.de e.V.
Please direct questions, flames, donations, etc. to news-***@muc.de