From be26f9ae022ad09967be7a83c58ce605014e939a Mon Sep 17 00:00:00 2001
From: Dan Williams <dan.j.williams@intel.com>
Date: Mon, 1 Feb 2016 17:48:42 -0800
Subject: nfit, tools/testing/nvdimm: add format interface code definitions

ACPI 6.1 and JEDEC Annex L Release 3 formalize the format interface
code.  Add definitions and update their usage in the unit test.

Signed-off-by: Dan Williams <dan.j.williams@intel.com>

diff --git a/drivers/acpi/nfit.h b/drivers/acpi/nfit.h
index 3d549a3..6689b0a 100644
--- a/drivers/acpi/nfit.h
+++ b/drivers/acpi/nfit.h
@@ -40,6 +40,12 @@ enum nfit_uuids {
 	NFIT_UUID_MAX,
 };
 
+enum nfit_fic {
+	NFIT_FIC_BYTE = 0x101, /* byte-addressable energy backed */
+	NFIT_FIC_BLK = 0x201, /* block-addressable non-energy backed */
+	NFIT_FIC_BYTEN = 0x301, /* byte-addressable non-energy backed */
+};
+
 enum {
 	ND_BLK_READ_FLUSH = 1,
 	ND_BLK_DCR_LATCH = 2,
diff --git a/tools/testing/nvdimm/test/nfit.c b/tools/testing/nvdimm/test/nfit.c
index b3281dc..6e831c4 100644
--- a/tools/testing/nvdimm/test/nfit.c
+++ b/tools/testing/nvdimm/test/nfit.c
@@ -823,6 +823,7 @@ static void nfit_test0_setup(struct nfit_test *t)
 	dcr->device_id = 0;
 	dcr->revision_id = 1;
 	dcr->serial_number = ~handle[0];
+	dcr->code = NFIT_FIC_BLK;
 	dcr->windows = 1;
 	dcr->window_size = DCR_SIZE;
 	dcr->command_offset = 0;
@@ -839,6 +840,7 @@ static void nfit_test0_setup(struct nfit_test *t)
 	dcr->device_id = 0;
 	dcr->revision_id = 1;
 	dcr->serial_number = ~handle[1];
+	dcr->code = NFIT_FIC_BLK;
 	dcr->windows = 1;
 	dcr->window_size = DCR_SIZE;
 	dcr->command_offset = 0;
@@ -855,6 +857,7 @@ static void nfit_test0_setup(struct nfit_test *t)
 	dcr->device_id = 0;
 	dcr->revision_id = 1;
 	dcr->serial_number = ~handle[2];
+	dcr->code = NFIT_FIC_BLK;
 	dcr->windows = 1;
 	dcr->window_size = DCR_SIZE;
 	dcr->command_offset = 0;
@@ -871,6 +874,7 @@ static void nfit_test0_setup(struct nfit_test *t)
 	dcr->device_id = 0;
 	dcr->revision_id = 1;
 	dcr->serial_number = ~handle[3];
+	dcr->code = NFIT_FIC_BLK;
 	dcr->windows = 1;
 	dcr->window_size = DCR_SIZE;
 	dcr->command_offset = 0;
@@ -967,6 +971,7 @@ static void nfit_test0_setup(struct nfit_test *t)
 		dcr->device_id = 0;
 		dcr->revision_id = 1;
 		dcr->serial_number = ~handle[4];
+		dcr->code = NFIT_FIC_BLK;
 		dcr->windows = 1;
 		dcr->window_size = DCR_SIZE;
 		dcr->command_offset = 0;
@@ -1136,7 +1141,7 @@ static void nfit_test1_setup(struct nfit_test *t)
 	dcr->device_id = 0;
 	dcr->revision_id = 1;
 	dcr->serial_number = ~0;
-	dcr->code = 0x201;
+	dcr->code = NFIT_FIC_BYTE;
 	dcr->windows = 0;
 	dcr->window_size = 0;
 	dcr->command_offset = 0;
-- 
cgit v0.10.2


From 3b87356f50aa12ae7f9bd60f630eb17fe9b4b253 Mon Sep 17 00:00:00 2001
From: Dan Williams <dan.j.williams@intel.com>
Date: Mon, 1 Feb 2016 17:45:54 -0800
Subject: nfit, tools/testing/nvdimm: test multiple control regions per-dimm

ACPI 6.1 clarifies that "The system shall include an NVDIMM Control
Region Structure for every Function Interface in the NVDIMM."
Implement this clarification in nfit_test.

Signed-off-by: Dan Williams <dan.j.williams@intel.com>

diff --git a/tools/testing/nvdimm/test/nfit.c b/tools/testing/nvdimm/test/nfit.c
index 6e831c4..27b808e 100644
--- a/tools/testing/nvdimm/test/nfit.c
+++ b/tools/testing/nvdimm/test/nfit.c
@@ -429,6 +429,8 @@ static int nfit_test0_alloc(struct nfit_test *t)
 	size_t nfit_size = sizeof(struct acpi_nfit_system_address) * NUM_SPA
 			+ sizeof(struct acpi_nfit_memory_map) * NUM_MEM
 			+ sizeof(struct acpi_nfit_control_region) * NUM_DCR
+			+ offsetof(struct acpi_nfit_control_region,
+					window_size) * NUM_DCR
 			+ sizeof(struct acpi_nfit_data_region) * NUM_BDW
 			+ sizeof(struct acpi_nfit_flush_address) * NUM_DCR;
 	int i;
@@ -478,7 +480,7 @@ static int nfit_test1_alloc(struct nfit_test *t)
 {
 	size_t nfit_size = sizeof(struct acpi_nfit_system_address)
 		+ sizeof(struct acpi_nfit_memory_map)
-		+ sizeof(struct acpi_nfit_control_region);
+		+ offsetof(struct acpi_nfit_control_region, window_size);
 
 	t->nfit_buf = test_alloc(t, nfit_size, &t->nfit_dma);
 	if (!t->nfit_buf)
@@ -611,7 +613,7 @@ static void nfit_test0_setup(struct nfit_test *t)
 	memdev->physical_id = 0;
 	memdev->region_id = 0;
 	memdev->range_index = 0+1;
-	memdev->region_index = 0+1;
+	memdev->region_index = 4+1;
 	memdev->region_size = SPA0_SIZE/2;
 	memdev->region_offset = t->spa_set_dma[0];
 	memdev->address = 0;
@@ -626,7 +628,7 @@ static void nfit_test0_setup(struct nfit_test *t)
 	memdev->physical_id = 1;
 	memdev->region_id = 0;
 	memdev->range_index = 0+1;
-	memdev->region_index = 1+1;
+	memdev->region_index = 5+1;
 	memdev->region_size = SPA0_SIZE/2;
 	memdev->region_offset = t->spa_set_dma[0] + SPA0_SIZE/2;
 	memdev->address = 0;
@@ -641,7 +643,7 @@ static void nfit_test0_setup(struct nfit_test *t)
 	memdev->physical_id = 0;
 	memdev->region_id = 1;
 	memdev->range_index = 1+1;
-	memdev->region_index = 0+1;
+	memdev->region_index = 4+1;
 	memdev->region_size = SPA1_SIZE/4;
 	memdev->region_offset = t->spa_set_dma[1];
 	memdev->address = SPA0_SIZE/2;
@@ -656,7 +658,7 @@ static void nfit_test0_setup(struct nfit_test *t)
 	memdev->physical_id = 1;
 	memdev->region_id = 1;
 	memdev->range_index = 1+1;
-	memdev->region_index = 1+1;
+	memdev->region_index = 5+1;
 	memdev->region_size = SPA1_SIZE/4;
 	memdev->region_offset = t->spa_set_dma[1] + SPA1_SIZE/4;
 	memdev->address = SPA0_SIZE/2;
@@ -671,7 +673,7 @@ static void nfit_test0_setup(struct nfit_test *t)
 	memdev->physical_id = 2;
 	memdev->region_id = 0;
 	memdev->range_index = 1+1;
-	memdev->region_index = 2+1;
+	memdev->region_index = 6+1;
 	memdev->region_size = SPA1_SIZE/4;
 	memdev->region_offset = t->spa_set_dma[1] + 2*SPA1_SIZE/4;
 	memdev->address = SPA0_SIZE/2;
@@ -686,7 +688,7 @@ static void nfit_test0_setup(struct nfit_test *t)
 	memdev->physical_id = 3;
 	memdev->region_id = 0;
 	memdev->range_index = 1+1;
-	memdev->region_index = 3+1;
+	memdev->region_index = 7+1;
 	memdev->region_size = SPA1_SIZE/4;
 	memdev->region_offset = t->spa_set_dma[1] + 3*SPA1_SIZE/4;
 	memdev->address = SPA0_SIZE/2;
@@ -814,7 +816,7 @@ static void nfit_test0_setup(struct nfit_test *t)
 	memdev->interleave_ways = 1;
 
 	offset = offset + sizeof(struct acpi_nfit_memory_map) * 14;
-	/* dcr-descriptor0 */
+	/* dcr-descriptor0: blk */
 	dcr = nfit_buf + offset;
 	dcr->header.type = ACPI_NFIT_TYPE_CONTROL_REGION;
 	dcr->header.length = sizeof(struct acpi_nfit_control_region);
@@ -831,7 +833,7 @@ static void nfit_test0_setup(struct nfit_test *t)
 	dcr->status_offset = 8;
 	dcr->status_size = 4;
 
-	/* dcr-descriptor1 */
+	/* dcr-descriptor1: blk */
 	dcr = nfit_buf + offset + sizeof(struct acpi_nfit_control_region);
 	dcr->header.type = ACPI_NFIT_TYPE_CONTROL_REGION;
 	dcr->header.length = sizeof(struct acpi_nfit_control_region);
@@ -848,7 +850,7 @@ static void nfit_test0_setup(struct nfit_test *t)
 	dcr->status_offset = 8;
 	dcr->status_size = 4;
 
-	/* dcr-descriptor2 */
+	/* dcr-descriptor2: blk */
 	dcr = nfit_buf + offset + sizeof(struct acpi_nfit_control_region) * 2;
 	dcr->header.type = ACPI_NFIT_TYPE_CONTROL_REGION;
 	dcr->header.length = sizeof(struct acpi_nfit_control_region);
@@ -865,7 +867,7 @@ static void nfit_test0_setup(struct nfit_test *t)
 	dcr->status_offset = 8;
 	dcr->status_size = 4;
 
-	/* dcr-descriptor3 */
+	/* dcr-descriptor3: blk */
 	dcr = nfit_buf + offset + sizeof(struct acpi_nfit_control_region) * 3;
 	dcr->header.type = ACPI_NFIT_TYPE_CONTROL_REGION;
 	dcr->header.length = sizeof(struct acpi_nfit_control_region);
@@ -883,6 +885,63 @@ static void nfit_test0_setup(struct nfit_test *t)
 	dcr->status_size = 4;
 
 	offset = offset + sizeof(struct acpi_nfit_control_region) * 4;
+	/* dcr-descriptor0: pmem */
+	dcr = nfit_buf + offset;
+	dcr->header.type = ACPI_NFIT_TYPE_CONTROL_REGION;
+	dcr->header.length = offsetof(struct acpi_nfit_control_region,
+			window_size);
+	dcr->region_index = 4+1;
+	dcr->vendor_id = 0xabcd;
+	dcr->device_id = 0;
+	dcr->revision_id = 1;
+	dcr->serial_number = ~handle[0];
+	dcr->code = NFIT_FIC_BYTEN;
+	dcr->windows = 0;
+
+	/* dcr-descriptor1: pmem */
+	dcr = nfit_buf + offset + offsetof(struct acpi_nfit_control_region,
+			window_size);
+	dcr->header.type = ACPI_NFIT_TYPE_CONTROL_REGION;
+	dcr->header.length = offsetof(struct acpi_nfit_control_region,
+			window_size);
+	dcr->region_index = 5+1;
+	dcr->vendor_id = 0xabcd;
+	dcr->device_id = 0;
+	dcr->revision_id = 1;
+	dcr->serial_number = ~handle[1];
+	dcr->code = NFIT_FIC_BYTEN;
+	dcr->windows = 0;
+
+	/* dcr-descriptor2: pmem */
+	dcr = nfit_buf + offset + offsetof(struct acpi_nfit_control_region,
+			window_size) * 2;
+	dcr->header.type = ACPI_NFIT_TYPE_CONTROL_REGION;
+	dcr->header.length = offsetof(struct acpi_nfit_control_region,
+			window_size);
+	dcr->region_index = 6+1;
+	dcr->vendor_id = 0xabcd;
+	dcr->device_id = 0;
+	dcr->revision_id = 1;
+	dcr->serial_number = ~handle[2];
+	dcr->code = NFIT_FIC_BYTEN;
+	dcr->windows = 0;
+
+	/* dcr-descriptor3: pmem */
+	dcr = nfit_buf + offset + offsetof(struct acpi_nfit_control_region,
+			window_size) * 3;
+	dcr->header.type = ACPI_NFIT_TYPE_CONTROL_REGION;
+	dcr->header.length = offsetof(struct acpi_nfit_control_region,
+			window_size);
+	dcr->region_index = 7+1;
+	dcr->vendor_id = 0xabcd;
+	dcr->device_id = 0;
+	dcr->revision_id = 1;
+	dcr->serial_number = ~handle[3];
+	dcr->code = NFIT_FIC_BYTEN;
+	dcr->windows = 0;
+
+	offset = offset + offsetof(struct acpi_nfit_control_region,
+			window_size) * 4;
 	/* bdw0 (spa/dcr0, dimm0) */
 	bdw = nfit_buf + offset;
 	bdw->header.type = ACPI_NFIT_TYPE_DATA_REGION;
@@ -962,11 +1021,11 @@ static void nfit_test0_setup(struct nfit_test *t)
 
 	if (t->setup_hotplug) {
 		offset = offset + sizeof(struct acpi_nfit_flush_address) * 4;
-		/* dcr-descriptor4 */
+		/* dcr-descriptor4: blk */
 		dcr = nfit_buf + offset;
 		dcr->header.type = ACPI_NFIT_TYPE_CONTROL_REGION;
 		dcr->header.length = sizeof(struct acpi_nfit_control_region);
-		dcr->region_index = 4+1;
+		dcr->region_index = 8+1;
 		dcr->vendor_id = 0xabcd;
 		dcr->device_id = 0;
 		dcr->revision_id = 1;
@@ -980,11 +1039,26 @@ static void nfit_test0_setup(struct nfit_test *t)
 		dcr->status_size = 4;
 
 		offset = offset + sizeof(struct acpi_nfit_control_region);
+		/* dcr-descriptor4: pmem */
+		dcr = nfit_buf + offset;
+		dcr->header.type = ACPI_NFIT_TYPE_CONTROL_REGION;
+		dcr->header.length = offsetof(struct acpi_nfit_control_region,
+				window_size);
+		dcr->region_index = 9+1;
+		dcr->vendor_id = 0xabcd;
+		dcr->device_id = 0;
+		dcr->revision_id = 1;
+		dcr->serial_number = ~handle[4];
+		dcr->code = NFIT_FIC_BYTEN;
+		dcr->windows = 0;
+
+		offset = offset + offsetof(struct acpi_nfit_control_region,
+				window_size);
 		/* bdw4 (spa/dcr4, dimm4) */
 		bdw = nfit_buf + offset;
 		bdw->header.type = ACPI_NFIT_TYPE_DATA_REGION;
 		bdw->header.length = sizeof(struct acpi_nfit_data_region);
-		bdw->region_index = 4+1;
+		bdw->region_index = 8+1;
 		bdw->windows = 1;
 		bdw->offset = 0;
 		bdw->size = BDW_SIZE;
@@ -1032,7 +1106,7 @@ static void nfit_test0_setup(struct nfit_test *t)
 		memdev->physical_id = 4;
 		memdev->region_id = 0;
 		memdev->range_index = 10+1;
-		memdev->region_index = 4+1;
+		memdev->region_index = 8+1;
 		memdev->region_size = 0;
 		memdev->region_offset = 0;
 		memdev->address = 0;
@@ -1048,14 +1122,14 @@ static void nfit_test0_setup(struct nfit_test *t)
 		memdev->physical_id = 4;
 		memdev->region_id = 0;
 		memdev->range_index = 11+1;
-		memdev->region_index = 4+1;
+		memdev->region_index = 9+1;
 		memdev->region_size = SPA0_SIZE;
 		memdev->region_offset = t->spa_set_dma[2];
 		memdev->address = 0;
 		memdev->interleave_index = 0;
 		memdev->interleave_ways = 1;
 
-		/* mem-region16 (spa/dcr4, dimm4) */
+		/* mem-region16 (spa/bdw4, dimm4) */
 		memdev = nfit_buf + offset +
 				sizeof(struct acpi_nfit_memory_map) * 2;
 		memdev->header.type = ACPI_NFIT_TYPE_MEMORY_MAP;
@@ -1064,7 +1138,7 @@ static void nfit_test0_setup(struct nfit_test *t)
 		memdev->physical_id = 4;
 		memdev->region_id = 0;
 		memdev->range_index = 12+1;
-		memdev->region_index = 4+1;
+		memdev->region_index = 8+1;
 		memdev->region_size = 0;
 		memdev->region_offset = 0;
 		memdev->address = 0;
@@ -1135,7 +1209,8 @@ static void nfit_test1_setup(struct nfit_test *t)
 	/* dcr-descriptor0 */
 	dcr = nfit_buf + offset;
 	dcr->header.type = ACPI_NFIT_TYPE_CONTROL_REGION;
-	dcr->header.length = sizeof(struct acpi_nfit_control_region);
+	dcr->header.length = offsetof(struct acpi_nfit_control_region,
+			window_size);
 	dcr->region_index = 0+1;
 	dcr->vendor_id = 0xabcd;
 	dcr->device_id = 0;
@@ -1143,11 +1218,6 @@ static void nfit_test1_setup(struct nfit_test *t)
 	dcr->serial_number = ~0;
 	dcr->code = NFIT_FIC_BYTE;
 	dcr->windows = 0;
-	dcr->window_size = 0;
-	dcr->command_offset = 0;
-	dcr->command_size = 0;
-	dcr->status_offset = 0;
-	dcr->status_size = 0;
 
 	acpi_desc = &t->acpi_desc;
 	set_bit(ND_CMD_ARS_CAP, &acpi_desc->bus_dsm_force_en);
-- 
cgit v0.10.2


From aef25338226660cdd4df908c2eff1abdcfca65e5 Mon Sep 17 00:00:00 2001
From: Dan Williams <dan.j.williams@intel.com>
Date: Fri, 12 Feb 2016 17:01:11 -0800
Subject: libnvdimm, nfit: centralize command status translation

The return value from an 'ndctl_fn' reports the command execution
status, i.e. was the command properly formatted and was it successfully
submitted to the bus provider.  The new 'cmd_rc' parameter allows the bus
provider to communicate command specific results, translated into
common error codes.

Convert the ARS commands to this scheme to:

1/ Consolidate status reporting

2/ Prepare for for expanding ars unit test cases

3/ Make the implementation more generic

Cc: Vishal Verma <vishal.l.verma@intel.com>
Signed-off-by: Dan Williams <dan.j.williams@intel.com>

diff --git a/drivers/acpi/nfit.c b/drivers/acpi/nfit.c
index 35947ac..4dd2b68 100644
--- a/drivers/acpi/nfit.c
+++ b/drivers/acpi/nfit.c
@@ -72,9 +72,80 @@ static struct acpi_device *to_acpi_dev(struct acpi_nfit_desc *acpi_desc)
 	return to_acpi_device(acpi_desc->dev);
 }
 
+static int xlat_status(void *buf, unsigned int cmd)
+{
+	struct nd_cmd_ars_status *ars_status;
+	struct nd_cmd_ars_start *ars_start;
+	struct nd_cmd_ars_cap *ars_cap;
+	u16 flags;
+
+	switch (cmd) {
+	case ND_CMD_ARS_CAP:
+		ars_cap = buf;
+		if ((ars_cap->status & 0xffff) == NFIT_ARS_CAP_NONE)
+			return -ENOTTY;
+
+		/* Command failed */
+		if (ars_cap->status & 0xffff)
+			return -EIO;
+
+		/* No supported scan types for this range */
+		flags = ND_ARS_PERSISTENT | ND_ARS_VOLATILE;
+		if ((ars_cap->status >> 16 & flags) == 0)
+			return -ENOTTY;
+		break;
+	case ND_CMD_ARS_START:
+		ars_start = buf;
+		/* ARS is in progress */
+		if ((ars_start->status & 0xffff) == NFIT_ARS_START_BUSY)
+			return -EBUSY;
+
+		/* Command failed */
+		if (ars_start->status & 0xffff)
+			return -EIO;
+		break;
+	case ND_CMD_ARS_STATUS:
+		ars_status = buf;
+		/* Command failed */
+		if (ars_status->status & 0xffff)
+			return -EIO;
+		/* Check extended status (Upper two bytes) */
+		if (ars_status->status == NFIT_ARS_STATUS_DONE)
+			return 0;
+
+		/* ARS is in progress */
+		if (ars_status->status == NFIT_ARS_STATUS_BUSY)
+			return -EBUSY;
+
+		/* No ARS performed for the current boot */
+		if (ars_status->status == NFIT_ARS_STATUS_NONE)
+			return -EAGAIN;
+
+		/*
+		 * ARS interrupted, either we overflowed or some other
+		 * agent wants the scan to stop.  If we didn't overflow
+		 * then just continue with the returned results.
+		 */
+		if (ars_status->status == NFIT_ARS_STATUS_INTR) {
+			if (ars_status->flags & NFIT_ARS_F_OVERFLOW)
+				return -ENOSPC;
+			return 0;
+		}
+
+		/* Unknown status */
+		if (ars_status->status >> 16)
+			return -EIO;
+		break;
+	default:
+		break;
+	}
+
+	return 0;
+}
+
 static int acpi_nfit_ctl(struct nvdimm_bus_descriptor *nd_desc,
 		struct nvdimm *nvdimm, unsigned int cmd, void *buf,
-		unsigned int buf_len)
+		unsigned int buf_len, int *cmd_rc)
 {
 	struct acpi_nfit_desc *acpi_desc = to_acpi_nfit_desc(nd_desc);
 	const struct nd_cmd_desc *desc = NULL;
@@ -185,6 +256,8 @@ static int acpi_nfit_ctl(struct nvdimm_bus_descriptor *nd_desc,
 			 * unfilled in the output buffer
 			 */
 			rc = buf_len - offset - in_buf.buffer.length;
+			if (cmd_rc)
+				*cmd_rc = xlat_status(buf, cmd);
 		} else {
 			dev_err(dev, "%s:%s underrun cmd: %s buf_len: %d out_len: %d\n",
 					__func__, dimm_name, cmd_name, buf_len,
@@ -1105,7 +1178,7 @@ static void write_blk_ctl(struct nfit_blk *nfit_blk, unsigned int bw,
 	writeq(cmd, mmio->addr.base + offset);
 	wmb_blk(nfit_blk);
 
-	if (nfit_blk->dimm_flags & ND_BLK_DCR_LATCH)
+	if (nfit_blk->dimm_flags & NFIT_BLK_DCR_LATCH)
 		readq(mmio->addr.base + offset);
 }
 
@@ -1141,7 +1214,7 @@ static int acpi_nfit_blk_single_io(struct nfit_blk *nfit_blk,
 			memcpy_to_pmem(mmio->addr.aperture + offset,
 					iobuf + copied, c);
 		else {
-			if (nfit_blk->dimm_flags & ND_BLK_READ_FLUSH)
+			if (nfit_blk->dimm_flags & NFIT_BLK_READ_FLUSH)
 				mmio_flush_range((void __force *)
 					mmio->addr.aperture + offset, c);
 
@@ -1328,13 +1401,13 @@ static int acpi_nfit_blk_get_flags(struct nvdimm_bus_descriptor *nd_desc,
 
 	memset(&flags, 0, sizeof(flags));
 	rc = nd_desc->ndctl(nd_desc, nvdimm, ND_CMD_DIMM_FLAGS, &flags,
-			sizeof(flags));
+			sizeof(flags), NULL);
 
 	if (rc >= 0 && flags.status == 0)
 		nfit_blk->dimm_flags = flags.flags;
 	else if (rc == -ENOTTY) {
 		/* fall back to a conservative default */
-		nfit_blk->dimm_flags = ND_BLK_DCR_LATCH | ND_BLK_READ_FLUSH;
+		nfit_blk->dimm_flags = NFIT_BLK_DCR_LATCH | NFIT_BLK_READ_FLUSH;
 		rc = 0;
 	} else
 		rc = -ENXIO;
@@ -1473,19 +1546,27 @@ static void acpi_nfit_blk_region_disable(struct nvdimm_bus *nvdimm_bus,
 	/* devm will free nfit_blk */
 }
 
-static int ars_get_cap(struct nvdimm_bus_descriptor *nd_desc,
+static int ars_get_cap(struct acpi_nfit_desc *acpi_desc,
 		struct nd_cmd_ars_cap *cmd, u64 addr, u64 length)
 {
+	struct nvdimm_bus_descriptor *nd_desc = &acpi_desc->nd_desc;
+	int cmd_rc, rc;
+
 	cmd->address = addr;
 	cmd->length = length;
-
-	return nd_desc->ndctl(nd_desc, NULL, ND_CMD_ARS_CAP, cmd,
-			sizeof(*cmd));
+	rc = nd_desc->ndctl(nd_desc, NULL, ND_CMD_ARS_CAP, cmd,
+			sizeof(*cmd), &cmd_rc);
+	if (rc < 0)
+		return rc;
+	if (cmd_rc < 0)
+		return cmd_rc;
+	return 0;
 }
 
 static int ars_do_start(struct nvdimm_bus_descriptor *nd_desc,
 		struct nd_cmd_ars_start *cmd, u64 addr, u64 length)
 {
+	int cmd_rc;
 	int rc;
 
 	cmd->address = addr;
@@ -1494,52 +1575,49 @@ static int ars_do_start(struct nvdimm_bus_descriptor *nd_desc,
 
 	while (1) {
 		rc = nd_desc->ndctl(nd_desc, NULL, ND_CMD_ARS_START, cmd,
-				sizeof(*cmd));
-		if (rc)
+				sizeof(*cmd), &cmd_rc);
+
+		if (rc < 0)
 			return rc;
-		switch (cmd->status) {
-		case 0:
-			return 0;
-		case 1:
-			/* ARS unsupported, but we should never get here */
-			return 0;
-		case 6:
+
+		if (cmd_rc == -EBUSY) {
 			/* ARS is in progress */
 			msleep(1000);
-			break;
-		default:
-			return -ENXIO;
+			continue;
 		}
+
+		if (cmd_rc < 0)
+			return cmd_rc;
+
+		return 0;
 	}
 }
 
 static int ars_get_status(struct nvdimm_bus_descriptor *nd_desc,
 		struct nd_cmd_ars_status *cmd, u32 size)
 {
-	int rc;
+	int rc, cmd_rc;
 
 	while (1) {
 		rc = nd_desc->ndctl(nd_desc, NULL, ND_CMD_ARS_STATUS, cmd,
-			size);
-		if (rc || cmd->status & 0xffff)
-			return -ENXIO;
+			size, &cmd_rc);
+		if (rc < 0)
+			return rc;
 
-		/* Check extended status (Upper two bytes) */
-		switch (cmd->status >> 16) {
-		case 0:
-			return 0;
-		case 1:
-			/* ARS is in progress */
+		/* FIXME make async and have a timeout */
+		if (cmd_rc == -EBUSY) {
 			msleep(1000);
-			break;
-		case 2:
-			/* No ARS performed for the current boot */
+			continue;
+		}
+
+		if (cmd_rc == -EAGAIN || cmd_rc == 0)
 			return 0;
-		case 3:
-			/* TODO: error list overflow support */
-		default:
+
+		/* TODO: error list overflow support */
+		if (cmd_rc == -ENOSPC)
 			return -ENXIO;
-		}
+
+		return cmd_rc;
 	}
 }
 
@@ -1590,28 +1668,11 @@ static int acpi_nfit_find_poison(struct acpi_nfit_desc *acpi_desc,
 	start = ndr_desc->res->start;
 	len = ndr_desc->res->end - ndr_desc->res->start + 1;
 
-	/*
-	 * If ARS is unimplemented, unsupported, or if the 'Persistent Memory
-	 * Scrub' flag in extended status is not set, skip this but continue
-	 * initialization
-	 */
-	rc = ars_get_cap(nd_desc, ars_cap, start, len);
+	rc = ars_get_cap(acpi_desc, ars_cap, start, len);
 	if (rc == -ENOTTY) {
-		dev_dbg(acpi_desc->dev,
-			"Address Range Scrub is not implemented, won't create an error list\n");
 		rc = 0;
 		goto out;
 	}
-	if (rc)
-		goto out;
-
-	if ((ars_cap->status & 0xffff) ||
-		!(ars_cap->status >> 16 & ND_ARS_PERSISTENT)) {
-		dev_warn(acpi_desc->dev,
-			"ARS unsupported (status: 0x%x), won't create an error list\n",
-			ars_cap->status);
-		goto out;
-	}
 
 	/*
 	 * Check if a full-range ARS has been run. If so, use those results
@@ -1651,15 +1712,15 @@ static int acpi_nfit_find_poison(struct acpi_nfit_desc *acpi_desc,
 		u64 done, end;
 
 		rc = ars_do_start(nd_desc, ars_start, cur, remaining);
-		if (rc)
+		if (rc < 0)
 			goto out;
 
 		rc = ars_get_status(nd_desc, ars_status, ars_status_size);
-		if (rc)
+		if (rc < 0)
 			goto out;
 
 		rc = ars_status_process_records(nvdimm_bus, ars_status, cur);
-		if (rc)
+		if (rc < 0)
 			goto out;
 
 		end = min(cur + remaining,
diff --git a/drivers/acpi/nfit.h b/drivers/acpi/nfit.h
index 6689b0a..f45b7d9 100644
--- a/drivers/acpi/nfit.h
+++ b/drivers/acpi/nfit.h
@@ -47,8 +47,15 @@ enum nfit_fic {
 };
 
 enum {
-	ND_BLK_READ_FLUSH = 1,
-	ND_BLK_DCR_LATCH = 2,
+	NFIT_BLK_READ_FLUSH = 1,
+	NFIT_BLK_DCR_LATCH = 2,
+	NFIT_ARS_STATUS_DONE = 0,
+	NFIT_ARS_STATUS_BUSY = 1 << 16,
+	NFIT_ARS_STATUS_NONE = 2 << 16,
+	NFIT_ARS_STATUS_INTR = 3 << 16,
+	NFIT_ARS_START_BUSY = 6,
+	NFIT_ARS_CAP_NONE = 1,
+	NFIT_ARS_F_OVERFLOW = 1,
 };
 
 struct nfit_spa {
diff --git a/drivers/nvdimm/bus.c b/drivers/nvdimm/bus.c
index 5d28e94..c3ba888 100644
--- a/drivers/nvdimm/bus.c
+++ b/drivers/nvdimm/bus.c
@@ -587,7 +587,7 @@ static int __nd_ioctl(struct nvdimm_bus *nvdimm_bus, struct nvdimm *nvdimm,
 	if (rc)
 		goto out_unlock;
 
-	rc = nd_desc->ndctl(nd_desc, nvdimm, cmd, buf, buf_len);
+	rc = nd_desc->ndctl(nd_desc, nvdimm, cmd, buf, buf_len, NULL);
 	if (rc < 0)
 		goto out_unlock;
 	if (copy_to_user(p, buf, buf_len))
diff --git a/drivers/nvdimm/dimm_devs.c b/drivers/nvdimm/dimm_devs.c
index 651b8d1..c56f882 100644
--- a/drivers/nvdimm/dimm_devs.c
+++ b/drivers/nvdimm/dimm_devs.c
@@ -75,7 +75,7 @@ int nvdimm_init_nsarea(struct nvdimm_drvdata *ndd)
 	memset(cmd, 0, sizeof(*cmd));
 	nd_desc = nvdimm_bus->nd_desc;
 	return nd_desc->ndctl(nd_desc, to_nvdimm(ndd->dev),
-			ND_CMD_GET_CONFIG_SIZE, cmd, sizeof(*cmd));
+			ND_CMD_GET_CONFIG_SIZE, cmd, sizeof(*cmd), NULL);
 }
 
 int nvdimm_init_config_data(struct nvdimm_drvdata *ndd)
@@ -120,7 +120,7 @@ int nvdimm_init_config_data(struct nvdimm_drvdata *ndd)
 		cmd->in_offset = offset;
 		rc = nd_desc->ndctl(nd_desc, to_nvdimm(ndd->dev),
 				ND_CMD_GET_CONFIG_DATA, cmd,
-				cmd->in_length + sizeof(*cmd));
+				cmd->in_length + sizeof(*cmd), NULL);
 		if (rc || cmd->status) {
 			rc = -ENXIO;
 			break;
@@ -171,7 +171,7 @@ int nvdimm_set_config_data(struct nvdimm_drvdata *ndd, size_t offset,
 		status = ((void *) cmd) + cmd_size - sizeof(u32);
 
 		rc = nd_desc->ndctl(nd_desc, to_nvdimm(ndd->dev),
-				ND_CMD_SET_CONFIG_DATA, cmd, cmd_size);
+				ND_CMD_SET_CONFIG_DATA, cmd, cmd_size, NULL);
 		if (rc || *status) {
 			rc = rc ? rc : -ENXIO;
 			break;
diff --git a/include/linux/libnvdimm.h b/include/linux/libnvdimm.h
index 141ffdd..f398953 100644
--- a/include/linux/libnvdimm.h
+++ b/include/linux/libnvdimm.h
@@ -48,7 +48,7 @@ struct nvdimm;
 struct nvdimm_bus_descriptor;
 typedef int (*ndctl_fn)(struct nvdimm_bus_descriptor *nd_desc,
 		struct nvdimm *nvdimm, unsigned int cmd, void *buf,
-		unsigned int buf_len);
+		unsigned int buf_len, int *cmd_rc);
 
 struct nd_namespace_label;
 struct nvdimm_drvdata;
diff --git a/tools/testing/nvdimm/test/nfit.c b/tools/testing/nvdimm/test/nfit.c
index 27b808e..9ead779 100644
--- a/tools/testing/nvdimm/test/nfit.c
+++ b/tools/testing/nvdimm/test/nfit.c
@@ -261,7 +261,7 @@ static int nfit_test_cmd_ars_status(struct nd_cmd_ars_status *nd_cmd,
 
 static int nfit_test_ctl(struct nvdimm_bus_descriptor *nd_desc,
 		struct nvdimm *nvdimm, unsigned int cmd, void *buf,
-		unsigned int buf_len)
+		unsigned int buf_len, int *cmd_rc)
 {
 	struct acpi_nfit_desc *acpi_desc = to_acpi_desc(nd_desc);
 	struct nfit_test *t = container_of(acpi_desc, typeof(*t), acpi_desc);
@@ -315,6 +315,9 @@ static int nfit_test_ctl(struct nvdimm_bus_descriptor *nd_desc,
 		}
 	}
 
+	/* TODO: error status tests */
+	if (cmd_rc)
+		*cmd_rc = 0;
 	return rc;
 }
 
-- 
cgit v0.10.2


From 5faecf4eb0d7d67e809a4bc9059c764c27670832 Mon Sep 17 00:00:00 2001
From: Dan Williams <dan.j.williams@intel.com>
Date: Wed, 17 Feb 2016 15:25:36 -0800
Subject: libnvdimm: protect nvdimm_{bus|namespace}_add_poison() with
 nvdimm_bus_lock()

In preparation for making poison list retrieval asynchronus to region
registration, add protection for walking and mutating the bus-level
poison list.

Cc: Vishal Verma <vishal.l.verma@intel.com>
Signed-off-by: Dan Williams <dan.j.williams@intel.com>

diff --git a/drivers/nvdimm/core.c b/drivers/nvdimm/core.c
index 2e2832b..f309e6b 100644
--- a/drivers/nvdimm/core.c
+++ b/drivers/nvdimm/core.c
@@ -408,33 +408,11 @@ static void __add_badblock_range(struct badblocks *bb, u64 ns_offset, u64 len)
 		set_badblock(bb, start_sector, num_sectors);
 }
 
-/**
- * nvdimm_namespace_add_poison() - Convert a list of poison ranges to badblocks
- * @ndns:	the namespace containing poison ranges
- * @bb:		badblocks instance to populate
- * @offset:	offset at the start of the namespace before 'sector 0'
- *
- * The poison list generated during NFIT initialization may contain multiple,
- * possibly overlapping ranges in the SPA (System Physical Address) space.
- * Compare each of these ranges to the namespace currently being initialized,
- * and add badblocks to the gendisk for all matching sub-ranges
- */
-void nvdimm_namespace_add_poison(struct nd_namespace_common *ndns,
-		struct badblocks *bb, resource_size_t offset)
+static void namespace_add_poison(struct list_head *poison_list,
+		struct badblocks *bb, struct resource *res)
 {
-	struct nd_namespace_io *nsio = to_nd_namespace_io(&ndns->dev);
-	struct nd_region *nd_region = to_nd_region(ndns->dev.parent);
-	struct nvdimm_bus *nvdimm_bus;
-	struct list_head *poison_list;
-	u64 ns_start, ns_end, ns_size;
 	struct nd_poison *pl;
 
-	ns_size = nvdimm_namespace_capacity(ndns) - offset;
-	ns_start = nsio->res.start + offset;
-	ns_end = nsio->res.end;
-
-	nvdimm_bus = to_nvdimm_bus(nd_region->dev.parent);
-	poison_list = &nvdimm_bus->poison_list;
 	if (list_empty(poison_list))
 		return;
 
@@ -442,37 +420,69 @@ void nvdimm_namespace_add_poison(struct nd_namespace_common *ndns,
 		u64 pl_end = pl->start + pl->length - 1;
 
 		/* Discard intervals with no intersection */
-		if (pl_end < ns_start)
+		if (pl_end < res->start)
 			continue;
-		if (pl->start > ns_end)
+		if (pl->start >  res->end)
 			continue;
 		/* Deal with any overlap after start of the namespace */
-		if (pl->start >= ns_start) {
+		if (pl->start >= res->start) {
 			u64 start = pl->start;
 			u64 len;
 
-			if (pl_end <= ns_end)
+			if (pl_end <= res->end)
 				len = pl->length;
 			else
-				len = ns_start + ns_size - pl->start;
-			__add_badblock_range(bb, start - ns_start, len);
+				len = res->start + resource_size(res)
+					- pl->start;
+			__add_badblock_range(bb, start - res->start, len);
 			continue;
 		}
 		/* Deal with overlap for poison starting before the namespace */
-		if (pl->start < ns_start) {
+		if (pl->start < res->start) {
 			u64 len;
 
-			if (pl_end < ns_end)
-				len = pl->start + pl->length - ns_start;
+			if (pl_end < res->end)
+				len = pl->start + pl->length - res->start;
 			else
-				len = ns_size;
+				len = resource_size(res);
 			__add_badblock_range(bb, 0, len);
 		}
 	}
 }
+
+/**
+ * nvdimm_namespace_add_poison() - Convert a list of poison ranges to badblocks
+ * @ndns:	the namespace containing poison ranges
+ * @bb:		badblocks instance to populate
+ * @offset:	offset at the start of the namespace before 'sector 0'
+ *
+ * The poison list generated during NFIT initialization may contain multiple,
+ * possibly overlapping ranges in the SPA (System Physical Address) space.
+ * Compare each of these ranges to the namespace currently being initialized,
+ * and add badblocks to the gendisk for all matching sub-ranges
+ */
+void nvdimm_namespace_add_poison(struct nd_namespace_common *ndns,
+		struct badblocks *bb, resource_size_t offset)
+{
+	struct nd_namespace_io *nsio = to_nd_namespace_io(&ndns->dev);
+	struct nd_region *nd_region = to_nd_region(ndns->dev.parent);
+	struct nvdimm_bus *nvdimm_bus;
+	struct list_head *poison_list;
+	struct resource res = {
+		.start = nsio->res.start + offset,
+		.end = nsio->res.end,
+	};
+
+	nvdimm_bus = to_nvdimm_bus(nd_region->dev.parent);
+	poison_list = &nvdimm_bus->poison_list;
+
+	nvdimm_bus_lock(&nvdimm_bus->dev);
+	namespace_add_poison(poison_list, bb, &res);
+	nvdimm_bus_unlock(&nvdimm_bus->dev);
+}
 EXPORT_SYMBOL_GPL(nvdimm_namespace_add_poison);
 
-static int __add_poison(struct nvdimm_bus *nvdimm_bus, u64 addr, u64 length)
+static int add_poison(struct nvdimm_bus *nvdimm_bus, u64 addr, u64 length)
 {
 	struct nd_poison *pl;
 
@@ -487,12 +497,12 @@ static int __add_poison(struct nvdimm_bus *nvdimm_bus, u64 addr, u64 length)
 	return 0;
 }
 
-int nvdimm_bus_add_poison(struct nvdimm_bus *nvdimm_bus, u64 addr, u64 length)
+static int bus_add_poison(struct nvdimm_bus *nvdimm_bus, u64 addr, u64 length)
 {
 	struct nd_poison *pl;
 
 	if (list_empty(&nvdimm_bus->poison_list))
-		return __add_poison(nvdimm_bus, addr, length);
+		return add_poison(nvdimm_bus, addr, length);
 
 	/*
 	 * There is a chance this is a duplicate, check for those first.
@@ -512,7 +522,18 @@ int nvdimm_bus_add_poison(struct nvdimm_bus *nvdimm_bus, u64 addr, u64 length)
 	 * as any overlapping ranges will get resolved when the list is consumed
 	 * and converted to badblocks
 	 */
-	return __add_poison(nvdimm_bus, addr, length);
+	return add_poison(nvdimm_bus, addr, length);
+}
+
+int nvdimm_bus_add_poison(struct nvdimm_bus *nvdimm_bus, u64 addr, u64 length)
+{
+	int rc;
+
+	nvdimm_bus_lock(&nvdimm_bus->dev);
+	rc = bus_add_poison(nvdimm_bus, addr, length);
+	nvdimm_bus_unlock(&nvdimm_bus->dev);
+
+	return rc;
 }
 EXPORT_SYMBOL_GPL(nvdimm_bus_add_poison);
 
@@ -553,7 +574,11 @@ void nvdimm_bus_unregister(struct nvdimm_bus *nvdimm_bus)
 
 	nd_synchronize();
 	device_for_each_child(&nvdimm_bus->dev, NULL, child_unregister);
+
+	nvdimm_bus_lock(&nvdimm_bus->dev);
 	free_poison_list(&nvdimm_bus->poison_list);
+	nvdimm_bus_unlock(&nvdimm_bus->dev);
+
 	nvdimm_bus_destroy_ndctl(nvdimm_bus);
 
 	device_unregister(&nvdimm_bus->dev);
-- 
cgit v0.10.2


From 719994660c249a086a7493205c7f1562e30c38cb Mon Sep 17 00:00:00 2001
From: Dan Williams <dan.j.williams@intel.com>
Date: Thu, 18 Feb 2016 10:29:49 -0800
Subject: libnvdimm: async notification support

In preparation for asynchronous address range scrub support add an
ability for the pmem driver to dynamically consume address range scrub
results.

Signed-off-by: Dan Williams <dan.j.williams@intel.com>

diff --git a/drivers/nvdimm/bus.c b/drivers/nvdimm/bus.c
index c3ba888..2508251 100644
--- a/drivers/nvdimm/bus.c
+++ b/drivers/nvdimm/bus.c
@@ -133,6 +133,32 @@ static int nvdimm_bus_remove(struct device *dev)
 	return rc;
 }
 
+void nd_device_notify(struct device *dev, enum nvdimm_event event)
+{
+	device_lock(dev);
+	if (dev->driver) {
+		struct nd_device_driver *nd_drv;
+
+		nd_drv = to_nd_device_driver(dev->driver);
+		if (nd_drv->notify)
+			nd_drv->notify(dev, event);
+	}
+	device_unlock(dev);
+}
+EXPORT_SYMBOL(nd_device_notify);
+
+void nvdimm_region_notify(struct nd_region *nd_region, enum nvdimm_event event)
+{
+	struct nvdimm_bus *nvdimm_bus = walk_to_nvdimm_bus(&nd_region->dev);
+
+	if (!nvdimm_bus)
+		return;
+
+	/* caller is responsible for holding a reference on the device */
+	nd_device_notify(&nd_region->dev, event);
+}
+EXPORT_SYMBOL_GPL(nvdimm_region_notify);
+
 static struct bus_type nvdimm_bus_type = {
 	.name = "nd",
 	.uevent = nvdimm_bus_uevent,
diff --git a/drivers/nvdimm/nd.h b/drivers/nvdimm/nd.h
index ba1633b..78b82f6 100644
--- a/drivers/nvdimm/nd.h
+++ b/drivers/nvdimm/nd.h
@@ -18,6 +18,7 @@
 #include <linux/mutex.h>
 #include <linux/ndctl.h>
 #include <linux/types.h>
+#include <linux/nd.h>
 #include "label.h"
 
 enum {
@@ -168,6 +169,7 @@ int nd_integrity_init(struct gendisk *disk, unsigned long meta_size);
 void wait_nvdimm_bus_probe_idle(struct device *dev);
 void nd_device_register(struct device *dev);
 void nd_device_unregister(struct device *dev, enum nd_async_mode mode);
+void nd_device_notify(struct device *dev, enum nvdimm_event event);
 int nd_uuid_store(struct device *dev, u8 **uuid_out, const char *buf,
 		size_t len);
 ssize_t nd_sector_size_show(unsigned long current_lbasize,
diff --git a/drivers/nvdimm/pmem.c b/drivers/nvdimm/pmem.c
index 8d0b546..efc2a5e 100644
--- a/drivers/nvdimm/pmem.c
+++ b/drivers/nvdimm/pmem.c
@@ -488,12 +488,27 @@ static int nd_pmem_remove(struct device *dev)
 	return 0;
 }
 
+static void nd_pmem_notify(struct device *dev, enum nvdimm_event event)
+{
+	struct pmem_device *pmem = dev_get_drvdata(dev);
+	struct nd_namespace_common *ndns = pmem->ndns;
+
+	if (event != NVDIMM_REVALIDATE_POISON)
+		return;
+
+	if (is_nd_btt(dev))
+		nvdimm_namespace_add_poison(ndns, &pmem->bb, 0);
+	else
+		nvdimm_namespace_add_poison(ndns, &pmem->bb, pmem->data_offset);
+}
+
 MODULE_ALIAS("pmem");
 MODULE_ALIAS_ND_DEVICE(ND_DEVICE_NAMESPACE_IO);
 MODULE_ALIAS_ND_DEVICE(ND_DEVICE_NAMESPACE_PMEM);
 static struct nd_device_driver nd_pmem_driver = {
 	.probe = nd_pmem_probe,
 	.remove = nd_pmem_remove,
+	.notify = nd_pmem_notify,
 	.drv = {
 		.name = "nd_pmem",
 	},
diff --git a/drivers/nvdimm/region.c b/drivers/nvdimm/region.c
index 7da63ea..4b7715e 100644
--- a/drivers/nvdimm/region.c
+++ b/drivers/nvdimm/region.c
@@ -93,9 +93,21 @@ static int nd_region_remove(struct device *dev)
 	return 0;
 }
 
+static int child_notify(struct device *dev, void *data)
+{
+	nd_device_notify(dev, *(enum nvdimm_event *) data);
+	return 0;
+}
+
+static void nd_region_notify(struct device *dev, enum nvdimm_event event)
+{
+	device_for_each_child(dev, &event, child_notify);
+}
+
 static struct nd_device_driver nd_region_driver = {
 	.probe = nd_region_probe,
 	.remove = nd_region_remove,
+	.notify = nd_region_notify,
 	.drv = {
 		.name = "nd_region",
 	},
diff --git a/include/linux/nd.h b/include/linux/nd.h
index 507e47c..5489ab7 100644
--- a/include/linux/nd.h
+++ b/include/linux/nd.h
@@ -16,11 +16,16 @@
 #include <linux/ndctl.h>
 #include <linux/device.h>
 
+enum nvdimm_event {
+	NVDIMM_REVALIDATE_POISON,
+};
+
 struct nd_device_driver {
 	struct device_driver drv;
 	unsigned long type;
 	int (*probe)(struct device *dev);
 	int (*remove)(struct device *dev);
+	void (*notify)(struct device *dev, enum nvdimm_event event);
 };
 
 static inline struct nd_device_driver *to_nd_device_driver(
@@ -144,6 +149,8 @@ static inline int nvdimm_write_bytes(struct nd_namespace_common *ndns,
 	MODULE_ALIAS("nd:t" __stringify(type) "*")
 #define ND_DEVICE_MODALIAS_FMT "nd:t%d"
 
+struct nd_region;
+void nvdimm_region_notify(struct nd_region *nd_region, enum nvdimm_event event);
 int __must_check __nd_driver_register(struct nd_device_driver *nd_drv,
 		struct module *module, const char *mod_name);
 #define nd_driver_register(driver) \
-- 
cgit v0.10.2


From a61fe6f7902ecaa89d5e6c709490fc4324927134 Mon Sep 17 00:00:00 2001
From: Dan Williams <dan.j.williams@intel.com>
Date: Fri, 19 Feb 2016 12:29:32 -0800
Subject: nfit, tools/testing/nvdimm: unify common init for acpi_nfit_desc

The nvdimm unit test infrastructure performs its own initialization of
an acpi_nfit_desc to specify test overrides over the native
implementation.  Make it clear which attributes and operations it is
overriding by re-using acpi_nfit_init_desc() as a common starting point.

Signed-off-by: Dan Williams <dan.j.williams@intel.com>

diff --git a/drivers/acpi/nfit.c b/drivers/acpi/nfit.c
index 4dd2b68..76c9444 100644
--- a/drivers/acpi/nfit.c
+++ b/drivers/acpi/nfit.c
@@ -748,12 +748,11 @@ static struct attribute_group acpi_nfit_attribute_group = {
 	.attrs = acpi_nfit_attributes,
 };
 
-const struct attribute_group *acpi_nfit_attribute_groups[] = {
+static const struct attribute_group *acpi_nfit_attribute_groups[] = {
 	&nvdimm_bus_attribute_group,
 	&acpi_nfit_attribute_group,
 	NULL,
 };
-EXPORT_SYMBOL_GPL(acpi_nfit_attribute_groups);
 
 static struct acpi_nfit_memory_map *to_nfit_memdev(struct device *dev)
 {
@@ -1962,15 +1961,9 @@ int acpi_nfit_init(struct acpi_nfit_desc *acpi_desc, acpi_size sz)
 }
 EXPORT_SYMBOL_GPL(acpi_nfit_init);
 
-static struct acpi_nfit_desc *acpi_nfit_desc_init(struct acpi_device *adev)
+void acpi_nfit_desc_init(struct acpi_nfit_desc *acpi_desc, struct device *dev)
 {
 	struct nvdimm_bus_descriptor *nd_desc;
-	struct acpi_nfit_desc *acpi_desc;
-	struct device *dev = &adev->dev;
-
-	acpi_desc = devm_kzalloc(dev, sizeof(*acpi_desc), GFP_KERNEL);
-	if (!acpi_desc)
-		return ERR_PTR(-ENOMEM);
 
 	dev_set_drvdata(dev, acpi_desc);
 	acpi_desc->dev = dev;
@@ -1980,12 +1973,6 @@ static struct acpi_nfit_desc *acpi_nfit_desc_init(struct acpi_device *adev)
 	nd_desc->ndctl = acpi_nfit_ctl;
 	nd_desc->attr_groups = acpi_nfit_attribute_groups;
 
-	acpi_desc->nvdimm_bus = nvdimm_bus_register(dev, nd_desc);
-	if (!acpi_desc->nvdimm_bus) {
-		devm_kfree(dev, acpi_desc);
-		return ERR_PTR(-ENXIO);
-	}
-
 	INIT_LIST_HEAD(&acpi_desc->spa_maps);
 	INIT_LIST_HEAD(&acpi_desc->spas);
 	INIT_LIST_HEAD(&acpi_desc->dcrs);
@@ -1996,9 +1983,8 @@ static struct acpi_nfit_desc *acpi_nfit_desc_init(struct acpi_device *adev)
 	INIT_LIST_HEAD(&acpi_desc->dimms);
 	mutex_init(&acpi_desc->spa_map_mutex);
 	mutex_init(&acpi_desc->init_mutex);
-
-	return acpi_desc;
 }
+EXPORT_SYMBOL_GPL(acpi_nfit_desc_init);
 
 static int acpi_nfit_add(struct acpi_device *adev)
 {
@@ -2017,12 +2003,13 @@ static int acpi_nfit_add(struct acpi_device *adev)
 		return 0;
 	}
 
-	acpi_desc = acpi_nfit_desc_init(adev);
-	if (IS_ERR(acpi_desc)) {
-		dev_err(dev, "%s: error initializing acpi_desc: %ld\n",
-				__func__, PTR_ERR(acpi_desc));
-		return PTR_ERR(acpi_desc);
-	}
+	acpi_desc = devm_kzalloc(dev, sizeof(*acpi_desc), GFP_KERNEL);
+	if (!acpi_desc)
+		return -ENOMEM;
+	acpi_nfit_desc_init(acpi_desc, &adev->dev);
+	acpi_desc->nvdimm_bus = nvdimm_bus_register(dev, &acpi_desc->nd_desc);
+	if (!acpi_desc->nvdimm_bus)
+		return -ENOMEM;
 
 	/*
 	 * Save the acpi header for later and then skip it,
@@ -2085,12 +2072,13 @@ static void acpi_nfit_notify(struct acpi_device *adev, u32 event)
 	}
 
 	if (!acpi_desc) {
-		acpi_desc = acpi_nfit_desc_init(adev);
-		if (IS_ERR(acpi_desc)) {
-			dev_err(dev, "%s: error initializing acpi_desc: %ld\n",
-				__func__, PTR_ERR(acpi_desc));
+		acpi_desc = devm_kzalloc(dev, sizeof(*acpi_desc), GFP_KERNEL);
+		if (!acpi_desc)
+			goto out_unlock;
+		acpi_nfit_desc_init(acpi_desc, &adev->dev);
+		acpi_desc->nvdimm_bus = nvdimm_bus_register(dev, &acpi_desc->nd_desc);
+		if (!acpi_desc->nvdimm_bus)
 			goto out_unlock;
-		}
 	}
 
 	/* Evaluate _FIT */
diff --git a/drivers/acpi/nfit.h b/drivers/acpi/nfit.h
index f45b7d9..524dec0 100644
--- a/drivers/acpi/nfit.h
+++ b/drivers/acpi/nfit.h
@@ -195,5 +195,5 @@ static inline struct acpi_nfit_desc *to_acpi_desc(
 
 const u8 *to_nfit_uuid(enum nfit_uuids id);
 int acpi_nfit_init(struct acpi_nfit_desc *nfit, acpi_size sz);
-extern const struct attribute_group *acpi_nfit_attribute_groups[];
+void acpi_nfit_desc_init(struct acpi_nfit_desc *acpi_desc, struct device *dev);
 #endif /* __NFIT_H__ */
diff --git a/tools/testing/nvdimm/test/nfit.c b/tools/testing/nvdimm/test/nfit.c
index 9ead779..a66842e 100644
--- a/tools/testing/nvdimm/test/nfit.c
+++ b/tools/testing/nvdimm/test/nfit.c
@@ -499,7 +499,6 @@ static int nfit_test1_alloc(struct nfit_test *t)
 
 static void nfit_test0_setup(struct nfit_test *t)
 {
-	struct nvdimm_bus_descriptor *nd_desc;
 	struct acpi_nfit_desc *acpi_desc;
 	struct acpi_nfit_memory_map *memdev;
 	void *nfit_buf = t->nfit_buf;
@@ -1165,8 +1164,6 @@ static void nfit_test0_setup(struct nfit_test *t)
 	set_bit(ND_CMD_ARS_CAP, &acpi_desc->bus_dsm_force_en);
 	set_bit(ND_CMD_ARS_START, &acpi_desc->bus_dsm_force_en);
 	set_bit(ND_CMD_ARS_STATUS, &acpi_desc->bus_dsm_force_en);
-	nd_desc = &acpi_desc->nd_desc;
-	nd_desc->ndctl = nfit_test_ctl;
 }
 
 static void nfit_test1_setup(struct nfit_test *t)
@@ -1176,7 +1173,6 @@ static void nfit_test1_setup(struct nfit_test *t)
 	struct acpi_nfit_memory_map *memdev;
 	struct acpi_nfit_control_region *dcr;
 	struct acpi_nfit_system_address *spa;
-	struct nvdimm_bus_descriptor *nd_desc;
 	struct acpi_nfit_desc *acpi_desc;
 
 	offset = 0;
@@ -1226,8 +1222,6 @@ static void nfit_test1_setup(struct nfit_test *t)
 	set_bit(ND_CMD_ARS_CAP, &acpi_desc->bus_dsm_force_en);
 	set_bit(ND_CMD_ARS_START, &acpi_desc->bus_dsm_force_en);
 	set_bit(ND_CMD_ARS_STATUS, &acpi_desc->bus_dsm_force_en);
-	nd_desc = &acpi_desc->nd_desc;
-	nd_desc->ndctl = nfit_test_ctl;
 }
 
 static int nfit_test_blk_do_io(struct nd_blk_region *ndbr, resource_size_t dpa,
@@ -1310,26 +1304,16 @@ static int nfit_test_probe(struct platform_device *pdev)
 
 	nfit_test->setup(nfit_test);
 	acpi_desc = &nfit_test->acpi_desc;
-	acpi_desc->dev = &pdev->dev;
+	acpi_nfit_desc_init(acpi_desc, &pdev->dev);
 	acpi_desc->nfit = nfit_test->nfit_buf;
 	acpi_desc->blk_do_io = nfit_test_blk_do_io;
 	nd_desc = &acpi_desc->nd_desc;
-	nd_desc->attr_groups = acpi_nfit_attribute_groups;
+	nd_desc->provider_name = NULL;
+	nd_desc->ndctl = nfit_test_ctl;
 	acpi_desc->nvdimm_bus = nvdimm_bus_register(&pdev->dev, nd_desc);
 	if (!acpi_desc->nvdimm_bus)
 		return -ENXIO;
 
-	INIT_LIST_HEAD(&acpi_desc->spa_maps);
-	INIT_LIST_HEAD(&acpi_desc->spas);
-	INIT_LIST_HEAD(&acpi_desc->dcrs);
-	INIT_LIST_HEAD(&acpi_desc->bdws);
-	INIT_LIST_HEAD(&acpi_desc->idts);
-	INIT_LIST_HEAD(&acpi_desc->flushes);
-	INIT_LIST_HEAD(&acpi_desc->memdevs);
-	INIT_LIST_HEAD(&acpi_desc->dimms);
-	mutex_init(&acpi_desc->spa_map_mutex);
-	mutex_init(&acpi_desc->init_mutex);
-
 	rc = acpi_nfit_init(acpi_desc, nfit_test->nfit_size);
 	if (rc) {
 		nvdimm_bus_unregister(acpi_desc->nvdimm_bus);
-- 
cgit v0.10.2


From 7ae0fa439faff000744b234d04cb470bfd83593b Mon Sep 17 00:00:00 2001
From: Dan Williams <dan.j.williams@intel.com>
Date: Fri, 19 Feb 2016 12:16:34 -0800
Subject: nfit, libnvdimm: async region scrub workqueue

Introduce a workqueue that will be used to run address range scrub
asynchronously with the rest of nvdimm device probing.

Userspace still wants notification when probing operations complete, so
introduce a new callback to flush this workqueue when userspace is
awaiting probe completion.

Signed-off-by: Dan Williams <dan.j.williams@intel.com>

diff --git a/drivers/acpi/nfit.c b/drivers/acpi/nfit.c
index 76c9444..4aa2bd5 100644
--- a/drivers/acpi/nfit.c
+++ b/drivers/acpi/nfit.c
@@ -34,6 +34,8 @@ static bool force_enable_dimms;
 module_param(force_enable_dimms, bool, S_IRUGO|S_IWUSR);
 MODULE_PARM_DESC(force_enable_dimms, "Ignore _STA (ACPI DIMM device) status");
 
+static struct workqueue_struct *nfit_wq;
+
 struct nfit_table_prev {
 	struct list_head spas;
 	struct list_head memdevs;
@@ -1961,6 +1963,39 @@ int acpi_nfit_init(struct acpi_nfit_desc *acpi_desc, acpi_size sz)
 }
 EXPORT_SYMBOL_GPL(acpi_nfit_init);
 
+struct acpi_nfit_flush_work {
+	struct work_struct work;
+	struct completion cmp;
+};
+
+static void flush_probe(struct work_struct *work)
+{
+	struct acpi_nfit_flush_work *flush;
+
+	flush = container_of(work, typeof(*flush), work);
+	complete(&flush->cmp);
+}
+
+static int acpi_nfit_flush_probe(struct nvdimm_bus_descriptor *nd_desc)
+{
+	struct acpi_nfit_desc *acpi_desc = to_acpi_nfit_desc(nd_desc);
+	struct device *dev = acpi_desc->dev;
+	struct acpi_nfit_flush_work flush;
+
+	/* bounce the device lock to flush acpi_nfit_add / acpi_nfit_notify */
+	device_lock(dev);
+	device_unlock(dev);
+
+	/*
+	 * Scrub work could take 10s of seconds, userspace may give up so we
+	 * need to be interruptible while waiting.
+	 */
+	INIT_WORK_ONSTACK(&flush.work, flush_probe);
+	COMPLETION_INITIALIZER_ONSTACK(flush.cmp);
+	queue_work(nfit_wq, &flush.work);
+	return wait_for_completion_interruptible(&flush.cmp);
+}
+
 void acpi_nfit_desc_init(struct acpi_nfit_desc *acpi_desc, struct device *dev)
 {
 	struct nvdimm_bus_descriptor *nd_desc;
@@ -1971,6 +2006,7 @@ void acpi_nfit_desc_init(struct acpi_nfit_desc *acpi_desc, struct device *dev)
 	nd_desc = &acpi_desc->nd_desc;
 	nd_desc->provider_name = "ACPI.NFIT";
 	nd_desc->ndctl = acpi_nfit_ctl;
+	nd_desc->flush_probe = acpi_nfit_flush_probe;
 	nd_desc->attr_groups = acpi_nfit_attribute_groups;
 
 	INIT_LIST_HEAD(&acpi_desc->spa_maps);
@@ -2048,6 +2084,8 @@ static int acpi_nfit_remove(struct acpi_device *adev)
 {
 	struct acpi_nfit_desc *acpi_desc = dev_get_drvdata(&adev->dev);
 
+	acpi_desc->cancel = 1;
+	flush_workqueue(nfit_wq);
 	nvdimm_bus_unregister(acpi_desc->nvdimm_bus);
 	return 0;
 }
@@ -2079,6 +2117,12 @@ static void acpi_nfit_notify(struct acpi_device *adev, u32 event)
 		acpi_desc->nvdimm_bus = nvdimm_bus_register(dev, &acpi_desc->nd_desc);
 		if (!acpi_desc->nvdimm_bus)
 			goto out_unlock;
+	} else {
+		/*
+		 * Finish previous registration before considering new
+		 * regions.
+		 */
+		flush_workqueue(nfit_wq);
 	}
 
 	/* Evaluate _FIT */
@@ -2146,12 +2190,17 @@ static __init int nfit_init(void)
 	acpi_str_to_uuid(UUID_NFIT_BUS, nfit_uuid[NFIT_DEV_BUS]);
 	acpi_str_to_uuid(UUID_NFIT_DIMM, nfit_uuid[NFIT_DEV_DIMM]);
 
+	nfit_wq = create_singlethread_workqueue("nfit");
+	if (!nfit_wq)
+		return -ENOMEM;
+
 	return acpi_bus_register_driver(&acpi_nfit_driver);
 }
 
 static __exit void nfit_exit(void)
 {
 	acpi_bus_unregister_driver(&acpi_nfit_driver);
+	destroy_workqueue(nfit_wq);
 }
 
 module_init(nfit_init);
diff --git a/drivers/acpi/nfit.h b/drivers/acpi/nfit.h
index 524dec0..e8388fe 100644
--- a/drivers/acpi/nfit.h
+++ b/drivers/acpi/nfit.h
@@ -14,6 +14,7 @@
  */
 #ifndef __NFIT_H__
 #define __NFIT_H__
+#include <linux/workqueue.h>
 #include <linux/libnvdimm.h>
 #include <linux/types.h>
 #include <linux/uuid.h>
@@ -123,6 +124,8 @@ struct acpi_nfit_desc {
 	struct list_head idts;
 	struct nvdimm_bus *nvdimm_bus;
 	struct device *dev;
+	struct work_struct work;
+	unsigned int cancel:1;
 	unsigned long dimm_dsm_force_en;
 	unsigned long bus_dsm_force_en;
 	int (*blk_do_io)(struct nd_blk_region *ndbr, resource_size_t dpa,
diff --git a/drivers/nvdimm/core.c b/drivers/nvdimm/core.c
index f309e6b..79646d0 100644
--- a/drivers/nvdimm/core.c
+++ b/drivers/nvdimm/core.c
@@ -298,6 +298,15 @@ static int flush_regions_dimms(struct device *dev, void *data)
 static ssize_t wait_probe_show(struct device *dev,
 		struct device_attribute *attr, char *buf)
 {
+	struct nvdimm_bus *nvdimm_bus = to_nvdimm_bus(dev);
+	struct nvdimm_bus_descriptor *nd_desc = nvdimm_bus->nd_desc;
+	int rc;
+
+	if (nd_desc->flush_probe) {
+		rc = nd_desc->flush_probe(nd_desc);
+		if (rc)
+			return rc;
+	}
 	nd_synchronize();
 	device_for_each_child(dev, NULL, flush_regions_dimms);
 	return sprintf(buf, "1\n");
diff --git a/include/linux/libnvdimm.h b/include/linux/libnvdimm.h
index f398953..2299f87 100644
--- a/include/linux/libnvdimm.h
+++ b/include/linux/libnvdimm.h
@@ -71,6 +71,7 @@ struct nvdimm_bus_descriptor {
 	unsigned long dsm_mask;
 	char *provider_name;
 	ndctl_fn ndctl;
+	int (*flush_probe)(struct nvdimm_bus_descriptor *nd_desc);
 };
 
 struct nd_cmd_desc {
-- 
cgit v0.10.2


From 1cf03c00e7c17d3cf13a267dac83b3162a16ba8c Mon Sep 17 00:00:00 2001
From: Dan Williams <dan.j.williams@intel.com>
Date: Wed, 17 Feb 2016 13:01:23 -0800
Subject: nfit: scrub and register regions in a workqueue

Address range scrub is a potentially long running process that we want
to complete before any pmem regions are registered.  Perform this
operation asynchronously to allow other drivers to load in the meantime.

Platform firmware may have initiated a partial scrub prior to the driver
loading, so we must be careful to consume those results before kicking
off kernel initiated scrubs on other regions.

This rework also makes the registration path more tolerant of scrub
errors in that it splits scrubbing into 2 phases.  The first phase
synchronously waits for a platform-firmware initiated scrub to complete.
The second phase scans the remaining address ranges asynchronously and
notifies the related driver(s) when the scrub completes.

Signed-off-by: Dan Williams <dan.j.williams@intel.com>

diff --git a/drivers/acpi/nfit.c b/drivers/acpi/nfit.c
index 4aa2bd5..3646501 100644
--- a/drivers/acpi/nfit.c
+++ b/drivers/acpi/nfit.c
@@ -21,6 +21,7 @@
 #include <linux/sort.h>
 #include <linux/pmem.h>
 #include <linux/io.h>
+#include <linux/nd.h>
 #include <asm/cacheflush.h>
 #include "nfit.h"
 
@@ -34,6 +35,16 @@ static bool force_enable_dimms;
 module_param(force_enable_dimms, bool, S_IRUGO|S_IWUSR);
 MODULE_PARM_DESC(force_enable_dimms, "Ignore _STA (ACPI DIMM device) status");
 
+static unsigned int scrub_timeout = NFIT_ARS_TIMEOUT;
+module_param(scrub_timeout, uint, S_IRUGO|S_IWUSR);
+MODULE_PARM_DESC(scrub_timeout, "Initial scrub timeout in seconds");
+
+/* after three payloads of overflow, it's dead jim */
+static unsigned int scrub_overflow_abort = 3;
+module_param(scrub_overflow_abort, uint, S_IRUGO|S_IWUSR);
+MODULE_PARM_DESC(scrub_overflow_abort,
+		"Number of times we overflow ARS results before abort");
+
 static struct workqueue_struct *nfit_wq;
 
 struct nfit_table_prev {
@@ -1548,97 +1559,84 @@ static void acpi_nfit_blk_region_disable(struct nvdimm_bus *nvdimm_bus,
 }
 
 static int ars_get_cap(struct acpi_nfit_desc *acpi_desc,
-		struct nd_cmd_ars_cap *cmd, u64 addr, u64 length)
+		struct nd_cmd_ars_cap *cmd, struct nfit_spa *nfit_spa)
 {
 	struct nvdimm_bus_descriptor *nd_desc = &acpi_desc->nd_desc;
+	struct acpi_nfit_system_address *spa = nfit_spa->spa;
 	int cmd_rc, rc;
 
-	cmd->address = addr;
-	cmd->length = length;
+	cmd->address = spa->address;
+	cmd->length = spa->length;
 	rc = nd_desc->ndctl(nd_desc, NULL, ND_CMD_ARS_CAP, cmd,
 			sizeof(*cmd), &cmd_rc);
 	if (rc < 0)
 		return rc;
-	if (cmd_rc < 0)
-		return cmd_rc;
-	return 0;
+	return cmd_rc;
 }
 
-static int ars_do_start(struct nvdimm_bus_descriptor *nd_desc,
-		struct nd_cmd_ars_start *cmd, u64 addr, u64 length)
+static int ars_start(struct acpi_nfit_desc *acpi_desc, struct nfit_spa *nfit_spa)
 {
-	int cmd_rc;
 	int rc;
+	int cmd_rc;
+	struct nd_cmd_ars_start ars_start;
+	struct acpi_nfit_system_address *spa = nfit_spa->spa;
+	struct nvdimm_bus_descriptor *nd_desc = &acpi_desc->nd_desc;
 
-	cmd->address = addr;
-	cmd->length = length;
-	cmd->type = ND_ARS_PERSISTENT;
-
-	while (1) {
-		rc = nd_desc->ndctl(nd_desc, NULL, ND_CMD_ARS_START, cmd,
-				sizeof(*cmd), &cmd_rc);
-
-		if (rc < 0)
-			return rc;
-
-		if (cmd_rc == -EBUSY) {
-			/* ARS is in progress */
-			msleep(1000);
-			continue;
-		}
+	memset(&ars_start, 0, sizeof(ars_start));
+	ars_start.address = spa->address;
+	ars_start.length = spa->length;
+	if (nfit_spa_type(spa) == NFIT_SPA_PM)
+		ars_start.type = ND_ARS_PERSISTENT;
+	else if (nfit_spa_type(spa) == NFIT_SPA_VOLATILE)
+		ars_start.type = ND_ARS_VOLATILE;
+	else
+		return -ENOTTY;
 
-		if (cmd_rc < 0)
-			return cmd_rc;
+	rc = nd_desc->ndctl(nd_desc, NULL, ND_CMD_ARS_START, &ars_start,
+			sizeof(ars_start), &cmd_rc);
 
-		return 0;
-	}
+	if (rc < 0)
+		return rc;
+	return cmd_rc;
 }
 
-static int ars_get_status(struct nvdimm_bus_descriptor *nd_desc,
-		struct nd_cmd_ars_status *cmd, u32 size)
+static int ars_continue(struct acpi_nfit_desc *acpi_desc)
 {
 	int rc, cmd_rc;
+	struct nd_cmd_ars_start ars_start;
+	struct nvdimm_bus_descriptor *nd_desc = &acpi_desc->nd_desc;
+	struct nd_cmd_ars_status *ars_status = acpi_desc->ars_status;
+
+	memset(&ars_start, 0, sizeof(ars_start));
+	ars_start.address = ars_status->restart_address;
+	ars_start.length = ars_status->restart_length;
+	ars_start.type = ars_status->type;
+	rc = nd_desc->ndctl(nd_desc, NULL, ND_CMD_ARS_START, &ars_start,
+			sizeof(ars_start), &cmd_rc);
+	if (rc < 0)
+		return rc;
+	return cmd_rc;
+}
 
-	while (1) {
-		rc = nd_desc->ndctl(nd_desc, NULL, ND_CMD_ARS_STATUS, cmd,
-			size, &cmd_rc);
-		if (rc < 0)
-			return rc;
-
-		/* FIXME make async and have a timeout */
-		if (cmd_rc == -EBUSY) {
-			msleep(1000);
-			continue;
-		}
-
-		if (cmd_rc == -EAGAIN || cmd_rc == 0)
-			return 0;
-
-		/* TODO: error list overflow support */
-		if (cmd_rc == -ENOSPC)
-			return -ENXIO;
+static int ars_get_status(struct acpi_nfit_desc *acpi_desc)
+{
+	struct nvdimm_bus_descriptor *nd_desc = &acpi_desc->nd_desc;
+	struct nd_cmd_ars_status *ars_status = acpi_desc->ars_status;
+	int rc, cmd_rc;
 
-		return cmd_rc;
-	}
+	rc = nd_desc->ndctl(nd_desc, NULL, ND_CMD_ARS_STATUS, ars_status,
+			acpi_desc->ars_status_size, &cmd_rc);
+	if (rc < 0)
+		return rc;
+	return cmd_rc;
 }
 
 static int ars_status_process_records(struct nvdimm_bus *nvdimm_bus,
-		struct nd_cmd_ars_status *ars_status, u64 start)
+		struct nd_cmd_ars_status *ars_status)
 {
 	int rc;
 	u32 i;
 
-	/*
-	 * The address field returned by ars_status should be either
-	 * less than or equal to the address we last started ARS for.
-	 * The (start, length) returned by ars_status should also have
-	 * non-zero overlap with the range we started ARS for.
-	 * If this is not the case, bail.
-	 */
-	if (ars_status->address > start ||
-			(ars_status->address + ars_status->length < start))
-		return -ENXIO;
-
 	for (i = 0; i < ars_status->num_records; i++) {
 		rc = nvdimm_bus_add_poison(nvdimm_bus,
 				ars_status->records[i].err_address,
@@ -1650,101 +1648,14 @@ static int ars_status_process_records(struct nvdimm_bus *nvdimm_bus,
 	return 0;
 }
 
-static int acpi_nfit_find_poison(struct acpi_nfit_desc *acpi_desc,
-		struct nd_region_desc *ndr_desc)
-{
-	struct nvdimm_bus_descriptor *nd_desc = &acpi_desc->nd_desc;
-	struct nvdimm_bus *nvdimm_bus = acpi_desc->nvdimm_bus;
-	struct nd_cmd_ars_status *ars_status = NULL;
-	struct nd_cmd_ars_start *ars_start = NULL;
-	struct nd_cmd_ars_cap *ars_cap = NULL;
-	u64 start, len, cur, remaining;
-	u32 ars_status_size;
-	int rc;
-
-	ars_cap = kzalloc(sizeof(*ars_cap), GFP_KERNEL);
-	if (!ars_cap)
-		return -ENOMEM;
-
-	start = ndr_desc->res->start;
-	len = ndr_desc->res->end - ndr_desc->res->start + 1;
-
-	rc = ars_get_cap(acpi_desc, ars_cap, start, len);
-	if (rc == -ENOTTY) {
-		rc = 0;
-		goto out;
-	}
-
-	/*
-	 * Check if a full-range ARS has been run. If so, use those results
-	 * without having to start a new ARS.
-	 */
-	ars_status_size = ars_cap->max_ars_out;
-	ars_status = kzalloc(ars_status_size, GFP_KERNEL);
-	if (!ars_status) {
-		rc = -ENOMEM;
-		goto out;
-	}
-
-	rc = ars_get_status(nd_desc, ars_status, ars_status_size);
-	if (rc)
-		goto out;
-
-	if (ars_status->address <= start &&
-		(ars_status->address + ars_status->length >= start + len)) {
-		rc = ars_status_process_records(nvdimm_bus, ars_status, start);
-		goto out;
-	}
-
-	/*
-	 * ARS_STATUS can overflow if the number of poison entries found is
-	 * greater than the maximum buffer size (ars_cap->max_ars_out)
-	 * To detect overflow, check if the length field of ars_status
-	 * is less than the length we supplied. If so, process the
-	 * error entries we got, adjust the start point, and start again
-	 */
-	ars_start = kzalloc(sizeof(*ars_start), GFP_KERNEL);
-	if (!ars_start)
-		return -ENOMEM;
-
-	cur = start;
-	remaining = len;
-	do {
-		u64 done, end;
-
-		rc = ars_do_start(nd_desc, ars_start, cur, remaining);
-		if (rc < 0)
-			goto out;
-
-		rc = ars_get_status(nd_desc, ars_status, ars_status_size);
-		if (rc < 0)
-			goto out;
-
-		rc = ars_status_process_records(nvdimm_bus, ars_status, cur);
-		if (rc < 0)
-			goto out;
-
-		end = min(cur + remaining,
-			ars_status->address + ars_status->length);
-		done = end - cur;
-		cur += done;
-		remaining -= done;
-	} while (remaining);
-
- out:
-	kfree(ars_cap);
-	kfree(ars_start);
-	kfree(ars_status);
-	return rc;
-}
-
 static int acpi_nfit_init_mapping(struct acpi_nfit_desc *acpi_desc,
 		struct nd_mapping *nd_mapping, struct nd_region_desc *ndr_desc,
 		struct acpi_nfit_memory_map *memdev,
-		struct acpi_nfit_system_address *spa)
+		struct nfit_spa *nfit_spa)
 {
 	struct nvdimm *nvdimm = acpi_nfit_dimm_by_handle(acpi_desc,
 			memdev->device_handle);
+	struct acpi_nfit_system_address *spa = nfit_spa->spa;
 	struct nd_blk_region_desc *ndbr_desc;
 	struct nfit_mem *nfit_mem;
 	int blk_valid = 0;
@@ -1780,7 +1691,9 @@ static int acpi_nfit_init_mapping(struct acpi_nfit_desc *acpi_desc,
 		ndbr_desc->enable = acpi_nfit_blk_region_enable;
 		ndbr_desc->disable = acpi_nfit_blk_region_disable;
 		ndbr_desc->do_io = acpi_desc->blk_do_io;
-		if (!nvdimm_blk_region_create(acpi_desc->nvdimm_bus, ndr_desc))
+		nfit_spa->nd_region = nvdimm_blk_region_create(acpi_desc->nvdimm_bus,
+				ndr_desc);
+		if (!nfit_spa->nd_region)
 			return -ENOMEM;
 		break;
 	}
@@ -1800,7 +1713,7 @@ static int acpi_nfit_register_region(struct acpi_nfit_desc *acpi_desc,
 	struct resource res;
 	int count = 0, rc;
 
-	if (nfit_spa->is_registered)
+	if (nfit_spa->nd_region)
 		return 0;
 
 	if (spa->range_index == 0) {
@@ -1837,47 +1750,324 @@ static int acpi_nfit_register_region(struct acpi_nfit_desc *acpi_desc,
 		}
 		nd_mapping = &nd_mappings[count++];
 		rc = acpi_nfit_init_mapping(acpi_desc, nd_mapping, ndr_desc,
-				memdev, spa);
+				memdev, nfit_spa);
 		if (rc)
-			return rc;
+			goto out;
 	}
 
 	ndr_desc->nd_mapping = nd_mappings;
 	ndr_desc->num_mappings = count;
 	rc = acpi_nfit_init_interleave_set(acpi_desc, ndr_desc, spa);
 	if (rc)
-		return rc;
+		goto out;
 
 	nvdimm_bus = acpi_desc->nvdimm_bus;
 	if (nfit_spa_type(spa) == NFIT_SPA_PM) {
-		rc = acpi_nfit_find_poison(acpi_desc, ndr_desc);
-		if (rc) {
-			dev_err(acpi_desc->dev,
-				"error while performing ARS to find poison: %d\n",
-				rc);
-			return rc;
-		}
-		if (!nvdimm_pmem_region_create(nvdimm_bus, ndr_desc))
-			return -ENOMEM;
+		nfit_spa->nd_region = nvdimm_pmem_region_create(nvdimm_bus,
+				ndr_desc);
+		if (!nfit_spa->nd_region)
+			rc = -ENOMEM;
 	} else if (nfit_spa_type(spa) == NFIT_SPA_VOLATILE) {
-		if (!nvdimm_volatile_region_create(nvdimm_bus, ndr_desc))
-			return -ENOMEM;
+		nfit_spa->nd_region = nvdimm_volatile_region_create(nvdimm_bus,
+				ndr_desc);
+		if (!nfit_spa->nd_region)
+			rc = -ENOMEM;
 	}
 
-	nfit_spa->is_registered = 1;
+ out:
+	if (rc)
+		dev_err(acpi_desc->dev, "failed to register spa range %d\n",
+				nfit_spa->spa->range_index);
+	return rc;
+}
+
+static int ars_status_alloc(struct acpi_nfit_desc *acpi_desc,
+		u32 max_ars)
+{
+	struct device *dev = acpi_desc->dev;
+	struct nd_cmd_ars_status *ars_status;
+
+	if (acpi_desc->ars_status && acpi_desc->ars_status_size >= max_ars) {
+		memset(acpi_desc->ars_status, 0, acpi_desc->ars_status_size);
+		return 0;
+	}
+
+	if (acpi_desc->ars_status)
+		devm_kfree(dev, acpi_desc->ars_status);
+	acpi_desc->ars_status = NULL;
+	ars_status = devm_kzalloc(dev, max_ars, GFP_KERNEL);
+	if (!ars_status)
+		return -ENOMEM;
+	acpi_desc->ars_status = ars_status;
+	acpi_desc->ars_status_size = max_ars;
 	return 0;
 }
 
-static int acpi_nfit_register_regions(struct acpi_nfit_desc *acpi_desc)
+static int acpi_nfit_query_poison(struct acpi_nfit_desc *acpi_desc,
+		struct nfit_spa *nfit_spa)
+{
+	struct acpi_nfit_system_address *spa = nfit_spa->spa;
+	int rc;
+
+	if (!nfit_spa->max_ars) {
+		struct nd_cmd_ars_cap ars_cap;
+
+		memset(&ars_cap, 0, sizeof(ars_cap));
+		rc = ars_get_cap(acpi_desc, &ars_cap, nfit_spa);
+		if (rc < 0)
+			return rc;
+		nfit_spa->max_ars = ars_cap.max_ars_out;
+		nfit_spa->clear_err_unit = ars_cap.clear_err_unit;
+		/* check that the supported scrub types match the spa type */
+		if (nfit_spa_type(spa) == NFIT_SPA_VOLATILE &&
+				((ars_cap.status >> 16) & ND_ARS_VOLATILE) == 0)
+			return -ENOTTY;
+		else if (nfit_spa_type(spa) == NFIT_SPA_PM &&
+				((ars_cap.status >> 16) & ND_ARS_PERSISTENT) == 0)
+			return -ENOTTY;
+	}
+
+	if (ars_status_alloc(acpi_desc, nfit_spa->max_ars))
+		return -ENOMEM;
+
+	rc = ars_get_status(acpi_desc);
+	if (rc < 0 && rc != -ENOSPC)
+		return rc;
+
+	if (ars_status_process_records(acpi_desc->nvdimm_bus,
+				acpi_desc->ars_status))
+		return -ENOMEM;
+
+	return 0;
+}
+
+static void acpi_nfit_async_scrub(struct acpi_nfit_desc *acpi_desc,
+		struct nfit_spa *nfit_spa)
+{
+	struct acpi_nfit_system_address *spa = nfit_spa->spa;
+	unsigned int overflow_retry = scrub_overflow_abort;
+	u64 init_ars_start = 0, init_ars_len = 0;
+	struct device *dev = acpi_desc->dev;
+	unsigned int tmo = scrub_timeout;
+	int rc;
+
+	if (nfit_spa->ars_done || !nfit_spa->nd_region)
+		return;
+
+	rc = ars_start(acpi_desc, nfit_spa);
+	/*
+	 * If we timed out the initial scan we'll still be busy here,
+	 * and will wait another timeout before giving up permanently.
+	 */
+	if (rc < 0 && rc != -EBUSY)
+		return;
+
+	do {
+		u64 ars_start, ars_len;
+
+		if (acpi_desc->cancel)
+			break;
+		rc = acpi_nfit_query_poison(acpi_desc, nfit_spa);
+		if (rc == -ENOTTY)
+			break;
+		if (rc == -EBUSY && !tmo) {
+			dev_warn(dev, "range %d ars timeout, aborting\n",
+					spa->range_index);
+			break;
+		}
+
+		if (rc == -EBUSY) {
+			/*
+			 * Note, entries may be appended to the list
+			 * while the lock is dropped, but the workqueue
+			 * being active prevents entries being deleted /
+			 * freed.
+			 */
+			mutex_unlock(&acpi_desc->init_mutex);
+			ssleep(1);
+			tmo--;
+			mutex_lock(&acpi_desc->init_mutex);
+			continue;
+		}
+
+		/* we got some results, but there are more pending... */
+		if (rc == -ENOSPC && overflow_retry--) {
+			if (!init_ars_len) {
+				init_ars_len = acpi_desc->ars_status->length;
+				init_ars_start = acpi_desc->ars_status->address;
+			}
+			rc = ars_continue(acpi_desc);
+		}
+
+		if (rc < 0) {
+			dev_warn(dev, "range %d ars continuation failed\n",
+					spa->range_index);
+			break;
+		}
+
+		if (init_ars_len) {
+			ars_start = init_ars_start;
+			ars_len = init_ars_len;
+		} else {
+			ars_start = acpi_desc->ars_status->address;
+			ars_len = acpi_desc->ars_status->length;
+		}
+		dev_dbg(dev, "spa range: %d ars from %#llx + %#llx complete\n",
+				spa->range_index, ars_start, ars_len);
+		/* notify the region about new poison entries */
+		nvdimm_region_notify(nfit_spa->nd_region,
+				NVDIMM_REVALIDATE_POISON);
+		break;
+	} while (1);
+}
+
+static void acpi_nfit_scrub(struct work_struct *work)
 {
+	struct device *dev;
+	u64 init_scrub_length = 0;
 	struct nfit_spa *nfit_spa;
+	u64 init_scrub_address = 0;
+	bool init_ars_done = false;
+	struct acpi_nfit_desc *acpi_desc;
+	unsigned int tmo = scrub_timeout;
+	unsigned int overflow_retry = scrub_overflow_abort;
+
+	acpi_desc = container_of(work, typeof(*acpi_desc), work);
+	dev = acpi_desc->dev;
 
+	/*
+	 * We scrub in 2 phases.  The first phase waits for any platform
+	 * firmware initiated scrubs to complete and then we go search for the
+	 * affected spa regions to mark them scanned.  In the second phase we
+	 * initiate a directed scrub for every range that was not scrubbed in
+	 * phase 1.
+	 */
+
+	/* process platform firmware initiated scrubs */
+ retry:
+	mutex_lock(&acpi_desc->init_mutex);
 	list_for_each_entry(nfit_spa, &acpi_desc->spas, list) {
-		int rc = acpi_nfit_register_region(acpi_desc, nfit_spa);
+		struct nd_cmd_ars_status *ars_status;
+		struct acpi_nfit_system_address *spa;
+		u64 ars_start, ars_len;
+		int rc;
 
-		if (rc)
-			return rc;
+		if (acpi_desc->cancel)
+			break;
+
+		if (nfit_spa->nd_region)
+			continue;
+
+		if (init_ars_done) {
+			/*
+			 * No need to re-query, we're now just
+			 * reconciling all the ranges covered by the
+			 * initial scrub
+			 */
+			rc = 0;
+		} else
+			rc = acpi_nfit_query_poison(acpi_desc, nfit_spa);
+
+		if (rc == -ENOTTY) {
+			/* no ars capability, just register spa and move on */
+			acpi_nfit_register_region(acpi_desc, nfit_spa);
+			continue;
+		}
+
+		if (rc == -EBUSY && !tmo) {
+			/* fallthrough to directed scrub in phase 2 */
+			dev_warn(dev, "timeout awaiting ars results, continuing...\n");
+			break;
+		} else if (rc == -EBUSY) {
+			mutex_unlock(&acpi_desc->init_mutex);
+			ssleep(1);
+			tmo--;
+			goto retry;
+		}
+
+		/* we got some results, but there are more pending... */
+		if (rc == -ENOSPC && overflow_retry--) {
+			ars_status = acpi_desc->ars_status;
+			/*
+			 * Record the original scrub range, so that we
+			 * can recall all the ranges impacted by the
+			 * initial scrub.
+			 */
+			if (!init_scrub_length) {
+				init_scrub_length = ars_status->length;
+				init_scrub_address = ars_status->address;
+			}
+			rc = ars_continue(acpi_desc);
+			if (rc == 0) {
+				mutex_unlock(&acpi_desc->init_mutex);
+				goto retry;
+			}
+		}
+
+		if (rc < 0) {
+			/*
+			 * Initial scrub failed, we'll give it one more
+			 * try below...
+			 */
+			break;
+		}
+
+		/* We got some final results, record completed ranges */
+		ars_status = acpi_desc->ars_status;
+		if (init_scrub_length) {
+			ars_start = init_scrub_address;
+			ars_len = ars_start + init_scrub_length;
+		} else {
+			ars_start = ars_status->address;
+			ars_len = ars_status->length;
+		}
+		spa = nfit_spa->spa;
+
+		if (!init_ars_done) {
+			init_ars_done = true;
+			dev_dbg(dev, "init scrub %#llx + %#llx complete\n",
+					ars_start, ars_len);
+		}
+		if (ars_start <= spa->address && ars_start + ars_len
+				>= spa->address + spa->length)
+			acpi_nfit_register_region(acpi_desc, nfit_spa);
+	}
+
+	/*
+	 * For all the ranges not covered by an initial scrub we still
+	 * want to see if there are errors, but it's ok to discover them
+	 * asynchronously.
+	 */
+	list_for_each_entry(nfit_spa, &acpi_desc->spas, list) {
+		/*
+		 * Flag all the ranges that still need scrubbing, but
+		 * register them now to make data available.
+		 */
+		if (nfit_spa->nd_region)
+			nfit_spa->ars_done = 1;
+		else
+			acpi_nfit_register_region(acpi_desc, nfit_spa);
 	}
+
+	list_for_each_entry(nfit_spa, &acpi_desc->spas, list)
+		acpi_nfit_async_scrub(acpi_desc, nfit_spa);
+	mutex_unlock(&acpi_desc->init_mutex);
+}
+
+static int acpi_nfit_register_regions(struct acpi_nfit_desc *acpi_desc)
+{
+	struct nfit_spa *nfit_spa;
+	int rc;
+
+	list_for_each_entry(nfit_spa, &acpi_desc->spas, list)
+		if (nfit_spa_type(nfit_spa->spa) == NFIT_SPA_DCR) {
+			/* BLK regions don't need to wait for ars results */
+			rc = acpi_nfit_register_region(acpi_desc, nfit_spa);
+			if (rc)
+				return rc;
+		}
+
+	queue_work(nfit_wq, &acpi_desc->work);
 	return 0;
 }
 
@@ -2019,6 +2209,7 @@ void acpi_nfit_desc_init(struct acpi_nfit_desc *acpi_desc, struct device *dev)
 	INIT_LIST_HEAD(&acpi_desc->dimms);
 	mutex_init(&acpi_desc->spa_map_mutex);
 	mutex_init(&acpi_desc->init_mutex);
+	INIT_WORK(&acpi_desc->work, acpi_nfit_scrub);
 }
 EXPORT_SYMBOL_GPL(acpi_nfit_desc_init);
 
diff --git a/drivers/acpi/nfit.h b/drivers/acpi/nfit.h
index e8388fe..c75576b 100644
--- a/drivers/acpi/nfit.h
+++ b/drivers/acpi/nfit.h
@@ -57,12 +57,16 @@ enum {
 	NFIT_ARS_START_BUSY = 6,
 	NFIT_ARS_CAP_NONE = 1,
 	NFIT_ARS_F_OVERFLOW = 1,
+	NFIT_ARS_TIMEOUT = 90,
 };
 
 struct nfit_spa {
 	struct acpi_nfit_system_address *spa;
 	struct list_head list;
-	int is_registered;
+	struct nd_region *nd_region;
+	unsigned int ars_done:1;
+	u32 clear_err_unit;
+	u32 max_ars;
 };
 
 struct nfit_dcr {
@@ -124,6 +128,8 @@ struct acpi_nfit_desc {
 	struct list_head idts;
 	struct nvdimm_bus *nvdimm_bus;
 	struct device *dev;
+	struct nd_cmd_ars_status *ars_status;
+	size_t ars_status_size;
 	struct work_struct work;
 	unsigned int cancel:1;
 	unsigned long dimm_dsm_force_en;
-- 
cgit v0.10.2


From 87bf572e19a092cc9cc77d5a00d543a2b628c142 Mon Sep 17 00:00:00 2001
From: Dan Williams <dan.j.williams@intel.com>
Date: Mon, 22 Feb 2016 21:50:31 -0800
Subject: nfit: disable userspace initiated ars during scrub

While the nfit driver is issuing address range scrub commands and
reaping the results do not permit an ars_start command issued from
userspace.  The scrub thread assumes that all ars completions are for
scrubs initiated by platform firmware at boot, or by the nfit driver.

Signed-off-by: Dan Williams <dan.j.williams@intel.com>

diff --git a/drivers/acpi/nfit.c b/drivers/acpi/nfit.c
index 3646501..0def4eb 100644
--- a/drivers/acpi/nfit.c
+++ b/drivers/acpi/nfit.c
@@ -2186,6 +2186,28 @@ static int acpi_nfit_flush_probe(struct nvdimm_bus_descriptor *nd_desc)
 	return wait_for_completion_interruptible(&flush.cmp);
 }
 
+static int acpi_nfit_clear_to_send(struct nvdimm_bus_descriptor *nd_desc,
+		struct nvdimm *nvdimm, unsigned int cmd)
+{
+	struct acpi_nfit_desc *acpi_desc = to_acpi_nfit_desc(nd_desc);
+
+	if (nvdimm)
+		return 0;
+	if (cmd != ND_CMD_ARS_START)
+		return 0;
+
+	/*
+	 * The kernel and userspace may race to initiate a scrub, but
+	 * the scrub thread is prepared to lose that initial race.  It
+	 * just needs guarantees that any ars it initiates are not
+	 * interrupted by any intervening start reqeusts from userspace.
+	 */
+	if (work_busy(&acpi_desc->work))
+		return -EBUSY;
+
+	return 0;
+}
+
 void acpi_nfit_desc_init(struct acpi_nfit_desc *acpi_desc, struct device *dev)
 {
 	struct nvdimm_bus_descriptor *nd_desc;
@@ -2197,6 +2219,7 @@ void acpi_nfit_desc_init(struct acpi_nfit_desc *acpi_desc, struct device *dev)
 	nd_desc->provider_name = "ACPI.NFIT";
 	nd_desc->ndctl = acpi_nfit_ctl;
 	nd_desc->flush_probe = acpi_nfit_flush_probe;
+	nd_desc->clear_to_send = acpi_nfit_clear_to_send;
 	nd_desc->attr_groups = acpi_nfit_attribute_groups;
 
 	INIT_LIST_HEAD(&acpi_desc->spa_maps);
diff --git a/drivers/nvdimm/bus.c b/drivers/nvdimm/bus.c
index 2508251..228c0e9 100644
--- a/drivers/nvdimm/bus.c
+++ b/drivers/nvdimm/bus.c
@@ -490,16 +490,24 @@ void wait_nvdimm_bus_probe_idle(struct device *dev)
 }
 
 /* set_config requires an idle interleave set */
-static int nd_cmd_clear_to_send(struct nvdimm *nvdimm, unsigned int cmd)
+static int nd_cmd_clear_to_send(struct nvdimm_bus *nvdimm_bus,
+		struct nvdimm *nvdimm, unsigned int cmd)
 {
-	struct nvdimm_bus *nvdimm_bus;
+	struct nvdimm_bus_descriptor *nd_desc = nvdimm_bus->nd_desc;
+
+	/* ask the bus provider if it would like to block this request */
+	if (nd_desc->clear_to_send) {
+		int rc = nd_desc->clear_to_send(nd_desc, nvdimm, cmd);
+
+		if (rc)
+			return rc;
+	}
 
 	if (!nvdimm || cmd != ND_CMD_SET_CONFIG_DATA)
 		return 0;
 
-	nvdimm_bus = walk_to_nvdimm_bus(&nvdimm->dev);
+	/* prevent label manipulation while the kernel owns label updates */
 	wait_nvdimm_bus_probe_idle(&nvdimm_bus->dev);
-
 	if (atomic_read(&nvdimm->busy))
 		return -EBUSY;
 	return 0;
@@ -609,7 +617,7 @@ static int __nd_ioctl(struct nvdimm_bus *nvdimm_bus, struct nvdimm *nvdimm,
 	}
 
 	nvdimm_bus_lock(&nvdimm_bus->dev);
-	rc = nd_cmd_clear_to_send(nvdimm, cmd);
+	rc = nd_cmd_clear_to_send(nvdimm_bus, nvdimm, cmd);
 	if (rc)
 		goto out_unlock;
 
diff --git a/include/linux/libnvdimm.h b/include/linux/libnvdimm.h
index 2299f87..833867b 100644
--- a/include/linux/libnvdimm.h
+++ b/include/linux/libnvdimm.h
@@ -72,6 +72,8 @@ struct nvdimm_bus_descriptor {
 	char *provider_name;
 	ndctl_fn ndctl;
 	int (*flush_probe)(struct nvdimm_bus_descriptor *nd_desc);
+	int (*clear_to_send)(struct nvdimm_bus_descriptor *nd_desc,
+			struct nvdimm *nvdimm, unsigned int cmd);
 };
 
 struct nd_cmd_desc {
-- 
cgit v0.10.2


From f471f1a7d0aa58c609e665514010650b2afa24b6 Mon Sep 17 00:00:00 2001
From: Dan Williams <dan.j.williams@intel.com>
Date: Sat, 20 Feb 2016 15:12:47 -0800
Subject: tools/testing/nvdimm: expand ars unit testing

Simulate platform-firmware-initiated and asynchronous scrub results.
This injects poison in the middle of all nfit_test pmem address ranges.

Signed-off-by: Dan Williams <dan.j.williams@intel.com>

diff --git a/tools/testing/nvdimm/test/nfit.c b/tools/testing/nvdimm/test/nfit.c
index a66842e..1555c09 100644
--- a/tools/testing/nvdimm/test/nfit.c
+++ b/tools/testing/nvdimm/test/nfit.c
@@ -151,6 +151,11 @@ struct nfit_test {
 	int (*alloc)(struct nfit_test *t);
 	void (*setup)(struct nfit_test *t);
 	int setup_hotplug;
+	struct ars_state {
+		struct nd_cmd_ars_status *ars_status;
+		unsigned long deadline;
+		spinlock_t lock;
+	} ars_state;
 };
 
 static struct nfit_test *to_nfit_test(struct device *dev)
@@ -232,30 +237,72 @@ static int nfit_test_cmd_ars_cap(struct nd_cmd_ars_cap *nd_cmd,
 	return 0;
 }
 
-static int nfit_test_cmd_ars_start(struct nd_cmd_ars_start *nd_cmd,
-		unsigned int buf_len)
+/*
+ * Initialize the ars_state to return an ars_result 1 second in the future with
+ * a 4K error range in the middle of the requested address range.
+ */
+static void post_ars_status(struct ars_state *ars_state, u64 addr, u64 len)
 {
-	if (buf_len < sizeof(*nd_cmd))
+	struct nd_cmd_ars_status *ars_status;
+	struct nd_ars_record *ars_record;
+
+	ars_state->deadline = jiffies + 1*HZ;
+	ars_status = ars_state->ars_status;
+	ars_status->status = 0;
+	ars_status->out_length = sizeof(struct nd_cmd_ars_status)
+		+ sizeof(struct nd_ars_record);
+	ars_status->address = addr;
+	ars_status->length = len;
+	ars_status->type = ND_ARS_PERSISTENT;
+	ars_status->num_records = 1;
+	ars_record = &ars_status->records[0];
+	ars_record->handle = 0;
+	ars_record->err_address = addr + len / 2;
+	ars_record->length = SZ_4K;
+}
+
+static int nfit_test_cmd_ars_start(struct ars_state *ars_state,
+		struct nd_cmd_ars_start *ars_start, unsigned int buf_len,
+		int *cmd_rc)
+{
+	if (buf_len < sizeof(*ars_start))
 		return -EINVAL;
 
-	nd_cmd->status = 0;
+	spin_lock(&ars_state->lock);
+	if (time_before(jiffies, ars_state->deadline)) {
+		ars_start->status = NFIT_ARS_START_BUSY;
+		*cmd_rc = -EBUSY;
+	} else {
+		ars_start->status = 0;
+		ars_start->scrub_time = 1;
+		post_ars_status(ars_state, ars_start->address,
+				ars_start->length);
+		*cmd_rc = 0;
+	}
+	spin_unlock(&ars_state->lock);
 
 	return 0;
 }
 
-static int nfit_test_cmd_ars_status(struct nd_cmd_ars_status *nd_cmd,
-		unsigned int buf_len)
+static int nfit_test_cmd_ars_status(struct ars_state *ars_state,
+		struct nd_cmd_ars_status *ars_status, unsigned int buf_len,
+		int *cmd_rc)
 {
-	if (buf_len < sizeof(*nd_cmd))
+	if (buf_len < ars_state->ars_status->out_length)
 		return -EINVAL;
 
-	nd_cmd->out_length = sizeof(struct nd_cmd_ars_status);
-	/* TODO: emit error records */
-	nd_cmd->num_records = 0;
-	nd_cmd->address = 0;
-	nd_cmd->length = -1ULL;
-	nd_cmd->status = 0;
-
+	spin_lock(&ars_state->lock);
+	if (time_before(jiffies, ars_state->deadline)) {
+		memset(ars_status, 0, buf_len);
+		ars_status->status = NFIT_ARS_STATUS_BUSY;
+		ars_status->out_length = sizeof(*ars_status);
+		*cmd_rc = -EBUSY;
+	} else {
+		memcpy(ars_status, ars_state->ars_status,
+				ars_state->ars_status->out_length);
+		*cmd_rc = 0;
+	}
+	spin_unlock(&ars_state->lock);
 	return 0;
 }
 
@@ -265,7 +312,11 @@ static int nfit_test_ctl(struct nvdimm_bus_descriptor *nd_desc,
 {
 	struct acpi_nfit_desc *acpi_desc = to_acpi_desc(nd_desc);
 	struct nfit_test *t = container_of(acpi_desc, typeof(*t), acpi_desc);
-	int i, rc = 0;
+	int i, rc = 0, __cmd_rc;
+
+	if (!cmd_rc)
+		cmd_rc = &__cmd_rc;
+	*cmd_rc = 0;
 
 	if (nvdimm) {
 		struct nfit_mem *nfit_mem = nvdimm_provider_data(nvdimm);
@@ -297,6 +348,8 @@ static int nfit_test_ctl(struct nvdimm_bus_descriptor *nd_desc,
 			return -ENOTTY;
 		}
 	} else {
+		struct ars_state *ars_state = &t->ars_state;
+
 		if (!nd_desc || !test_bit(cmd, &nd_desc->dsm_mask))
 			return -ENOTTY;
 
@@ -305,19 +358,18 @@ static int nfit_test_ctl(struct nvdimm_bus_descriptor *nd_desc,
 			rc = nfit_test_cmd_ars_cap(buf, buf_len);
 			break;
 		case ND_CMD_ARS_START:
-			rc = nfit_test_cmd_ars_start(buf, buf_len);
+			rc = nfit_test_cmd_ars_start(ars_state, buf, buf_len,
+					cmd_rc);
 			break;
 		case ND_CMD_ARS_STATUS:
-			rc = nfit_test_cmd_ars_status(buf, buf_len);
+			rc = nfit_test_cmd_ars_status(ars_state, buf, buf_len,
+					cmd_rc);
 			break;
 		default:
 			return -ENOTTY;
 		}
 	}
 
-	/* TODO: error status tests */
-	if (cmd_rc)
-		*cmd_rc = 0;
 	return rc;
 }
 
@@ -427,6 +479,18 @@ static struct nfit_test_resource *nfit_test_lookup(resource_size_t addr)
 	return NULL;
 }
 
+static int ars_state_init(struct device *dev, struct ars_state *ars_state)
+{
+	ars_state->ars_status = devm_kzalloc(dev,
+			sizeof(struct nd_cmd_ars_status)
+			+ sizeof(struct nd_ars_record) * NFIT_TEST_ARS_RECORDS,
+			GFP_KERNEL);
+	if (!ars_state->ars_status)
+		return -ENOMEM;
+	spin_lock_init(&ars_state->lock);
+	return 0;
+}
+
 static int nfit_test0_alloc(struct nfit_test *t)
 {
 	size_t nfit_size = sizeof(struct acpi_nfit_system_address) * NUM_SPA
@@ -476,7 +540,7 @@ static int nfit_test0_alloc(struct nfit_test *t)
 			return -ENOMEM;
 	}
 
-	return 0;
+	return ars_state_init(&t->pdev.dev, &t->ars_state);
 }
 
 static int nfit_test1_alloc(struct nfit_test *t)
@@ -494,7 +558,7 @@ static int nfit_test1_alloc(struct nfit_test *t)
 	if (!t->spa_set[0])
 		return -ENOMEM;
 
-	return 0;
+	return ars_state_init(&t->pdev.dev, &t->ars_state);
 }
 
 static void nfit_test0_setup(struct nfit_test *t)
@@ -1157,6 +1221,8 @@ static void nfit_test0_setup(struct nfit_test *t)
 		flush->hint_address[0] = t->flush_dma[4];
 	}
 
+	post_ars_status(&t->ars_state, t->spa_set_dma[0], SPA0_SIZE);
+
 	acpi_desc = &t->acpi_desc;
 	set_bit(ND_CMD_GET_CONFIG_SIZE, &acpi_desc->dimm_dsm_force_en);
 	set_bit(ND_CMD_GET_CONFIG_DATA, &acpi_desc->dimm_dsm_force_en);
@@ -1218,6 +1284,8 @@ static void nfit_test1_setup(struct nfit_test *t)
 	dcr->code = NFIT_FIC_BYTE;
 	dcr->windows = 0;
 
+	post_ars_status(&t->ars_state, t->spa_set_dma[0], SPA2_SIZE);
+
 	acpi_desc = &t->acpi_desc;
 	set_bit(ND_CMD_ARS_CAP, &acpi_desc->bus_dsm_force_en);
 	set_bit(ND_CMD_ARS_START, &acpi_desc->bus_dsm_force_en);
-- 
cgit v0.10.2


From 4dc0e7be884e0f99211107dd75e6e1884b7b3754 Mon Sep 17 00:00:00 2001
From: Jerry Hoemann <jerry.hoemann@hpe.com>
Date: Wed, 6 Jan 2016 16:03:39 -0700
Subject: libnvdimm: Clean-up access mode check.

Change nd_ioctl and nvdimm_ioctl access mode check to use O_RDONLY.

Signed-off-by: Jerry Hoemann <jerry.hoemann@hpe.com>
Signed-off-by: Dan Williams <dan.j.williams@intel.com>

diff --git a/drivers/nvdimm/bus.c b/drivers/nvdimm/bus.c
index 228c0e9..f85227a 100644
--- a/drivers/nvdimm/bus.c
+++ b/drivers/nvdimm/bus.c
@@ -636,14 +636,14 @@ static int __nd_ioctl(struct nvdimm_bus *nvdimm_bus, struct nvdimm *nvdimm,
 static long nd_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
 {
 	long id = (long) file->private_data;
-	int rc = -ENXIO, read_only;
+	int rc = -ENXIO, ro;
 	struct nvdimm_bus *nvdimm_bus;
 
-	read_only = (O_RDWR != (file->f_flags & O_ACCMODE));
+	ro = ((file->f_flags & O_ACCMODE) == O_RDONLY);
 	mutex_lock(&nvdimm_bus_list_mutex);
 	list_for_each_entry(nvdimm_bus, &nvdimm_bus_list, list) {
 		if (nvdimm_bus->id == id) {
-			rc = __nd_ioctl(nvdimm_bus, NULL, read_only, cmd, arg);
+			rc = __nd_ioctl(nvdimm_bus, NULL, ro, cmd, arg);
 			break;
 		}
 	}
@@ -667,10 +667,10 @@ static int match_dimm(struct device *dev, void *data)
 
 static long nvdimm_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
 {
-	int rc = -ENXIO, read_only;
+	int rc = -ENXIO, ro;
 	struct nvdimm_bus *nvdimm_bus;
 
-	read_only = (O_RDWR != (file->f_flags & O_ACCMODE));
+	ro = ((file->f_flags & O_ACCMODE) == O_RDONLY);
 	mutex_lock(&nvdimm_bus_list_mutex);
 	list_for_each_entry(nvdimm_bus, &nvdimm_bus_list, list) {
 		struct device *dev = device_find_child(&nvdimm_bus->dev,
@@ -681,7 +681,7 @@ static long nvdimm_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
 			continue;
 
 		nvdimm = to_nvdimm(dev);
-		rc = __nd_ioctl(nvdimm_bus, nvdimm, read_only, cmd, arg);
+		rc = __nd_ioctl(nvdimm_bus, nvdimm, ro, cmd, arg);
 		put_device(dev);
 		break;
 	}
-- 
cgit v0.10.2


From 07accfa9d1a8bac8262f6d24a94a54d2d1f35149 Mon Sep 17 00:00:00 2001
From: Jerry Hoemann <jerry.hoemann@hpe.com>
Date: Wed, 6 Jan 2016 16:03:41 -0700
Subject: libnvdimm: Fix security issue with DSM IOCTL.

Code attempts to prevent certain IOCTL DSM from being called
when device is opened read only.  This security feature can
be trivially overcome by changing the size portion of the
ioctl_command which isn't used.

Check only the _IOC_NR (i.e. the command).

Cc: <stable@vger.kernel.org>
Signed-off-by: Jerry Hoemann <jerry.hoemann@hpe.com>
Signed-off-by: Dan Williams <dan.j.williams@intel.com>

diff --git a/drivers/nvdimm/bus.c b/drivers/nvdimm/bus.c
index f85227a..2e9ac22 100644
--- a/drivers/nvdimm/bus.c
+++ b/drivers/nvdimm/bus.c
@@ -547,10 +547,10 @@ static int __nd_ioctl(struct nvdimm_bus *nvdimm_bus, struct nvdimm *nvdimm,
 
 	/* fail write commands (when read-only) */
 	if (read_only)
-		switch (ioctl_cmd) {
-		case ND_IOCTL_VENDOR:
-		case ND_IOCTL_SET_CONFIG_DATA:
-		case ND_IOCTL_ARS_START:
+		switch (cmd) {
+		case ND_CMD_VENDOR:
+		case ND_CMD_SET_CONFIG_DATA:
+		case ND_CMD_ARS_START:
 			dev_dbg(&nvdimm_bus->dev, "'%s' command while read-only.\n",
 					nvdimm ? nvdimm_cmd_name(cmd)
 					: nvdimm_bus_cmd_name(cmd));
-- 
cgit v0.10.2


From d9cbe09d39aa13f6924dc5fb88325de7ef01a72e Mon Sep 17 00:00:00 2001
From: Dan Williams <dan.j.williams@intel.com>
Date: Thu, 3 Mar 2016 09:14:36 -0800
Subject: libnvdimm, pmem: fix 'pfn' support for section-misaligned namespaces

The altmap for a section-misaligned namespace needs to arrange for the
base_pfn to be section-aligned.  As a result the 'reserve' region (pfns
from base that do not have a struct page) must be increased.  Otherwise
we trip the altmap validation check in __add_pages:

	if (altmap->base_pfn != phys_start_pfn
			|| vmem_altmap_offset(altmap) > nr_pages) {
		pr_warn_once("memory add fail, invalid altmap\n");
		return -EINVAL;
	}

Signed-off-by: Dan Williams <dan.j.williams@intel.com>

diff --git a/drivers/nvdimm/pfn.h b/drivers/nvdimm/pfn.h
index cc24375..6ee707e 100644
--- a/drivers/nvdimm/pfn.h
+++ b/drivers/nvdimm/pfn.h
@@ -15,6 +15,7 @@
 #define __NVDIMM_PFN_H
 
 #include <linux/types.h>
+#include <linux/mmzone.h>
 
 #define PFN_SIG_LEN 16
 #define PFN_SIG "NVDIMM_PFN_INFO\0"
@@ -32,4 +33,16 @@ struct nd_pfn_sb {
 	u8 padding[4012];
 	__le64 checksum;
 };
+
+#ifdef CONFIG_SPARSEMEM
+#define PFN_SECTION_ALIGN_DOWN(x) SECTION_ALIGN_DOWN(x)
+#define PFN_SECTION_ALIGN_UP(x) SECTION_ALIGN_UP(x)
+#else
+/*
+ * In this case ZONE_DEVICE=n and we will disable 'pfn' device support,
+ * but we still want pmem to compile.
+ */
+#define PFN_SECTION_ALIGN_DOWN(x) (x)
+#define PFN_SECTION_ALIGN_UP(x) (x)
+#endif
 #endif /* __NVDIMM_PFN_H */
diff --git a/drivers/nvdimm/pmem.c b/drivers/nvdimm/pmem.c
index 8d0b546..59d568a 100644
--- a/drivers/nvdimm/pmem.c
+++ b/drivers/nvdimm/pmem.c
@@ -356,6 +356,26 @@ static int nvdimm_namespace_detach_pfn(struct nd_namespace_common *ndns)
 	return 0;
 }
 
+/*
+ * We hotplug memory at section granularity, pad the reserved area from
+ * the previous section base to the namespace base address.
+ */
+static unsigned long init_altmap_base(resource_size_t base)
+{
+	unsigned long base_pfn = __phys_to_pfn(base);
+
+	return PFN_SECTION_ALIGN_DOWN(base_pfn);
+}
+
+static unsigned long init_altmap_reserve(resource_size_t base)
+{
+	unsigned long reserve = __phys_to_pfn(SZ_8K);
+	unsigned long base_pfn = __phys_to_pfn(base);
+
+	reserve += base_pfn - PFN_SECTION_ALIGN_DOWN(base_pfn);
+	return reserve;
+}
+
 static int nvdimm_namespace_attach_pfn(struct nd_namespace_common *ndns)
 {
 	struct nd_namespace_io *nsio = to_nd_namespace_io(&ndns->dev);
@@ -369,8 +389,8 @@ static int nvdimm_namespace_attach_pfn(struct nd_namespace_common *ndns)
 	phys_addr_t offset;
 	int rc;
 	struct vmem_altmap __altmap = {
-		.base_pfn = __phys_to_pfn(nsio->res.start),
-		.reserve = __phys_to_pfn(SZ_8K),
+		.base_pfn = init_altmap_base(nsio->res.start),
+		.reserve = init_altmap_reserve(nsio->res.start),
 	};
 
 	if (!nd_pfn->uuid || !nd_pfn->ndns)
-- 
cgit v0.10.2


From cfe30b872058f211630eda7f65fb19d83beaaa3c Mon Sep 17 00:00:00 2001
From: Dan Williams <dan.j.williams@intel.com>
Date: Thu, 3 Mar 2016 09:38:00 -0800
Subject: libnvdimm, pmem: adjust for section collisions with 'System RAM'

On a platform where 'Persistent Memory' and 'System RAM' are mixed
within a given sparsemem section, trim the namespace and notify about the
sub-optimal alignment.

Cc: Toshi Kani <toshi.kani@hpe.com>
Cc: Ross Zwisler <ross.zwisler@linux.intel.com>
Signed-off-by: Dan Williams <dan.j.williams@intel.com>

diff --git a/drivers/nvdimm/namespace_devs.c b/drivers/nvdimm/namespace_devs.c
index 9edf7eb..f5cb886 100644
--- a/drivers/nvdimm/namespace_devs.c
+++ b/drivers/nvdimm/namespace_devs.c
@@ -133,6 +133,7 @@ bool nd_is_uuid_unique(struct device *dev, u8 *uuid)
 bool pmem_should_map_pages(struct device *dev)
 {
 	struct nd_region *nd_region = to_nd_region(dev->parent);
+	struct nd_namespace_io *nsio;
 
 	if (!IS_ENABLED(CONFIG_ZONE_DEVICE))
 		return false;
@@ -143,6 +144,12 @@ bool pmem_should_map_pages(struct device *dev)
 	if (is_nd_pfn(dev) || is_nd_btt(dev))
 		return false;
 
+	nsio = to_nd_namespace_io(dev);
+	if (region_intersects(nsio->res.start, resource_size(&nsio->res),
+				IORESOURCE_SYSTEM_RAM,
+				IORES_DESC_NONE) == REGION_MIXED)
+		return false;
+
 #ifdef ARCH_MEMREMAP_PMEM
 	return ARCH_MEMREMAP_PMEM == MEMREMAP_WB;
 #else
diff --git a/drivers/nvdimm/pfn.h b/drivers/nvdimm/pfn.h
index 6ee707e..8e343a3 100644
--- a/drivers/nvdimm/pfn.h
+++ b/drivers/nvdimm/pfn.h
@@ -27,10 +27,13 @@ struct nd_pfn_sb {
 	__le32 flags;
 	__le16 version_major;
 	__le16 version_minor;
-	__le64 dataoff;
+	__le64 dataoff; /* relative to namespace_base + start_pad */
 	__le64 npfns;
 	__le32 mode;
-	u8 padding[4012];
+	/* minor-version-1 additions for section alignment */
+	__le32 start_pad;
+	__le32 end_trunc;
+	u8 padding[4004];
 	__le64 checksum;
 };
 
@@ -45,4 +48,7 @@ struct nd_pfn_sb {
 #define PFN_SECTION_ALIGN_DOWN(x) (x)
 #define PFN_SECTION_ALIGN_UP(x) (x)
 #endif
+
+#define PHYS_SECTION_ALIGN_DOWN(x) PFN_PHYS(PFN_SECTION_ALIGN_DOWN(PHYS_PFN(x)))
+#define PHYS_SECTION_ALIGN_UP(x) PFN_PHYS(PFN_SECTION_ALIGN_UP(PHYS_PFN(x)))
 #endif /* __NVDIMM_PFN_H */
diff --git a/drivers/nvdimm/pfn_devs.c b/drivers/nvdimm/pfn_devs.c
index ae81a2f..75a31a7 100644
--- a/drivers/nvdimm/pfn_devs.c
+++ b/drivers/nvdimm/pfn_devs.c
@@ -299,6 +299,11 @@ int nd_pfn_validate(struct nd_pfn *nd_pfn)
 	if (memcmp(pfn_sb->parent_uuid, parent_uuid, 16) != 0)
 		return -ENODEV;
 
+	if (__le16_to_cpu(pfn_sb->version_minor) < 1) {
+		pfn_sb->start_pad = 0;
+		pfn_sb->end_trunc = 0;
+	}
+
 	switch (le32_to_cpu(pfn_sb->mode)) {
 	case PFN_MODE_RAM:
 	case PFN_MODE_PMEM:
diff --git a/drivers/nvdimm/pmem.c b/drivers/nvdimm/pmem.c
index 59d568a..0cb450e 100644
--- a/drivers/nvdimm/pmem.c
+++ b/drivers/nvdimm/pmem.c
@@ -43,7 +43,10 @@ struct pmem_device {
 	phys_addr_t		data_offset;
 	u64			pfn_flags;
 	void __pmem		*virt_addr;
+	/* immutable base size of the namespace */
 	size_t			size;
+	/* trim size when namespace capacity has been section aligned */
+	u32			pfn_pad;
 	struct badblocks	bb;
 };
 
@@ -145,7 +148,7 @@ static long pmem_direct_access(struct block_device *bdev, sector_t sector,
 	*kaddr = pmem->virt_addr + offset;
 	*pfn = phys_to_pfn_t(pmem->phys_addr + offset, pmem->pfn_flags);
 
-	return pmem->size - offset;
+	return pmem->size - pmem->pfn_pad - offset;
 }
 
 static const struct block_device_operations pmem_fops = {
@@ -236,7 +239,8 @@ static int pmem_attach_disk(struct device *dev,
 	disk->flags		= GENHD_FL_EXT_DEVT;
 	nvdimm_namespace_disk_name(ndns, disk->disk_name);
 	disk->driverfs_dev = dev;
-	set_capacity(disk, (pmem->size - pmem->data_offset) / 512);
+	set_capacity(disk, (pmem->size - pmem->pfn_pad - pmem->data_offset)
+			/ 512);
 	pmem->pmem_disk = disk;
 	devm_exit_badblocks(dev, &pmem->bb);
 	if (devm_init_badblocks(dev, &pmem->bb))
@@ -279,6 +283,9 @@ static int nd_pfn_init(struct nd_pfn *nd_pfn)
 	struct nd_pfn_sb *pfn_sb = kzalloc(sizeof(*pfn_sb), GFP_KERNEL);
 	struct pmem_device *pmem = dev_get_drvdata(&nd_pfn->dev);
 	struct nd_namespace_common *ndns = nd_pfn->ndns;
+	u32 start_pad = 0, end_trunc = 0;
+	resource_size_t start, size;
+	struct nd_namespace_io *nsio;
 	struct nd_region *nd_region;
 	unsigned long npfns;
 	phys_addr_t offset;
@@ -304,21 +311,56 @@ static int nd_pfn_init(struct nd_pfn *nd_pfn)
 	}
 
 	memset(pfn_sb, 0, sizeof(*pfn_sb));
-	npfns = (pmem->size - SZ_8K) / SZ_4K;
+
+	/*
+	 * Check if pmem collides with 'System RAM' when section aligned and
+	 * trim it accordingly
+	 */
+	nsio = to_nd_namespace_io(&ndns->dev);
+	start = PHYS_SECTION_ALIGN_DOWN(nsio->res.start);
+	size = resource_size(&nsio->res);
+	if (region_intersects(start, size, IORESOURCE_SYSTEM_RAM,
+				IORES_DESC_NONE) == REGION_MIXED) {
+
+		start = nsio->res.start;
+		start_pad = PHYS_SECTION_ALIGN_UP(start) - start;
+	}
+
+	start = nsio->res.start;
+	size = PHYS_SECTION_ALIGN_UP(start + size) - start;
+	if (region_intersects(start, size, IORESOURCE_SYSTEM_RAM,
+				IORES_DESC_NONE) == REGION_MIXED) {
+		size = resource_size(&nsio->res);
+		end_trunc = start + size - PHYS_SECTION_ALIGN_DOWN(start + size);
+	}
+
+	if (start_pad + end_trunc)
+		dev_info(&nd_pfn->dev, "%s section collision, truncate %d bytes\n",
+				dev_name(&ndns->dev), start_pad + end_trunc);
+
 	/*
 	 * Note, we use 64 here for the standard size of struct page,
 	 * debugging options may cause it to be larger in which case the
 	 * implementation will limit the pfns advertised through
 	 * ->direct_access() to those that are included in the memmap.
 	 */
+	start += start_pad;
+	npfns = (pmem->size - start_pad - end_trunc - SZ_8K) / SZ_4K;
 	if (nd_pfn->mode == PFN_MODE_PMEM)
-		offset = ALIGN(SZ_8K + 64 * npfns, nd_pfn->align);
+		offset = ALIGN(start + SZ_8K + 64 * npfns, nd_pfn->align)
+			- start;
 	else if (nd_pfn->mode == PFN_MODE_RAM)
-		offset = ALIGN(SZ_8K, nd_pfn->align);
+		offset = ALIGN(start + SZ_8K, nd_pfn->align) - start;
 	else
 		goto err;
 
-	npfns = (pmem->size - offset) / SZ_4K;
+	if (offset + start_pad + end_trunc >= pmem->size) {
+		dev_err(&nd_pfn->dev, "%s unable to satisfy requested alignment\n",
+				dev_name(&ndns->dev));
+		goto err;
+	}
+
+	npfns = (pmem->size - offset - start_pad - end_trunc) / SZ_4K;
 	pfn_sb->mode = cpu_to_le32(nd_pfn->mode);
 	pfn_sb->dataoff = cpu_to_le64(offset);
 	pfn_sb->npfns = cpu_to_le64(npfns);
@@ -326,6 +368,9 @@ static int nd_pfn_init(struct nd_pfn *nd_pfn)
 	memcpy(pfn_sb->uuid, nd_pfn->uuid, 16);
 	memcpy(pfn_sb->parent_uuid, nd_dev_to_uuid(&ndns->dev), 16);
 	pfn_sb->version_major = cpu_to_le16(1);
+	pfn_sb->version_minor = cpu_to_le16(1);
+	pfn_sb->start_pad = cpu_to_le32(start_pad);
+	pfn_sb->end_trunc = cpu_to_le32(end_trunc);
 	checksum = nd_sb_checksum((struct nd_gen_sb *) pfn_sb);
 	pfn_sb->checksum = cpu_to_le64(checksum);
 
@@ -376,41 +421,36 @@ static unsigned long init_altmap_reserve(resource_size_t base)
 	return reserve;
 }
 
-static int nvdimm_namespace_attach_pfn(struct nd_namespace_common *ndns)
+static int __nvdimm_namespace_attach_pfn(struct nd_pfn *nd_pfn)
 {
-	struct nd_namespace_io *nsio = to_nd_namespace_io(&ndns->dev);
-	struct nd_pfn *nd_pfn = to_nd_pfn(ndns->claim);
-	struct device *dev = &nd_pfn->dev;
-	struct nd_region *nd_region;
-	struct vmem_altmap *altmap;
-	struct nd_pfn_sb *pfn_sb;
-	struct pmem_device *pmem;
-	struct request_queue *q;
-	phys_addr_t offset;
 	int rc;
+	struct resource res;
+	struct request_queue *q;
+	struct pmem_device *pmem;
+	struct vmem_altmap *altmap;
+	struct device *dev = &nd_pfn->dev;
+	struct nd_pfn_sb *pfn_sb = nd_pfn->pfn_sb;
+	struct nd_namespace_common *ndns = nd_pfn->ndns;
+	u32 start_pad = __le32_to_cpu(pfn_sb->start_pad);
+	u32 end_trunc = __le32_to_cpu(pfn_sb->end_trunc);
+	struct nd_namespace_io *nsio = to_nd_namespace_io(&ndns->dev);
+	resource_size_t base = nsio->res.start + start_pad;
 	struct vmem_altmap __altmap = {
-		.base_pfn = init_altmap_base(nsio->res.start),
-		.reserve = init_altmap_reserve(nsio->res.start),
+		.base_pfn = init_altmap_base(base),
+		.reserve = init_altmap_reserve(base),
 	};
 
-	if (!nd_pfn->uuid || !nd_pfn->ndns)
-		return -ENODEV;
-
-	nd_region = to_nd_region(dev->parent);
-	rc = nd_pfn_init(nd_pfn);
-	if (rc)
-		return rc;
-
-	pfn_sb = nd_pfn->pfn_sb;
-	offset = le64_to_cpu(pfn_sb->dataoff);
+	pmem = dev_get_drvdata(dev);
+	pmem->data_offset = le64_to_cpu(pfn_sb->dataoff);
+	pmem->pfn_pad = start_pad + end_trunc;
 	nd_pfn->mode = le32_to_cpu(nd_pfn->pfn_sb->mode);
 	if (nd_pfn->mode == PFN_MODE_RAM) {
-		if (offset < SZ_8K)
+		if (pmem->data_offset < SZ_8K)
 			return -EINVAL;
 		nd_pfn->npfns = le64_to_cpu(pfn_sb->npfns);
 		altmap = NULL;
 	} else if (nd_pfn->mode == PFN_MODE_PMEM) {
-		nd_pfn->npfns = (resource_size(&nsio->res) - offset)
+		nd_pfn->npfns = (pmem->size - pmem->pfn_pad - pmem->data_offset)
 			/ PAGE_SIZE;
 		if (le64_to_cpu(nd_pfn->pfn_sb->npfns) > nd_pfn->npfns)
 			dev_info(&nd_pfn->dev,
@@ -418,7 +458,7 @@ static int nvdimm_namespace_attach_pfn(struct nd_namespace_common *ndns)
 					le64_to_cpu(nd_pfn->pfn_sb->npfns),
 					nd_pfn->npfns);
 		altmap = & __altmap;
-		altmap->free = __phys_to_pfn(offset - SZ_8K);
+		altmap->free = __phys_to_pfn(pmem->data_offset - SZ_8K);
 		altmap->alloc = 0;
 	} else {
 		rc = -ENXIO;
@@ -426,10 +466,12 @@ static int nvdimm_namespace_attach_pfn(struct nd_namespace_common *ndns)
 	}
 
 	/* establish pfn range for lookup, and switch to direct map */
-	pmem = dev_get_drvdata(dev);
 	q = pmem->pmem_queue;
+	memcpy(&res, &nsio->res, sizeof(res));
+	res.start += start_pad;
+	res.end -= end_trunc;
 	devm_memunmap(dev, (void __force *) pmem->virt_addr);
-	pmem->virt_addr = (void __pmem *) devm_memremap_pages(dev, &nsio->res,
+	pmem->virt_addr = (void __pmem *) devm_memremap_pages(dev, &res,
 			&q->q_usage_counter, altmap);
 	pmem->pfn_flags |= PFN_MAP;
 	if (IS_ERR(pmem->virt_addr)) {
@@ -438,7 +480,6 @@ static int nvdimm_namespace_attach_pfn(struct nd_namespace_common *ndns)
 	}
 
 	/* attach pmem disk in "pfn-mode" */
-	pmem->data_offset = offset;
 	rc = pmem_attach_disk(dev, ndns, pmem);
 	if (rc)
 		goto err;
@@ -447,6 +488,22 @@ static int nvdimm_namespace_attach_pfn(struct nd_namespace_common *ndns)
  err:
 	nvdimm_namespace_detach_pfn(ndns);
 	return rc;
+
+}
+
+static int nvdimm_namespace_attach_pfn(struct nd_namespace_common *ndns)
+{
+	struct nd_pfn *nd_pfn = to_nd_pfn(ndns->claim);
+	int rc;
+
+	if (!nd_pfn->uuid || !nd_pfn->ndns)
+		return -ENODEV;
+
+	rc = nd_pfn_init(nd_pfn);
+	if (rc)
+		return rc;
+	/* we need a valid pfn_sb before we can init a vmem_altmap */
+	return __nvdimm_namespace_attach_pfn(nd_pfn);
 }
 
 static int nd_pmem_probe(struct device *dev)
-- 
cgit v0.10.2


From f6ed58c70d14572d0272ee129579dbfc97b97f50 Mon Sep 17 00:00:00 2001
From: Dan Williams <dan.j.williams@intel.com>
Date: Thu, 3 Mar 2016 09:46:04 -0800
Subject: libnvdimm, pfn: 'resource'-address and 'size' attributes for pfn
 devices

Currenty with a raw mode pmem namespace the physical memory address range for
the device can be obtained via /sys/block/pmemX/device/{resource|size}.  Add
similar attributes for pfn instances that takes the struct page memmap and
section padding into account.

Reported-by: Haozhong Zhang <haozhong.zhang@intel.com>
Signed-off-by: Dan Williams <dan.j.williams@intel.com>

diff --git a/drivers/nvdimm/pfn_devs.c b/drivers/nvdimm/pfn_devs.c
index 75a31a7..254d3bc 100644
--- a/drivers/nvdimm/pfn_devs.c
+++ b/drivers/nvdimm/pfn_devs.c
@@ -205,11 +205,67 @@ static ssize_t namespace_store(struct device *dev,
 }
 static DEVICE_ATTR_RW(namespace);
 
+static ssize_t resource_show(struct device *dev,
+		struct device_attribute *attr, char *buf)
+{
+	struct nd_pfn *nd_pfn = to_nd_pfn(dev);
+	ssize_t rc;
+
+	device_lock(dev);
+	if (dev->driver) {
+		struct nd_pfn_sb *pfn_sb = nd_pfn->pfn_sb;
+		u64 offset = __le64_to_cpu(pfn_sb->dataoff);
+		struct nd_namespace_common *ndns = nd_pfn->ndns;
+		u32 start_pad = __le32_to_cpu(pfn_sb->start_pad);
+		struct nd_namespace_io *nsio = to_nd_namespace_io(&ndns->dev);
+
+		rc = sprintf(buf, "%#llx\n", (unsigned long long) nsio->res.start
+				+ start_pad + offset);
+	} else {
+		/* no address to convey if the pfn instance is disabled */
+		rc = -ENXIO;
+	}
+	device_unlock(dev);
+
+	return rc;
+}
+static DEVICE_ATTR_RO(resource);
+
+static ssize_t size_show(struct device *dev,
+		struct device_attribute *attr, char *buf)
+{
+	struct nd_pfn *nd_pfn = to_nd_pfn(dev);
+	ssize_t rc;
+
+	device_lock(dev);
+	if (dev->driver) {
+		struct nd_pfn_sb *pfn_sb = nd_pfn->pfn_sb;
+		u64 offset = __le64_to_cpu(pfn_sb->dataoff);
+		struct nd_namespace_common *ndns = nd_pfn->ndns;
+		u32 start_pad = __le32_to_cpu(pfn_sb->start_pad);
+		u32 end_trunc = __le32_to_cpu(pfn_sb->end_trunc);
+		struct nd_namespace_io *nsio = to_nd_namespace_io(&ndns->dev);
+
+		rc = sprintf(buf, "%llu\n", (unsigned long long)
+				resource_size(&nsio->res) - start_pad
+				- end_trunc - offset);
+	} else {
+		/* no size to convey if the pfn instance is disabled */
+		rc = -ENXIO;
+	}
+	device_unlock(dev);
+
+	return rc;
+}
+static DEVICE_ATTR_RO(size);
+
 static struct attribute *nd_pfn_attributes[] = {
 	&dev_attr_mode.attr,
 	&dev_attr_namespace.attr,
 	&dev_attr_uuid.attr,
 	&dev_attr_align.attr,
+	&dev_attr_resource.attr,
+	&dev_attr_size.attr,
 	NULL,
 };
 
-- 
cgit v0.10.2


From d4f323672aa63713b7ca26da418f66cc30d3a41a Mon Sep 17 00:00:00 2001
From: Dan Williams <dan.j.williams@intel.com>
Date: Thu, 3 Mar 2016 16:08:54 -0800
Subject: nfit, libnvdimm: clear poison command support

Add the boiler-plate for a 'clear error' command based on section
9.20.7.6 "Function Index 4 - Clear Uncorrectable Error" from the ACPI
6.1 specification, and add a reference implementation in nfit_test.

Reviewed-by: Vishal Verma <vishal.l.verma@intel.com>
Signed-off-by: Dan Williams <dan.j.williams@intel.com>

diff --git a/drivers/acpi/nfit.c b/drivers/acpi/nfit.c
index 0def4eb..c067d74 100644
--- a/drivers/acpi/nfit.c
+++ b/drivers/acpi/nfit.c
@@ -87,6 +87,7 @@ static struct acpi_device *to_acpi_dev(struct acpi_nfit_desc *acpi_desc)
 
 static int xlat_status(void *buf, unsigned int cmd)
 {
+	struct nd_cmd_clear_error *clear_err;
 	struct nd_cmd_ars_status *ars_status;
 	struct nd_cmd_ars_start *ars_start;
 	struct nd_cmd_ars_cap *ars_cap;
@@ -149,6 +150,15 @@ static int xlat_status(void *buf, unsigned int cmd)
 		if (ars_status->status >> 16)
 			return -EIO;
 		break;
+	case ND_CMD_CLEAR_ERROR:
+		clear_err = buf;
+		if (clear_err->status & 0xffff)
+			return -EIO;
+		if (!clear_err->cleared)
+			return -EIO;
+		if (clear_err->length > clear_err->cleared)
+			return clear_err->cleared;
+		break;
 	default:
 		break;
 	}
@@ -1002,7 +1012,7 @@ static void acpi_nfit_init_dsms(struct acpi_nfit_desc *acpi_desc)
 	if (!adev)
 		return;
 
-	for (i = ND_CMD_ARS_CAP; i <= ND_CMD_ARS_STATUS; i++)
+	for (i = ND_CMD_ARS_CAP; i <= ND_CMD_CLEAR_ERROR; i++)
 		if (acpi_check_dsm(adev->handle, uuid, 1, 1ULL << i))
 			set_bit(i, &nd_desc->dsm_mask);
 }
diff --git a/drivers/nvdimm/bus.c b/drivers/nvdimm/bus.c
index 2e9ac22..cb6fd64 100644
--- a/drivers/nvdimm/bus.c
+++ b/drivers/nvdimm/bus.c
@@ -421,6 +421,12 @@ static const struct nd_cmd_desc __nd_cmd_bus_descs[] = {
 		.out_num = 3,
 		.out_sizes = { 4, 4, UINT_MAX, },
 	},
+	[ND_CMD_CLEAR_ERROR] = {
+		.in_num = 2,
+		.in_sizes = { 8, 8, },
+		.out_num = 3,
+		.out_sizes = { 4, 4, 8, },
+	},
 };
 
 const struct nd_cmd_desc *nd_cmd_bus_desc(int cmd)
@@ -489,6 +495,13 @@ void wait_nvdimm_bus_probe_idle(struct device *dev)
 	} while (true);
 }
 
+static int pmem_active(struct device *dev, void *data)
+{
+	if (is_nd_pmem(dev) && dev->driver)
+		return -EBUSY;
+	return 0;
+}
+
 /* set_config requires an idle interleave set */
 static int nd_cmd_clear_to_send(struct nvdimm_bus *nvdimm_bus,
 		struct nvdimm *nvdimm, unsigned int cmd)
@@ -503,6 +516,11 @@ static int nd_cmd_clear_to_send(struct nvdimm_bus *nvdimm_bus,
 			return rc;
 	}
 
+	/* require clear error to go through the pmem driver */
+	if (!nvdimm && cmd == ND_CMD_CLEAR_ERROR)
+		return device_for_each_child(&nvdimm_bus->dev, NULL,
+				pmem_active);
+
 	if (!nvdimm || cmd != ND_CMD_SET_CONFIG_DATA)
 		return 0;
 
@@ -551,6 +569,7 @@ static int __nd_ioctl(struct nvdimm_bus *nvdimm_bus, struct nvdimm *nvdimm,
 		case ND_CMD_VENDOR:
 		case ND_CMD_SET_CONFIG_DATA:
 		case ND_CMD_ARS_START:
+		case ND_CMD_CLEAR_ERROR:
 			dev_dbg(&nvdimm_bus->dev, "'%s' command while read-only.\n",
 					nvdimm ? nvdimm_cmd_name(cmd)
 					: nvdimm_bus_cmd_name(cmd));
diff --git a/include/uapi/linux/ndctl.h b/include/uapi/linux/ndctl.h
index cc68b921..7cc28ab 100644
--- a/include/uapi/linux/ndctl.h
+++ b/include/uapi/linux/ndctl.h
@@ -98,6 +98,14 @@ struct nd_cmd_ars_status {
 	} __packed records[0];
 } __packed;
 
+struct nd_cmd_clear_error {
+	__u64 address;
+	__u64 length;
+	__u32 status;
+	__u8 reserved[4];
+	__u64 cleared;
+} __packed;
+
 enum {
 	ND_CMD_IMPLEMENTED = 0,
 
@@ -105,6 +113,7 @@ enum {
 	ND_CMD_ARS_CAP = 1,
 	ND_CMD_ARS_START = 2,
 	ND_CMD_ARS_STATUS = 3,
+	ND_CMD_CLEAR_ERROR = 4,
 
 	/* per-dimm commands */
 	ND_CMD_SMART = 1,
@@ -129,6 +138,7 @@ static inline const char *nvdimm_bus_cmd_name(unsigned cmd)
 		[ND_CMD_ARS_CAP] = "ars_cap",
 		[ND_CMD_ARS_START] = "ars_start",
 		[ND_CMD_ARS_STATUS] = "ars_status",
+		[ND_CMD_CLEAR_ERROR] = "clear_error",
 	};
 
 	if (cmd < ARRAY_SIZE(names) && names[cmd])
@@ -187,6 +197,9 @@ static inline const char *nvdimm_cmd_name(unsigned cmd)
 #define ND_IOCTL_ARS_STATUS		_IOWR(ND_IOCTL, ND_CMD_ARS_STATUS,\
 					struct nd_cmd_ars_status)
 
+#define ND_IOCTL_CLEAR_ERROR		_IOWR(ND_IOCTL, ND_CMD_CLEAR_ERROR,\
+					struct nd_cmd_clear_error)
+
 #define ND_DEVICE_DIMM 1            /* nd_dimm: container for "config data" */
 #define ND_DEVICE_REGION_PMEM 2     /* nd_region: (parent of PMEM namespaces) */
 #define ND_DEVICE_REGION_BLK 3      /* nd_region: (parent of BLK namespaces) */
diff --git a/tools/testing/nvdimm/test/nfit.c b/tools/testing/nvdimm/test/nfit.c
index 1555c09..3187322 100644
--- a/tools/testing/nvdimm/test/nfit.c
+++ b/tools/testing/nvdimm/test/nfit.c
@@ -223,6 +223,7 @@ static int nfit_test_cmd_set_config_data(struct nd_cmd_set_config_hdr *nd_cmd,
 }
 
 #define NFIT_TEST_ARS_RECORDS 4
+#define NFIT_TEST_CLEAR_ERR_UNIT 256
 
 static int nfit_test_cmd_ars_cap(struct nd_cmd_ars_cap *nd_cmd,
 		unsigned int buf_len)
@@ -233,6 +234,7 @@ static int nfit_test_cmd_ars_cap(struct nd_cmd_ars_cap *nd_cmd,
 	nd_cmd->max_ars_out = sizeof(struct nd_cmd_ars_status)
 		+ NFIT_TEST_ARS_RECORDS * sizeof(struct nd_ars_record);
 	nd_cmd->status = (ND_ARS_PERSISTENT | ND_ARS_VOLATILE) << 16;
+	nd_cmd->clear_err_unit = NFIT_TEST_CLEAR_ERR_UNIT;
 
 	return 0;
 }
@@ -306,6 +308,28 @@ static int nfit_test_cmd_ars_status(struct ars_state *ars_state,
 	return 0;
 }
 
+static int nfit_test_cmd_clear_error(struct nd_cmd_clear_error *clear_err,
+		unsigned int buf_len, int *cmd_rc)
+{
+	const u64 mask = NFIT_TEST_CLEAR_ERR_UNIT - 1;
+	if (buf_len < sizeof(*clear_err))
+		return -EINVAL;
+
+	if ((clear_err->address & mask) || (clear_err->length & mask))
+		return -EINVAL;
+
+	/*
+	 * Report 'all clear' success for all commands even though a new
+	 * scrub will find errors again.  This is enough to have the
+	 * error removed from the 'badblocks' tracking in the pmem
+	 * driver.
+	 */
+	clear_err->status = 0;
+	clear_err->cleared = clear_err->length;
+	*cmd_rc = 0;
+	return 0;
+}
+
 static int nfit_test_ctl(struct nvdimm_bus_descriptor *nd_desc,
 		struct nvdimm *nvdimm, unsigned int cmd, void *buf,
 		unsigned int buf_len, int *cmd_rc)
@@ -365,6 +389,9 @@ static int nfit_test_ctl(struct nvdimm_bus_descriptor *nd_desc,
 			rc = nfit_test_cmd_ars_status(ars_state, buf, buf_len,
 					cmd_rc);
 			break;
+		case ND_CMD_CLEAR_ERROR:
+			rc = nfit_test_cmd_clear_error(buf, buf_len, cmd_rc);
+			break;
 		default:
 			return -ENOTTY;
 		}
@@ -1230,6 +1257,7 @@ static void nfit_test0_setup(struct nfit_test *t)
 	set_bit(ND_CMD_ARS_CAP, &acpi_desc->bus_dsm_force_en);
 	set_bit(ND_CMD_ARS_START, &acpi_desc->bus_dsm_force_en);
 	set_bit(ND_CMD_ARS_STATUS, &acpi_desc->bus_dsm_force_en);
+	set_bit(ND_CMD_CLEAR_ERROR, &acpi_desc->bus_dsm_force_en);
 }
 
 static void nfit_test1_setup(struct nfit_test *t)
@@ -1290,6 +1318,7 @@ static void nfit_test1_setup(struct nfit_test *t)
 	set_bit(ND_CMD_ARS_CAP, &acpi_desc->bus_dsm_force_en);
 	set_bit(ND_CMD_ARS_START, &acpi_desc->bus_dsm_force_en);
 	set_bit(ND_CMD_ARS_STATUS, &acpi_desc->bus_dsm_force_en);
+	set_bit(ND_CMD_CLEAR_ERROR, &acpi_desc->bus_dsm_force_en);
 }
 
 static int nfit_test_blk_do_io(struct nd_blk_region *ndbr, resource_size_t dpa,
-- 
cgit v0.10.2


From 45f68802f2542a6ad1550dab9c07004de6e0df40 Mon Sep 17 00:00:00 2001
From: Dan Williams <dan.j.williams@intel.com>
Date: Sun, 6 Mar 2016 08:04:12 -0800
Subject: libnvdimm, pmem: fix ia64 build, use PHYS_PFN

   drivers/nvdimm/pmem.c: In function 'nvdimm_namespace_attach_pfn':
   drivers/nvdimm/pmem.c:367:3: error: implicit declaration of function
   	'__phys_to_pfn' [-Werror=implicit-function-declaration]
   .base_pfn = __phys_to_pfn(nsio->res.start),

ia64 does not provide __phys_to_pfn(), just use the PHYS_PFN() alias.

Cc: Guenter Roeck <linux@roeck-us.net>
Reported-by: kbuild test robot <fengguang.wu@intel.com>
Signed-off-by: Dan Williams <dan.j.williams@intel.com>

diff --git a/drivers/nvdimm/pmem.c b/drivers/nvdimm/pmem.c
index 0cb450e..74e2569 100644
--- a/drivers/nvdimm/pmem.c
+++ b/drivers/nvdimm/pmem.c
@@ -407,15 +407,15 @@ static int nvdimm_namespace_detach_pfn(struct nd_namespace_common *ndns)
  */
 static unsigned long init_altmap_base(resource_size_t base)
 {
-	unsigned long base_pfn = __phys_to_pfn(base);
+	unsigned long base_pfn = PHYS_PFN(base);
 
 	return PFN_SECTION_ALIGN_DOWN(base_pfn);
 }
 
 static unsigned long init_altmap_reserve(resource_size_t base)
 {
-	unsigned long reserve = __phys_to_pfn(SZ_8K);
-	unsigned long base_pfn = __phys_to_pfn(base);
+	unsigned long reserve = PHYS_PFN(SZ_8K);
+	unsigned long base_pfn = PHYS_PFN(base);
 
 	reserve += base_pfn - PFN_SECTION_ALIGN_DOWN(base_pfn);
 	return reserve;
@@ -458,7 +458,7 @@ static int __nvdimm_namespace_attach_pfn(struct nd_pfn *nd_pfn)
 					le64_to_cpu(nd_pfn->pfn_sb->npfns),
 					nd_pfn->npfns);
 		altmap = & __altmap;
-		altmap->free = __phys_to_pfn(pmem->data_offset - SZ_8K);
+		altmap->free = PHYS_PFN(pmem->data_offset - SZ_8K);
 		altmap->alloc = 0;
 	} else {
 		rc = -ENXIO;
-- 
cgit v0.10.2


From 4e0d8f7eff3fbfa3e3ac5782669c078f590dc9e2 Mon Sep 17 00:00:00 2001
From: Toshi Kani <toshi.kani@hpe.com>
Date: Wed, 9 Mar 2016 12:47:03 -0700
Subject: resource: Change __request_region to inherit from immediate parent

__request_region() sets 'flags' of a new resource from @parent
as it inherits the parent's attribute.  When a target resource
has a conflict, this function inserts the new resource entry
under the conflicted entry by updating @parent.  In this case,
the new resource entry needs to inherit attribute from the updated
parent.  This conflict is a typical case since __request_region()
is used to allocate a new resource from a specific resource range.

For instance, request_mem_region() calls __request_region() with
@parent set to &iomem_resource, which is the root entry of the
whole iomem range.  When this request results in inserting a new
entry "DEV-A" under "BUS-1", "DEV-A" needs to inherit from the
immediate parent "BUS-1" as it holds specific attribute for the
range.

root (&iomem_resource)
 :
 + "BUS-1"
    + "DEV-A"

Change __request_region() to set 'flags' and 'desc' of a new entry
from the immediate parent.

Signed-off-by: Toshi Kani <toshi.kani@hpe.com>
Cc: Ingo Molnar <mingo@kernel.org>
Cc: Borislav Petkov <bp@suse.de>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Dan Williams <dan.j.williams@intel.com>
Signed-off-by: Dan Williams <dan.j.williams@intel.com>

diff --git a/kernel/resource.c b/kernel/resource.c
index 4d46605..5a56e8f 100644
--- a/kernel/resource.c
+++ b/kernel/resource.c
@@ -1085,15 +1085,16 @@ struct resource * __request_region(struct resource *parent,
 	res->name = name;
 	res->start = start;
 	res->end = start + n - 1;
-	res->flags = resource_type(parent) | resource_ext_type(parent);
-	res->flags |= IORESOURCE_BUSY | flags;
-	res->desc = IORES_DESC_NONE;
 
 	write_lock(&resource_lock);
 
 	for (;;) {
 		struct resource *conflict;
 
+		res->flags = resource_type(parent) | resource_ext_type(parent);
+		res->flags |= IORESOURCE_BUSY | flags;
+		res->desc = parent->desc;
+
 		conflict = __request_resource(parent, res);
 		if (!conflict)
 			break;
-- 
cgit v0.10.2


From ff3cc952d3f009e6c376cc40651b87187ce364a6 Mon Sep 17 00:00:00 2001
From: Toshi Kani <toshi.kani@hpe.com>
Date: Wed, 9 Mar 2016 12:47:04 -0700
Subject: resource: Add remove_resource interface

insert_resource() and insert_resource_conflict() are called
by resource producers to insert a new resource.  When there
is any conflict, they move conflicting resources down to the
children of the new resource.  There is no destructor of these
interfaces, however.

Add remove_resource(), which removes a resource previously
inserted by insert_resource() or insert_resource_conflict(),
and moves the children up to where they were before.

__release_resource() is changed to have @release_child, so
that this function can be used for remove_resource() as well.

Also add comments to clarify that these functions are intended
for producers of resources to avoid any confusion with
request/release_resource() for consumers.

Signed-off-by: Toshi Kani <toshi.kani@hpe.com>
Cc: Ingo Molnar <mingo@kernel.org>
Cc: Borislav Petkov <bp@suse.de>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Dan Williams <dan.j.williams@intel.com>
Signed-off-by: Dan Williams <dan.j.williams@intel.com>

diff --git a/include/linux/ioport.h b/include/linux/ioport.h
index afb4559..8017b8b 100644
--- a/include/linux/ioport.h
+++ b/include/linux/ioport.h
@@ -174,6 +174,7 @@ extern void reserve_region_with_split(struct resource *root,
 extern struct resource *insert_resource_conflict(struct resource *parent, struct resource *new);
 extern int insert_resource(struct resource *parent, struct resource *new);
 extern void insert_resource_expand_to_fit(struct resource *root, struct resource *new);
+extern int remove_resource(struct resource *old);
 extern void arch_remove_reservations(struct resource *avail);
 extern int allocate_resource(struct resource *root, struct resource *new,
 			     resource_size_t size, resource_size_t min,
diff --git a/kernel/resource.c b/kernel/resource.c
index 5a56e8f..effb6ee 100644
--- a/kernel/resource.c
+++ b/kernel/resource.c
@@ -233,9 +233,9 @@ static struct resource * __request_resource(struct resource *root, struct resour
 	}
 }
 
-static int __release_resource(struct resource *old)
+static int __release_resource(struct resource *old, bool release_child)
 {
-	struct resource *tmp, **p;
+	struct resource *tmp, **p, *chd;
 
 	p = &old->parent->child;
 	for (;;) {
@@ -243,7 +243,17 @@ static int __release_resource(struct resource *old)
 		if (!tmp)
 			break;
 		if (tmp == old) {
-			*p = tmp->sibling;
+			if (release_child || !(tmp->child)) {
+				*p = tmp->sibling;
+			} else {
+				for (chd = tmp->child;; chd = chd->sibling) {
+					chd->parent = tmp->parent;
+					if (!(chd->sibling))
+						break;
+				}
+				*p = tmp->child;
+				chd->sibling = tmp->sibling;
+			}
 			old->parent = NULL;
 			return 0;
 		}
@@ -325,7 +335,7 @@ int release_resource(struct resource *old)
 	int retval;
 
 	write_lock(&resource_lock);
-	retval = __release_resource(old);
+	retval = __release_resource(old, true);
 	write_unlock(&resource_lock);
 	return retval;
 }
@@ -679,7 +689,7 @@ static int reallocate_resource(struct resource *root, struct resource *old,
 		old->start = new.start;
 		old->end = new.end;
 	} else {
-		__release_resource(old);
+		__release_resource(old, true);
 		*old = new;
 		conflict = __request_resource(root, old);
 		BUG_ON(conflict);
@@ -825,6 +835,9 @@ static struct resource * __insert_resource(struct resource *parent, struct resou
  * entirely fit within the range of the new resource, then the new
  * resource is inserted and the conflicting resources become children of
  * the new resource.
+ *
+ * This function is intended for producers of resources, such as FW modules
+ * and bus drivers.
  */
 struct resource *insert_resource_conflict(struct resource *parent, struct resource *new)
 {
@@ -842,6 +855,9 @@ struct resource *insert_resource_conflict(struct resource *parent, struct resour
  * @new: new resource to insert
  *
  * Returns 0 on success, -EBUSY if the resource can't be inserted.
+ *
+ * This function is intended for producers of resources, such as FW modules
+ * and bus drivers.
  */
 int insert_resource(struct resource *parent, struct resource *new)
 {
@@ -885,6 +901,31 @@ void insert_resource_expand_to_fit(struct resource *root, struct resource *new)
 	write_unlock(&resource_lock);
 }
 
+/**
+ * remove_resource - Remove a resource in the resource tree
+ * @old: resource to remove
+ *
+ * Returns 0 on success, -EINVAL if the resource is not valid.
+ *
+ * This function removes a resource previously inserted by insert_resource()
+ * or insert_resource_conflict(), and moves the children (if any) up to
+ * where they were before.  insert_resource() and insert_resource_conflict()
+ * insert a new resource, and move any conflicting resources down to the
+ * children of the new resource.
+ *
+ * insert_resource(), insert_resource_conflict() and remove_resource() are
+ * intended for producers of resources, such as FW modules and bus drivers.
+ */
+int remove_resource(struct resource *old)
+{
+	int retval;
+
+	write_lock(&resource_lock);
+	retval = __release_resource(old, false);
+	write_unlock(&resource_lock);
+	return retval;
+}
+
 static int __adjust_resource(struct resource *res, resource_size_t start,
 				resource_size_t size)
 {
-- 
cgit v0.10.2


From 8095d0f225fe31eaac4a013177b77ed5283278f8 Mon Sep 17 00:00:00 2001
From: Toshi Kani <toshi.kani@hpe.com>
Date: Wed, 9 Mar 2016 12:47:05 -0700
Subject: resource: Export insert_resource and remove_resource

insert_resource() and remove_resouce() are called by producers
of resources, such as FW modules and bus drivers.  These modules
may be implemented as loadable modules.

Export insert_resource() and remove_resouce() so that they can
be called from such modules.

link: https://lkml.org/lkml/2016/3/8/872
Signed-off-by: Toshi Kani <toshi.kani@hpe.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Ingo Molnar <mingo@kernel.org>
Cc: Borislav Petkov <bp@suse.de>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Dan Williams <dan.j.williams@intel.com>
Signed-off-by: Dan Williams <dan.j.williams@intel.com>

diff --git a/kernel/resource.c b/kernel/resource.c
index effb6ee..2e78ead 100644
--- a/kernel/resource.c
+++ b/kernel/resource.c
@@ -866,6 +866,7 @@ int insert_resource(struct resource *parent, struct resource *new)
 	conflict = insert_resource_conflict(parent, new);
 	return conflict ? -EBUSY : 0;
 }
+EXPORT_SYMBOL_GPL(insert_resource);
 
 /**
  * insert_resource_expand_to_fit - Insert a resource into the resource tree
@@ -925,6 +926,7 @@ int remove_resource(struct resource *old)
 	write_unlock(&resource_lock);
 	return retval;
 }
+EXPORT_SYMBOL_GPL(remove_resource);
 
 static int __adjust_resource(struct resource *res, resource_size_t start,
 				resource_size_t size)
-- 
cgit v0.10.2


From af1996ef59dbcb36fe4878df7c717a02eb89d07a Mon Sep 17 00:00:00 2001
From: Toshi Kani <toshi.kani@hpe.com>
Date: Wed, 9 Mar 2016 12:47:06 -0700
Subject: ACPI: Change NFIT driver to insert new resource

ACPI 6 defines persistent memory (PMEM) ranges in multiple
firmware interfaces, e820, EFI, and ACPI NFIT table.  This EFI
change, however, leads to hit a bug in the grub bootloader, which
treats EFI_PERSISTENT_MEMORY type as regular memory and corrupts
stored user data [1].

Therefore, BIOS may set generic reserved type in e820 and EFI to
cover PMEM ranges.  The kernel can initialize PMEM ranges from
ACPI NFIT table alone.

This scheme causes a problem in the iomem table, though.  On x86,
for instance, e820_reserve_resources() initializes top-level entries
(iomem_resource.child) from the e820 table at early boot-time.
This creates "reserved" entry for a PMEM range, which does not allow
region_intersects() to check with PMEM type.

Change acpi_nfit_register_region() to call acpi_nfit_insert_resource(),
which calls insert_resource() to insert a PMEM entry from NFIT when
the iomem table does not have a PMEM entry already.  That is, when
a PMEM range is marked as reserved type in e820, it inserts
"Persistent Memory" entry, which results as follows.

 + "Persistent Memory"
    + "reserved"

This allows the EINJ driver, which calls region_intersects() to check
PMEM ranges, to work continuously even if BIOS sets reserved type
(or sets nothing) to PMEM ranges in e820 and EFI.

[1]: https://lists.gnu.org/archive/html/grub-devel/2015-11/msg00209.html
Signed-off-by: Toshi Kani <toshi.kani@hpe.com>
Cc: Rafael J. Wysocki <rjw@rjwysocki.net>
Cc: Dan Williams <dan.j.williams@intel.com>
Cc: Ingo Molnar <mingo@kernel.org>
Cc: Borislav Petkov <bp@suse.de>
Cc: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Dan Williams <dan.j.williams@intel.com>

diff --git a/drivers/acpi/nfit.c b/drivers/acpi/nfit.c
index fb53db1..269de5f 100644
--- a/drivers/acpi/nfit.c
+++ b/drivers/acpi/nfit.c
@@ -1571,6 +1571,48 @@ static int ars_status_process_records(struct nvdimm_bus *nvdimm_bus,
 	return 0;
 }
 
+static void acpi_nfit_remove_resource(void *data)
+{
+	struct resource *res = data;
+
+	remove_resource(res);
+}
+
+static int acpi_nfit_insert_resource(struct acpi_nfit_desc *acpi_desc,
+		struct nd_region_desc *ndr_desc)
+{
+	struct resource *res, *nd_res = ndr_desc->res;
+	int is_pmem, ret;
+
+	/* No operation if the region is already registered as PMEM */
+	is_pmem = region_intersects(nd_res->start, resource_size(nd_res),
+				IORESOURCE_MEM, IORES_DESC_PERSISTENT_MEMORY);
+	if (is_pmem == REGION_INTERSECTS)
+		return 0;
+
+	res = devm_kzalloc(acpi_desc->dev, sizeof(*res), GFP_KERNEL);
+	if (!res)
+		return -ENOMEM;
+
+	res->name = "Persistent Memory";
+	res->start = nd_res->start;
+	res->end = nd_res->end;
+	res->flags = IORESOURCE_MEM;
+	res->desc = IORES_DESC_PERSISTENT_MEMORY;
+
+	ret = insert_resource(&iomem_resource, res);
+	if (ret)
+		return ret;
+
+	ret = devm_add_action(acpi_desc->dev, acpi_nfit_remove_resource, res);
+	if (ret) {
+		remove_resource(res);
+		return ret;
+	}
+
+	return 0;
+}
+
 static int acpi_nfit_find_poison(struct acpi_nfit_desc *acpi_desc,
 		struct nd_region_desc *ndr_desc)
 {
@@ -1781,6 +1823,12 @@ static int acpi_nfit_register_region(struct acpi_nfit_desc *acpi_desc,
 
 	nvdimm_bus = acpi_desc->nvdimm_bus;
 	if (nfit_spa_type(spa) == NFIT_SPA_PM) {
+		rc = acpi_nfit_insert_resource(acpi_desc, ndr_desc);
+		if (rc)
+			dev_warn(acpi_desc->dev,
+				"failed to insert pmem resource to iomem: %d\n",
+				rc);
+
 		rc = acpi_nfit_find_poison(acpi_desc, ndr_desc);
 		if (rc) {
 			dev_err(acpi_desc->dev,
-- 
cgit v0.10.2


From 55155291b32d24371256adbcc67f9f53cf3f314f Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.com>
Date: Wed, 9 Mar 2016 09:21:54 +1100
Subject: pmem: don't allocate unused major device number

When alloc_disk(0) or alloc_disk-node(0, XX) is used, the ->major
number is completely ignored:  all devices are allocated with a
major of BLOCK_EXT_MAJOR.

So there is no point allocating pmem_major.

Signed-off-by: NeilBrown <neilb@suse.com>
Signed-off-by: Dan Williams <dan.j.williams@intel.com>

diff --git a/drivers/nvdimm/pmem.c b/drivers/nvdimm/pmem.c
index 74e2569..ba8d5b6 100644
--- a/drivers/nvdimm/pmem.c
+++ b/drivers/nvdimm/pmem.c
@@ -50,8 +50,6 @@ struct pmem_device {
 	struct badblocks	bb;
 };
 
-static int pmem_major;
-
 static bool is_bad_pmem(struct badblocks *bb, sector_t sector, unsigned int len)
 {
 	if (bb->count) {
@@ -231,8 +229,6 @@ static int pmem_attach_disk(struct device *dev,
 		return -ENOMEM;
 	}
 
-	disk->major		= pmem_major;
-	disk->first_minor	= 0;
 	disk->fops		= &pmem_fops;
 	disk->private_data	= pmem;
 	disk->queue		= pmem->pmem_queue;
@@ -579,26 +575,13 @@ static struct nd_device_driver nd_pmem_driver = {
 
 static int __init pmem_init(void)
 {
-	int error;
-
-	pmem_major = register_blkdev(0, "pmem");
-	if (pmem_major < 0)
-		return pmem_major;
-
-	error = nd_driver_register(&nd_pmem_driver);
-	if (error) {
-		unregister_blkdev(pmem_major, "pmem");
-		return error;
-	}
-
-	return 0;
+	return nd_driver_register(&nd_pmem_driver);
 }
 module_init(pmem_init);
 
 static void pmem_exit(void)
 {
 	driver_unregister(&nd_pmem_driver.drv);
-	unregister_blkdev(pmem_major, "pmem");
 }
 module_exit(pmem_exit);
 
-- 
cgit v0.10.2


From ec56151d382c2140851b4f25203af9016ba84fea Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.com>
Date: Thu, 10 Mar 2016 08:59:28 +1100
Subject: nvdimm/blk: don't allocate unused major device number

When alloc_disk(0) is used ->major is completely ignored, all devices
are allocated with a "major" of BLOCK_EXT_MAJOR.

So don't allocate nd_blk_major

Signed-off-by: NeilBrown <neilb@suse.com>
Signed-off-by: Dan Williams <dan.j.williams@intel.com>

diff --git a/drivers/nvdimm/blk.c b/drivers/nvdimm/blk.c
index 91a336e..e9ff922 100644
--- a/drivers/nvdimm/blk.c
+++ b/drivers/nvdimm/blk.c
@@ -31,8 +31,6 @@ struct nd_blk_device {
 	u32 internal_lbasize;
 };
 
-static int nd_blk_major;
-
 static u32 nd_blk_meta_size(struct nd_blk_device *blk_dev)
 {
 	return blk_dev->nsblk->lbasize - blk_dev->sector_size;
@@ -264,7 +262,6 @@ static int nd_blk_attach_disk(struct nd_namespace_common *ndns,
 	}
 
 	disk->driverfs_dev	= &ndns->dev;
-	disk->major		= nd_blk_major;
 	disk->first_minor	= 0;
 	disk->fops		= &nd_blk_fops;
 	disk->private_data	= blk_dev;
@@ -358,25 +355,12 @@ static struct nd_device_driver nd_blk_driver = {
 
 static int __init nd_blk_init(void)
 {
-	int rc;
-
-	rc = register_blkdev(0, "nd_blk");
-	if (rc < 0)
-		return rc;
-
-	nd_blk_major = rc;
-	rc = nd_driver_register(&nd_blk_driver);
-
-	if (rc < 0)
-		unregister_blkdev(nd_blk_major, "nd_blk");
-
-	return rc;
+	return nd_driver_register(&nd_blk_driver);
 }
 
 static void __exit nd_blk_exit(void)
 {
 	driver_unregister(&nd_blk_driver.drv);
-	unregister_blkdev(nd_blk_major, "nd_blk");
 }
 
 MODULE_AUTHOR("Ross Zwisler <ross.zwisler@linux.intel.com>");
-- 
cgit v0.10.2


From ff8e92d5d94b99aab39f439d532cba435947dfc0 Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.com>
Date: Thu, 10 Mar 2016 08:59:28 +1100
Subject: nvdimm/btt: don't allocate unused major device number

alloc_disk(0) does not require or use a ->major number,
all devices are allocated with a major of BLOCK_EXT_MAJOR.

So don't allocate btt_major.

Signed-off-by: NeilBrown <neilb@suse.com>
Signed-off-by: Dan Williams <dan.j.williams@intel.com>

diff --git a/drivers/nvdimm/btt.c b/drivers/nvdimm/btt.c
index efb2c1c..c32cbb5 100644
--- a/drivers/nvdimm/btt.c
+++ b/drivers/nvdimm/btt.c
@@ -31,8 +31,6 @@ enum log_ent_request {
 	LOG_OLD_ENT
 };
 
-static int btt_major;
-
 static int arena_read_bytes(struct arena_info *arena, resource_size_t offset,
 		void *buf, size_t n)
 {
@@ -1246,7 +1244,6 @@ static int btt_blk_init(struct btt *btt)
 
 	nvdimm_namespace_disk_name(ndns, btt->btt_disk->disk_name);
 	btt->btt_disk->driverfs_dev = &btt->nd_btt->dev;
-	btt->btt_disk->major = btt_major;
 	btt->btt_disk->first_minor = 0;
 	btt->btt_disk->fops = &btt_fops;
 	btt->btt_disk->private_data = btt;
@@ -1423,22 +1420,11 @@ EXPORT_SYMBOL(nvdimm_namespace_detach_btt);
 
 static int __init nd_btt_init(void)
 {
-	int rc;
-
-	btt_major = register_blkdev(0, "btt");
-	if (btt_major < 0)
-		return btt_major;
+	int rc = 0;
 
 	debugfs_root = debugfs_create_dir("btt", NULL);
-	if (IS_ERR_OR_NULL(debugfs_root)) {
+	if (IS_ERR_OR_NULL(debugfs_root))
 		rc = -ENXIO;
-		goto err_debugfs;
-	}
-
-	return 0;
-
- err_debugfs:
-	unregister_blkdev(btt_major, "btt");
 
 	return rc;
 }
@@ -1446,7 +1432,6 @@ static int __init nd_btt_init(void)
 static void __exit nd_btt_exit(void)
 {
 	debugfs_remove_recursive(debugfs_root);
-	unregister_blkdev(btt_major, "btt");
 }
 
 MODULE_ALIAS_ND_DEVICE(ND_DEVICE_BTT);
-- 
cgit v0.10.2


From b5ebc8ec693281c3c1efff7459a069cbd8b9a149 Mon Sep 17 00:00:00 2001
From: Dan Williams <dan.j.williams@intel.com>
Date: Sun, 6 Mar 2016 15:20:51 -0800
Subject: libnvdimm, pmem: fix kmap_atomic() leak in error path

When we enounter a bad block we need to kunmap_atomic() before
returning.

Cc: <stable@vger.kernel.org>
Cc: Ross Zwisler <ross.zwisler@linux.intel.com>
Reviewed-by: Vishal Verma <vishal.l.verma@intel.com>
Signed-off-by: Dan Williams <dan.j.williams@intel.com>

diff --git a/drivers/nvdimm/pmem.c b/drivers/nvdimm/pmem.c
index efc2a5e..e7b86a7 100644
--- a/drivers/nvdimm/pmem.c
+++ b/drivers/nvdimm/pmem.c
@@ -66,22 +66,25 @@ static int pmem_do_bvec(struct pmem_device *pmem, struct page *page,
 			unsigned int len, unsigned int off, int rw,
 			sector_t sector)
 {
+	int rc = 0;
 	void *mem = kmap_atomic(page);
 	phys_addr_t pmem_off = sector * 512 + pmem->data_offset;
 	void __pmem *pmem_addr = pmem->virt_addr + pmem_off;
 
 	if (rw == READ) {
 		if (unlikely(is_bad_pmem(&pmem->bb, sector, len)))
-			return -EIO;
-		memcpy_from_pmem(mem + off, pmem_addr, len);
-		flush_dcache_page(page);
+			rc = -EIO;
+		else {
+			memcpy_from_pmem(mem + off, pmem_addr, len);
+			flush_dcache_page(page);
+		}
 	} else {
 		flush_dcache_page(page);
 		memcpy_to_pmem(pmem_addr, mem + off, len);
 	}
 
 	kunmap_atomic(mem);
-	return 0;
+	return rc;
 }
 
 static blk_qc_t pmem_make_request(struct request_queue *q, struct bio *bio)
-- 
cgit v0.10.2


From 59e6473980f321c16299e12db69d1fabc2644a6f Mon Sep 17 00:00:00 2001
From: Dan Williams <dan.j.williams@intel.com>
Date: Tue, 8 Mar 2016 07:16:07 -0800
Subject: libnvdimm, pmem: clear poison on write

If a write is directed at a known bad block perform the following:

1/ write the data

2/ send a clear poison command

3/ invalidate the poison out of the cache hierarchy

Cc: <x86@kernel.org>
Cc: Ross Zwisler <ross.zwisler@linux.intel.com>
Reviewed-by: Vishal Verma <vishal.l.verma@intel.com>
Signed-off-by: Dan Williams <dan.j.williams@intel.com>

diff --git a/arch/x86/include/asm/pmem.h b/arch/x86/include/asm/pmem.h
index c57fd1e..bf8b35d 100644
--- a/arch/x86/include/asm/pmem.h
+++ b/arch/x86/include/asm/pmem.h
@@ -137,6 +137,11 @@ static inline void arch_clear_pmem(void __pmem *addr, size_t size)
 	arch_wb_cache_pmem(addr, size);
 }
 
+static inline void arch_invalidate_pmem(void __pmem *addr, size_t size)
+{
+	clflush_cache_range((void __force *) addr, size);
+}
+
 static inline bool __arch_has_wmb_pmem(void)
 {
 	/*
diff --git a/drivers/nvdimm/bus.c b/drivers/nvdimm/bus.c
index cb6fd64..3355748 100644
--- a/drivers/nvdimm/bus.c
+++ b/drivers/nvdimm/bus.c
@@ -159,6 +159,52 @@ void nvdimm_region_notify(struct nd_region *nd_region, enum nvdimm_event event)
 }
 EXPORT_SYMBOL_GPL(nvdimm_region_notify);
 
+long nvdimm_clear_poison(struct device *dev, phys_addr_t phys,
+		unsigned int len)
+{
+	struct nvdimm_bus *nvdimm_bus = walk_to_nvdimm_bus(dev);
+	struct nvdimm_bus_descriptor *nd_desc;
+	struct nd_cmd_clear_error clear_err;
+	struct nd_cmd_ars_cap ars_cap;
+	u32 clear_err_unit, mask;
+	int cmd_rc, rc;
+
+	if (!nvdimm_bus)
+		return -ENXIO;
+
+	nd_desc = nvdimm_bus->nd_desc;
+	if (!nd_desc->ndctl)
+		return -ENXIO;
+
+	memset(&ars_cap, 0, sizeof(ars_cap));
+	ars_cap.address = phys;
+	ars_cap.length = len;
+	rc = nd_desc->ndctl(nd_desc, NULL, ND_CMD_ARS_CAP, &ars_cap,
+			sizeof(ars_cap), &cmd_rc);
+	if (rc < 0)
+		return rc;
+	if (cmd_rc < 0)
+		return cmd_rc;
+	clear_err_unit = ars_cap.clear_err_unit;
+	if (!clear_err_unit || !is_power_of_2(clear_err_unit))
+		return -ENXIO;
+
+	mask = clear_err_unit - 1;
+	if ((phys | len) & mask)
+		return -ENXIO;
+	memset(&clear_err, 0, sizeof(clear_err));
+	clear_err.address = phys;
+	clear_err.length = len;
+	rc = nd_desc->ndctl(nd_desc, NULL, ND_CMD_CLEAR_ERROR, &clear_err,
+			sizeof(clear_err), &cmd_rc);
+	if (rc < 0)
+		return rc;
+	if (cmd_rc < 0)
+		return cmd_rc;
+	return clear_err.cleared;
+}
+EXPORT_SYMBOL_GPL(nvdimm_clear_poison);
+
 static struct bus_type nvdimm_bus_type = {
 	.name = "nd",
 	.uevent = nvdimm_bus_uevent,
diff --git a/drivers/nvdimm/nd.h b/drivers/nvdimm/nd.h
index 78b82f6..1799bd9 100644
--- a/drivers/nvdimm/nd.h
+++ b/drivers/nvdimm/nd.h
@@ -186,6 +186,8 @@ int nvdimm_init_nsarea(struct nvdimm_drvdata *ndd);
 int nvdimm_init_config_data(struct nvdimm_drvdata *ndd);
 int nvdimm_set_config_data(struct nvdimm_drvdata *ndd, size_t offset,
 		void *buf, size_t len);
+long nvdimm_clear_poison(struct device *dev, phys_addr_t phys,
+		unsigned int len);
 struct nd_btt *to_nd_btt(struct device *dev);
 
 struct nd_gen_sb {
diff --git a/drivers/nvdimm/pmem.c b/drivers/nvdimm/pmem.c
index e7b86a7..adc3872 100644
--- a/drivers/nvdimm/pmem.c
+++ b/drivers/nvdimm/pmem.c
@@ -62,17 +62,40 @@ static bool is_bad_pmem(struct badblocks *bb, sector_t sector, unsigned int len)
 	return false;
 }
 
+static void pmem_clear_poison(struct pmem_device *pmem, phys_addr_t offset,
+		unsigned int len)
+{
+	struct device *dev = disk_to_dev(pmem->pmem_disk);
+	sector_t sector;
+	long cleared;
+
+	sector = (offset - pmem->data_offset) / 512;
+	cleared = nvdimm_clear_poison(dev, pmem->phys_addr + offset, len);
+
+	if (cleared > 0 && cleared / 512) {
+		dev_dbg(dev, "%s: %llx clear %ld sector%s\n",
+				__func__, (unsigned long long) sector,
+				cleared / 512, cleared / 512 > 1 ? "s" : "");
+		badblocks_clear(&pmem->bb, sector, cleared / 512);
+	}
+	invalidate_pmem(pmem->virt_addr + offset, len);
+}
+
 static int pmem_do_bvec(struct pmem_device *pmem, struct page *page,
 			unsigned int len, unsigned int off, int rw,
 			sector_t sector)
 {
 	int rc = 0;
+	bool bad_pmem = false;
 	void *mem = kmap_atomic(page);
 	phys_addr_t pmem_off = sector * 512 + pmem->data_offset;
 	void __pmem *pmem_addr = pmem->virt_addr + pmem_off;
 
+	if (unlikely(is_bad_pmem(&pmem->bb, sector, len)))
+		bad_pmem = true;
+
 	if (rw == READ) {
-		if (unlikely(is_bad_pmem(&pmem->bb, sector, len)))
+		if (unlikely(bad_pmem))
 			rc = -EIO;
 		else {
 			memcpy_from_pmem(mem + off, pmem_addr, len);
@@ -81,6 +104,10 @@ static int pmem_do_bvec(struct pmem_device *pmem, struct page *page,
 	} else {
 		flush_dcache_page(page);
 		memcpy_to_pmem(pmem_addr, mem + off, len);
+		if (unlikely(bad_pmem)) {
+			pmem_clear_poison(pmem, pmem_off, len);
+			memcpy_to_pmem(pmem_addr, mem + off, len);
+		}
 	}
 
 	kunmap_atomic(mem);
diff --git a/include/linux/pmem.h b/include/linux/pmem.h
index 7c3d11a..3ec5309 100644
--- a/include/linux/pmem.h
+++ b/include/linux/pmem.h
@@ -58,6 +58,11 @@ static inline void arch_wb_cache_pmem(void __pmem *addr, size_t size)
 {
 	BUG();
 }
+
+static inline void arch_invalidate_pmem(void __pmem *addr, size_t size)
+{
+	BUG();
+}
 #endif
 
 /*
@@ -186,6 +191,20 @@ static inline void clear_pmem(void __pmem *addr, size_t size)
 }
 
 /**
+ * invalidate_pmem - flush a pmem range from the cache hierarchy
+ * @addr:	virtual start address
+ * @size:	bytes to invalidate (internally aligned to cache line size)
+ *
+ * For platforms that support clearing poison this flushes any poisoned
+ * ranges out of the cache
+ */
+static inline void invalidate_pmem(void __pmem *addr, size_t size)
+{
+	if (arch_has_pmem_api())
+		arch_invalidate_pmem(addr, size);
+}
+
+/**
  * wb_cache_pmem - write back processor cache for PMEM memory range
  * @addr:	virtual start address
  * @size:	number of bytes to write back
-- 
cgit v0.10.2