From 8cc6ddfcafbb7e32ff025f7d9551ecf9649c12cd Mon Sep 17 00:00:00 2001
From: Dan Williams <dan.j.williams@intel.com>
Date: Tue, 5 Apr 2016 15:26:50 -0700
Subject: libnvdimm, nfit: report multiple interface codes per-dimm

Starting with ACPI 6.1 an NFIT table will report multiple 'NVDIMM
Control Region Structure' instances per-dimm, one for each supported
format interface.  Report that code in the following format in sysfs:

    nmemX/nfit/formats
    nmemX/nfit/format
    nmemX/nfit/format1
    nmemX/nfit/format2
    ...
    nmemX/nfit/formatN

Where format2 - formatN are theoretical as there are no known DIMMs with
support for more than two interface formats.

This layout is compatible with existing libndctl binaries that only
expect one code per-dimm as they will ignore nmemX/nfit/formats and
nmemX/nfit/formatN.

Signed-off-by: Dan Williams <dan.j.williams@intel.com>

diff --git a/drivers/acpi/nfit.c b/drivers/acpi/nfit.c
index d0f35e6..db0f806 100644
--- a/drivers/acpi/nfit.c
+++ b/drivers/acpi/nfit.c
@@ -655,6 +655,7 @@ static int nfit_mem_dcr_init(struct acpi_nfit_desc *acpi_desc,
 			if (!nfit_mem)
 				return -ENOMEM;
 			INIT_LIST_HEAD(&nfit_mem->list);
+			nfit_mem->acpi_desc = acpi_desc;
 			list_add(&nfit_mem->list, &acpi_desc->dimms);
 		}
 
@@ -838,6 +839,18 @@ static ssize_t device_show(struct device *dev,
 }
 static DEVICE_ATTR_RO(device);
 
+static int num_nvdimm_formats(struct nvdimm *nvdimm)
+{
+	struct nfit_mem *nfit_mem = nvdimm_provider_data(nvdimm);
+	int formats = 0;
+
+	if (nfit_mem->memdev_pmem)
+		formats++;
+	if (nfit_mem->memdev_bdw)
+		formats++;
+	return formats;
+}
+
 static ssize_t format_show(struct device *dev,
 		struct device_attribute *attr, char *buf)
 {
@@ -847,6 +860,55 @@ static ssize_t format_show(struct device *dev,
 }
 static DEVICE_ATTR_RO(format);
 
+static ssize_t format1_show(struct device *dev,
+		struct device_attribute *attr, char *buf)
+{
+	u32 handle;
+	ssize_t rc = -ENXIO;
+	struct nfit_mem *nfit_mem;
+	struct nfit_memdev *nfit_memdev;
+	struct acpi_nfit_desc *acpi_desc;
+	struct nvdimm *nvdimm = to_nvdimm(dev);
+	struct acpi_nfit_control_region *dcr = to_nfit_dcr(dev);
+
+	nfit_mem = nvdimm_provider_data(nvdimm);
+	acpi_desc = nfit_mem->acpi_desc;
+	handle = to_nfit_memdev(dev)->device_handle;
+
+	/* assumes DIMMs have at most 2 published interface codes */
+	mutex_lock(&acpi_desc->init_mutex);
+	list_for_each_entry(nfit_memdev, &acpi_desc->memdevs, list) {
+		struct acpi_nfit_memory_map *memdev = nfit_memdev->memdev;
+		struct nfit_dcr *nfit_dcr;
+
+		if (memdev->device_handle != handle)
+			continue;
+
+		list_for_each_entry(nfit_dcr, &acpi_desc->dcrs, list) {
+			if (nfit_dcr->dcr->region_index != memdev->region_index)
+				continue;
+			if (nfit_dcr->dcr->code == dcr->code)
+				continue;
+			rc = sprintf(buf, "%#x\n", nfit_dcr->dcr->code);
+			break;
+		}
+		if (rc != ENXIO)
+			break;
+	}
+	mutex_unlock(&acpi_desc->init_mutex);
+	return rc;
+}
+static DEVICE_ATTR_RO(format1);
+
+static ssize_t formats_show(struct device *dev,
+		struct device_attribute *attr, char *buf)
+{
+	struct nvdimm *nvdimm = to_nvdimm(dev);
+
+	return sprintf(buf, "%d\n", num_nvdimm_formats(nvdimm));
+}
+static DEVICE_ATTR_RO(formats);
+
 static ssize_t serial_show(struct device *dev,
 		struct device_attribute *attr, char *buf)
 {
@@ -876,6 +938,8 @@ static struct attribute *acpi_nfit_dimm_attributes[] = {
 	&dev_attr_vendor.attr,
 	&dev_attr_device.attr,
 	&dev_attr_format.attr,
+	&dev_attr_formats.attr,
+	&dev_attr_format1.attr,
 	&dev_attr_serial.attr,
 	&dev_attr_rev_id.attr,
 	&dev_attr_flags.attr,
@@ -886,11 +950,13 @@ static umode_t acpi_nfit_dimm_attr_visible(struct kobject *kobj,
 		struct attribute *a, int n)
 {
 	struct device *dev = container_of(kobj, struct device, kobj);
+	struct nvdimm *nvdimm = to_nvdimm(dev);
 
-	if (to_nfit_dcr(dev))
-		return a->mode;
-	else
+	if (!to_nfit_dcr(dev))
+		return 0;
+	if (a == &dev_attr_format1.attr && num_nvdimm_formats(nvdimm) <= 1)
 		return 0;
+	return a->mode;
 }
 
 static struct attribute_group acpi_nfit_dimm_attribute_group = {
diff --git a/drivers/acpi/nfit.h b/drivers/acpi/nfit.h
index c75576b..5201840 100644
--- a/drivers/acpi/nfit.h
+++ b/drivers/acpi/nfit.h
@@ -109,6 +109,7 @@ struct nfit_mem {
 	struct nfit_flush *nfit_flush;
 	struct list_head list;
 	struct acpi_device *adev;
+	struct acpi_nfit_desc *acpi_desc;
 	unsigned long dsm_mask;
 };
 
-- 
cgit v0.10.2


From baa51277cf5dc844089ea2f6e0f78b1c5ca665d8 Mon Sep 17 00:00:00 2001
From: Dan Williams <dan.j.williams@intel.com>
Date: Tue, 5 Apr 2016 17:40:52 -0700
Subject: libnvdimm, test: add mock SMART data payload

Provide simulated SMART data to enable the ndctl implementation of SMART
data retrieval and parsing.

The payload is defined here, "Section 4.1 SMART and Health Info
(Function Index 1)":

    http://pmem.io/documents/NVDIMM_DSM_Interface_Example.pdf

Signed-off-by: Dan Williams <dan.j.williams@intel.com>

diff --git a/drivers/nvdimm/bus.c b/drivers/nvdimm/bus.c
index 19f822d..8111b12 100644
--- a/drivers/nvdimm/bus.c
+++ b/drivers/nvdimm/bus.c
@@ -783,6 +783,9 @@ int __init nvdimm_bus_init(void)
 {
 	int rc;
 
+	BUILD_BUG_ON(sizeof(struct nd_smart_payload) != 128);
+	BUILD_BUG_ON(sizeof(struct nd_smart_threshold_payload) != 8);
+
 	rc = bus_register(&nvdimm_bus_type);
 	if (rc)
 		return rc;
diff --git a/include/uapi/linux/ndctl.h b/include/uapi/linux/ndctl.h
index 7cc28ab..59c61e0 100644
--- a/include/uapi/linux/ndctl.h
+++ b/include/uapi/linux/ndctl.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2014-2015, Intel Corporation.
+ * Copyright (c) 2014-2016, Intel Corporation.
  *
  * This program is free software; you can redistribute it and/or modify it
  * under the terms and conditions of the GNU Lesser General Public License,
@@ -20,11 +20,45 @@ struct nd_cmd_smart {
 	__u8 data[128];
 } __packed;
 
+#define ND_SMART_HEALTH_VALID	(1 << 0)
+#define ND_SMART_TEMP_VALID 	(1 << 1)
+#define ND_SMART_SPARES_VALID	(1 << 2)
+#define ND_SMART_ALARM_VALID	(1 << 3)
+#define ND_SMART_USED_VALID	(1 << 4)
+#define ND_SMART_SHUTDOWN_VALID	(1 << 5)
+#define ND_SMART_VENDOR_VALID	(1 << 6)
+#define ND_SMART_TEMP_TRIP	(1 << 0)
+#define ND_SMART_SPARE_TRIP	(1 << 1)
+#define ND_SMART_NON_CRITICAL_HEALTH	(1 << 0)
+#define ND_SMART_CRITICAL_HEALTH	(1 << 1)
+#define ND_SMART_FATAL_HEALTH		(1 << 2)
+
+struct nd_smart_payload {
+	__u32 flags;
+	__u8 reserved0[4];
+	__u8 health;
+	__u16 temperature;
+	__u8 spares;
+	__u8 alarm_flags;
+	__u8 life_used;
+	__u8 shutdown_state;
+	__u8 reserved1;
+	__u32 vendor_size;
+	__u8 vendor_data[108];
+} __packed;
+
 struct nd_cmd_smart_threshold {
 	__u32 status;
 	__u8 data[8];
 } __packed;
 
+struct nd_smart_threshold_payload {
+	__u16 alarm_control;
+	__u16 temperature;
+	__u8 spares;
+	__u8 reserved[3];
+} __packed;
+
 struct nd_cmd_dimm_flags {
 	__u32 status;
 	__u32 flags;
diff --git a/tools/testing/nvdimm/test/nfit.c b/tools/testing/nvdimm/test/nfit.c
index 3187322..d1c98d4 100644
--- a/tools/testing/nvdimm/test/nfit.c
+++ b/tools/testing/nvdimm/test/nfit.c
@@ -330,6 +330,42 @@ static int nfit_test_cmd_clear_error(struct nd_cmd_clear_error *clear_err,
 	return 0;
 }
 
+static int nfit_test_cmd_smart(struct nd_cmd_smart *smart, unsigned int buf_len)
+{
+	static const struct nd_smart_payload smart_data = {
+		.flags = ND_SMART_HEALTH_VALID | ND_SMART_TEMP_VALID
+			| ND_SMART_SPARES_VALID | ND_SMART_ALARM_VALID
+			| ND_SMART_USED_VALID | ND_SMART_SHUTDOWN_VALID,
+		.health = ND_SMART_NON_CRITICAL_HEALTH,
+		.temperature = 23 * 16,
+		.spares = 75,
+		.alarm_flags = ND_SMART_SPARE_TRIP | ND_SMART_TEMP_TRIP,
+		.life_used = 5,
+		.shutdown_state = 0,
+		.vendor_size = 0,
+	};
+
+	if (buf_len < sizeof(*smart))
+		return -EINVAL;
+	memcpy(smart->data, &smart_data, sizeof(smart_data));
+	return 0;
+}
+
+static int nfit_test_cmd_smart_threshold(struct nd_cmd_smart_threshold *smart_t,
+		unsigned int buf_len)
+{
+	static const struct nd_smart_threshold_payload smart_t_data = {
+		.alarm_control = ND_SMART_SPARE_TRIP | ND_SMART_TEMP_TRIP,
+		.temperature = 40 * 16,
+		.spares = 5,
+	};
+
+	if (buf_len < sizeof(*smart_t))
+		return -EINVAL;
+	memcpy(smart_t->data, &smart_t_data, sizeof(smart_t_data));
+	return 0;
+}
+
 static int nfit_test_ctl(struct nvdimm_bus_descriptor *nd_desc,
 		struct nvdimm *nvdimm, unsigned int cmd, void *buf,
 		unsigned int buf_len, int *cmd_rc)
@@ -368,6 +404,12 @@ static int nfit_test_ctl(struct nvdimm_bus_descriptor *nd_desc,
 			rc = nfit_test_cmd_set_config_data(buf, buf_len,
 				t->label[i]);
 			break;
+		case ND_CMD_SMART:
+			rc = nfit_test_cmd_smart(buf, buf_len);
+			break;
+		case ND_CMD_SMART_THRESHOLD:
+			rc = nfit_test_cmd_smart_threshold(buf, buf_len);
+			break;
 		default:
 			return -ENOTTY;
 		}
@@ -1254,10 +1296,12 @@ static void nfit_test0_setup(struct nfit_test *t)
 	set_bit(ND_CMD_GET_CONFIG_SIZE, &acpi_desc->dimm_dsm_force_en);
 	set_bit(ND_CMD_GET_CONFIG_DATA, &acpi_desc->dimm_dsm_force_en);
 	set_bit(ND_CMD_SET_CONFIG_DATA, &acpi_desc->dimm_dsm_force_en);
+	set_bit(ND_CMD_SMART, &acpi_desc->dimm_dsm_force_en);
 	set_bit(ND_CMD_ARS_CAP, &acpi_desc->bus_dsm_force_en);
 	set_bit(ND_CMD_ARS_START, &acpi_desc->bus_dsm_force_en);
 	set_bit(ND_CMD_ARS_STATUS, &acpi_desc->bus_dsm_force_en);
 	set_bit(ND_CMD_CLEAR_ERROR, &acpi_desc->bus_dsm_force_en);
+	set_bit(ND_CMD_SMART_THRESHOLD, &acpi_desc->dimm_dsm_force_en);
 }
 
 static void nfit_test1_setup(struct nfit_test *t)
-- 
cgit v0.10.2


From 8259542348d93da6a04eed979047b1fd1ca72abe Mon Sep 17 00:00:00 2001
From: "Lee, Chun-Yi" <joeyli.kernel@gmail.com>
Date: Thu, 21 Jan 2016 20:32:10 +0800
Subject: libnvdimm, nfit: Use ACPI_SIG_NFIT instead of hard coded string

It's minor but that's still better to use ACPI_SIG_NFIT instead of hard
coded string.

Cc: Ross Zwisler <ross.zwisler@linux.intel.com>
Cc: "Rafael J. Wysocki" <rjw@rjwysocki.net>
Cc: Len Brown <lenb@kernel.org>
Signed-off-by: Lee, Chun-Yi <jlee@suse.com>
Acked-by: Ross Zwisler <ross.zwisler@linux.intel.com>
Signed-off-by: Dan Williams <dan.j.williams@intel.com>

diff --git a/drivers/acpi/nfit.c b/drivers/acpi/nfit.c
index db0f806..a434b58 100644
--- a/drivers/acpi/nfit.c
+++ b/drivers/acpi/nfit.c
@@ -2372,7 +2372,7 @@ static int acpi_nfit_add(struct acpi_device *adev)
 	acpi_size sz;
 	int rc;
 
-	status = acpi_get_table_with_size("NFIT", 0, &tbl, &sz);
+	status = acpi_get_table_with_size(ACPI_SIG_NFIT, 0, &tbl, &sz);
 	if (ACPI_FAILURE(status)) {
 		/* This is ok, we could have an nvdimm hotplugged later */
 		dev_dbg(dev, "failed to find NFIT at startup\n");
-- 
cgit v0.10.2


From c7e16e5257ec46530e3e874af38191746c137c83 Mon Sep 17 00:00:00 2001
From: Jerry Hoemann <jerry.hoemann@hpe.com>
Date: Mon, 11 Apr 2016 15:02:26 -0700
Subject: acpi: widen acpi_evaluate_dsm() revision and function-index arguments

The ACPI specification states that arguments "Revision ID" and "Function
Index" to a _DSM are type "Integer."  Type Integers are 64 bit
quantities.

The function evaluate_dsm specifies these types as simple "int" which
are 32 bits.  Widen type passed to acpi_evaluate_dsm and its callers and
derived callers to pass correct type.

acpi_check_dsm and acpi_evaluate_dsm_typed had similar issue and were
corrected as well.

This is in preparation for libnvdimm implementing a generic _DSM
passthrough facility to have the capacity to pass 64-bit values as the
ACPI specification allows.

[djbw: clarify the changelog, add rationale]
Signed-off-by: Jerry Hoemann <jerry.hoemann@hpe.com>
Acked-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
Signed-off-by: Dan Williams <dan.j.williams@intel.com>

diff --git a/drivers/acpi/utils.c b/drivers/acpi/utils.c
index 050673f..e854dea 100644
--- a/drivers/acpi/utils.c
+++ b/drivers/acpi/utils.c
@@ -625,7 +625,7 @@ acpi_status acpi_evaluate_lck(acpi_handle handle, int lock)
  * some old BIOSes do expect a buffer or an integer etc.
  */
 union acpi_object *
-acpi_evaluate_dsm(acpi_handle handle, const u8 *uuid, int rev, int func,
+acpi_evaluate_dsm(acpi_handle handle, const u8 *uuid, u64 rev, u64 func,
 		  union acpi_object *argv4)
 {
 	acpi_status ret;
@@ -674,7 +674,7 @@ EXPORT_SYMBOL(acpi_evaluate_dsm);
  * functions. Currently only support 64 functions at maximum, should be
  * enough for now.
  */
-bool acpi_check_dsm(acpi_handle handle, const u8 *uuid, int rev, u64 funcs)
+bool acpi_check_dsm(acpi_handle handle, const u8 *uuid, u64 rev, u64 funcs)
 {
 	int i;
 	u64 mask = 0;
diff --git a/include/acpi/acpi_bus.h b/include/acpi/acpi_bus.h
index 14362a8..f092cc6 100644
--- a/include/acpi/acpi_bus.h
+++ b/include/acpi/acpi_bus.h
@@ -61,12 +61,12 @@ bool acpi_ata_match(acpi_handle handle);
 bool acpi_bay_match(acpi_handle handle);
 bool acpi_dock_match(acpi_handle handle);
 
-bool acpi_check_dsm(acpi_handle handle, const u8 *uuid, int rev, u64 funcs);
+bool acpi_check_dsm(acpi_handle handle, const u8 *uuid, u64 rev, u64 funcs);
 union acpi_object *acpi_evaluate_dsm(acpi_handle handle, const u8 *uuid,
-			int rev, int func, union acpi_object *argv4);
+			u64 rev, u64 func, union acpi_object *argv4);
 
 static inline union acpi_object *
-acpi_evaluate_dsm_typed(acpi_handle handle, const u8 *uuid, int rev, int func,
+acpi_evaluate_dsm_typed(acpi_handle handle, const u8 *uuid, u64 rev, u64 func,
 			union acpi_object *argv4, acpi_object_type type)
 {
 	union acpi_object *obj;
-- 
cgit v0.10.2


From 298f2bc5db3851cf2e839a0025425256ef852139 Mon Sep 17 00:00:00 2001
From: Dan Williams <dan.j.williams@intel.com>
Date: Tue, 15 Mar 2016 16:41:04 -0700
Subject: libnvdimm, pmem: kill pmem->ndns

We can derive the common namespace from other information.  We also do
not need to cache it because all the usages are in slow paths.

Reviewed-by: Johannes Thumshirn <jthumshirn@suse.de>
Signed-off-by: Dan Williams <dan.j.williams@intel.com>

diff --git a/drivers/nvdimm/blk.c b/drivers/nvdimm/blk.c
index e9ff922..2464939 100644
--- a/drivers/nvdimm/blk.c
+++ b/drivers/nvdimm/blk.c
@@ -336,7 +336,7 @@ static int nd_blk_remove(struct device *dev)
 	struct nd_blk_device *blk_dev = dev_get_drvdata(dev);
 
 	if (is_nd_btt(dev))
-		nvdimm_namespace_detach_btt(to_nd_btt(dev)->ndns);
+		nvdimm_namespace_detach_btt(to_nd_btt(dev));
 	else
 		nd_blk_detach_disk(blk_dev);
 	kfree(blk_dev);
diff --git a/drivers/nvdimm/btt.c b/drivers/nvdimm/btt.c
index f068b65..676c31a 100644
--- a/drivers/nvdimm/btt.c
+++ b/drivers/nvdimm/btt.c
@@ -1406,9 +1406,8 @@ int nvdimm_namespace_attach_btt(struct nd_namespace_common *ndns)
 }
 EXPORT_SYMBOL(nvdimm_namespace_attach_btt);
 
-int nvdimm_namespace_detach_btt(struct nd_namespace_common *ndns)
+int nvdimm_namespace_detach_btt(struct nd_btt *nd_btt)
 {
-	struct nd_btt *nd_btt = to_nd_btt(ndns->claim);
 	struct btt *btt = nd_btt->btt;
 
 	btt_fini(btt);
diff --git a/drivers/nvdimm/nd.h b/drivers/nvdimm/nd.h
index 875c524..b0a4ab9 100644
--- a/drivers/nvdimm/nd.h
+++ b/drivers/nvdimm/nd.h
@@ -263,7 +263,7 @@ struct resource *nvdimm_allocate_dpa(struct nvdimm_drvdata *ndd,
 resource_size_t nvdimm_namespace_capacity(struct nd_namespace_common *ndns);
 struct nd_namespace_common *nvdimm_namespace_common_probe(struct device *dev);
 int nvdimm_namespace_attach_btt(struct nd_namespace_common *ndns);
-int nvdimm_namespace_detach_btt(struct nd_namespace_common *ndns);
+int nvdimm_namespace_detach_btt(struct nd_btt *nd_btt);
 const char *nvdimm_namespace_disk_name(struct nd_namespace_common *ndns,
 		char *name);
 void nvdimm_badblocks_populate(struct nd_region *nd_region,
diff --git a/drivers/nvdimm/pmem.c b/drivers/nvdimm/pmem.c
index f798899..2b51d4d 100644
--- a/drivers/nvdimm/pmem.c
+++ b/drivers/nvdimm/pmem.c
@@ -35,7 +35,6 @@
 struct pmem_device {
 	struct request_queue	*pmem_queue;
 	struct gendisk		*pmem_disk;
-	struct nd_namespace_common *ndns;
 
 	/* One contiguous memory region per device */
 	phys_addr_t		phys_addr;
@@ -436,9 +435,8 @@ static int nd_pfn_init(struct nd_pfn *nd_pfn)
 	return -ENXIO;
 }
 
-static int nvdimm_namespace_detach_pfn(struct nd_namespace_common *ndns)
+static int nvdimm_namespace_detach_pfn(struct nd_pfn *nd_pfn)
 {
-	struct nd_pfn *nd_pfn = to_nd_pfn(ndns->claim);
 	struct pmem_device *pmem;
 
 	/* free pmem disk */
@@ -537,7 +535,7 @@ static int __nvdimm_namespace_attach_pfn(struct nd_pfn *nd_pfn)
 
 	return rc;
  err:
-	nvdimm_namespace_detach_pfn(ndns);
+	nvdimm_namespace_detach_pfn(nd_pfn);
 	return rc;
 
 }
@@ -573,7 +571,6 @@ static int nd_pmem_probe(struct device *dev)
 	if (IS_ERR(pmem))
 		return PTR_ERR(pmem);
 
-	pmem->ndns = ndns;
 	dev_set_drvdata(dev, pmem);
 	ndns->rw_bytes = pmem_rw_bytes;
 	if (devm_init_badblocks(dev, &pmem->bb))
@@ -607,9 +604,9 @@ static int nd_pmem_remove(struct device *dev)
 	struct pmem_device *pmem = dev_get_drvdata(dev);
 
 	if (is_nd_btt(dev))
-		nvdimm_namespace_detach_btt(pmem->ndns);
+		nvdimm_namespace_detach_btt(to_nd_btt(dev));
 	else if (is_nd_pfn(dev))
-		nvdimm_namespace_detach_pfn(pmem->ndns);
+		nvdimm_namespace_detach_pfn(to_nd_pfn(dev));
 	else
 		pmem_detach_disk(pmem);
 
@@ -618,26 +615,33 @@ static int nd_pmem_remove(struct device *dev)
 
 static void nd_pmem_notify(struct device *dev, enum nvdimm_event event)
 {
-	struct pmem_device *pmem = dev_get_drvdata(dev);
-	struct nd_namespace_common *ndns = pmem->ndns;
 	struct nd_region *nd_region = to_nd_region(dev->parent);
-	struct nd_namespace_io *nsio = to_nd_namespace_io(&ndns->dev);
-	struct resource res = {
-		.start = nsio->res.start + pmem->data_offset,
-		.end = nsio->res.end,
-	};
+	struct pmem_device *pmem = dev_get_drvdata(dev);
+	resource_size_t offset = 0, end_trunc = 0;
+	struct nd_namespace_common *ndns;
+	struct nd_namespace_io *nsio;
+	struct resource res;
 
 	if (event != NVDIMM_REVALIDATE_POISON)
 		return;
 
-	if (is_nd_pfn(dev)) {
+	if (is_nd_btt(dev)) {
+		struct nd_btt *nd_btt = to_nd_btt(dev);
+
+		ndns = nd_btt->ndns;
+	} else if (is_nd_pfn(dev)) {
 		struct nd_pfn *nd_pfn = to_nd_pfn(dev);
 		struct nd_pfn_sb *pfn_sb = nd_pfn->pfn_sb;
 
-		res.start += __le32_to_cpu(pfn_sb->start_pad);
-		res.end -= __le32_to_cpu(pfn_sb->end_trunc);
-	}
+		ndns = nd_pfn->ndns;
+		offset = pmem->data_offset + __le32_to_cpu(pfn_sb->start_pad);
+		end_trunc = __le32_to_cpu(pfn_sb->end_trunc);
+	} else
+		ndns = to_ndns(dev);
 
+	nsio = to_nd_namespace_io(&ndns->dev);
+	res.start = nsio->res.start + offset;
+	res.end = nsio->res.end - end_trunc;
 	nvdimm_badblocks_populate(nd_region, &pmem->bb, &res);
 }
 
-- 
cgit v0.10.2


From bd032943b5b2b336994171dcebc11531a38b45ba Mon Sep 17 00:00:00 2001
From: Dan Williams <dan.j.williams@intel.com>
Date: Thu, 17 Mar 2016 18:16:15 -0700
Subject: libnvdimm, pfn, convert nd_pfn_probe() to devm

Pass the device performing the probe so we can use a devm allocation for
the pfn superblock.

Cc: Ross Zwisler <ross.zwisler@linux.intel.com>
Reviewed-by: Johannes Thumshirn <jthumshirn@suse.de>
Signed-off-by: Dan Williams <dan.j.williams@intel.com>

diff --git a/drivers/nvdimm/nd.h b/drivers/nvdimm/nd.h
index b0a4ab9..c831caa 100644
--- a/drivers/nvdimm/nd.h
+++ b/drivers/nvdimm/nd.h
@@ -219,12 +219,14 @@ static inline struct device *nd_btt_create(struct nd_region *nd_region)
 
 struct nd_pfn *to_nd_pfn(struct device *dev);
 #if IS_ENABLED(CONFIG_NVDIMM_PFN)
-int nd_pfn_probe(struct nd_namespace_common *ndns, void *drvdata);
+int nd_pfn_probe(struct device *dev, struct nd_namespace_common *ndns,
+		void *drvdata);
 bool is_nd_pfn(struct device *dev);
 struct device *nd_pfn_create(struct nd_region *nd_region);
 int nd_pfn_validate(struct nd_pfn *nd_pfn);
 #else
-static inline int nd_pfn_probe(struct nd_namespace_common *ndns, void *drvdata)
+static inline int nd_pfn_probe(struct device *dev, struct nd_namespace_common *ndns,
+		void *drvdata)
 {
 	return -ENODEV;
 }
diff --git a/drivers/nvdimm/pfn_devs.c b/drivers/nvdimm/pfn_devs.c
index e071e21..96aa549 100644
--- a/drivers/nvdimm/pfn_devs.c
+++ b/drivers/nvdimm/pfn_devs.c
@@ -410,11 +410,12 @@ int nd_pfn_validate(struct nd_pfn *nd_pfn)
 }
 EXPORT_SYMBOL(nd_pfn_validate);
 
-int nd_pfn_probe(struct nd_namespace_common *ndns, void *drvdata)
+int nd_pfn_probe(struct device *dev, struct nd_namespace_common *ndns,
+		void *drvdata)
 {
 	int rc;
-	struct device *dev;
 	struct nd_pfn *nd_pfn;
+	struct device *pfn_dev;
 	struct nd_pfn_sb *pfn_sb;
 	struct nd_region *nd_region = to_nd_region(ndns->dev.parent);
 
@@ -422,24 +423,22 @@ int nd_pfn_probe(struct nd_namespace_common *ndns, void *drvdata)
 		return -ENODEV;
 
 	nvdimm_bus_lock(&ndns->dev);
-	dev = __nd_pfn_create(nd_region, ndns);
+	pfn_dev = __nd_pfn_create(nd_region, ndns);
 	nvdimm_bus_unlock(&ndns->dev);
-	if (!dev)
+	if (!pfn_dev)
 		return -ENOMEM;
-	dev_set_drvdata(dev, drvdata);
-	pfn_sb = kzalloc(sizeof(*pfn_sb), GFP_KERNEL);
-	nd_pfn = to_nd_pfn(dev);
+	dev_set_drvdata(pfn_dev, drvdata);
+	pfn_sb = devm_kzalloc(dev, sizeof(*pfn_sb), GFP_KERNEL);
+	nd_pfn = to_nd_pfn(pfn_dev);
 	nd_pfn->pfn_sb = pfn_sb;
 	rc = nd_pfn_validate(nd_pfn);
-	nd_pfn->pfn_sb = NULL;
-	kfree(pfn_sb);
-	dev_dbg(&ndns->dev, "%s: pfn: %s\n", __func__,
-			rc == 0 ? dev_name(dev) : "<none>");
+	dev_dbg(dev, "%s: pfn: %s\n", __func__,
+			rc == 0 ? dev_name(pfn_dev) : "<none>");
 	if (rc < 0) {
-		__nd_detach_ndns(dev, &nd_pfn->ndns);
-		put_device(dev);
+		__nd_detach_ndns(pfn_dev, &nd_pfn->ndns);
+		put_device(pfn_dev);
 	} else
-		__nd_device_register(&nd_pfn->dev);
+		__nd_device_register(pfn_dev);
 
 	return rc;
 }
diff --git a/drivers/nvdimm/pmem.c b/drivers/nvdimm/pmem.c
index 2b51d4d..4d8f5c5 100644
--- a/drivers/nvdimm/pmem.c
+++ b/drivers/nvdimm/pmem.c
@@ -330,18 +330,19 @@ static int pmem_rw_bytes(struct nd_namespace_common *ndns,
 
 static int nd_pfn_init(struct nd_pfn *nd_pfn)
 {
-	struct nd_pfn_sb *pfn_sb = kzalloc(sizeof(*pfn_sb), GFP_KERNEL);
 	struct pmem_device *pmem = dev_get_drvdata(&nd_pfn->dev);
 	struct nd_namespace_common *ndns = nd_pfn->ndns;
 	u32 start_pad = 0, end_trunc = 0;
 	resource_size_t start, size;
 	struct nd_namespace_io *nsio;
 	struct nd_region *nd_region;
+	struct nd_pfn_sb *pfn_sb;
 	unsigned long npfns;
 	phys_addr_t offset;
 	u64 checksum;
 	int rc;
 
+	pfn_sb = devm_kzalloc(&nd_pfn->dev, sizeof(*pfn_sb), GFP_KERNEL);
 	if (!pfn_sb)
 		return -ENOMEM;
 
@@ -357,7 +358,7 @@ static int nd_pfn_init(struct nd_pfn *nd_pfn)
 		dev_info(&nd_pfn->dev,
 				"%s is read-only, unable to init metadata\n",
 				dev_name(&nd_region->dev));
-		goto err;
+		return -ENXIO;
 	}
 
 	memset(pfn_sb, 0, sizeof(*pfn_sb));
@@ -402,12 +403,12 @@ static int nd_pfn_init(struct nd_pfn *nd_pfn)
 	else if (nd_pfn->mode == PFN_MODE_RAM)
 		offset = ALIGN(start + SZ_8K, nd_pfn->align) - start;
 	else
-		goto err;
+		return -ENXIO;
 
 	if (offset + start_pad + end_trunc >= pmem->size) {
 		dev_err(&nd_pfn->dev, "%s unable to satisfy requested alignment\n",
 				dev_name(&ndns->dev));
-		goto err;
+		return -ENXIO;
 	}
 
 	npfns = (pmem->size - offset - start_pad - end_trunc) / SZ_4K;
@@ -424,30 +425,16 @@ static int nd_pfn_init(struct nd_pfn *nd_pfn)
 	checksum = nd_sb_checksum((struct nd_gen_sb *) pfn_sb);
 	pfn_sb->checksum = cpu_to_le64(checksum);
 
-	rc = nvdimm_write_bytes(ndns, SZ_4K, pfn_sb, sizeof(*pfn_sb));
-	if (rc)
-		goto err;
-
-	return 0;
- err:
-	nd_pfn->pfn_sb = NULL;
-	kfree(pfn_sb);
-	return -ENXIO;
+	return nvdimm_write_bytes(ndns, SZ_4K, pfn_sb, sizeof(*pfn_sb));
 }
 
-static int nvdimm_namespace_detach_pfn(struct nd_pfn *nd_pfn)
+static void nvdimm_namespace_detach_pfn(struct nd_pfn *nd_pfn)
 {
 	struct pmem_device *pmem;
 
 	/* free pmem disk */
 	pmem = dev_get_drvdata(&nd_pfn->dev);
 	pmem_detach_disk(pmem);
-
-	/* release nd_pfn resources */
-	kfree(nd_pfn->pfn_sb);
-	nd_pfn->pfn_sb = NULL;
-
-	return 0;
 }
 
 /*
@@ -587,7 +574,8 @@ static int nd_pmem_probe(struct device *dev)
 	if (is_nd_pfn(dev))
 		return nvdimm_namespace_attach_pfn(ndns);
 
-	if (nd_btt_probe(ndns, pmem) == 0 || nd_pfn_probe(ndns, pmem) == 0) {
+	if (nd_btt_probe(ndns, pmem) == 0
+			|| nd_pfn_probe(dev, ndns, pmem) == 0) {
 		/*
 		 * We'll come back as either btt-pmem, or pfn-pmem, so
 		 * drop the queue allocation for now.
-- 
cgit v0.10.2


From e32bc729a3a486e20443db3379ecf67240b20616 Mon Sep 17 00:00:00 2001
From: Dan Williams <dan.j.williams@intel.com>
Date: Thu, 17 Mar 2016 18:23:09 -0700
Subject: libnvdimm, btt, convert nd_btt_probe() to devm

Pass the device performing the probe so we can use a devm allocation for
the btt superblock.

Cc: Vishal Verma <vishal.l.verma@intel.com>
Reviewed-by: Johannes Thumshirn <jthumshirn@suse.de>
Signed-off-by: Dan Williams <dan.j.williams@intel.com>

diff --git a/drivers/nvdimm/blk.c b/drivers/nvdimm/blk.c
index 2464939..c8215dc 100644
--- a/drivers/nvdimm/blk.c
+++ b/drivers/nvdimm/blk.c
@@ -314,7 +314,7 @@ static int nd_blk_probe(struct device *dev)
 	ndns->rw_bytes = nd_blk_rw_bytes;
 	if (is_nd_btt(dev))
 		rc = nvdimm_namespace_attach_btt(ndns);
-	else if (nd_btt_probe(ndns, blk_dev) == 0) {
+	else if (nd_btt_probe(dev, ndns, blk_dev) == 0) {
 		/* we'll come back as btt-blk */
 		rc = -ENXIO;
 	} else
diff --git a/drivers/nvdimm/btt.c b/drivers/nvdimm/btt.c
index 676c31a..cc9fafe 100644
--- a/drivers/nvdimm/btt.c
+++ b/drivers/nvdimm/btt.c
@@ -1306,7 +1306,7 @@ static struct btt *btt_init(struct nd_btt *nd_btt, unsigned long long rawsize,
 	struct btt *btt;
 	struct device *dev = &nd_btt->dev;
 
-	btt = kzalloc(sizeof(struct btt), GFP_KERNEL);
+	btt = devm_kzalloc(dev, sizeof(struct btt), GFP_KERNEL);
 	if (!btt)
 		return NULL;
 
@@ -1321,13 +1321,13 @@ static struct btt *btt_init(struct nd_btt *nd_btt, unsigned long long rawsize,
 	ret = discover_arenas(btt);
 	if (ret) {
 		dev_err(dev, "init: error in arena_discover: %d\n", ret);
-		goto out_free;
+		return NULL;
 	}
 
 	if (btt->init_state != INIT_READY && nd_region->ro) {
 		dev_info(dev, "%s is read-only, unable to init btt metadata\n",
 				dev_name(&nd_region->dev));
-		goto out_free;
+		return NULL;
 	} else if (btt->init_state != INIT_READY) {
 		btt->num_arenas = (rawsize / ARENA_MAX_SIZE) +
 			((rawsize % ARENA_MAX_SIZE) ? 1 : 0);
@@ -1337,29 +1337,25 @@ static struct btt *btt_init(struct nd_btt *nd_btt, unsigned long long rawsize,
 		ret = create_arenas(btt);
 		if (ret) {
 			dev_info(dev, "init: create_arenas: %d\n", ret);
-			goto out_free;
+			return NULL;
 		}
 
 		ret = btt_meta_init(btt);
 		if (ret) {
 			dev_err(dev, "init: error in meta_init: %d\n", ret);
-			goto out_free;
+			return NULL;
 		}
 	}
 
 	ret = btt_blk_init(btt);
 	if (ret) {
 		dev_err(dev, "init: error in blk_init: %d\n", ret);
-		goto out_free;
+		return NULL;
 	}
 
 	btt_debugfs_init(btt);
 
 	return btt;
-
- out_free:
-	kfree(btt);
-	return NULL;
 }
 
 /**
@@ -1377,7 +1373,6 @@ static void btt_fini(struct btt *btt)
 		btt_blk_cleanup(btt);
 		free_arenas(btt);
 		debugfs_remove_recursive(btt->debugfs_dir);
-		kfree(btt);
 	}
 }
 
diff --git a/drivers/nvdimm/btt_devs.c b/drivers/nvdimm/btt_devs.c
index cb47751..1886171 100644
--- a/drivers/nvdimm/btt_devs.c
+++ b/drivers/nvdimm/btt_devs.c
@@ -273,10 +273,11 @@ static int __nd_btt_probe(struct nd_btt *nd_btt,
 	return 0;
 }
 
-int nd_btt_probe(struct nd_namespace_common *ndns, void *drvdata)
+int nd_btt_probe(struct device *dev, struct nd_namespace_common *ndns,
+		void *drvdata)
 {
 	int rc;
-	struct device *dev;
+	struct device *btt_dev;
 	struct btt_sb *btt_sb;
 	struct nd_region *nd_region = to_nd_region(ndns->dev.parent);
 
@@ -284,21 +285,20 @@ int nd_btt_probe(struct nd_namespace_common *ndns, void *drvdata)
 		return -ENODEV;
 
 	nvdimm_bus_lock(&ndns->dev);
-	dev = __nd_btt_create(nd_region, 0, NULL, ndns);
+	btt_dev = __nd_btt_create(nd_region, 0, NULL, ndns);
 	nvdimm_bus_unlock(&ndns->dev);
-	if (!dev)
+	if (!btt_dev)
 		return -ENOMEM;
-	dev_set_drvdata(dev, drvdata);
-	btt_sb = kzalloc(sizeof(*btt_sb), GFP_KERNEL);
-	rc = __nd_btt_probe(to_nd_btt(dev), ndns, btt_sb);
-	kfree(btt_sb);
-	dev_dbg(&ndns->dev, "%s: btt: %s\n", __func__,
-			rc == 0 ? dev_name(dev) : "<none>");
+	dev_set_drvdata(btt_dev, drvdata);
+	btt_sb = devm_kzalloc(dev, sizeof(*btt_sb), GFP_KERNEL);
+	rc = __nd_btt_probe(to_nd_btt(btt_dev), ndns, btt_sb);
+	dev_dbg(dev, "%s: btt: %s\n", __func__,
+			rc == 0 ? dev_name(btt_dev) : "<none>");
 	if (rc < 0) {
-		struct nd_btt *nd_btt = to_nd_btt(dev);
+		struct nd_btt *nd_btt = to_nd_btt(btt_dev);
 
-		__nd_detach_ndns(dev, &nd_btt->ndns);
-		put_device(dev);
+		__nd_detach_ndns(btt_dev, &nd_btt->ndns);
+		put_device(btt_dev);
 	}
 
 	return rc;
diff --git a/drivers/nvdimm/nd.h b/drivers/nvdimm/nd.h
index c831caa..0fb1489 100644
--- a/drivers/nvdimm/nd.h
+++ b/drivers/nvdimm/nd.h
@@ -197,11 +197,13 @@ struct nd_gen_sb {
 
 u64 nd_sb_checksum(struct nd_gen_sb *sb);
 #if IS_ENABLED(CONFIG_BTT)
-int nd_btt_probe(struct nd_namespace_common *ndns, void *drvdata);
+int nd_btt_probe(struct device *dev, struct nd_namespace_common *ndns,
+		void *drvdata);
 bool is_nd_btt(struct device *dev);
 struct device *nd_btt_create(struct nd_region *nd_region);
 #else
-static inline int nd_btt_probe(struct nd_namespace_common *ndns, void *drvdata)
+static inline int nd_btt_probe(struct device *dev,
+		struct nd_namespace_common *ndns, void *drvdata)
 {
 	return -ENODEV;
 }
diff --git a/drivers/nvdimm/pmem.c b/drivers/nvdimm/pmem.c
index 4d8f5c5..6fa39f5 100644
--- a/drivers/nvdimm/pmem.c
+++ b/drivers/nvdimm/pmem.c
@@ -574,7 +574,7 @@ static int nd_pmem_probe(struct device *dev)
 	if (is_nd_pfn(dev))
 		return nvdimm_namespace_attach_pfn(ndns);
 
-	if (nd_btt_probe(ndns, pmem) == 0
+	if (nd_btt_probe(dev, ndns, pmem) == 0
 			|| nd_pfn_probe(dev, ndns, pmem) == 0) {
 		/*
 		 * We'll come back as either btt-pmem, or pfn-pmem, so
-- 
cgit v0.10.2


From 9dec4892ca9afd6aad3c9c9e6c17480ecbd04440 Mon Sep 17 00:00:00 2001
From: Dan Williams <dan.j.williams@intel.com>
Date: Fri, 22 Apr 2016 12:26:05 -0700
Subject: libnvdimm, btt: add btt startup debug

Report the reason for btt probe failures when debug is enabled.

Signed-off-by: Dan Williams <dan.j.williams@intel.com>

diff --git a/drivers/nvdimm/btt.c b/drivers/nvdimm/btt.c
index f068b65..af09d6c 100644
--- a/drivers/nvdimm/btt.c
+++ b/drivers/nvdimm/btt.c
@@ -1388,11 +1388,15 @@ int nvdimm_namespace_attach_btt(struct nd_namespace_common *ndns)
 	struct btt *btt;
 	size_t rawsize;
 
-	if (!nd_btt->uuid || !nd_btt->ndns || !nd_btt->lbasize)
+	if (!nd_btt->uuid || !nd_btt->ndns || !nd_btt->lbasize) {
+		dev_dbg(&nd_btt->dev, "incomplete btt configuration\n");
 		return -ENODEV;
+	}
 
 	rawsize = nvdimm_namespace_capacity(ndns) - SZ_4K;
 	if (rawsize < ARENA_MIN_SIZE) {
+		dev_dbg(&nd_btt->dev, "%s must be at least %ld bytes\n",
+				dev_name(&ndns->dev), ARENA_MIN_SIZE + SZ_4K);
 		return -ENXIO;
 	}
 	nd_region = to_nd_region(nd_btt->dev.parent);
-- 
cgit v0.10.2


From d29cee120eb890027c69f5fe7cce8bd6a663900a Mon Sep 17 00:00:00 2001
From: Dan Williams <dan.j.williams@intel.com>
Date: Thu, 17 Mar 2016 20:08:28 -0700
Subject: libnvdimm, blk: use devm_add_action to release bdev resources

Register a callback to clean up the request_queue and put the gendisk at
driver disable time.

Cc: Ross Zwisler <ross.zwisler@linux.intel.com>
Reviewed-by: Johannes Thumshirn <jthumshirn@suse.de>
Signed-off-by: Dan Williams <dan.j.williams@intel.com>

diff --git a/drivers/nvdimm/blk.c b/drivers/nvdimm/blk.c
index c8215dc..27ff32a 100644
--- a/drivers/nvdimm/blk.c
+++ b/drivers/nvdimm/blk.c
@@ -22,8 +22,6 @@
 #include "nd.h"
 
 struct nd_blk_device {
-	struct request_queue *queue;
-	struct gendisk *disk;
 	struct nd_namespace_blk *nsblk;
 	struct nd_blk_region *ndbr;
 	size_t disk_size;
@@ -235,29 +233,47 @@ static const struct block_device_operations nd_blk_fops = {
 	.revalidate_disk = nvdimm_revalidate_disk,
 };
 
-static int nd_blk_attach_disk(struct nd_namespace_common *ndns,
-		struct nd_blk_device *blk_dev)
+static void nd_blk_release_queue(void *q)
+{
+	blk_cleanup_queue(q);
+}
+
+static void nd_blk_release_disk(void *disk)
+{
+	del_gendisk(disk);
+	put_disk(disk);
+}
+
+static int nd_blk_attach_disk(struct device *dev,
+		struct nd_namespace_common *ndns, struct nd_blk_device *blk_dev)
 {
 	resource_size_t available_disk_size;
+	struct request_queue *q;
 	struct gendisk *disk;
 	u64 internal_nlba;
 
 	internal_nlba = div_u64(blk_dev->disk_size, blk_dev->internal_lbasize);
 	available_disk_size = internal_nlba * blk_dev->sector_size;
 
-	blk_dev->queue = blk_alloc_queue(GFP_KERNEL);
-	if (!blk_dev->queue)
+	q = blk_alloc_queue(GFP_KERNEL);
+	if (!q)
+		return -ENOMEM;
+	if (devm_add_action(dev, nd_blk_release_queue, q)) {
+		blk_cleanup_queue(q);
 		return -ENOMEM;
+	}
 
-	blk_queue_make_request(blk_dev->queue, nd_blk_make_request);
-	blk_queue_max_hw_sectors(blk_dev->queue, UINT_MAX);
-	blk_queue_bounce_limit(blk_dev->queue, BLK_BOUNCE_ANY);
-	blk_queue_logical_block_size(blk_dev->queue, blk_dev->sector_size);
-	queue_flag_set_unlocked(QUEUE_FLAG_NONROT, blk_dev->queue);
+	blk_queue_make_request(q, nd_blk_make_request);
+	blk_queue_max_hw_sectors(q, UINT_MAX);
+	blk_queue_bounce_limit(q, BLK_BOUNCE_ANY);
+	blk_queue_logical_block_size(q, blk_dev->sector_size);
+	queue_flag_set_unlocked(QUEUE_FLAG_NONROT, q);
 
-	disk = blk_dev->disk = alloc_disk(0);
-	if (!disk) {
-		blk_cleanup_queue(blk_dev->queue);
+	disk = alloc_disk(0);
+	if (!disk)
+		return -ENOMEM;
+	if (devm_add_action(dev, nd_blk_release_disk, disk)) {
+		put_disk(disk);
 		return -ENOMEM;
 	}
 
@@ -265,7 +281,7 @@ static int nd_blk_attach_disk(struct nd_namespace_common *ndns,
 	disk->first_minor	= 0;
 	disk->fops		= &nd_blk_fops;
 	disk->private_data	= blk_dev;
-	disk->queue		= blk_dev->queue;
+	disk->queue		= q;
 	disk->flags		= GENHD_FL_EXT_DEVT;
 	nvdimm_namespace_disk_name(ndns, disk->disk_name);
 	set_capacity(disk, 0);
@@ -274,12 +290,8 @@ static int nd_blk_attach_disk(struct nd_namespace_common *ndns,
 	if (nd_blk_meta_size(blk_dev)) {
 		int rc = nd_integrity_init(disk, nd_blk_meta_size(blk_dev));
 
-		if (rc) {
-			del_gendisk(disk);
-			put_disk(disk);
-			blk_cleanup_queue(blk_dev->queue);
+		if (rc)
 			return rc;
-		}
 	}
 
 	set_capacity(disk, available_disk_size >> SECTOR_SHIFT);
@@ -292,13 +304,12 @@ static int nd_blk_probe(struct device *dev)
 	struct nd_namespace_common *ndns;
 	struct nd_namespace_blk *nsblk;
 	struct nd_blk_device *blk_dev;
-	int rc;
 
 	ndns = nvdimm_namespace_common_probe(dev);
 	if (IS_ERR(ndns))
 		return PTR_ERR(ndns);
 
-	blk_dev = kzalloc(sizeof(*blk_dev), GFP_KERNEL);
+	blk_dev = devm_kzalloc(dev, sizeof(*blk_dev), GFP_KERNEL);
 	if (!blk_dev)
 		return -ENOMEM;
 
@@ -313,34 +324,18 @@ static int nd_blk_probe(struct device *dev)
 
 	ndns->rw_bytes = nd_blk_rw_bytes;
 	if (is_nd_btt(dev))
-		rc = nvdimm_namespace_attach_btt(ndns);
+		return nvdimm_namespace_attach_btt(ndns);
 	else if (nd_btt_probe(dev, ndns, blk_dev) == 0) {
 		/* we'll come back as btt-blk */
-		rc = -ENXIO;
+		return -ENXIO;
 	} else
-		rc = nd_blk_attach_disk(ndns, blk_dev);
-	if (rc)
-		kfree(blk_dev);
-	return rc;
-}
-
-static void nd_blk_detach_disk(struct nd_blk_device *blk_dev)
-{
-	del_gendisk(blk_dev->disk);
-	put_disk(blk_dev->disk);
-	blk_cleanup_queue(blk_dev->queue);
+		return nd_blk_attach_disk(dev, ndns, blk_dev);
 }
 
 static int nd_blk_remove(struct device *dev)
 {
-	struct nd_blk_device *blk_dev = dev_get_drvdata(dev);
-
 	if (is_nd_btt(dev))
 		nvdimm_namespace_detach_btt(to_nd_btt(dev));
-	else
-		nd_blk_detach_disk(blk_dev);
-	kfree(blk_dev);
-
 	return 0;
 }
 
-- 
cgit v0.10.2


From d44077a7cddce18fc8d83194bb4c83a0225f0f40 Mon Sep 17 00:00:00 2001
From: Dan Williams <dan.j.williams@intel.com>
Date: Fri, 18 Mar 2016 23:45:45 -0700
Subject: libnvdimm, blk: use ->queuedata for driver private data

Save a pointer chase by storing the driver private data in the
request_queue rather than the gendisk.

Reviewed-by: Johannes Thumshirn <jthumshirn@suse.de>
Signed-off-by: Dan Williams <dan.j.williams@intel.com>

diff --git a/drivers/nvdimm/blk.c b/drivers/nvdimm/blk.c
index 27ff32a..c8635b3 100644
--- a/drivers/nvdimm/blk.c
+++ b/drivers/nvdimm/blk.c
@@ -159,8 +159,6 @@ static int nd_blk_do_bvec(struct nd_blk_device *blk_dev,
 
 static blk_qc_t nd_blk_make_request(struct request_queue *q, struct bio *bio)
 {
-	struct block_device *bdev = bio->bi_bdev;
-	struct gendisk *disk = bdev->bd_disk;
 	struct bio_integrity_payload *bip;
 	struct nd_blk_device *blk_dev;
 	struct bvec_iter iter;
@@ -181,7 +179,7 @@ static blk_qc_t nd_blk_make_request(struct request_queue *q, struct bio *bio)
 	}
 
 	bip = bio_integrity(bio);
-	blk_dev = disk->private_data;
+	blk_dev = q->queuedata;
 	rw = bio_data_dir(bio);
 	do_acct = nd_iostat_start(bio, &start);
 	bio_for_each_segment(bvec, bio, iter) {
@@ -268,6 +266,7 @@ static int nd_blk_attach_disk(struct device *dev,
 	blk_queue_bounce_limit(q, BLK_BOUNCE_ANY);
 	blk_queue_logical_block_size(q, blk_dev->sector_size);
 	queue_flag_set_unlocked(QUEUE_FLAG_NONROT, q);
+	q->queuedata = blk_dev;
 
 	disk = alloc_disk(0);
 	if (!disk)
@@ -280,7 +279,6 @@ static int nd_blk_attach_disk(struct device *dev,
 	disk->driverfs_dev	= &ndns->dev;
 	disk->first_minor	= 0;
 	disk->fops		= &nd_blk_fops;
-	disk->private_data	= blk_dev;
 	disk->queue		= q;
 	disk->flags		= GENHD_FL_EXT_DEVT;
 	nvdimm_namespace_disk_name(ndns, disk->disk_name);
-- 
cgit v0.10.2


From bd842b8ca7f207b99a5476a8174e62c29a2ff80e Mon Sep 17 00:00:00 2001
From: Dan Williams <dan.j.williams@intel.com>
Date: Fri, 18 Mar 2016 23:47:43 -0700
Subject: libnvdimm, pmem: use ->queuedata for driver private data

Save a pointer chase by storing the driver private data in the
request_queue rather than the gendisk.

Reviewed-by: Johannes Thumshirn <jthumshirn@suse.de>
Signed-off-by: Dan Williams <dan.j.williams@intel.com>

diff --git a/drivers/nvdimm/pmem.c b/drivers/nvdimm/pmem.c
index 6fa39f5..2238e3a 100644
--- a/drivers/nvdimm/pmem.c
+++ b/drivers/nvdimm/pmem.c
@@ -135,8 +135,7 @@ static blk_qc_t pmem_make_request(struct request_queue *q, struct bio *bio)
 	unsigned long start;
 	struct bio_vec bvec;
 	struct bvec_iter iter;
-	struct block_device *bdev = bio->bi_bdev;
-	struct pmem_device *pmem = bdev->bd_disk->private_data;
+	struct pmem_device *pmem = q->queuedata;
 
 	do_acct = nd_iostat_start(bio, &start);
 	bio_for_each_segment(bvec, bio, iter) {
@@ -161,7 +160,7 @@ static blk_qc_t pmem_make_request(struct request_queue *q, struct bio *bio)
 static int pmem_rw_page(struct block_device *bdev, sector_t sector,
 		       struct page *page, int rw)
 {
-	struct pmem_device *pmem = bdev->bd_disk->private_data;
+	struct pmem_device *pmem = bdev->bd_queue->queuedata;
 	int rc;
 
 	rc = pmem_do_bvec(pmem, page, PAGE_SIZE, 0, rw, sector);
@@ -183,7 +182,7 @@ static int pmem_rw_page(struct block_device *bdev, sector_t sector,
 static long pmem_direct_access(struct block_device *bdev, sector_t sector,
 		      void __pmem **kaddr, pfn_t *pfn)
 {
-	struct pmem_device *pmem = bdev->bd_disk->private_data;
+	struct pmem_device *pmem = bdev->bd_queue->queuedata;
 	resource_size_t offset = sector * 512 + pmem->data_offset;
 
 	*kaddr = pmem->virt_addr + offset;
@@ -267,6 +266,7 @@ static int pmem_attach_disk(struct device *dev,
 	blk_queue_max_hw_sectors(pmem->pmem_queue, UINT_MAX);
 	blk_queue_bounce_limit(pmem->pmem_queue, BLK_BOUNCE_ANY);
 	queue_flag_set_unlocked(QUEUE_FLAG_NONROT, pmem->pmem_queue);
+	pmem->pmem_queue->queuedata = pmem;
 
 	disk = alloc_disk_node(0, nid);
 	if (!disk) {
@@ -275,7 +275,6 @@ static int pmem_attach_disk(struct device *dev,
 	}
 
 	disk->fops		= &pmem_fops;
-	disk->private_data	= pmem;
 	disk->queue		= pmem->pmem_queue;
 	disk->flags		= GENHD_FL_EXT_DEVT;
 	nvdimm_namespace_disk_name(ndns, disk->disk_name);
-- 
cgit v0.10.2


From 8378af17a4021f01b3bed20c1bd19c3921c1f5ac Mon Sep 17 00:00:00 2001
From: Dan Williams <dan.j.williams@intel.com>
Date: Thu, 24 Mar 2016 18:06:07 -0700
Subject: libnvdimm, blk: quiet i/o error reporting

I/O errors events have the potential to be a high frequency and a log
message for each event can swamp the system.  This message is also
redundant with upper layer error reporting.

Cc: Ross Zwisler <ross.zwisler@linux.intel.com>
Reviewed-by: Johannes Thumshirn <jthumshirn@suse.de>
Signed-off-by: Dan Williams <dan.j.williams@intel.com>

diff --git a/drivers/nvdimm/blk.c b/drivers/nvdimm/blk.c
index c8635b3..26d0398 100644
--- a/drivers/nvdimm/blk.c
+++ b/drivers/nvdimm/blk.c
@@ -189,7 +189,7 @@ static blk_qc_t nd_blk_make_request(struct request_queue *q, struct bio *bio)
 		err = nd_blk_do_bvec(blk_dev, bip, bvec.bv_page, len,
 					bvec.bv_offset, rw, iter.bi_sector);
 		if (err) {
-			dev_info(&blk_dev->nsblk->common.dev,
+			dev_dbg(&blk_dev->nsblk->common.dev,
 					"io error in %s sector %lld, len %d,\n",
 					(rw == READ) ? "READ" : "WRITE",
 					(unsigned long long) iter.bi_sector, len);
-- 
cgit v0.10.2


From 9d90725ddca347450c4ab177ad680ed76063afd4 Mon Sep 17 00:00:00 2001
From: Dan Williams <dan.j.williams@intel.com>
Date: Fri, 18 Mar 2016 11:27:36 -0700
Subject: libnvdimm, blk: move i/o infrastructure to nd_namespace_blk

Consolidate the information for issuing i/o to a blk-namespace, and
eliminate some pointer chasing.

Reviewed-by: Johannes Thumshirn <jthumshirn@suse.de>
Signed-off-by: Dan Williams <dan.j.williams@intel.com>

diff --git a/drivers/nvdimm/blk.c b/drivers/nvdimm/blk.c
index 26d0398..4c14ecd 100644
--- a/drivers/nvdimm/blk.c
+++ b/drivers/nvdimm/blk.c
@@ -21,17 +21,19 @@
 #include <linux/sizes.h>
 #include "nd.h"
 
-struct nd_blk_device {
-	struct nd_namespace_blk *nsblk;
-	struct nd_blk_region *ndbr;
-	size_t disk_size;
-	u32 sector_size;
-	u32 internal_lbasize;
-};
+static u32 nsblk_meta_size(struct nd_namespace_blk *nsblk)
+{
+	return nsblk->lbasize - ((nsblk->lbasize >= 4096) ? 4096 : 512);
+}
 
-static u32 nd_blk_meta_size(struct nd_blk_device *blk_dev)
+static u32 nsblk_internal_lbasize(struct nd_namespace_blk *nsblk)
 {
-	return blk_dev->nsblk->lbasize - blk_dev->sector_size;
+	return roundup(nsblk->lbasize, INT_LBASIZE_ALIGNMENT);
+}
+
+static u32 nsblk_sector_size(struct nd_namespace_blk *nsblk)
+{
+	return nsblk->lbasize - nsblk_meta_size(nsblk);
 }
 
 static resource_size_t to_dev_offset(struct nd_namespace_blk *nsblk,
@@ -55,20 +57,29 @@ static resource_size_t to_dev_offset(struct nd_namespace_blk *nsblk,
 	return SIZE_MAX;
 }
 
+static struct nd_blk_region *to_ndbr(struct nd_namespace_blk *nsblk)
+{
+	struct nd_region *nd_region;
+	struct device *parent;
+
+	parent = nsblk->common.dev.parent;
+	nd_region = container_of(parent, struct nd_region, dev);
+	return container_of(nd_region, struct nd_blk_region, nd_region);
+}
+
 #ifdef CONFIG_BLK_DEV_INTEGRITY
-static int nd_blk_rw_integrity(struct nd_blk_device *blk_dev,
-				struct bio_integrity_payload *bip, u64 lba,
-				int rw)
+static int nd_blk_rw_integrity(struct nd_namespace_blk *nsblk,
+		struct bio_integrity_payload *bip, u64 lba, int rw)
 {
-	unsigned int len = nd_blk_meta_size(blk_dev);
+	struct nd_blk_region *ndbr = to_ndbr(nsblk);
+	unsigned int len = nsblk_meta_size(nsblk);
 	resource_size_t	dev_offset, ns_offset;
-	struct nd_namespace_blk *nsblk;
-	struct nd_blk_region *ndbr;
+	u32 internal_lbasize, sector_size;
 	int err = 0;
 
-	nsblk = blk_dev->nsblk;
-	ndbr = blk_dev->ndbr;
-	ns_offset = lba * blk_dev->internal_lbasize + blk_dev->sector_size;
+	internal_lbasize = nsblk_internal_lbasize(nsblk);
+	sector_size = nsblk_sector_size(nsblk);
+	ns_offset = lba * internal_lbasize + sector_size;
 	dev_offset = to_dev_offset(nsblk, ns_offset, len);
 	if (dev_offset == SIZE_MAX)
 		return -EIO;
@@ -102,25 +113,26 @@ static int nd_blk_rw_integrity(struct nd_blk_device *blk_dev,
 }
 
 #else /* CONFIG_BLK_DEV_INTEGRITY */
-static int nd_blk_rw_integrity(struct nd_blk_device *blk_dev,
-				struct bio_integrity_payload *bip, u64 lba,
-				int rw)
+static int nd_blk_rw_integrity(struct nd_namespace_blk *nsblk,
+		struct bio_integrity_payload *bip, u64 lba, int rw)
 {
 	return 0;
 }
 #endif
 
-static int nd_blk_do_bvec(struct nd_blk_device *blk_dev,
-			struct bio_integrity_payload *bip, struct page *page,
-			unsigned int len, unsigned int off, int rw,
-			sector_t sector)
+static int nsblk_do_bvec(struct nd_namespace_blk *nsblk,
+		struct bio_integrity_payload *bip, struct page *page,
+		unsigned int len, unsigned int off, int rw, sector_t sector)
 {
-	struct nd_blk_region *ndbr = blk_dev->ndbr;
+	struct nd_blk_region *ndbr = to_ndbr(nsblk);
 	resource_size_t	dev_offset, ns_offset;
+	u32 internal_lbasize, sector_size;
 	int err = 0;
 	void *iobuf;
 	u64 lba;
 
+	internal_lbasize = nsblk_internal_lbasize(nsblk);
+	sector_size = nsblk_sector_size(nsblk);
 	while (len) {
 		unsigned int cur_len;
 
@@ -130,11 +142,11 @@ static int nd_blk_do_bvec(struct nd_blk_device *blk_dev,
 		 * Block Window setup/move steps. the do_io routine is capable
 		 * of handling len <= PAGE_SIZE.
 		 */
-		cur_len = bip ? min(len, blk_dev->sector_size) : len;
+		cur_len = bip ? min(len, sector_size) : len;
 
-		lba = div_u64(sector << SECTOR_SHIFT, blk_dev->sector_size);
-		ns_offset = lba * blk_dev->internal_lbasize;
-		dev_offset = to_dev_offset(blk_dev->nsblk, ns_offset, cur_len);
+		lba = div_u64(sector << SECTOR_SHIFT, sector_size);
+		ns_offset = lba * internal_lbasize;
+		dev_offset = to_dev_offset(nsblk, ns_offset, cur_len);
 		if (dev_offset == SIZE_MAX)
 			return -EIO;
 
@@ -145,13 +157,13 @@ static int nd_blk_do_bvec(struct nd_blk_device *blk_dev,
 			return err;
 
 		if (bip) {
-			err = nd_blk_rw_integrity(blk_dev, bip, lba, rw);
+			err = nd_blk_rw_integrity(nsblk, bip, lba, rw);
 			if (err)
 				return err;
 		}
 		len -= cur_len;
 		off += cur_len;
-		sector += blk_dev->sector_size >> SECTOR_SHIFT;
+		sector += sector_size >> SECTOR_SHIFT;
 	}
 
 	return err;
@@ -160,7 +172,7 @@ static int nd_blk_do_bvec(struct nd_blk_device *blk_dev,
 static blk_qc_t nd_blk_make_request(struct request_queue *q, struct bio *bio)
 {
 	struct bio_integrity_payload *bip;
-	struct nd_blk_device *blk_dev;
+	struct nd_namespace_blk *nsblk;
 	struct bvec_iter iter;
 	unsigned long start;
 	struct bio_vec bvec;
@@ -179,17 +191,17 @@ static blk_qc_t nd_blk_make_request(struct request_queue *q, struct bio *bio)
 	}
 
 	bip = bio_integrity(bio);
-	blk_dev = q->queuedata;
+	nsblk = q->queuedata;
 	rw = bio_data_dir(bio);
 	do_acct = nd_iostat_start(bio, &start);
 	bio_for_each_segment(bvec, bio, iter) {
 		unsigned int len = bvec.bv_len;
 
 		BUG_ON(len > PAGE_SIZE);
-		err = nd_blk_do_bvec(blk_dev, bip, bvec.bv_page, len,
-					bvec.bv_offset, rw, iter.bi_sector);
+		err = nsblk_do_bvec(nsblk, bip, bvec.bv_page, len,
+				bvec.bv_offset, rw, iter.bi_sector);
 		if (err) {
-			dev_dbg(&blk_dev->nsblk->common.dev,
+			dev_dbg(&nsblk->common.dev,
 					"io error in %s sector %lld, len %d,\n",
 					(rw == READ) ? "READ" : "WRITE",
 					(unsigned long long) iter.bi_sector, len);
@@ -205,17 +217,16 @@ static blk_qc_t nd_blk_make_request(struct request_queue *q, struct bio *bio)
 	return BLK_QC_T_NONE;
 }
 
-static int nd_blk_rw_bytes(struct nd_namespace_common *ndns,
+static int nsblk_rw_bytes(struct nd_namespace_common *ndns,
 		resource_size_t offset, void *iobuf, size_t n, int rw)
 {
-	struct nd_blk_device *blk_dev = dev_get_drvdata(ndns->claim);
-	struct nd_namespace_blk *nsblk = blk_dev->nsblk;
-	struct nd_blk_region *ndbr = blk_dev->ndbr;
+	struct nd_namespace_blk *nsblk = to_nd_namespace_blk(&ndns->dev);
+	struct nd_blk_region *ndbr = to_ndbr(nsblk);
 	resource_size_t	dev_offset;
 
 	dev_offset = to_dev_offset(nsblk, offset, n);
 
-	if (unlikely(offset + n > blk_dev->disk_size)) {
+	if (unlikely(offset + n > nsblk->size)) {
 		dev_WARN_ONCE(&ndns->dev, 1, "request out of range\n");
 		return -EFAULT;
 	}
@@ -242,16 +253,16 @@ static void nd_blk_release_disk(void *disk)
 	put_disk(disk);
 }
 
-static int nd_blk_attach_disk(struct device *dev,
-		struct nd_namespace_common *ndns, struct nd_blk_device *blk_dev)
+static int nsblk_attach_disk(struct nd_namespace_blk *nsblk)
 {
+	struct device *dev = &nsblk->common.dev;
 	resource_size_t available_disk_size;
 	struct request_queue *q;
 	struct gendisk *disk;
 	u64 internal_nlba;
 
-	internal_nlba = div_u64(blk_dev->disk_size, blk_dev->internal_lbasize);
-	available_disk_size = internal_nlba * blk_dev->sector_size;
+	internal_nlba = div_u64(nsblk->size, nsblk_internal_lbasize(nsblk));
+	available_disk_size = internal_nlba * nsblk_sector_size(nsblk);
 
 	q = blk_alloc_queue(GFP_KERNEL);
 	if (!q)
@@ -264,9 +275,9 @@ static int nd_blk_attach_disk(struct device *dev,
 	blk_queue_make_request(q, nd_blk_make_request);
 	blk_queue_max_hw_sectors(q, UINT_MAX);
 	blk_queue_bounce_limit(q, BLK_BOUNCE_ANY);
-	blk_queue_logical_block_size(q, blk_dev->sector_size);
+	blk_queue_logical_block_size(q, nsblk_sector_size(nsblk));
 	queue_flag_set_unlocked(QUEUE_FLAG_NONROT, q);
-	q->queuedata = blk_dev;
+	q->queuedata = nsblk;
 
 	disk = alloc_disk(0);
 	if (!disk)
@@ -276,17 +287,17 @@ static int nd_blk_attach_disk(struct device *dev,
 		return -ENOMEM;
 	}
 
-	disk->driverfs_dev	= &ndns->dev;
+	disk->driverfs_dev	= dev;
 	disk->first_minor	= 0;
 	disk->fops		= &nd_blk_fops;
 	disk->queue		= q;
 	disk->flags		= GENHD_FL_EXT_DEVT;
-	nvdimm_namespace_disk_name(ndns, disk->disk_name);
+	nvdimm_namespace_disk_name(&nsblk->common, disk->disk_name);
 	set_capacity(disk, 0);
 	add_disk(disk);
 
-	if (nd_blk_meta_size(blk_dev)) {
-		int rc = nd_integrity_init(disk, nd_blk_meta_size(blk_dev));
+	if (nsblk_meta_size(nsblk)) {
+		int rc = nd_integrity_init(disk, nsblk_meta_size(nsblk));
 
 		if (rc)
 			return rc;
@@ -301,33 +312,23 @@ static int nd_blk_probe(struct device *dev)
 {
 	struct nd_namespace_common *ndns;
 	struct nd_namespace_blk *nsblk;
-	struct nd_blk_device *blk_dev;
 
 	ndns = nvdimm_namespace_common_probe(dev);
 	if (IS_ERR(ndns))
 		return PTR_ERR(ndns);
 
-	blk_dev = devm_kzalloc(dev, sizeof(*blk_dev), GFP_KERNEL);
-	if (!blk_dev)
-		return -ENOMEM;
-
 	nsblk = to_nd_namespace_blk(&ndns->dev);
-	blk_dev->disk_size = nvdimm_namespace_capacity(ndns);
-	blk_dev->ndbr = to_nd_blk_region(dev->parent);
-	blk_dev->nsblk = to_nd_namespace_blk(&ndns->dev);
-	blk_dev->internal_lbasize = roundup(nsblk->lbasize,
-						INT_LBASIZE_ALIGNMENT);
-	blk_dev->sector_size = ((nsblk->lbasize >= 4096) ? 4096 : 512);
-	dev_set_drvdata(dev, blk_dev);
-
-	ndns->rw_bytes = nd_blk_rw_bytes;
+	nsblk->size = nvdimm_namespace_capacity(ndns);
+	dev_set_drvdata(dev, nsblk);
+
+	ndns->rw_bytes = nsblk_rw_bytes;
 	if (is_nd_btt(dev))
 		return nvdimm_namespace_attach_btt(ndns);
-	else if (nd_btt_probe(dev, ndns, blk_dev) == 0) {
+	else if (nd_btt_probe(dev, ndns, nsblk) == 0) {
 		/* we'll come back as btt-blk */
 		return -ENXIO;
 	} else
-		return nd_blk_attach_disk(dev, ndns, blk_dev);
+		return nsblk_attach_disk(nsblk);
 }
 
 static int nd_blk_remove(struct device *dev)
diff --git a/include/linux/nd.h b/include/linux/nd.h
index 5489ab7..5ea4aec 100644
--- a/include/linux/nd.h
+++ b/include/linux/nd.h
@@ -82,6 +82,7 @@ struct nd_namespace_pmem {
  * @uuid: namespace name supplied in the dimm label
  * @id: ida allocated id
  * @lbasize: blk namespaces have a native sector size when btt not present
+ * @size: sum of all the resource ranges allocated to this namespace
  * @num_resources: number of dpa extents to claim
  * @res: discontiguous dpa extents for given dimm
  */
@@ -91,6 +92,7 @@ struct nd_namespace_blk {
 	u8 *uuid;
 	int id;
 	unsigned long lbasize;
+	resource_size_t size;
 	int num_resources;
 	struct resource **res;
 };
-- 
cgit v0.10.2


From 030b99e39cad33b104474fbe688e0eb23d8209b4 Mon Sep 17 00:00:00 2001
From: Dan Williams <dan.j.williams@intel.com>
Date: Thu, 17 Mar 2016 20:24:31 -0700
Subject: libnvdimm, pmem: use devm_add_action to release bdev resources

Register a callback to clean up the request_queue and put the gendisk at
driver disable time.

Cc: Ross Zwisler <ross.zwisler@linux.intel.com>
Reviewed-by: Johannes Thumshirn <jthumshirn@suse.de>
Signed-off-by: Dan Williams <dan.j.williams@intel.com>

diff --git a/drivers/nvdimm/pmem.c b/drivers/nvdimm/pmem.c
index 2238e3a..d936def 100644
--- a/drivers/nvdimm/pmem.c
+++ b/drivers/nvdimm/pmem.c
@@ -198,6 +198,17 @@ static const struct block_device_operations pmem_fops = {
 	.revalidate_disk =	nvdimm_revalidate_disk,
 };
 
+static void pmem_release_queue(void *q)
+{
+	blk_cleanup_queue(q);
+}
+
+void pmem_release_disk(void *disk)
+{
+	del_gendisk(disk);
+	put_disk(disk);
+}
+
 static struct pmem_device *pmem_alloc(struct device *dev,
 		struct resource *res, int id)
 {
@@ -234,25 +245,22 @@ static struct pmem_device *pmem_alloc(struct device *dev,
 				pmem->phys_addr, pmem->size,
 				ARCH_MEMREMAP_PMEM);
 
-	if (IS_ERR(pmem->virt_addr)) {
+	/*
+	 * At release time the queue must be dead before
+	 * devm_memremap_pages is unwound
+	 */
+	if (devm_add_action(dev, pmem_release_queue, q)) {
 		blk_cleanup_queue(q);
-		return (void __force *) pmem->virt_addr;
+		return ERR_PTR(-ENOMEM);
 	}
 
+	if (IS_ERR(pmem->virt_addr))
+		return (void __force *) pmem->virt_addr;
+
 	pmem->pmem_queue = q;
 	return pmem;
 }
 
-static void pmem_detach_disk(struct pmem_device *pmem)
-{
-	if (!pmem->pmem_disk)
-		return;
-
-	del_gendisk(pmem->pmem_disk);
-	put_disk(pmem->pmem_disk);
-	blk_cleanup_queue(pmem->pmem_queue);
-}
-
 static int pmem_attach_disk(struct device *dev,
 		struct nd_namespace_common *ndns, struct pmem_device *pmem)
 {
@@ -269,8 +277,10 @@ static int pmem_attach_disk(struct device *dev,
 	pmem->pmem_queue->queuedata = pmem;
 
 	disk = alloc_disk_node(0, nid);
-	if (!disk) {
-		blk_cleanup_queue(pmem->pmem_queue);
+	if (!disk)
+		return -ENOMEM;
+	if (devm_add_action(dev, pmem_release_disk, disk)) {
+		put_disk(disk);
 		return -ENOMEM;
 	}
 
@@ -427,15 +437,6 @@ static int nd_pfn_init(struct nd_pfn *nd_pfn)
 	return nvdimm_write_bytes(ndns, SZ_4K, pfn_sb, sizeof(*pfn_sb));
 }
 
-static void nvdimm_namespace_detach_pfn(struct nd_pfn *nd_pfn)
-{
-	struct pmem_device *pmem;
-
-	/* free pmem disk */
-	pmem = dev_get_drvdata(&nd_pfn->dev);
-	pmem_detach_disk(pmem);
-}
-
 /*
  * We hotplug memory at section granularity, pad the reserved area from
  * the previous section base to the namespace base address.
@@ -458,7 +459,6 @@ static unsigned long init_altmap_reserve(resource_size_t base)
 
 static int __nvdimm_namespace_attach_pfn(struct nd_pfn *nd_pfn)
 {
-	int rc;
 	struct resource res;
 	struct request_queue *q;
 	struct pmem_device *pmem;
@@ -495,35 +495,33 @@ static int __nvdimm_namespace_attach_pfn(struct nd_pfn *nd_pfn)
 		altmap = & __altmap;
 		altmap->free = PHYS_PFN(pmem->data_offset - SZ_8K);
 		altmap->alloc = 0;
-	} else {
-		rc = -ENXIO;
-		goto err;
-	}
+	} else
+		return -ENXIO;
 
 	/* establish pfn range for lookup, and switch to direct map */
 	q = pmem->pmem_queue;
 	memcpy(&res, &nsio->res, sizeof(res));
 	res.start += start_pad;
 	res.end -= end_trunc;
+	devm_remove_action(dev, pmem_release_queue, q);
 	devm_memunmap(dev, (void __force *) pmem->virt_addr);
 	pmem->virt_addr = (void __pmem *) devm_memremap_pages(dev, &res,
 			&q->q_usage_counter, altmap);
 	pmem->pfn_flags |= PFN_MAP;
-	if (IS_ERR(pmem->virt_addr)) {
-		rc = PTR_ERR(pmem->virt_addr);
-		goto err;
+
+	/*
+	 * At release time the queue must be dead before
+	 * devm_memremap_pages is unwound
+	 */
+	if (devm_add_action(dev, pmem_release_queue, q)) {
+		blk_cleanup_queue(q);
+		return -ENOMEM;
 	}
+	if (IS_ERR(pmem->virt_addr))
+		return PTR_ERR(pmem->virt_addr);
 
 	/* attach pmem disk in "pfn-mode" */
-	rc = pmem_attach_disk(dev, ndns, pmem);
-	if (rc)
-		goto err;
-
-	return rc;
- err:
-	nvdimm_namespace_detach_pfn(nd_pfn);
-	return rc;
-
+	return pmem_attach_disk(dev, ndns, pmem);
 }
 
 static int nvdimm_namespace_attach_pfn(struct nd_namespace_common *ndns)
@@ -565,8 +563,8 @@ static int nd_pmem_probe(struct device *dev)
 
 	if (is_nd_btt(dev)) {
 		/* btt allocates its own request_queue */
+		devm_remove_action(dev, pmem_release_queue, pmem->pmem_queue);
 		blk_cleanup_queue(pmem->pmem_queue);
-		pmem->pmem_queue = NULL;
 		return nvdimm_namespace_attach_btt(ndns);
 	}
 
@@ -579,7 +577,6 @@ static int nd_pmem_probe(struct device *dev)
 		 * We'll come back as either btt-pmem, or pfn-pmem, so
 		 * drop the queue allocation for now.
 		 */
-		blk_cleanup_queue(pmem->pmem_queue);
 		return -ENXIO;
 	}
 
@@ -588,15 +585,8 @@ static int nd_pmem_probe(struct device *dev)
 
 static int nd_pmem_remove(struct device *dev)
 {
-	struct pmem_device *pmem = dev_get_drvdata(dev);
-
 	if (is_nd_btt(dev))
 		nvdimm_namespace_detach_btt(to_nd_btt(dev));
-	else if (is_nd_pfn(dev))
-		nvdimm_namespace_detach_pfn(to_nd_pfn(dev));
-	else
-		pmem_detach_disk(pmem);
-
 	return 0;
 }
 
-- 
cgit v0.10.2


From 947df02d255a6a81a3832e831c5ca02078cfd529 Mon Sep 17 00:00:00 2001
From: Dan Williams <dan.j.williams@intel.com>
Date: Mon, 21 Mar 2016 22:28:40 -0700
Subject: libnvdimm, pmem: clean up resource print / request

The leading '0x' in front of %pa is redundant, also we can just use %pR
to simplify the print statement.  The request parameters can be directly
taken from the resource as well.

Reviewed-by: Johannes Thumshirn <jthumshirn@suse.de>
Signed-off-by: Dan Williams <dan.j.williams@intel.com>

diff --git a/drivers/nvdimm/pmem.c b/drivers/nvdimm/pmem.c
index d936def..67d48e2e 100644
--- a/drivers/nvdimm/pmem.c
+++ b/drivers/nvdimm/pmem.c
@@ -224,10 +224,9 @@ static struct pmem_device *pmem_alloc(struct device *dev,
 	if (!arch_has_wmb_pmem())
 		dev_warn(dev, "unable to guarantee persistence of writes\n");
 
-	if (!devm_request_mem_region(dev, pmem->phys_addr, pmem->size,
-			dev_name(dev))) {
-		dev_warn(dev, "could not reserve region [0x%pa:0x%zx]\n",
-				&pmem->phys_addr, pmem->size);
+	if (!devm_request_mem_region(dev, res->start, resource_size(res),
+				dev_name(dev))) {
+		dev_warn(dev, "could not reserve region %pR\n", res);
 		return ERR_PTR(-EBUSY);
 	}
 
-- 
cgit v0.10.2


From 200c79da824c978fcf6eec1dc9c0a1e521133267 Mon Sep 17 00:00:00 2001
From: Dan Williams <dan.j.williams@intel.com>
Date: Tue, 22 Mar 2016 00:22:16 -0700
Subject: libnvdimm, pmem, pfn: make pmem_rw_bytes generic and refactor pfn
 setup

In preparation for providing an alternative (to block device) access
mechanism to persistent memory, convert pmem_rw_bytes() to
nsio_rw_bytes().  This allows ->rw_bytes() functionality without
requiring a 'struct pmem_device' to be instantiated.

In other words, when ->rw_bytes() is in use i/o is driven through
'struct nd_namespace_io', otherwise it is driven through 'struct
pmem_device' and the block layer.  This consolidates the disjoint calls
to devm_exit_badblocks() and devm_memunmap() into a common
devm_nsio_disable() and cleans up the init path to use a unified
pmem_attach_disk() implementation.

Reviewed-by: Johannes Thumshirn <jthumshirn@suse.de>
Signed-off-by: Dan Williams <dan.j.williams@intel.com>

diff --git a/drivers/nvdimm/blk.c b/drivers/nvdimm/blk.c
index 4c14ecd..495e06d9 100644
--- a/drivers/nvdimm/blk.c
+++ b/drivers/nvdimm/blk.c
@@ -324,7 +324,7 @@ static int nd_blk_probe(struct device *dev)
 	ndns->rw_bytes = nsblk_rw_bytes;
 	if (is_nd_btt(dev))
 		return nvdimm_namespace_attach_btt(ndns);
-	else if (nd_btt_probe(dev, ndns, nsblk) == 0) {
+	else if (nd_btt_probe(dev, ndns) == 0) {
 		/* we'll come back as btt-blk */
 		return -ENXIO;
 	} else
diff --git a/drivers/nvdimm/btt_devs.c b/drivers/nvdimm/btt_devs.c
index 1886171..816d0da 100644
--- a/drivers/nvdimm/btt_devs.c
+++ b/drivers/nvdimm/btt_devs.c
@@ -273,8 +273,7 @@ static int __nd_btt_probe(struct nd_btt *nd_btt,
 	return 0;
 }
 
-int nd_btt_probe(struct device *dev, struct nd_namespace_common *ndns,
-		void *drvdata)
+int nd_btt_probe(struct device *dev, struct nd_namespace_common *ndns)
 {
 	int rc;
 	struct device *btt_dev;
@@ -289,7 +288,6 @@ int nd_btt_probe(struct device *dev, struct nd_namespace_common *ndns,
 	nvdimm_bus_unlock(&ndns->dev);
 	if (!btt_dev)
 		return -ENOMEM;
-	dev_set_drvdata(btt_dev, drvdata);
 	btt_sb = devm_kzalloc(dev, sizeof(*btt_sb), GFP_KERNEL);
 	rc = __nd_btt_probe(to_nd_btt(btt_dev), ndns, btt_sb);
 	dev_dbg(dev, "%s: btt: %s\n", __func__,
diff --git a/drivers/nvdimm/claim.c b/drivers/nvdimm/claim.c
index e8f03b0..6bbd0a3 100644
--- a/drivers/nvdimm/claim.c
+++ b/drivers/nvdimm/claim.c
@@ -12,6 +12,7 @@
  */
 #include <linux/device.h>
 #include <linux/sizes.h>
+#include <linux/pmem.h>
 #include "nd-core.h"
 #include "pfn.h"
 #include "btt.h"
@@ -199,3 +200,63 @@ u64 nd_sb_checksum(struct nd_gen_sb *nd_gen_sb)
 	return sum;
 }
 EXPORT_SYMBOL(nd_sb_checksum);
+
+static int nsio_rw_bytes(struct nd_namespace_common *ndns,
+		resource_size_t offset, void *buf, size_t size, int rw)
+{
+	struct nd_namespace_io *nsio = to_nd_namespace_io(&ndns->dev);
+
+	if (unlikely(offset + size > nsio->size)) {
+		dev_WARN_ONCE(&ndns->dev, 1, "request out of range\n");
+		return -EFAULT;
+	}
+
+	if (rw == READ) {
+		unsigned int sz_align = ALIGN(size + (offset & (512 - 1)), 512);
+
+		if (unlikely(is_bad_pmem(&nsio->bb, offset / 512, sz_align)))
+			return -EIO;
+		return memcpy_from_pmem(buf, nsio->addr + offset, size);
+	} else {
+		memcpy_to_pmem(nsio->addr + offset, buf, size);
+		wmb_pmem();
+	}
+
+	return 0;
+}
+
+int devm_nsio_enable(struct device *dev, struct nd_namespace_io *nsio)
+{
+	struct resource *res = &nsio->res;
+	struct nd_namespace_common *ndns = &nsio->common;
+
+	nsio->size = resource_size(res);
+	if (!devm_request_mem_region(dev, res->start, resource_size(res),
+				dev_name(dev))) {
+		dev_warn(dev, "could not reserve region %pR\n", res);
+		return -EBUSY;
+	}
+
+	ndns->rw_bytes = nsio_rw_bytes;
+	if (devm_init_badblocks(dev, &nsio->bb))
+		return -ENOMEM;
+	nvdimm_badblocks_populate(to_nd_region(ndns->dev.parent), &nsio->bb,
+			&nsio->res);
+
+	nsio->addr = devm_memremap(dev, res->start, resource_size(res),
+			ARCH_MEMREMAP_PMEM);
+	if (IS_ERR(nsio->addr))
+		return PTR_ERR(nsio->addr);
+	return 0;
+}
+EXPORT_SYMBOL_GPL(devm_nsio_enable);
+
+void devm_nsio_disable(struct device *dev, struct nd_namespace_io *nsio)
+{
+	struct resource *res = &nsio->res;
+
+	devm_memunmap(dev, nsio->addr);
+	devm_exit_badblocks(dev, &nsio->bb);
+	devm_release_mem_region(dev, res->start, resource_size(res));
+}
+EXPORT_SYMBOL_GPL(devm_nsio_disable);
diff --git a/drivers/nvdimm/nd.h b/drivers/nvdimm/nd.h
index 0fb1489..10e23fe 100644
--- a/drivers/nvdimm/nd.h
+++ b/drivers/nvdimm/nd.h
@@ -13,6 +13,7 @@
 #ifndef __ND_H__
 #define __ND_H__
 #include <linux/libnvdimm.h>
+#include <linux/badblocks.h>
 #include <linux/blkdev.h>
 #include <linux/device.h>
 #include <linux/mutex.h>
@@ -197,13 +198,12 @@ struct nd_gen_sb {
 
 u64 nd_sb_checksum(struct nd_gen_sb *sb);
 #if IS_ENABLED(CONFIG_BTT)
-int nd_btt_probe(struct device *dev, struct nd_namespace_common *ndns,
-		void *drvdata);
+int nd_btt_probe(struct device *dev, struct nd_namespace_common *ndns);
 bool is_nd_btt(struct device *dev);
 struct device *nd_btt_create(struct nd_region *nd_region);
 #else
 static inline int nd_btt_probe(struct device *dev,
-		struct nd_namespace_common *ndns, void *drvdata)
+		struct nd_namespace_common *ndns)
 {
 	return -ENODEV;
 }
@@ -221,14 +221,13 @@ static inline struct device *nd_btt_create(struct nd_region *nd_region)
 
 struct nd_pfn *to_nd_pfn(struct device *dev);
 #if IS_ENABLED(CONFIG_NVDIMM_PFN)
-int nd_pfn_probe(struct device *dev, struct nd_namespace_common *ndns,
-		void *drvdata);
+int nd_pfn_probe(struct device *dev, struct nd_namespace_common *ndns);
 bool is_nd_pfn(struct device *dev);
 struct device *nd_pfn_create(struct nd_region *nd_region);
 int nd_pfn_validate(struct nd_pfn *nd_pfn);
 #else
-static inline int nd_pfn_probe(struct device *dev, struct nd_namespace_common *ndns,
-		void *drvdata)
+static inline int nd_pfn_probe(struct device *dev,
+		struct nd_namespace_common *ndns)
 {
 	return -ENODEV;
 }
@@ -272,6 +271,20 @@ const char *nvdimm_namespace_disk_name(struct nd_namespace_common *ndns,
 		char *name);
 void nvdimm_badblocks_populate(struct nd_region *nd_region,
 		struct badblocks *bb, const struct resource *res);
+#if IS_ENABLED(CONFIG_ND_CLAIM)
+int devm_nsio_enable(struct device *dev, struct nd_namespace_io *nsio);
+void devm_nsio_disable(struct device *dev, struct nd_namespace_io *nsio);
+#else
+static inline int devm_nsio_enable(struct device *dev,
+		struct nd_namespace_io *nsio)
+{
+	return -ENXIO;
+}
+static inline void devm_nsio_disable(struct device *dev,
+		struct nd_namespace_io *nsio)
+{
+}
+#endif
 int nd_blk_region_init(struct nd_region *nd_region);
 void __nd_iostat_start(struct bio *bio, unsigned long *start);
 static inline bool nd_iostat_start(struct bio *bio, unsigned long *start)
@@ -285,6 +298,19 @@ static inline bool nd_iostat_start(struct bio *bio, unsigned long *start)
 	return true;
 }
 void nd_iostat_end(struct bio *bio, unsigned long start);
+static inline bool is_bad_pmem(struct badblocks *bb, sector_t sector,
+		unsigned int len)
+{
+	if (bb->count) {
+		sector_t first_bad;
+		int num_bad;
+
+		return !!badblocks_check(bb, sector, len / 512, &first_bad,
+				&num_bad);
+	}
+
+	return false;
+}
 resource_size_t nd_namespace_blk_validate(struct nd_namespace_blk *nsblk);
 const u8 *nd_dev_to_uuid(struct device *dev);
 bool pmem_should_map_pages(struct device *dev);
diff --git a/drivers/nvdimm/pfn_devs.c b/drivers/nvdimm/pfn_devs.c
index 96aa549..9df081a 100644
--- a/drivers/nvdimm/pfn_devs.c
+++ b/drivers/nvdimm/pfn_devs.c
@@ -410,8 +410,7 @@ int nd_pfn_validate(struct nd_pfn *nd_pfn)
 }
 EXPORT_SYMBOL(nd_pfn_validate);
 
-int nd_pfn_probe(struct device *dev, struct nd_namespace_common *ndns,
-		void *drvdata)
+int nd_pfn_probe(struct device *dev, struct nd_namespace_common *ndns)
 {
 	int rc;
 	struct nd_pfn *nd_pfn;
@@ -427,7 +426,6 @@ int nd_pfn_probe(struct device *dev, struct nd_namespace_common *ndns,
 	nvdimm_bus_unlock(&ndns->dev);
 	if (!pfn_dev)
 		return -ENOMEM;
-	dev_set_drvdata(pfn_dev, drvdata);
 	pfn_sb = devm_kzalloc(dev, sizeof(*pfn_sb), GFP_KERNEL);
 	nd_pfn = to_nd_pfn(pfn_dev);
 	nd_pfn->pfn_sb = pfn_sb;
diff --git a/drivers/nvdimm/pmem.c b/drivers/nvdimm/pmem.c
index 67d48e2e..b5f81b0 100644
--- a/drivers/nvdimm/pmem.c
+++ b/drivers/nvdimm/pmem.c
@@ -49,19 +49,6 @@ struct pmem_device {
 	struct badblocks	bb;
 };
 
-static bool is_bad_pmem(struct badblocks *bb, sector_t sector, unsigned int len)
-{
-	if (bb->count) {
-		sector_t first_bad;
-		int num_bad;
-
-		return !!badblocks_check(bb, sector, len / 512, &first_bad,
-				&num_bad);
-	}
-
-	return false;
-}
-
 static void pmem_clear_poison(struct pmem_device *pmem, phys_addr_t offset,
 		unsigned int len)
 {
@@ -209,16 +196,40 @@ void pmem_release_disk(void *disk)
 	put_disk(disk);
 }
 
-static struct pmem_device *pmem_alloc(struct device *dev,
-		struct resource *res, int id)
+static struct vmem_altmap *nvdimm_setup_pfn(struct nd_pfn *nd_pfn,
+		struct resource *res, struct vmem_altmap *altmap);
+
+static int pmem_attach_disk(struct device *dev,
+		struct nd_namespace_common *ndns)
 {
+	struct nd_namespace_io *nsio = to_nd_namespace_io(&ndns->dev);
+	struct vmem_altmap __altmap, *altmap = NULL;
+	struct resource *res = &nsio->res;
+	struct nd_pfn *nd_pfn = NULL;
+	int nid = dev_to_node(dev);
+	struct nd_pfn_sb *pfn_sb;
 	struct pmem_device *pmem;
+	struct resource pfn_res;
 	struct request_queue *q;
+	struct gendisk *disk;
+	void *addr;
+
+	/* while nsio_rw_bytes is active, parse a pfn info block if present */
+	if (is_nd_pfn(dev)) {
+		nd_pfn = to_nd_pfn(dev);
+		altmap = nvdimm_setup_pfn(nd_pfn, &pfn_res, &__altmap);
+		if (IS_ERR(altmap))
+			return PTR_ERR(altmap);
+	}
+
+	/* we're attaching a block device, disable raw namespace access */
+	devm_nsio_disable(dev, nsio);
 
 	pmem = devm_kzalloc(dev, sizeof(*pmem), GFP_KERNEL);
 	if (!pmem)
-		return ERR_PTR(-ENOMEM);
+		return -ENOMEM;
 
+	dev_set_drvdata(dev, pmem);
 	pmem->phys_addr = res->start;
 	pmem->size = resource_size(res);
 	if (!arch_has_wmb_pmem())
@@ -227,22 +238,31 @@ static struct pmem_device *pmem_alloc(struct device *dev,
 	if (!devm_request_mem_region(dev, res->start, resource_size(res),
 				dev_name(dev))) {
 		dev_warn(dev, "could not reserve region %pR\n", res);
-		return ERR_PTR(-EBUSY);
+		return -EBUSY;
 	}
 
 	q = blk_alloc_queue_node(GFP_KERNEL, dev_to_node(dev));
 	if (!q)
-		return ERR_PTR(-ENOMEM);
+		return -ENOMEM;
+	pmem->pmem_queue = q;
 
 	pmem->pfn_flags = PFN_DEV;
-	if (pmem_should_map_pages(dev)) {
-		pmem->virt_addr = (void __pmem *) devm_memremap_pages(dev, res,
+	if (is_nd_pfn(dev)) {
+		addr = devm_memremap_pages(dev, &pfn_res, &q->q_usage_counter,
+				altmap);
+		pfn_sb = nd_pfn->pfn_sb;
+		pmem->data_offset = le64_to_cpu(pfn_sb->dataoff);
+		pmem->pfn_pad = resource_size(res) - resource_size(&pfn_res);
+		pmem->pfn_flags |= PFN_MAP;
+		res = &pfn_res; /* for badblocks populate */
+		res->start += pmem->data_offset;
+	} else if (pmem_should_map_pages(dev)) {
+		addr = devm_memremap_pages(dev, &nsio->res,
 				&q->q_usage_counter, NULL);
 		pmem->pfn_flags |= PFN_MAP;
 	} else
-		pmem->virt_addr = (void __pmem *) devm_memremap(dev,
-				pmem->phys_addr, pmem->size,
-				ARCH_MEMREMAP_PMEM);
+		addr = devm_memremap(dev, pmem->phys_addr,
+				pmem->size, ARCH_MEMREMAP_PMEM);
 
 	/*
 	 * At release time the queue must be dead before
@@ -250,23 +270,12 @@ static struct pmem_device *pmem_alloc(struct device *dev,
 	 */
 	if (devm_add_action(dev, pmem_release_queue, q)) {
 		blk_cleanup_queue(q);
-		return ERR_PTR(-ENOMEM);
+		return -ENOMEM;
 	}
 
-	if (IS_ERR(pmem->virt_addr))
-		return (void __force *) pmem->virt_addr;
-
-	pmem->pmem_queue = q;
-	return pmem;
-}
-
-static int pmem_attach_disk(struct device *dev,
-		struct nd_namespace_common *ndns, struct pmem_device *pmem)
-{
-	struct nd_namespace_io *nsio = to_nd_namespace_io(&ndns->dev);
-	int nid = dev_to_node(dev);
-	struct resource bb_res;
-	struct gendisk *disk;
+	if (IS_ERR(addr))
+		return PTR_ERR(addr);
+	pmem->virt_addr = (void __pmem *) addr;
 
 	blk_queue_make_request(pmem->pmem_queue, pmem_make_request);
 	blk_queue_physical_block_size(pmem->pmem_queue, PAGE_SIZE);
@@ -291,20 +300,9 @@ static int pmem_attach_disk(struct device *dev,
 	set_capacity(disk, (pmem->size - pmem->pfn_pad - pmem->data_offset)
 			/ 512);
 	pmem->pmem_disk = disk;
-	devm_exit_badblocks(dev, &pmem->bb);
 	if (devm_init_badblocks(dev, &pmem->bb))
 		return -ENOMEM;
-	bb_res.start = nsio->res.start + pmem->data_offset;
-	bb_res.end = nsio->res.end;
-	if (is_nd_pfn(dev)) {
-		struct nd_pfn *nd_pfn = to_nd_pfn(dev);
-		struct nd_pfn_sb *pfn_sb = nd_pfn->pfn_sb;
-
-		bb_res.start += __le32_to_cpu(pfn_sb->start_pad);
-		bb_res.end -= __le32_to_cpu(pfn_sb->end_trunc);
-	}
-	nvdimm_badblocks_populate(to_nd_region(dev->parent), &pmem->bb,
-			&bb_res);
+	nvdimm_badblocks_populate(to_nd_region(dev->parent), &pmem->bb, res);
 	disk->bb = &pmem->bb;
 	add_disk(disk);
 	revalidate_disk(disk);
@@ -312,33 +310,8 @@ static int pmem_attach_disk(struct device *dev,
 	return 0;
 }
 
-static int pmem_rw_bytes(struct nd_namespace_common *ndns,
-		resource_size_t offset, void *buf, size_t size, int rw)
-{
-	struct pmem_device *pmem = dev_get_drvdata(ndns->claim);
-
-	if (unlikely(offset + size > pmem->size)) {
-		dev_WARN_ONCE(&ndns->dev, 1, "request out of range\n");
-		return -EFAULT;
-	}
-
-	if (rw == READ) {
-		unsigned int sz_align = ALIGN(size + (offset & (512 - 1)), 512);
-
-		if (unlikely(is_bad_pmem(&pmem->bb, offset / 512, sz_align)))
-			return -EIO;
-		return memcpy_from_pmem(buf, pmem->virt_addr + offset, size);
-	} else {
-		memcpy_to_pmem(pmem->virt_addr + offset, buf, size);
-		wmb_pmem();
-	}
-
-	return 0;
-}
-
 static int nd_pfn_init(struct nd_pfn *nd_pfn)
 {
-	struct pmem_device *pmem = dev_get_drvdata(&nd_pfn->dev);
 	struct nd_namespace_common *ndns = nd_pfn->ndns;
 	u32 start_pad = 0, end_trunc = 0;
 	resource_size_t start, size;
@@ -404,7 +377,8 @@ static int nd_pfn_init(struct nd_pfn *nd_pfn)
 	 * ->direct_access() to those that are included in the memmap.
 	 */
 	start += start_pad;
-	npfns = (pmem->size - start_pad - end_trunc - SZ_8K) / SZ_4K;
+	size = resource_size(&nsio->res);
+	npfns = (size - start_pad - end_trunc - SZ_8K) / SZ_4K;
 	if (nd_pfn->mode == PFN_MODE_PMEM)
 		offset = ALIGN(start + SZ_8K + 64 * npfns, nd_pfn->align)
 			- start;
@@ -413,13 +387,13 @@ static int nd_pfn_init(struct nd_pfn *nd_pfn)
 	else
 		return -ENXIO;
 
-	if (offset + start_pad + end_trunc >= pmem->size) {
+	if (offset + start_pad + end_trunc >= size) {
 		dev_err(&nd_pfn->dev, "%s unable to satisfy requested alignment\n",
 				dev_name(&ndns->dev));
 		return -ENXIO;
 	}
 
-	npfns = (pmem->size - offset - start_pad - end_trunc) / SZ_4K;
+	npfns = (size - offset - start_pad - end_trunc) / SZ_4K;
 	pfn_sb->mode = cpu_to_le32(nd_pfn->mode);
 	pfn_sb->dataoff = cpu_to_le64(offset);
 	pfn_sb->npfns = cpu_to_le64(npfns);
@@ -456,17 +430,14 @@ static unsigned long init_altmap_reserve(resource_size_t base)
 	return reserve;
 }
 
-static int __nvdimm_namespace_attach_pfn(struct nd_pfn *nd_pfn)
+static struct vmem_altmap *__nvdimm_setup_pfn(struct nd_pfn *nd_pfn,
+		struct resource *res, struct vmem_altmap *altmap)
 {
-	struct resource res;
-	struct request_queue *q;
-	struct pmem_device *pmem;
-	struct vmem_altmap *altmap;
-	struct device *dev = &nd_pfn->dev;
 	struct nd_pfn_sb *pfn_sb = nd_pfn->pfn_sb;
-	struct nd_namespace_common *ndns = nd_pfn->ndns;
+	u64 offset = le64_to_cpu(pfn_sb->dataoff);
 	u32 start_pad = __le32_to_cpu(pfn_sb->start_pad);
 	u32 end_trunc = __le32_to_cpu(pfn_sb->end_trunc);
+	struct nd_namespace_common *ndns = nd_pfn->ndns;
 	struct nd_namespace_io *nsio = to_nd_namespace_io(&ndns->dev);
 	resource_size_t base = nsio->res.start + start_pad;
 	struct vmem_altmap __altmap = {
@@ -474,112 +445,75 @@ static int __nvdimm_namespace_attach_pfn(struct nd_pfn *nd_pfn)
 		.reserve = init_altmap_reserve(base),
 	};
 
-	pmem = dev_get_drvdata(dev);
-	pmem->data_offset = le64_to_cpu(pfn_sb->dataoff);
-	pmem->pfn_pad = start_pad + end_trunc;
+	memcpy(res, &nsio->res, sizeof(*res));
+	res->start += start_pad;
+	res->end -= end_trunc;
+
 	nd_pfn->mode = le32_to_cpu(nd_pfn->pfn_sb->mode);
 	if (nd_pfn->mode == PFN_MODE_RAM) {
-		if (pmem->data_offset < SZ_8K)
-			return -EINVAL;
+		if (offset < SZ_8K)
+			return ERR_PTR(-EINVAL);
 		nd_pfn->npfns = le64_to_cpu(pfn_sb->npfns);
 		altmap = NULL;
 	} else if (nd_pfn->mode == PFN_MODE_PMEM) {
-		nd_pfn->npfns = (pmem->size - pmem->pfn_pad - pmem->data_offset)
-			/ PAGE_SIZE;
+		nd_pfn->npfns = (resource_size(res) - offset) / PAGE_SIZE;
 		if (le64_to_cpu(nd_pfn->pfn_sb->npfns) > nd_pfn->npfns)
 			dev_info(&nd_pfn->dev,
 					"number of pfns truncated from %lld to %ld\n",
 					le64_to_cpu(nd_pfn->pfn_sb->npfns),
 					nd_pfn->npfns);
-		altmap = & __altmap;
-		altmap->free = PHYS_PFN(pmem->data_offset - SZ_8K);
+		memcpy(altmap, &__altmap, sizeof(*altmap));
+		altmap->free = PHYS_PFN(offset - SZ_8K);
 		altmap->alloc = 0;
 	} else
-		return -ENXIO;
+		return ERR_PTR(-ENXIO);
 
-	/* establish pfn range for lookup, and switch to direct map */
-	q = pmem->pmem_queue;
-	memcpy(&res, &nsio->res, sizeof(res));
-	res.start += start_pad;
-	res.end -= end_trunc;
-	devm_remove_action(dev, pmem_release_queue, q);
-	devm_memunmap(dev, (void __force *) pmem->virt_addr);
-	pmem->virt_addr = (void __pmem *) devm_memremap_pages(dev, &res,
-			&q->q_usage_counter, altmap);
-	pmem->pfn_flags |= PFN_MAP;
-
-	/*
-	 * At release time the queue must be dead before
-	 * devm_memremap_pages is unwound
-	 */
-	if (devm_add_action(dev, pmem_release_queue, q)) {
-		blk_cleanup_queue(q);
-		return -ENOMEM;
-	}
-	if (IS_ERR(pmem->virt_addr))
-		return PTR_ERR(pmem->virt_addr);
-
-	/* attach pmem disk in "pfn-mode" */
-	return pmem_attach_disk(dev, ndns, pmem);
+	return altmap;
 }
 
-static int nvdimm_namespace_attach_pfn(struct nd_namespace_common *ndns)
+/*
+ * Determine the effective resource range and vmem_altmap from an nd_pfn
+ * instance.
+ */
+static struct vmem_altmap *nvdimm_setup_pfn(struct nd_pfn *nd_pfn,
+		struct resource *res, struct vmem_altmap *altmap)
 {
-	struct nd_pfn *nd_pfn = to_nd_pfn(ndns->claim);
 	int rc;
 
 	if (!nd_pfn->uuid || !nd_pfn->ndns)
-		return -ENODEV;
+		return ERR_PTR(-ENODEV);
 
 	rc = nd_pfn_init(nd_pfn);
 	if (rc)
-		return rc;
+		return ERR_PTR(rc);
+
 	/* we need a valid pfn_sb before we can init a vmem_altmap */
-	return __nvdimm_namespace_attach_pfn(nd_pfn);
+	return __nvdimm_setup_pfn(nd_pfn, res, altmap);
 }
 
 static int nd_pmem_probe(struct device *dev)
 {
-	struct nd_region *nd_region = to_nd_region(dev->parent);
 	struct nd_namespace_common *ndns;
-	struct nd_namespace_io *nsio;
-	struct pmem_device *pmem;
 
 	ndns = nvdimm_namespace_common_probe(dev);
 	if (IS_ERR(ndns))
 		return PTR_ERR(ndns);
 
-	nsio = to_nd_namespace_io(&ndns->dev);
-	pmem = pmem_alloc(dev, &nsio->res, nd_region->id);
-	if (IS_ERR(pmem))
-		return PTR_ERR(pmem);
-
-	dev_set_drvdata(dev, pmem);
-	ndns->rw_bytes = pmem_rw_bytes;
-	if (devm_init_badblocks(dev, &pmem->bb))
-		return -ENOMEM;
-	nvdimm_badblocks_populate(nd_region, &pmem->bb, &nsio->res);
+	if (devm_nsio_enable(dev, to_nd_namespace_io(&ndns->dev)))
+		return -ENXIO;
 
-	if (is_nd_btt(dev)) {
-		/* btt allocates its own request_queue */
-		devm_remove_action(dev, pmem_release_queue, pmem->pmem_queue);
-		blk_cleanup_queue(pmem->pmem_queue);
+	if (is_nd_btt(dev))
 		return nvdimm_namespace_attach_btt(ndns);
-	}
 
 	if (is_nd_pfn(dev))
-		return nvdimm_namespace_attach_pfn(ndns);
+		return pmem_attach_disk(dev, ndns);
 
-	if (nd_btt_probe(dev, ndns, pmem) == 0
-			|| nd_pfn_probe(dev, ndns, pmem) == 0) {
-		/*
-		 * We'll come back as either btt-pmem, or pfn-pmem, so
-		 * drop the queue allocation for now.
-		 */
+	/* if we find a valid info-block we'll come back as that personality */
+	if (nd_btt_probe(dev, ndns) == 0 || nd_pfn_probe(dev, ndns) == 0)
 		return -ENXIO;
-	}
 
-	return pmem_attach_disk(dev, ndns, pmem);
+	/* ...otherwise we're just a raw pmem device */
+	return pmem_attach_disk(dev, ndns);
 }
 
 static int nd_pmem_remove(struct device *dev)
diff --git a/include/linux/nd.h b/include/linux/nd.h
index 5ea4aec..aee2761 100644
--- a/include/linux/nd.h
+++ b/include/linux/nd.h
@@ -15,6 +15,7 @@
 #include <linux/fs.h>
 #include <linux/ndctl.h>
 #include <linux/device.h>
+#include <linux/badblocks.h>
 
 enum nvdimm_event {
 	NVDIMM_REVALIDATE_POISON,
@@ -55,13 +56,19 @@ static inline struct nd_namespace_common *to_ndns(struct device *dev)
 }
 
 /**
- * struct nd_namespace_io - infrastructure for loading an nd_pmem instance
+ * struct nd_namespace_io - device representation of a persistent memory range
  * @dev: namespace device created by the nd region driver
  * @res: struct resource conversion of a NFIT SPA table
+ * @size: cached resource_size(@res) for fast path size checks
+ * @addr: virtual address to access the namespace range
+ * @bb: badblocks list for the namespace range
  */
 struct nd_namespace_io {
 	struct nd_namespace_common common;
 	struct resource res;
+	resource_size_t size;
+	void __pmem *addr;
+	struct badblocks bb;
 };
 
 /**
diff --git a/tools/testing/nvdimm/Kbuild b/tools/testing/nvdimm/Kbuild
index a34bfd0..d5bc8c0 100644
--- a/tools/testing/nvdimm/Kbuild
+++ b/tools/testing/nvdimm/Kbuild
@@ -7,6 +7,7 @@ ldflags-y += --wrap=ioremap_nocache
 ldflags-y += --wrap=iounmap
 ldflags-y += --wrap=memunmap
 ldflags-y += --wrap=__devm_request_region
+ldflags-y += --wrap=__devm_release_region
 ldflags-y += --wrap=__request_region
 ldflags-y += --wrap=__release_region
 ldflags-y += --wrap=devm_memremap_pages
diff --git a/tools/testing/nvdimm/test/iomap.c b/tools/testing/nvdimm/test/iomap.c
index 0c1a7e6..c842095 100644
--- a/tools/testing/nvdimm/test/iomap.c
+++ b/tools/testing/nvdimm/test/iomap.c
@@ -239,13 +239,11 @@ struct resource *__wrap___devm_request_region(struct device *dev,
 }
 EXPORT_SYMBOL(__wrap___devm_request_region);
 
-void __wrap___release_region(struct resource *parent, resource_size_t start,
-				resource_size_t n)
+static bool nfit_test_release_region(struct resource *parent,
+		resource_size_t start, resource_size_t n)
 {
-	struct nfit_test_resource *nfit_res;
-
 	if (parent == &iomem_resource) {
-		nfit_res = get_nfit_res(start);
+		struct nfit_test_resource *nfit_res = get_nfit_res(start);
 		if (nfit_res) {
 			struct resource *res = nfit_res->res + 1;
 
@@ -254,11 +252,26 @@ void __wrap___release_region(struct resource *parent, resource_size_t start,
 						__func__, start, n, res);
 			else
 				memset(res, 0, sizeof(*res));
-			return;
+			return true;
 		}
 	}
-	__release_region(parent, start, n);
+	return false;
+}
+
+void __wrap___release_region(struct resource *parent, resource_size_t start,
+		resource_size_t n)
+{
+	if (!nfit_test_release_region(parent, start, n))
+		__release_region(parent, start, n);
 }
 EXPORT_SYMBOL(__wrap___release_region);
 
+void __wrap___devm_release_region(struct device *dev, struct resource *parent,
+		resource_size_t start, resource_size_t n)
+{
+	if (!nfit_test_release_region(parent, start, n))
+		__devm_release_region(dev, parent, start, n);
+}
+EXPORT_SYMBOL(__wrap___devm_release_region);
+
 MODULE_LICENSE("GPL v2");
-- 
cgit v0.10.2


From ac515c084be9b3995f7aef0ae87797e75e0260f0 Mon Sep 17 00:00:00 2001
From: Dan Williams <dan.j.williams@intel.com>
Date: Tue, 22 Mar 2016 00:29:43 -0700
Subject: libnvdimm, pmem, pfn: move pfn setup to the core

Now that pmem internals have been disentangled from pfn setup, that code
can move to the core.  This is in preparation for adding another user of
the pfn-device capabilities.

Reviewed-by: Johannes Thumshirn <jthumshirn@suse.de>
Signed-off-by: Dan Williams <dan.j.williams@intel.com>

diff --git a/drivers/nvdimm/nd.h b/drivers/nvdimm/nd.h
index 10e23fe..6c36509 100644
--- a/drivers/nvdimm/nd.h
+++ b/drivers/nvdimm/nd.h
@@ -272,9 +272,16 @@ const char *nvdimm_namespace_disk_name(struct nd_namespace_common *ndns,
 void nvdimm_badblocks_populate(struct nd_region *nd_region,
 		struct badblocks *bb, const struct resource *res);
 #if IS_ENABLED(CONFIG_ND_CLAIM)
+struct vmem_altmap *nvdimm_setup_pfn(struct nd_pfn *nd_pfn,
+		struct resource *res, struct vmem_altmap *altmap);
 int devm_nsio_enable(struct device *dev, struct nd_namespace_io *nsio);
 void devm_nsio_disable(struct device *dev, struct nd_namespace_io *nsio);
 #else
+static inline struct vmem_altmap *nvdimm_setup_pfn(struct nd_pfn *nd_pfn,
+		struct resource *res, struct vmem_altmap *altmap)
+{
+	return ERR_PTR(-ENXIO);
+}
 static inline int devm_nsio_enable(struct device *dev,
 		struct nd_namespace_io *nsio)
 {
diff --git a/drivers/nvdimm/pfn_devs.c b/drivers/nvdimm/pfn_devs.c
index 9df081a..e8693fe 100644
--- a/drivers/nvdimm/pfn_devs.c
+++ b/drivers/nvdimm/pfn_devs.c
@@ -10,6 +10,7 @@
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * General Public License for more details.
  */
+#include <linux/memremap.h>
 #include <linux/blkdev.h>
 #include <linux/device.h>
 #include <linux/genhd.h>
@@ -441,3 +442,183 @@ int nd_pfn_probe(struct device *dev, struct nd_namespace_common *ndns)
 	return rc;
 }
 EXPORT_SYMBOL(nd_pfn_probe);
+
+/*
+ * We hotplug memory at section granularity, pad the reserved area from
+ * the previous section base to the namespace base address.
+ */
+static unsigned long init_altmap_base(resource_size_t base)
+{
+	unsigned long base_pfn = PHYS_PFN(base);
+
+	return PFN_SECTION_ALIGN_DOWN(base_pfn);
+}
+
+static unsigned long init_altmap_reserve(resource_size_t base)
+{
+	unsigned long reserve = PHYS_PFN(SZ_8K);
+	unsigned long base_pfn = PHYS_PFN(base);
+
+	reserve += base_pfn - PFN_SECTION_ALIGN_DOWN(base_pfn);
+	return reserve;
+}
+
+static struct vmem_altmap *__nvdimm_setup_pfn(struct nd_pfn *nd_pfn,
+		struct resource *res, struct vmem_altmap *altmap)
+{
+	struct nd_pfn_sb *pfn_sb = nd_pfn->pfn_sb;
+	u64 offset = le64_to_cpu(pfn_sb->dataoff);
+	u32 start_pad = __le32_to_cpu(pfn_sb->start_pad);
+	u32 end_trunc = __le32_to_cpu(pfn_sb->end_trunc);
+	struct nd_namespace_common *ndns = nd_pfn->ndns;
+	struct nd_namespace_io *nsio = to_nd_namespace_io(&ndns->dev);
+	resource_size_t base = nsio->res.start + start_pad;
+	struct vmem_altmap __altmap = {
+		.base_pfn = init_altmap_base(base),
+		.reserve = init_altmap_reserve(base),
+	};
+
+	memcpy(res, &nsio->res, sizeof(*res));
+	res->start += start_pad;
+	res->end -= end_trunc;
+
+	nd_pfn->mode = le32_to_cpu(nd_pfn->pfn_sb->mode);
+	if (nd_pfn->mode == PFN_MODE_RAM) {
+		if (offset < SZ_8K)
+			return ERR_PTR(-EINVAL);
+		nd_pfn->npfns = le64_to_cpu(pfn_sb->npfns);
+		altmap = NULL;
+	} else if (nd_pfn->mode == PFN_MODE_PMEM) {
+		nd_pfn->npfns = (resource_size(res) - offset) / PAGE_SIZE;
+		if (le64_to_cpu(nd_pfn->pfn_sb->npfns) > nd_pfn->npfns)
+			dev_info(&nd_pfn->dev,
+					"number of pfns truncated from %lld to %ld\n",
+					le64_to_cpu(nd_pfn->pfn_sb->npfns),
+					nd_pfn->npfns);
+		memcpy(altmap, &__altmap, sizeof(*altmap));
+		altmap->free = PHYS_PFN(offset - SZ_8K);
+		altmap->alloc = 0;
+	} else
+		return ERR_PTR(-ENXIO);
+
+	return altmap;
+}
+
+static int nd_pfn_init(struct nd_pfn *nd_pfn)
+{
+	struct nd_namespace_common *ndns = nd_pfn->ndns;
+	u32 start_pad = 0, end_trunc = 0;
+	resource_size_t start, size;
+	struct nd_namespace_io *nsio;
+	struct nd_region *nd_region;
+	struct nd_pfn_sb *pfn_sb;
+	unsigned long npfns;
+	phys_addr_t offset;
+	u64 checksum;
+	int rc;
+
+	pfn_sb = devm_kzalloc(&nd_pfn->dev, sizeof(*pfn_sb), GFP_KERNEL);
+	if (!pfn_sb)
+		return -ENOMEM;
+
+	nd_pfn->pfn_sb = pfn_sb;
+	rc = nd_pfn_validate(nd_pfn);
+	if (rc != -ENODEV)
+		return rc;
+
+	/* no info block, do init */;
+	nd_region = to_nd_region(nd_pfn->dev.parent);
+	if (nd_region->ro) {
+		dev_info(&nd_pfn->dev,
+				"%s is read-only, unable to init metadata\n",
+				dev_name(&nd_region->dev));
+		return -ENXIO;
+	}
+
+	memset(pfn_sb, 0, sizeof(*pfn_sb));
+
+	/*
+	 * Check if pmem collides with 'System RAM' when section aligned and
+	 * trim it accordingly
+	 */
+	nsio = to_nd_namespace_io(&ndns->dev);
+	start = PHYS_SECTION_ALIGN_DOWN(nsio->res.start);
+	size = resource_size(&nsio->res);
+	if (region_intersects(start, size, IORESOURCE_SYSTEM_RAM,
+				IORES_DESC_NONE) == REGION_MIXED) {
+		start = nsio->res.start;
+		start_pad = PHYS_SECTION_ALIGN_UP(start) - start;
+	}
+
+	start = nsio->res.start;
+	size = PHYS_SECTION_ALIGN_UP(start + size) - start;
+	if (region_intersects(start, size, IORESOURCE_SYSTEM_RAM,
+				IORES_DESC_NONE) == REGION_MIXED) {
+		size = resource_size(&nsio->res);
+		end_trunc = start + size - PHYS_SECTION_ALIGN_DOWN(start + size);
+	}
+
+	if (start_pad + end_trunc)
+		dev_info(&nd_pfn->dev, "%s section collision, truncate %d bytes\n",
+				dev_name(&ndns->dev), start_pad + end_trunc);
+
+	/*
+	 * Note, we use 64 here for the standard size of struct page,
+	 * debugging options may cause it to be larger in which case the
+	 * implementation will limit the pfns advertised through
+	 * ->direct_access() to those that are included in the memmap.
+	 */
+	start += start_pad;
+	size = resource_size(&nsio->res);
+	npfns = (size - start_pad - end_trunc - SZ_8K) / SZ_4K;
+	if (nd_pfn->mode == PFN_MODE_PMEM)
+		offset = ALIGN(start + SZ_8K + 64 * npfns, nd_pfn->align)
+			- start;
+	else if (nd_pfn->mode == PFN_MODE_RAM)
+		offset = ALIGN(start + SZ_8K, nd_pfn->align) - start;
+	else
+		return -ENXIO;
+
+	if (offset + start_pad + end_trunc >= size) {
+		dev_err(&nd_pfn->dev, "%s unable to satisfy requested alignment\n",
+				dev_name(&ndns->dev));
+		return -ENXIO;
+	}
+
+	npfns = (size - offset - start_pad - end_trunc) / SZ_4K;
+	pfn_sb->mode = cpu_to_le32(nd_pfn->mode);
+	pfn_sb->dataoff = cpu_to_le64(offset);
+	pfn_sb->npfns = cpu_to_le64(npfns);
+	memcpy(pfn_sb->signature, PFN_SIG, PFN_SIG_LEN);
+	memcpy(pfn_sb->uuid, nd_pfn->uuid, 16);
+	memcpy(pfn_sb->parent_uuid, nd_dev_to_uuid(&ndns->dev), 16);
+	pfn_sb->version_major = cpu_to_le16(1);
+	pfn_sb->version_minor = cpu_to_le16(1);
+	pfn_sb->start_pad = cpu_to_le32(start_pad);
+	pfn_sb->end_trunc = cpu_to_le32(end_trunc);
+	checksum = nd_sb_checksum((struct nd_gen_sb *) pfn_sb);
+	pfn_sb->checksum = cpu_to_le64(checksum);
+
+	return nvdimm_write_bytes(ndns, SZ_4K, pfn_sb, sizeof(*pfn_sb));
+}
+
+/*
+ * Determine the effective resource range and vmem_altmap from an nd_pfn
+ * instance.
+ */
+struct vmem_altmap *nvdimm_setup_pfn(struct nd_pfn *nd_pfn,
+		struct resource *res, struct vmem_altmap *altmap)
+{
+	int rc;
+
+	if (!nd_pfn->uuid || !nd_pfn->ndns)
+		return ERR_PTR(-ENODEV);
+
+	rc = nd_pfn_init(nd_pfn);
+	if (rc)
+		return ERR_PTR(rc);
+
+	/* we need a valid pfn_sb before we can init a vmem_altmap */
+	return __nvdimm_setup_pfn(nd_pfn, res, altmap);
+}
+EXPORT_SYMBOL_GPL(nvdimm_setup_pfn);
diff --git a/drivers/nvdimm/pmem.c b/drivers/nvdimm/pmem.c
index b5f81b0..3fc6896 100644
--- a/drivers/nvdimm/pmem.c
+++ b/drivers/nvdimm/pmem.c
@@ -196,9 +196,6 @@ void pmem_release_disk(void *disk)
 	put_disk(disk);
 }
 
-static struct vmem_altmap *nvdimm_setup_pfn(struct nd_pfn *nd_pfn,
-		struct resource *res, struct vmem_altmap *altmap);
-
 static int pmem_attach_disk(struct device *dev,
 		struct nd_namespace_common *ndns)
 {
@@ -310,187 +307,6 @@ static int pmem_attach_disk(struct device *dev,
 	return 0;
 }
 
-static int nd_pfn_init(struct nd_pfn *nd_pfn)
-{
-	struct nd_namespace_common *ndns = nd_pfn->ndns;
-	u32 start_pad = 0, end_trunc = 0;
-	resource_size_t start, size;
-	struct nd_namespace_io *nsio;
-	struct nd_region *nd_region;
-	struct nd_pfn_sb *pfn_sb;
-	unsigned long npfns;
-	phys_addr_t offset;
-	u64 checksum;
-	int rc;
-
-	pfn_sb = devm_kzalloc(&nd_pfn->dev, sizeof(*pfn_sb), GFP_KERNEL);
-	if (!pfn_sb)
-		return -ENOMEM;
-
-	nd_pfn->pfn_sb = pfn_sb;
-	rc = nd_pfn_validate(nd_pfn);
-	if (rc == -ENODEV)
-		/* no info block, do init */;
-	else
-		return rc;
-
-	nd_region = to_nd_region(nd_pfn->dev.parent);
-	if (nd_region->ro) {
-		dev_info(&nd_pfn->dev,
-				"%s is read-only, unable to init metadata\n",
-				dev_name(&nd_region->dev));
-		return -ENXIO;
-	}
-
-	memset(pfn_sb, 0, sizeof(*pfn_sb));
-
-	/*
-	 * Check if pmem collides with 'System RAM' when section aligned and
-	 * trim it accordingly
-	 */
-	nsio = to_nd_namespace_io(&ndns->dev);
-	start = PHYS_SECTION_ALIGN_DOWN(nsio->res.start);
-	size = resource_size(&nsio->res);
-	if (region_intersects(start, size, IORESOURCE_SYSTEM_RAM,
-				IORES_DESC_NONE) == REGION_MIXED) {
-
-		start = nsio->res.start;
-		start_pad = PHYS_SECTION_ALIGN_UP(start) - start;
-	}
-
-	start = nsio->res.start;
-	size = PHYS_SECTION_ALIGN_UP(start + size) - start;
-	if (region_intersects(start, size, IORESOURCE_SYSTEM_RAM,
-				IORES_DESC_NONE) == REGION_MIXED) {
-		size = resource_size(&nsio->res);
-		end_trunc = start + size - PHYS_SECTION_ALIGN_DOWN(start + size);
-	}
-
-	if (start_pad + end_trunc)
-		dev_info(&nd_pfn->dev, "%s section collision, truncate %d bytes\n",
-				dev_name(&ndns->dev), start_pad + end_trunc);
-
-	/*
-	 * Note, we use 64 here for the standard size of struct page,
-	 * debugging options may cause it to be larger in which case the
-	 * implementation will limit the pfns advertised through
-	 * ->direct_access() to those that are included in the memmap.
-	 */
-	start += start_pad;
-	size = resource_size(&nsio->res);
-	npfns = (size - start_pad - end_trunc - SZ_8K) / SZ_4K;
-	if (nd_pfn->mode == PFN_MODE_PMEM)
-		offset = ALIGN(start + SZ_8K + 64 * npfns, nd_pfn->align)
-			- start;
-	else if (nd_pfn->mode == PFN_MODE_RAM)
-		offset = ALIGN(start + SZ_8K, nd_pfn->align) - start;
-	else
-		return -ENXIO;
-
-	if (offset + start_pad + end_trunc >= size) {
-		dev_err(&nd_pfn->dev, "%s unable to satisfy requested alignment\n",
-				dev_name(&ndns->dev));
-		return -ENXIO;
-	}
-
-	npfns = (size - offset - start_pad - end_trunc) / SZ_4K;
-	pfn_sb->mode = cpu_to_le32(nd_pfn->mode);
-	pfn_sb->dataoff = cpu_to_le64(offset);
-	pfn_sb->npfns = cpu_to_le64(npfns);
-	memcpy(pfn_sb->signature, PFN_SIG, PFN_SIG_LEN);
-	memcpy(pfn_sb->uuid, nd_pfn->uuid, 16);
-	memcpy(pfn_sb->parent_uuid, nd_dev_to_uuid(&ndns->dev), 16);
-	pfn_sb->version_major = cpu_to_le16(1);
-	pfn_sb->version_minor = cpu_to_le16(1);
-	pfn_sb->start_pad = cpu_to_le32(start_pad);
-	pfn_sb->end_trunc = cpu_to_le32(end_trunc);
-	checksum = nd_sb_checksum((struct nd_gen_sb *) pfn_sb);
-	pfn_sb->checksum = cpu_to_le64(checksum);
-
-	return nvdimm_write_bytes(ndns, SZ_4K, pfn_sb, sizeof(*pfn_sb));
-}
-
-/*
- * We hotplug memory at section granularity, pad the reserved area from
- * the previous section base to the namespace base address.
- */
-static unsigned long init_altmap_base(resource_size_t base)
-{
-	unsigned long base_pfn = PHYS_PFN(base);
-
-	return PFN_SECTION_ALIGN_DOWN(base_pfn);
-}
-
-static unsigned long init_altmap_reserve(resource_size_t base)
-{
-	unsigned long reserve = PHYS_PFN(SZ_8K);
-	unsigned long base_pfn = PHYS_PFN(base);
-
-	reserve += base_pfn - PFN_SECTION_ALIGN_DOWN(base_pfn);
-	return reserve;
-}
-
-static struct vmem_altmap *__nvdimm_setup_pfn(struct nd_pfn *nd_pfn,
-		struct resource *res, struct vmem_altmap *altmap)
-{
-	struct nd_pfn_sb *pfn_sb = nd_pfn->pfn_sb;
-	u64 offset = le64_to_cpu(pfn_sb->dataoff);
-	u32 start_pad = __le32_to_cpu(pfn_sb->start_pad);
-	u32 end_trunc = __le32_to_cpu(pfn_sb->end_trunc);
-	struct nd_namespace_common *ndns = nd_pfn->ndns;
-	struct nd_namespace_io *nsio = to_nd_namespace_io(&ndns->dev);
-	resource_size_t base = nsio->res.start + start_pad;
-	struct vmem_altmap __altmap = {
-		.base_pfn = init_altmap_base(base),
-		.reserve = init_altmap_reserve(base),
-	};
-
-	memcpy(res, &nsio->res, sizeof(*res));
-	res->start += start_pad;
-	res->end -= end_trunc;
-
-	nd_pfn->mode = le32_to_cpu(nd_pfn->pfn_sb->mode);
-	if (nd_pfn->mode == PFN_MODE_RAM) {
-		if (offset < SZ_8K)
-			return ERR_PTR(-EINVAL);
-		nd_pfn->npfns = le64_to_cpu(pfn_sb->npfns);
-		altmap = NULL;
-	} else if (nd_pfn->mode == PFN_MODE_PMEM) {
-		nd_pfn->npfns = (resource_size(res) - offset) / PAGE_SIZE;
-		if (le64_to_cpu(nd_pfn->pfn_sb->npfns) > nd_pfn->npfns)
-			dev_info(&nd_pfn->dev,
-					"number of pfns truncated from %lld to %ld\n",
-					le64_to_cpu(nd_pfn->pfn_sb->npfns),
-					nd_pfn->npfns);
-		memcpy(altmap, &__altmap, sizeof(*altmap));
-		altmap->free = PHYS_PFN(offset - SZ_8K);
-		altmap->alloc = 0;
-	} else
-		return ERR_PTR(-ENXIO);
-
-	return altmap;
-}
-
-/*
- * Determine the effective resource range and vmem_altmap from an nd_pfn
- * instance.
- */
-static struct vmem_altmap *nvdimm_setup_pfn(struct nd_pfn *nd_pfn,
-		struct resource *res, struct vmem_altmap *altmap)
-{
-	int rc;
-
-	if (!nd_pfn->uuid || !nd_pfn->ndns)
-		return ERR_PTR(-ENODEV);
-
-	rc = nd_pfn_init(nd_pfn);
-	if (rc)
-		return ERR_PTR(rc);
-
-	/* we need a valid pfn_sb before we can init a vmem_altmap */
-	return __nvdimm_setup_pfn(nd_pfn, res, altmap);
-}
-
 static int nd_pmem_probe(struct device *dev)
 {
 	struct nd_namespace_common *ndns;
-- 
cgit v0.10.2


From 5a92289f41311a54ededb5e4ed474cc38f5d85de Mon Sep 17 00:00:00 2001
From: Dan Williams <dan.j.williams@intel.com>
Date: Mon, 21 Mar 2016 15:43:53 -0700
Subject: libnvdimm, pmem: kill ->pmem_queue and ->pmem_disk

The devm conversion obviates the need to continue to remember the queue
and disk locally in the driver.

Reviewed-by: Johannes Thumshirn <jthumshirn@suse.de>
Signed-off-by: Dan Williams <dan.j.williams@intel.com>

diff --git a/drivers/nvdimm/pmem.c b/drivers/nvdimm/pmem.c
index 3fc6896..d9a0dbc 100644
--- a/drivers/nvdimm/pmem.c
+++ b/drivers/nvdimm/pmem.c
@@ -33,9 +33,6 @@
 #include "nd.h"
 
 struct pmem_device {
-	struct request_queue	*pmem_queue;
-	struct gendisk		*pmem_disk;
-
 	/* One contiguous memory region per device */
 	phys_addr_t		phys_addr;
 	/* when non-zero this device is hosting a 'pfn' instance */
@@ -52,7 +49,7 @@ struct pmem_device {
 static void pmem_clear_poison(struct pmem_device *pmem, phys_addr_t offset,
 		unsigned int len)
 {
-	struct device *dev = disk_to_dev(pmem->pmem_disk);
+	struct device *dev = pmem->bb.dev;
 	sector_t sector;
 	long cleared;
 
@@ -241,7 +238,6 @@ static int pmem_attach_disk(struct device *dev,
 	q = blk_alloc_queue_node(GFP_KERNEL, dev_to_node(dev));
 	if (!q)
 		return -ENOMEM;
-	pmem->pmem_queue = q;
 
 	pmem->pfn_flags = PFN_DEV;
 	if (is_nd_pfn(dev)) {
@@ -274,12 +270,12 @@ static int pmem_attach_disk(struct device *dev,
 		return PTR_ERR(addr);
 	pmem->virt_addr = (void __pmem *) addr;
 
-	blk_queue_make_request(pmem->pmem_queue, pmem_make_request);
-	blk_queue_physical_block_size(pmem->pmem_queue, PAGE_SIZE);
-	blk_queue_max_hw_sectors(pmem->pmem_queue, UINT_MAX);
-	blk_queue_bounce_limit(pmem->pmem_queue, BLK_BOUNCE_ANY);
-	queue_flag_set_unlocked(QUEUE_FLAG_NONROT, pmem->pmem_queue);
-	pmem->pmem_queue->queuedata = pmem;
+	blk_queue_make_request(q, pmem_make_request);
+	blk_queue_physical_block_size(q, PAGE_SIZE);
+	blk_queue_max_hw_sectors(q, UINT_MAX);
+	blk_queue_bounce_limit(q, BLK_BOUNCE_ANY);
+	queue_flag_set_unlocked(QUEUE_FLAG_NONROT, q);
+	q->queuedata = pmem;
 
 	disk = alloc_disk_node(0, nid);
 	if (!disk)
@@ -290,13 +286,12 @@ static int pmem_attach_disk(struct device *dev,
 	}
 
 	disk->fops		= &pmem_fops;
-	disk->queue		= pmem->pmem_queue;
+	disk->queue		= q;
 	disk->flags		= GENHD_FL_EXT_DEVT;
 	nvdimm_namespace_disk_name(ndns, disk->disk_name);
 	disk->driverfs_dev = dev;
 	set_capacity(disk, (pmem->size - pmem->pfn_pad - pmem->data_offset)
 			/ 512);
-	pmem->pmem_disk = disk;
 	if (devm_init_badblocks(dev, &pmem->bb))
 		return -ENOMEM;
 	nvdimm_badblocks_populate(to_nd_region(dev->parent), &pmem->bb, res);
-- 
cgit v0.10.2


From 0bfb8dd3edd6e423b5053c86e10c97e92cf205ea Mon Sep 17 00:00:00 2001
From: Dan Williams <dan.j.williams@intel.com>
Date: Wed, 13 Apr 2016 17:06:48 -0700
Subject: libnvdimm: cleanup nvdimm_namespace_common_probe(), kill 'host'

The 'host' variable can be killed as it is always the same as the passed
in device.

Reviewed-by: Johannes Thumshirn <jthumshirn@suse.de>
Signed-off-by: Dan Williams <dan.j.williams@intel.com>

diff --git a/drivers/nvdimm/namespace_devs.c b/drivers/nvdimm/namespace_devs.c
index f5cb886..e5ad516 100644
--- a/drivers/nvdimm/namespace_devs.c
+++ b/drivers/nvdimm/namespace_devs.c
@@ -1379,21 +1379,16 @@ struct nd_namespace_common *nvdimm_namespace_common_probe(struct device *dev)
 {
 	struct nd_btt *nd_btt = is_nd_btt(dev) ? to_nd_btt(dev) : NULL;
 	struct nd_pfn *nd_pfn = is_nd_pfn(dev) ? to_nd_pfn(dev) : NULL;
-	struct nd_namespace_common *ndns;
+	struct nd_namespace_common *ndns = NULL;
 	resource_size_t size;
 
 	if (nd_btt || nd_pfn) {
-		struct device *host = NULL;
-
-		if (nd_btt) {
-			host = &nd_btt->dev;
+		if (nd_btt)
 			ndns = nd_btt->ndns;
-		} else if (nd_pfn) {
-			host = &nd_pfn->dev;
+		else if (nd_pfn)
 			ndns = nd_pfn->ndns;
-		}
 
-		if (!ndns || !host)
+		if (!ndns)
 			return ERR_PTR(-ENODEV);
 
 		/*
@@ -1404,12 +1399,12 @@ struct nd_namespace_common *nvdimm_namespace_common_probe(struct device *dev)
 		device_unlock(&ndns->dev);
 		if (ndns->dev.driver) {
 			dev_dbg(&ndns->dev, "is active, can't bind %s\n",
-					dev_name(host));
+					dev_name(dev));
 			return ERR_PTR(-EBUSY);
 		}
-		if (dev_WARN_ONCE(&ndns->dev, ndns->claim != host,
+		if (dev_WARN_ONCE(&ndns->dev, ndns->claim != dev,
 					"host (%s) vs claim (%s) mismatch\n",
-					dev_name(host),
+					dev_name(dev),
 					dev_name(ndns->claim)))
 			return ERR_PTR(-ENXIO);
 	} else {
-- 
cgit v0.10.2


From 5ad9a7fde07a95b326da9e650b4f0a41b85e47b5 Mon Sep 17 00:00:00 2001
From: Toshi Kani <toshi.kani@hpe.com>
Date: Mon, 25 Apr 2016 15:34:58 -0600
Subject: acpi/nfit: Update nfit driver to comply with ACPI 6.1

ACPI 6.1, Table 5-133, updates NVDIMM Control Region Structure
as follows.
 - Valid Fields, Manufacturing Location, and Manufacturing Date
   are added from reserved range.  No change in the structure size.
 - IDs (SPD values) are stored as arrays of bytes (i.e. big-endian
   format).  The spec clarifies that they need to be represented
   as arrays of bytes as well.

This patch makes the following changes to support this update.
 - Change the NFIT driver to show SPD ID values in big-endian
   format.
 - Change sprintf format to use "0x" instead of "#" since "%#02x"
   does not prepend '0'.

link: http://www.uefi.org/sites/default/files/resources/ACPI_6_1.pdf
Signed-off-by: Toshi Kani <toshi.kani@hpe.com>
Cc: Rafael J. Wysocki <rjw@rjwysocki.net>
Cc: Dan Williams <dan.j.williams@intel.com>
Cc: Robert Moore <robert.moore@intel.com>
Cc: Robert Elliott <elliott@hpe.com>
Signed-off-by: Dan Williams <dan.j.williams@intel.com>

diff --git a/drivers/acpi/nfit.c b/drivers/acpi/nfit.c
index d0f35e6..5dc243c 100644
--- a/drivers/acpi/nfit.c
+++ b/drivers/acpi/nfit.c
@@ -816,7 +816,7 @@ static ssize_t vendor_show(struct device *dev,
 {
 	struct acpi_nfit_control_region *dcr = to_nfit_dcr(dev);
 
-	return sprintf(buf, "%#x\n", dcr->vendor_id);
+	return sprintf(buf, "0x%04x\n", be16_to_cpu(dcr->vendor_id));
 }
 static DEVICE_ATTR_RO(vendor);
 
@@ -825,7 +825,7 @@ static ssize_t rev_id_show(struct device *dev,
 {
 	struct acpi_nfit_control_region *dcr = to_nfit_dcr(dev);
 
-	return sprintf(buf, "%#x\n", dcr->revision_id);
+	return sprintf(buf, "0x%04x\n", be16_to_cpu(dcr->revision_id));
 }
 static DEVICE_ATTR_RO(rev_id);
 
@@ -834,7 +834,7 @@ static ssize_t device_show(struct device *dev,
 {
 	struct acpi_nfit_control_region *dcr = to_nfit_dcr(dev);
 
-	return sprintf(buf, "%#x\n", dcr->device_id);
+	return sprintf(buf, "0x%04x\n", be16_to_cpu(dcr->device_id));
 }
 static DEVICE_ATTR_RO(device);
 
@@ -843,7 +843,7 @@ static ssize_t format_show(struct device *dev,
 {
 	struct acpi_nfit_control_region *dcr = to_nfit_dcr(dev);
 
-	return sprintf(buf, "%#x\n", dcr->code);
+	return sprintf(buf, "0x%04x\n", be16_to_cpu(dcr->code));
 }
 static DEVICE_ATTR_RO(format);
 
@@ -852,7 +852,7 @@ static ssize_t serial_show(struct device *dev,
 {
 	struct acpi_nfit_control_region *dcr = to_nfit_dcr(dev);
 
-	return sprintf(buf, "%#x\n", dcr->serial_number);
+	return sprintf(buf, "0x%08x\n", be32_to_cpu(dcr->serial_number));
 }
 static DEVICE_ATTR_RO(serial);
 
-- 
cgit v0.10.2


From 38a879ba9c0a6849fe26c36e325f754a89848da7 Mon Sep 17 00:00:00 2001
From: Toshi Kani <toshi.kani@hpe.com>
Date: Mon, 25 Apr 2016 15:34:59 -0600
Subject: acpi/nfit: Add sysfs "id" for NVDIMM ID

ACPI 6.1, section 5.2.25.9, defines an identifier for an NVDIMM.

Change the NFIT driver to add a new sysfs file "id" under nfit
directory.

Signed-off-by: Toshi Kani <toshi.kani@hpe.com>
Cc: Rafael J. Wysocki <rjw@rjwysocki.net>
Cc: Dan Williams <dan.j.williams@intel.com>
Cc: Robert Moore <robert.moore@intel.com>
Cc: Robert Elliott <elliott@hpe.com>
Signed-off-by: Dan Williams <dan.j.williams@intel.com>

diff --git a/drivers/acpi/nfit.c b/drivers/acpi/nfit.c
index 5dc243c..5a7199d 100644
--- a/drivers/acpi/nfit.c
+++ b/drivers/acpi/nfit.c
@@ -870,6 +870,24 @@ static ssize_t flags_show(struct device *dev,
 }
 static DEVICE_ATTR_RO(flags);
 
+static ssize_t id_show(struct device *dev,
+		struct device_attribute *attr, char *buf)
+{
+	struct acpi_nfit_control_region *dcr = to_nfit_dcr(dev);
+
+	if (dcr->valid_fields & ACPI_NFIT_CONTROL_MFG_INFO_VALID)
+		return sprintf(buf, "%04x-%02x-%04x-%08x\n",
+				be16_to_cpu(dcr->vendor_id),
+				dcr->manufacturing_location,
+				be16_to_cpu(dcr->manufacturing_date),
+				be32_to_cpu(dcr->serial_number));
+	else
+		return sprintf(buf, "%04x-%08x\n",
+				be16_to_cpu(dcr->vendor_id),
+				be32_to_cpu(dcr->serial_number));
+}
+static DEVICE_ATTR_RO(id);
+
 static struct attribute *acpi_nfit_dimm_attributes[] = {
 	&dev_attr_handle.attr,
 	&dev_attr_phys_id.attr,
@@ -879,6 +897,7 @@ static struct attribute *acpi_nfit_dimm_attributes[] = {
 	&dev_attr_serial.attr,
 	&dev_attr_rev_id.attr,
 	&dev_attr_flags.attr,
+	&dev_attr_id.attr,
 	NULL,
 };
 
-- 
cgit v0.10.2


From 40abf9be8f52d440e442206182916e3dcc68f722 Mon Sep 17 00:00:00 2001
From: Jerry Hoemann <jerry.hoemann@hpe.com>
Date: Mon, 11 Apr 2016 15:02:28 -0700
Subject: libnvdimm: increase max envelope size for ioctl

nd_ioctl() must first read in the fixed sized portion of an ioctl so
that it can then determine the size of the variable part.

Prepare for ND_CMD_CALL calls which have larger fixed portion
envelope.

Signed-off-by: Jerry Hoemann <jerry.hoemann@hpe.com>
Signed-off-by: Dan Williams <dan.j.williams@intel.com>

diff --git a/include/linux/libnvdimm.h b/include/linux/libnvdimm.h
index 833867b..af31d1c 100644
--- a/include/linux/libnvdimm.h
+++ b/include/linux/libnvdimm.h
@@ -27,7 +27,7 @@ enum {
 	/* need to set a limit somewhere, but yes, this is likely overkill */
 	ND_IOCTL_MAX_BUFLEN = SZ_4M,
 	ND_CMD_MAX_ELEM = 5,
-	ND_CMD_MAX_ENVELOPE = 16,
+	ND_CMD_MAX_ENVELOPE = 256,
 	ND_MAX_MAPPINGS = 32,
 
 	/* region flag indicating to direct-map persistent memory by default */
-- 
cgit v0.10.2


From e3654eca70d63704c94a60a2aafc0b3c7b46a00b Mon Sep 17 00:00:00 2001
From: Dan Williams <dan.j.williams@intel.com>
Date: Thu, 28 Apr 2016 16:17:07 -0700
Subject: nfit, libnvdimm: clarify "commands" vs "_DSMs"

Clarify the distinction between "commands", the ioctls userspace calls
to request the kernel take some action on a given dimm device, and
"_DSMs", the actual function numbers used in the firmware interface to
the DIMM.  _DSMs are ACPI specific whereas commands are Linux kernel
generic.

This is in preparation for breaking the 1:1 implicit relationship
between the kernel ioctl number space and the firmware specific function
numbers.

Cc: Jerry Hoemann <jerry.hoemann@hpe.com>
Cc: Christoph Hellwig <hch@infradead.org>
Signed-off-by: Dan Williams <dan.j.williams@intel.com>

diff --git a/drivers/acpi/nfit.c b/drivers/acpi/nfit.c
index d0f35e6..1b98e9d 100644
--- a/drivers/acpi/nfit.c
+++ b/drivers/acpi/nfit.c
@@ -175,7 +175,7 @@ static int acpi_nfit_ctl(struct nvdimm_bus_descriptor *nd_desc,
 	union acpi_object in_obj, in_buf, *out_obj;
 	struct device *dev = acpi_desc->dev;
 	const char *cmd_name, *dimm_name;
-	unsigned long dsm_mask;
+	unsigned long cmd_mask;
 	acpi_handle handle;
 	const u8 *uuid;
 	u32 offset;
@@ -189,7 +189,7 @@ static int acpi_nfit_ctl(struct nvdimm_bus_descriptor *nd_desc,
 			return -ENOTTY;
 		dimm_name = nvdimm_name(nvdimm);
 		cmd_name = nvdimm_cmd_name(cmd);
-		dsm_mask = nfit_mem->dsm_mask;
+		cmd_mask = nvdimm_cmd_mask(nvdimm);
 		desc = nd_cmd_dimm_desc(cmd);
 		uuid = to_nfit_uuid(NFIT_DEV_DIMM);
 		handle = adev->handle;
@@ -197,7 +197,7 @@ static int acpi_nfit_ctl(struct nvdimm_bus_descriptor *nd_desc,
 		struct acpi_device *adev = to_acpi_dev(acpi_desc);
 
 		cmd_name = nvdimm_bus_cmd_name(cmd);
-		dsm_mask = nd_desc->dsm_mask;
+		cmd_mask = nd_desc->cmd_mask;
 		desc = nd_cmd_bus_desc(cmd);
 		uuid = to_nfit_uuid(NFIT_DEV_BUS);
 		handle = adev->handle;
@@ -207,7 +207,7 @@ static int acpi_nfit_ctl(struct nvdimm_bus_descriptor *nd_desc,
 	if (!desc || (cmd && (desc->out_num + desc->in_num == 0)))
 		return -ENOTTY;
 
-	if (!test_bit(cmd, &dsm_mask))
+	if (!test_bit(cmd, &cmd_mask))
 		return -ENOTTY;
 
 	in_obj.type = ACPI_TYPE_PACKAGE;
@@ -926,7 +926,8 @@ static int acpi_nfit_add_dimm(struct acpi_nfit_desc *acpi_desc,
 	const u8 *uuid = to_nfit_uuid(NFIT_DEV_DIMM);
 	int i;
 
-	nfit_mem->dsm_mask = acpi_desc->dimm_dsm_force_en;
+	/* nfit test assumes 1:1 relationship between commands and dsms */
+	nfit_mem->dsm_mask = acpi_desc->dimm_cmd_force_en;
 	adev = to_acpi_dev(acpi_desc);
 	if (!adev)
 		return 0;
@@ -976,9 +977,13 @@ static int acpi_nfit_register_dimms(struct acpi_nfit_desc *acpi_desc)
 		if (rc)
 			continue;
 
+		/*
+		 * For now there is 1:1 relationship between cmd_mask and
+		 * dsm_mask.
+		 */
 		nvdimm = nvdimm_create(acpi_desc->nvdimm_bus, nfit_mem,
 				acpi_nfit_dimm_attribute_groups,
-				flags, &nfit_mem->dsm_mask);
+				flags, nfit_mem->dsm_mask);
 		if (!nvdimm)
 			return -ENOMEM;
 
@@ -1007,14 +1012,14 @@ static void acpi_nfit_init_dsms(struct acpi_nfit_desc *acpi_desc)
 	struct acpi_device *adev;
 	int i;
 
-	nd_desc->dsm_mask = acpi_desc->bus_dsm_force_en;
+	nd_desc->cmd_mask = acpi_desc->bus_cmd_force_en;
 	adev = to_acpi_dev(acpi_desc);
 	if (!adev)
 		return;
 
 	for (i = ND_CMD_ARS_CAP; i <= ND_CMD_CLEAR_ERROR; i++)
 		if (acpi_check_dsm(adev->handle, uuid, 1, 1ULL << i))
-			set_bit(i, &nd_desc->dsm_mask);
+			set_bit(i, &nd_desc->cmd_mask);
 }
 
 static ssize_t range_index_show(struct device *dev,
diff --git a/drivers/acpi/nfit.h b/drivers/acpi/nfit.h
index c75576b..332ee6f 100644
--- a/drivers/acpi/nfit.h
+++ b/drivers/acpi/nfit.h
@@ -132,8 +132,8 @@ struct acpi_nfit_desc {
 	size_t ars_status_size;
 	struct work_struct work;
 	unsigned int cancel:1;
-	unsigned long dimm_dsm_force_en;
-	unsigned long bus_dsm_force_en;
+	unsigned long dimm_cmd_force_en;
+	unsigned long bus_cmd_force_en;
 	int (*blk_do_io)(struct nd_blk_region *ndbr, resource_size_t dpa,
 			void *iobuf, u64 len, int rw);
 };
diff --git a/drivers/nvdimm/bus.c b/drivers/nvdimm/bus.c
index 19f822d..cb2042a 100644
--- a/drivers/nvdimm/bus.c
+++ b/drivers/nvdimm/bus.c
@@ -589,24 +589,24 @@ static int __nd_ioctl(struct nvdimm_bus *nvdimm_bus, struct nvdimm *nvdimm,
 	void __user *p = (void __user *) arg;
 	struct device *dev = &nvdimm_bus->dev;
 	const char *cmd_name, *dimm_name;
-	unsigned long dsm_mask;
+	unsigned long cmd_mask;
 	void *buf;
 	int rc, i;
 
 	if (nvdimm) {
 		desc = nd_cmd_dimm_desc(cmd);
 		cmd_name = nvdimm_cmd_name(cmd);
-		dsm_mask = nvdimm->dsm_mask ? *(nvdimm->dsm_mask) : 0;
+		cmd_mask = nvdimm->cmd_mask;
 		dimm_name = dev_name(&nvdimm->dev);
 	} else {
 		desc = nd_cmd_bus_desc(cmd);
 		cmd_name = nvdimm_bus_cmd_name(cmd);
-		dsm_mask = nd_desc->dsm_mask;
+		cmd_mask = nd_desc->cmd_mask;
 		dimm_name = "bus";
 	}
 
 	if (!desc || (desc->out_num + desc->in_num == 0) ||
-			!test_bit(cmd, &dsm_mask))
+			!test_bit(cmd, &cmd_mask))
 		return -ENOTTY;
 
 	/* fail write commands (when read-only) */
diff --git a/drivers/nvdimm/core.c b/drivers/nvdimm/core.c
index 182a93f..e8688a1 100644
--- a/drivers/nvdimm/core.c
+++ b/drivers/nvdimm/core.c
@@ -251,7 +251,7 @@ static ssize_t commands_show(struct device *dev,
 	struct nvdimm_bus *nvdimm_bus = to_nvdimm_bus(dev);
 	struct nvdimm_bus_descriptor *nd_desc = nvdimm_bus->nd_desc;
 
-	for_each_set_bit(cmd, &nd_desc->dsm_mask, BITS_PER_LONG)
+	for_each_set_bit(cmd, &nd_desc->cmd_mask, BITS_PER_LONG)
 		len += sprintf(buf + len, "%s ", nvdimm_bus_cmd_name(cmd));
 	len += sprintf(buf + len, "\n");
 	return len;
diff --git a/drivers/nvdimm/dimm_devs.c b/drivers/nvdimm/dimm_devs.c
index c56f882..79a35a0 100644
--- a/drivers/nvdimm/dimm_devs.c
+++ b/drivers/nvdimm/dimm_devs.c
@@ -37,9 +37,9 @@ static int __validate_dimm(struct nvdimm_drvdata *ndd)
 
 	nvdimm = to_nvdimm(ndd->dev);
 
-	if (!nvdimm->dsm_mask)
+	if (!nvdimm->cmd_mask)
 		return -ENXIO;
-	if (!test_bit(ND_CMD_GET_CONFIG_DATA, nvdimm->dsm_mask))
+	if (!test_bit(ND_CMD_GET_CONFIG_DATA, &nvdimm->cmd_mask))
 		return -ENXIO;
 
 	return 0;
@@ -263,6 +263,12 @@ const char *nvdimm_name(struct nvdimm *nvdimm)
 }
 EXPORT_SYMBOL_GPL(nvdimm_name);
 
+unsigned long nvdimm_cmd_mask(struct nvdimm *nvdimm)
+{
+	return nvdimm->cmd_mask;
+}
+EXPORT_SYMBOL_GPL(nvdimm_cmd_mask);
+
 void *nvdimm_provider_data(struct nvdimm *nvdimm)
 {
 	if (nvdimm)
@@ -277,10 +283,10 @@ static ssize_t commands_show(struct device *dev,
 	struct nvdimm *nvdimm = to_nvdimm(dev);
 	int cmd, len = 0;
 
-	if (!nvdimm->dsm_mask)
+	if (!nvdimm->cmd_mask)
 		return sprintf(buf, "\n");
 
-	for_each_set_bit(cmd, nvdimm->dsm_mask, BITS_PER_LONG)
+	for_each_set_bit(cmd, &nvdimm->cmd_mask, BITS_PER_LONG)
 		len += sprintf(buf + len, "%s ", nvdimm_cmd_name(cmd));
 	len += sprintf(buf + len, "\n");
 	return len;
@@ -340,7 +346,7 @@ EXPORT_SYMBOL_GPL(nvdimm_attribute_group);
 
 struct nvdimm *nvdimm_create(struct nvdimm_bus *nvdimm_bus, void *provider_data,
 		const struct attribute_group **groups, unsigned long flags,
-		unsigned long *dsm_mask)
+		unsigned long cmd_mask)
 {
 	struct nvdimm *nvdimm = kzalloc(sizeof(*nvdimm), GFP_KERNEL);
 	struct device *dev;
@@ -355,7 +361,7 @@ struct nvdimm *nvdimm_create(struct nvdimm_bus *nvdimm_bus, void *provider_data,
 	}
 	nvdimm->provider_data = provider_data;
 	nvdimm->flags = flags;
-	nvdimm->dsm_mask = dsm_mask;
+	nvdimm->cmd_mask = cmd_mask;
 	atomic_set(&nvdimm->busy, 0);
 	dev = &nvdimm->dev;
 	dev_set_name(dev, "nmem%d", nvdimm->id);
diff --git a/drivers/nvdimm/nd-core.h b/drivers/nvdimm/nd-core.h
index 1d1500f..da0d322 100644
--- a/drivers/nvdimm/nd-core.h
+++ b/drivers/nvdimm/nd-core.h
@@ -37,7 +37,7 @@ struct nvdimm_bus {
 struct nvdimm {
 	unsigned long flags;
 	void *provider_data;
-	unsigned long *dsm_mask;
+	unsigned long cmd_mask;
 	struct device dev;
 	atomic_t busy;
 	int id;
diff --git a/include/linux/libnvdimm.h b/include/linux/libnvdimm.h
index af31d1c..0c3c30c 100644
--- a/include/linux/libnvdimm.h
+++ b/include/linux/libnvdimm.h
@@ -68,7 +68,7 @@ struct nd_mapping {
 
 struct nvdimm_bus_descriptor {
 	const struct attribute_group **attr_groups;
-	unsigned long dsm_mask;
+	unsigned long cmd_mask;
 	char *provider_name;
 	ndctl_fn ndctl;
 	int (*flush_probe)(struct nvdimm_bus_descriptor *nd_desc);
@@ -130,10 +130,11 @@ struct nd_region *to_nd_region(struct device *dev);
 struct nd_blk_region *to_nd_blk_region(struct device *dev);
 struct nvdimm_bus_descriptor *to_nd_desc(struct nvdimm_bus *nvdimm_bus);
 const char *nvdimm_name(struct nvdimm *nvdimm);
+unsigned long nvdimm_cmd_mask(struct nvdimm *nvdimm);
 void *nvdimm_provider_data(struct nvdimm *nvdimm);
 struct nvdimm *nvdimm_create(struct nvdimm_bus *nvdimm_bus, void *provider_data,
 		const struct attribute_group **groups, unsigned long flags,
-		unsigned long *dsm_mask);
+		unsigned long cmd_mask);
 const struct nd_cmd_desc *nd_cmd_dimm_desc(int cmd);
 const struct nd_cmd_desc *nd_cmd_bus_desc(int cmd);
 u32 nd_cmd_in_size(struct nvdimm *nvdimm, int cmd,
diff --git a/tools/testing/nvdimm/test/nfit.c b/tools/testing/nvdimm/test/nfit.c
index 3187322..ed899a4 100644
--- a/tools/testing/nvdimm/test/nfit.c
+++ b/tools/testing/nvdimm/test/nfit.c
@@ -344,8 +344,9 @@ static int nfit_test_ctl(struct nvdimm_bus_descriptor *nd_desc,
 
 	if (nvdimm) {
 		struct nfit_mem *nfit_mem = nvdimm_provider_data(nvdimm);
+		unsigned long cmd_mask = nvdimm_cmd_mask(nvdimm);
 
-		if (!nfit_mem || !test_bit(cmd, &nfit_mem->dsm_mask))
+		if (!nfit_mem || !test_bit(cmd, &cmd_mask))
 			return -ENOTTY;
 
 		/* lookup label space for the given dimm */
@@ -374,7 +375,7 @@ static int nfit_test_ctl(struct nvdimm_bus_descriptor *nd_desc,
 	} else {
 		struct ars_state *ars_state = &t->ars_state;
 
-		if (!nd_desc || !test_bit(cmd, &nd_desc->dsm_mask))
+		if (!nd_desc || !test_bit(cmd, &nd_desc->cmd_mask))
 			return -ENOTTY;
 
 		switch (cmd) {
@@ -1251,13 +1252,13 @@ static void nfit_test0_setup(struct nfit_test *t)
 	post_ars_status(&t->ars_state, t->spa_set_dma[0], SPA0_SIZE);
 
 	acpi_desc = &t->acpi_desc;
-	set_bit(ND_CMD_GET_CONFIG_SIZE, &acpi_desc->dimm_dsm_force_en);
-	set_bit(ND_CMD_GET_CONFIG_DATA, &acpi_desc->dimm_dsm_force_en);
-	set_bit(ND_CMD_SET_CONFIG_DATA, &acpi_desc->dimm_dsm_force_en);
-	set_bit(ND_CMD_ARS_CAP, &acpi_desc->bus_dsm_force_en);
-	set_bit(ND_CMD_ARS_START, &acpi_desc->bus_dsm_force_en);
-	set_bit(ND_CMD_ARS_STATUS, &acpi_desc->bus_dsm_force_en);
-	set_bit(ND_CMD_CLEAR_ERROR, &acpi_desc->bus_dsm_force_en);
+	set_bit(ND_CMD_GET_CONFIG_SIZE, &acpi_desc->dimm_cmd_force_en);
+	set_bit(ND_CMD_GET_CONFIG_DATA, &acpi_desc->dimm_cmd_force_en);
+	set_bit(ND_CMD_SET_CONFIG_DATA, &acpi_desc->dimm_cmd_force_en);
+	set_bit(ND_CMD_ARS_CAP, &acpi_desc->bus_cmd_force_en);
+	set_bit(ND_CMD_ARS_START, &acpi_desc->bus_cmd_force_en);
+	set_bit(ND_CMD_ARS_STATUS, &acpi_desc->bus_cmd_force_en);
+	set_bit(ND_CMD_CLEAR_ERROR, &acpi_desc->bus_cmd_force_en);
 }
 
 static void nfit_test1_setup(struct nfit_test *t)
@@ -1315,10 +1316,10 @@ static void nfit_test1_setup(struct nfit_test *t)
 	post_ars_status(&t->ars_state, t->spa_set_dma[0], SPA2_SIZE);
 
 	acpi_desc = &t->acpi_desc;
-	set_bit(ND_CMD_ARS_CAP, &acpi_desc->bus_dsm_force_en);
-	set_bit(ND_CMD_ARS_START, &acpi_desc->bus_dsm_force_en);
-	set_bit(ND_CMD_ARS_STATUS, &acpi_desc->bus_dsm_force_en);
-	set_bit(ND_CMD_CLEAR_ERROR, &acpi_desc->bus_dsm_force_en);
+	set_bit(ND_CMD_ARS_CAP, &acpi_desc->bus_cmd_force_en);
+	set_bit(ND_CMD_ARS_START, &acpi_desc->bus_cmd_force_en);
+	set_bit(ND_CMD_ARS_STATUS, &acpi_desc->bus_cmd_force_en);
+	set_bit(ND_CMD_CLEAR_ERROR, &acpi_desc->bus_cmd_force_en);
 }
 
 static int nfit_test_blk_do_io(struct nd_blk_region *ndbr, resource_size_t dpa,
-- 
cgit v0.10.2


From 31eca76ba2fc988bf88f16fcf763a0ec4068cd30 Mon Sep 17 00:00:00 2001
From: Dan Williams <dan.j.williams@intel.com>
Date: Thu, 28 Apr 2016 16:23:43 -0700
Subject: nfit, libnvdimm: limited/whitelisted dimm command marshaling
 mechanism

There are currently 4 known similar but incompatible definitions of the
command sets that can be sent to an NVDIMM through ACPI.  It is also
clear that future platform generations (ACPI or not) will continue to
revise and extend the DIMM command set as new devices and use cases
arrive.

It is obviously untenable to continue to proliferate divergence
of these command definitions, and to that end a standardization process
has begun to provide for a unified specification.  However, that leaves a
problem about what to do with this first generation where vendors are
already shipping divergence.

The Linux kernel can support these initial diverged platforms without
giving platform-firmware free reign to continue to diverge and compound
kernel maintenance overhead.  The kernel implementation can encourage
standardization in two ways:

1/ Require that any function code that userspace wants to send be
   explicitly white-listed in the implementation.  For ACPI this means
   function codes marked as supported by acpi_check_dsm() may
   only be invoked if they appear in the white-list.  A function must be
   publicly documented before it is added to the white-list.

2/ The above restrictions can be trivially bypassed by using the
   "vendor-specific" payload command.  However, since vendor-specific
   commands are by definition not publicly documented and have the
   potential to corrupt the kernel's view of the dimm state, we provide a
   toggle to disable vendor-specific operations.  Enabling undefined
   behavior is a policy decision that can be made by the platform owner
   and encourages firmware implementations to choose public over
   private command implementations.

Based on an initial patch from Jerry Hoemann
Cc: Jerry Hoemann <jerry.hoemann@hpe.com>
Cc: Christoph Hellwig <hch@infradead.org>
Signed-off-by: Dan Williams <dan.j.williams@intel.com>

diff --git a/drivers/acpi/nfit.c b/drivers/acpi/nfit.c
index 1b98e9d..b85a468 100644
--- a/drivers/acpi/nfit.c
+++ b/drivers/acpi/nfit.c
@@ -171,33 +171,46 @@ static int acpi_nfit_ctl(struct nvdimm_bus_descriptor *nd_desc,
 		unsigned int buf_len, int *cmd_rc)
 {
 	struct acpi_nfit_desc *acpi_desc = to_acpi_nfit_desc(nd_desc);
-	const struct nd_cmd_desc *desc = NULL;
 	union acpi_object in_obj, in_buf, *out_obj;
+	const struct nd_cmd_desc *desc = NULL;
 	struct device *dev = acpi_desc->dev;
+	struct nd_cmd_pkg *call_pkg = NULL;
 	const char *cmd_name, *dimm_name;
-	unsigned long cmd_mask;
+	unsigned long cmd_mask, dsm_mask;
 	acpi_handle handle;
+	unsigned int func;
 	const u8 *uuid;
 	u32 offset;
 	int rc, i;
 
+	func = cmd;
+	if (cmd == ND_CMD_CALL) {
+		call_pkg = buf;
+		func = call_pkg->nd_command;
+	}
+
 	if (nvdimm) {
 		struct nfit_mem *nfit_mem = nvdimm_provider_data(nvdimm);
 		struct acpi_device *adev = nfit_mem->adev;
 
 		if (!adev)
 			return -ENOTTY;
+		if (call_pkg && nfit_mem->family != call_pkg->nd_family)
+			return -ENOTTY;
+
 		dimm_name = nvdimm_name(nvdimm);
 		cmd_name = nvdimm_cmd_name(cmd);
 		cmd_mask = nvdimm_cmd_mask(nvdimm);
+		dsm_mask = nfit_mem->dsm_mask;
 		desc = nd_cmd_dimm_desc(cmd);
-		uuid = to_nfit_uuid(NFIT_DEV_DIMM);
+		uuid = to_nfit_uuid(nfit_mem->family);
 		handle = adev->handle;
 	} else {
 		struct acpi_device *adev = to_acpi_dev(acpi_desc);
 
 		cmd_name = nvdimm_bus_cmd_name(cmd);
 		cmd_mask = nd_desc->cmd_mask;
+		dsm_mask = cmd_mask;
 		desc = nd_cmd_bus_desc(cmd);
 		uuid = to_nfit_uuid(NFIT_DEV_BUS);
 		handle = adev->handle;
@@ -207,7 +220,7 @@ static int acpi_nfit_ctl(struct nvdimm_bus_descriptor *nd_desc,
 	if (!desc || (cmd && (desc->out_num + desc->in_num == 0)))
 		return -ENOTTY;
 
-	if (!test_bit(cmd, &cmd_mask))
+	if (!test_bit(cmd, &cmd_mask) || !test_bit(func, &dsm_mask))
 		return -ENOTTY;
 
 	in_obj.type = ACPI_TYPE_PACKAGE;
@@ -222,21 +235,44 @@ static int acpi_nfit_ctl(struct nvdimm_bus_descriptor *nd_desc,
 		in_buf.buffer.length += nd_cmd_in_size(nvdimm, cmd, desc,
 				i, buf);
 
+	if (call_pkg) {
+		/* skip over package wrapper */
+		in_buf.buffer.pointer = (void *) &call_pkg->nd_payload;
+		in_buf.buffer.length = call_pkg->nd_size_in;
+	}
+
 	if (IS_ENABLED(CONFIG_ACPI_NFIT_DEBUG)) {
-		dev_dbg(dev, "%s:%s cmd: %s input length: %d\n", __func__,
-				dimm_name, cmd_name, in_buf.buffer.length);
-		print_hex_dump_debug(cmd_name, DUMP_PREFIX_OFFSET, 4,
-				4, in_buf.buffer.pointer, min_t(u32, 128,
-					in_buf.buffer.length), true);
+		dev_dbg(dev, "%s:%s cmd: %d: func: %d input length: %d\n",
+				__func__, dimm_name, cmd, func,
+				in_buf.buffer.length);
+		print_hex_dump_debug("nvdimm in  ", DUMP_PREFIX_OFFSET, 4, 4,
+			in_buf.buffer.pointer,
+			min_t(u32, 256, in_buf.buffer.length), true);
 	}
 
-	out_obj = acpi_evaluate_dsm(handle, uuid, 1, cmd, &in_obj);
+	out_obj = acpi_evaluate_dsm(handle, uuid, 1, func, &in_obj);
 	if (!out_obj) {
 		dev_dbg(dev, "%s:%s _DSM failed cmd: %s\n", __func__, dimm_name,
 				cmd_name);
 		return -EINVAL;
 	}
 
+	if (call_pkg) {
+		call_pkg->nd_fw_size = out_obj->buffer.length;
+		memcpy(call_pkg->nd_payload + call_pkg->nd_size_in,
+			out_obj->buffer.pointer,
+			min(call_pkg->nd_fw_size, call_pkg->nd_size_out));
+
+		ACPI_FREE(out_obj);
+		/*
+		 * Need to support FW function w/o known size in advance.
+		 * Caller can determine required size based upon nd_fw_size.
+		 * If we return an error (like elsewhere) then caller wouldn't
+		 * be able to rely upon data returned to make calculation.
+		 */
+		return 0;
+	}
+
 	if (out_obj->package.type != ACPI_TYPE_BUFFER) {
 		dev_dbg(dev, "%s:%s unexpected output object type cmd: %s type: %d\n",
 				__func__, dimm_name, cmd_name, out_obj->type);
@@ -923,11 +959,13 @@ static int acpi_nfit_add_dimm(struct acpi_nfit_desc *acpi_desc,
 {
 	struct acpi_device *adev, *adev_dimm;
 	struct device *dev = acpi_desc->dev;
-	const u8 *uuid = to_nfit_uuid(NFIT_DEV_DIMM);
+	unsigned long dsm_mask;
+	const u8 *uuid;
 	int i;
 
 	/* nfit test assumes 1:1 relationship between commands and dsms */
 	nfit_mem->dsm_mask = acpi_desc->dimm_cmd_force_en;
+	nfit_mem->family = NVDIMM_FAMILY_INTEL;
 	adev = to_acpi_dev(acpi_desc);
 	if (!adev)
 		return 0;
@@ -940,7 +978,31 @@ static int acpi_nfit_add_dimm(struct acpi_nfit_desc *acpi_desc,
 		return force_enable_dimms ? 0 : -ENODEV;
 	}
 
-	for (i = ND_CMD_SMART; i <= ND_CMD_VENDOR; i++)
+	/*
+	 * Until standardization materializes we need to consider up to 3
+	 * different command sets.  Note, that checking for function0 (bit0)
+	 * tells us if any commands are reachable through this uuid.
+	 */
+	for (i = NVDIMM_FAMILY_INTEL; i <= NVDIMM_FAMILY_HPE2; i++)
+		if (acpi_check_dsm(adev_dimm->handle, to_nfit_uuid(i), 1, 1))
+			break;
+
+	/* limit the supported commands to those that are publicly documented */
+	nfit_mem->family = i;
+	if (nfit_mem->family == NVDIMM_FAMILY_INTEL)
+		dsm_mask = 0x3fe;
+	else if (nfit_mem->family == NVDIMM_FAMILY_HPE1)
+		dsm_mask = 0x1c3c76;
+	else if (nfit_mem->family == NVDIMM_FAMILY_HPE2)
+		dsm_mask = 0x1fe;
+	else {
+		dev_err(dev, "unknown dimm command family\n");
+		nfit_mem->family = -1;
+		return force_enable_dimms ? 0 : -ENODEV;
+	}
+
+	uuid = to_nfit_uuid(nfit_mem->family);
+	for_each_set_bit(i, &dsm_mask, BITS_PER_LONG)
 		if (acpi_check_dsm(adev_dimm->handle, uuid, 1, 1ULL << i))
 			set_bit(i, &nfit_mem->dsm_mask);
 
@@ -953,8 +1015,8 @@ static int acpi_nfit_register_dimms(struct acpi_nfit_desc *acpi_desc)
 	int dimm_count = 0;
 
 	list_for_each_entry(nfit_mem, &acpi_desc->dimms, list) {
+		unsigned long flags = 0, cmd_mask;
 		struct nvdimm *nvdimm;
-		unsigned long flags = 0;
 		u32 device_handle;
 		u16 mem_flags;
 		int rc;
@@ -978,12 +1040,17 @@ static int acpi_nfit_register_dimms(struct acpi_nfit_desc *acpi_desc)
 			continue;
 
 		/*
-		 * For now there is 1:1 relationship between cmd_mask and
-		 * dsm_mask.
+		 * TODO: provide translation for non-NVDIMM_FAMILY_INTEL
+		 * devices (i.e. from nd_cmd to acpi_dsm) to standardize the
+		 * userspace interface.
 		 */
+		cmd_mask = 1UL << ND_CMD_CALL;
+		if (nfit_mem->family == NVDIMM_FAMILY_INTEL)
+			cmd_mask |= nfit_mem->dsm_mask;
+
 		nvdimm = nvdimm_create(acpi_desc->nvdimm_bus, nfit_mem,
 				acpi_nfit_dimm_attribute_groups,
-				flags, nfit_mem->dsm_mask);
+				flags, cmd_mask);
 		if (!nvdimm)
 			return -ENOMEM;
 
@@ -2468,6 +2535,8 @@ static __init int nfit_init(void)
 	acpi_str_to_uuid(UUID_PERSISTENT_VIRTUAL_CD, nfit_uuid[NFIT_SPA_PCD]);
 	acpi_str_to_uuid(UUID_NFIT_BUS, nfit_uuid[NFIT_DEV_BUS]);
 	acpi_str_to_uuid(UUID_NFIT_DIMM, nfit_uuid[NFIT_DEV_DIMM]);
+	acpi_str_to_uuid(UUID_NFIT_DIMM_N_HPE1, nfit_uuid[NFIT_DEV_DIMM_N_HPE1]);
+	acpi_str_to_uuid(UUID_NFIT_DIMM_N_HPE2, nfit_uuid[NFIT_DEV_DIMM_N_HPE2]);
 
 	nfit_wq = create_singlethread_workqueue("nfit");
 	if (!nfit_wq)
diff --git a/drivers/acpi/nfit.h b/drivers/acpi/nfit.h
index 332ee6f..f82fda5 100644
--- a/drivers/acpi/nfit.h
+++ b/drivers/acpi/nfit.h
@@ -21,13 +21,25 @@
 #include <linux/acpi.h>
 #include <acpi/acuuid.h>
 
+/* ACPI 6.1 */
 #define UUID_NFIT_BUS "2f10e7a4-9e91-11e4-89d3-123b93f75cba"
+
+/* http://pmem.io/documents/NVDIMM_DSM_Interface_Example.pdf */
 #define UUID_NFIT_DIMM "4309ac30-0d11-11e4-9191-0800200c9a66"
+
+/* https://github.com/HewlettPackard/hpe-nvm/blob/master/Documentation/ */
+#define UUID_NFIT_DIMM_N_HPE1 "9002c334-acf3-4c0e-9642-a235f0d53bc6"
+#define UUID_NFIT_DIMM_N_HPE2 "5008664b-b758-41a0-a03c-27c2f2d04f7e"
+
 #define ACPI_NFIT_MEM_FAILED_MASK (ACPI_NFIT_MEM_SAVE_FAILED \
 		| ACPI_NFIT_MEM_RESTORE_FAILED | ACPI_NFIT_MEM_FLUSH_FAILED \
 		| ACPI_NFIT_MEM_NOT_ARMED)
 
 enum nfit_uuids {
+	/* for simplicity alias the uuid index with the family id */
+	NFIT_DEV_DIMM = NVDIMM_FAMILY_INTEL,
+	NFIT_DEV_DIMM_N_HPE1 = NVDIMM_FAMILY_HPE1,
+	NFIT_DEV_DIMM_N_HPE2 = NVDIMM_FAMILY_HPE2,
 	NFIT_SPA_VOLATILE,
 	NFIT_SPA_PM,
 	NFIT_SPA_DCR,
@@ -37,7 +49,6 @@ enum nfit_uuids {
 	NFIT_SPA_PDISK,
 	NFIT_SPA_PCD,
 	NFIT_DEV_BUS,
-	NFIT_DEV_DIMM,
 	NFIT_UUID_MAX,
 };
 
@@ -110,6 +121,7 @@ struct nfit_mem {
 	struct list_head list;
 	struct acpi_device *adev;
 	unsigned long dsm_mask;
+	int family;
 };
 
 struct acpi_nfit_desc {
diff --git a/drivers/nvdimm/bus.c b/drivers/nvdimm/bus.c
index cb2042a..395a9fb 100644
--- a/drivers/nvdimm/bus.c
+++ b/drivers/nvdimm/bus.c
@@ -439,6 +439,12 @@ static const struct nd_cmd_desc __nd_cmd_dimm_descs[] = {
 		.out_num = 3,
 		.out_sizes = { 4, 4, UINT_MAX, },
 	},
+	[ND_CMD_CALL] = {
+		.in_num = 2,
+		.in_sizes = { sizeof(struct nd_cmd_pkg), UINT_MAX, },
+		.out_num = 1,
+		.out_sizes = { UINT_MAX, },
+	},
 };
 
 const struct nd_cmd_desc *nd_cmd_dimm_desc(int cmd)
@@ -473,6 +479,12 @@ static const struct nd_cmd_desc __nd_cmd_bus_descs[] = {
 		.out_num = 3,
 		.out_sizes = { 4, 4, 8, },
 	},
+	[ND_CMD_CALL] = {
+		.in_num = 2,
+		.in_sizes = { sizeof(struct nd_cmd_pkg), UINT_MAX, },
+		.out_num = 1,
+		.out_sizes = { UINT_MAX, },
+	},
 };
 
 const struct nd_cmd_desc *nd_cmd_bus_desc(int cmd)
@@ -500,6 +512,10 @@ u32 nd_cmd_in_size(struct nvdimm *nvdimm, int cmd,
 		struct nd_cmd_vendor_hdr *hdr = buf;
 
 		return hdr->in_length;
+	} else if (cmd == ND_CMD_CALL) {
+		struct nd_cmd_pkg *pkg = buf;
+
+		return pkg->nd_size_in;
 	}
 
 	return UINT_MAX;
@@ -522,6 +538,12 @@ u32 nd_cmd_out_size(struct nvdimm *nvdimm, int cmd,
 		return out_field[1];
 	else if (!nvdimm && cmd == ND_CMD_ARS_STATUS && idx == 2)
 		return out_field[1] - 8;
+	else if (cmd == ND_CMD_CALL) {
+		struct nd_cmd_pkg *pkg = (struct nd_cmd_pkg *) in_field;
+
+		return pkg->nd_size_out;
+	}
+
 
 	return UINT_MAX;
 }
@@ -588,6 +610,7 @@ static int __nd_ioctl(struct nvdimm_bus *nvdimm_bus, struct nvdimm *nvdimm,
 	unsigned int cmd = _IOC_NR(ioctl_cmd);
 	void __user *p = (void __user *) arg;
 	struct device *dev = &nvdimm_bus->dev;
+	struct nd_cmd_pkg pkg;
 	const char *cmd_name, *dimm_name;
 	unsigned long cmd_mask;
 	void *buf;
@@ -605,6 +628,11 @@ static int __nd_ioctl(struct nvdimm_bus *nvdimm_bus, struct nvdimm *nvdimm,
 		dimm_name = "bus";
 	}
 
+	if (cmd == ND_CMD_CALL) {
+		if (copy_from_user(&pkg, p, sizeof(pkg)))
+			return -EFAULT;
+	}
+
 	if (!desc || (desc->out_num + desc->in_num == 0) ||
 			!test_bit(cmd, &cmd_mask))
 		return -ENOTTY;
@@ -616,6 +644,7 @@ static int __nd_ioctl(struct nvdimm_bus *nvdimm_bus, struct nvdimm *nvdimm,
 		case ND_CMD_SET_CONFIG_DATA:
 		case ND_CMD_ARS_START:
 		case ND_CMD_CLEAR_ERROR:
+		case ND_CMD_CALL:
 			dev_dbg(&nvdimm_bus->dev, "'%s' command while read-only.\n",
 					nvdimm ? nvdimm_cmd_name(cmd)
 					: nvdimm_bus_cmd_name(cmd));
@@ -643,6 +672,16 @@ static int __nd_ioctl(struct nvdimm_bus *nvdimm_bus, struct nvdimm *nvdimm,
 		in_len += in_size;
 	}
 
+	if (cmd == ND_CMD_CALL) {
+		dev_dbg(dev, "%s:%s, idx: %llu, in: %zu, out: %zu, len %zu\n",
+				__func__, dimm_name, pkg.nd_command,
+				in_len, out_len, buf_len);
+
+		for (i = 0; i < ARRAY_SIZE(pkg.nd_reserved2); i++)
+			if (pkg.nd_reserved2[i])
+				return -EINVAL;
+	}
+
 	/* process an output envelope */
 	for (i = 0; i < desc->out_num; i++) {
 		u32 out_size = nd_cmd_out_size(nvdimm, cmd, desc, i,
diff --git a/include/uapi/linux/ndctl.h b/include/uapi/linux/ndctl.h
index 7cc28ab..45daa0b 100644
--- a/include/uapi/linux/ndctl.h
+++ b/include/uapi/linux/ndctl.h
@@ -125,6 +125,7 @@ enum {
 	ND_CMD_VENDOR_EFFECT_LOG_SIZE = 7,
 	ND_CMD_VENDOR_EFFECT_LOG = 8,
 	ND_CMD_VENDOR = 9,
+	ND_CMD_CALL = 10,
 };
 
 enum {
@@ -158,6 +159,7 @@ static inline const char *nvdimm_cmd_name(unsigned cmd)
 		[ND_CMD_VENDOR_EFFECT_LOG_SIZE] = "effect_size",
 		[ND_CMD_VENDOR_EFFECT_LOG] = "effect_log",
 		[ND_CMD_VENDOR] = "vendor",
+		[ND_CMD_CALL] = "cmd_call",
 	};
 
 	if (cmd < ARRAY_SIZE(names) && names[cmd])
@@ -224,4 +226,44 @@ enum ars_masks {
 	ARS_STATUS_MASK = 0x0000FFFF,
 	ARS_EXT_STATUS_SHIFT = 16,
 };
+
+/*
+ * struct nd_cmd_pkg
+ *
+ * is a wrapper to a quasi pass thru interface for invoking firmware
+ * associated with nvdimms.
+ *
+ * INPUT PARAMETERS
+ *
+ * nd_family corresponds to the firmware (e.g. DSM) interface.
+ *
+ * nd_command are the function index advertised by the firmware.
+ *
+ * nd_size_in is the size of the input parameters being passed to firmware
+ *
+ * OUTPUT PARAMETERS
+ *
+ * nd_fw_size is the size of the data firmware wants to return for
+ * the call.  If nd_fw_size is greater than size of nd_size_out, only
+ * the first nd_size_out bytes are returned.
+ */
+
+struct nd_cmd_pkg {
+	__u64   nd_family;		/* family of commands */
+	__u64   nd_command;
+	__u32   nd_size_in;		/* INPUT: size of input args */
+	__u32   nd_size_out;		/* INPUT: size of payload */
+	__u32   nd_reserved2[9];	/* reserved must be zero */
+	__u32   nd_fw_size;		/* OUTPUT: size fw wants to return */
+	unsigned char nd_payload[];	/* Contents of call      */
+};
+
+/* These NVDIMM families represent pre-standardization command sets */
+#define NVDIMM_FAMILY_INTEL 0
+#define NVDIMM_FAMILY_HPE1 1
+#define NVDIMM_FAMILY_HPE2 2
+
+#define ND_IOCTL_CALL			_IOWR(ND_IOCTL, ND_CMD_CALL,\
+					struct nd_cmd_pkg)
+
 #endif /* __NDCTL_H__ */
-- 
cgit v0.10.2


From 30ec5fd464d51876247302da276db082e5675c35 Mon Sep 17 00:00:00 2001
From: Dan Williams <dan.j.williams@intel.com>
Date: Thu, 28 Apr 2016 18:35:23 -0700
Subject: nfit: fix format interface code byte order per ACPI6.1

ACPI6.1 clarifies that DCR fields are stored as an array of bytes,
update the format interface code constants to match.

Reviewed-by: Toshi Kani <toshi.kani@hpe.com>
Signed-off-by: Dan Williams <dan.j.williams@intel.com>

diff --git a/drivers/acpi/nfit.h b/drivers/acpi/nfit.h
index c75576b..c1b4eb2 100644
--- a/drivers/acpi/nfit.h
+++ b/drivers/acpi/nfit.h
@@ -41,11 +41,13 @@ enum nfit_uuids {
 	NFIT_UUID_MAX,
 };
 
-enum nfit_fic {
-	NFIT_FIC_BYTE = 0x101, /* byte-addressable energy backed */
-	NFIT_FIC_BLK = 0x201, /* block-addressable non-energy backed */
-	NFIT_FIC_BYTEN = 0x301, /* byte-addressable non-energy backed */
-};
+/*
+ * Region format interface codes are stored as an array of bytes in the
+ * NFIT DIMM Control Region structure
+ */
+#define NFIT_FIC_BYTE cpu_to_be16(0x101) /* byte-addressable energy backed */
+#define NFIT_FIC_BLK cpu_to_be16(0x201) /* block-addressable non-energy backed */
+#define NFIT_FIC_BYTEN cpu_to_be16(0x301) /* byte-addressable non-energy backed */
 
 enum {
 	NFIT_BLK_READ_FLUSH = 1,
-- 
cgit v0.10.2


From 6ca7208569550de43d64db6cf873706c371284a5 Mon Sep 17 00:00:00 2001
From: Dan Williams <dan.j.williams@intel.com>
Date: Fri, 29 Apr 2016 10:33:23 -0700
Subject: nfit: export subsystem ids as attributes

Similar to pci-sysfs export the subsystem information available in the
NFIT.  ACPI 6.1 clarifies that this data is copied as an array of bytes
from the DIMM SPD data.

Reported-by: Ryon Jensen <ryon.jensen@intel.com>
Signed-off-by: Dan Williams <dan.j.williams@intel.com>

diff --git a/drivers/acpi/nfit.c b/drivers/acpi/nfit.c
index 5a7199d..0a1ba3d 100644
--- a/drivers/acpi/nfit.c
+++ b/drivers/acpi/nfit.c
@@ -847,6 +847,34 @@ static ssize_t format_show(struct device *dev,
 }
 static DEVICE_ATTR_RO(format);
 
+static ssize_t subsystem_vendor_show(struct device *dev,
+		struct device_attribute *attr, char *buf)
+{
+	struct acpi_nfit_control_region *dcr = to_nfit_dcr(dev);
+
+	return sprintf(buf, "0x%04x\n", be16_to_cpu(dcr->subsystem_vendor_id));
+}
+static DEVICE_ATTR_RO(subsystem_vendor);
+
+static ssize_t subsystem_rev_id_show(struct device *dev,
+		struct device_attribute *attr, char *buf)
+{
+	struct acpi_nfit_control_region *dcr = to_nfit_dcr(dev);
+
+	return sprintf(buf, "0x%04x\n",
+			be16_to_cpu(dcr->subsystem_revision_id));
+}
+static DEVICE_ATTR_RO(subsystem_rev_id);
+
+static ssize_t subsystem_device_show(struct device *dev,
+		struct device_attribute *attr, char *buf)
+{
+	struct acpi_nfit_control_region *dcr = to_nfit_dcr(dev);
+
+	return sprintf(buf, "0x%04x\n", be16_to_cpu(dcr->subsystem_device_id));
+}
+static DEVICE_ATTR_RO(subsystem_device);
+
 static ssize_t serial_show(struct device *dev,
 		struct device_attribute *attr, char *buf)
 {
@@ -893,9 +921,12 @@ static struct attribute *acpi_nfit_dimm_attributes[] = {
 	&dev_attr_phys_id.attr,
 	&dev_attr_vendor.attr,
 	&dev_attr_device.attr,
+	&dev_attr_rev_id.attr,
+	&dev_attr_subsystem_vendor.attr,
+	&dev_attr_subsystem_device.attr,
+	&dev_attr_subsystem_rev_id.attr,
 	&dev_attr_format.attr,
 	&dev_attr_serial.attr,
-	&dev_attr_rev_id.attr,
 	&dev_attr_flags.attr,
 	&dev_attr_id.attr,
 	NULL,
-- 
cgit v0.10.2


From 87554098fec74a6c4a8cbea0d9adea2e8868e9e4 Mon Sep 17 00:00:00 2001
From: Dan Williams <dan.j.williams@intel.com>
Date: Thu, 28 Apr 2016 18:01:20 -0700
Subject: nfit: disable vendor specific commands

Module option to limit userspace to the publicly defined command set.
For cases where private DIMM commands may be interfering with the
kernel's handling of DIMM state this option can be set to block vendor
specific commands.

Cc: Jerry Hoemann <jerry.hoemann@hpe.com>
Signed-off-by: Dan Williams <dan.j.williams@intel.com>

diff --git a/drivers/acpi/nfit.c b/drivers/acpi/nfit.c
index b85a468..ad4fc86 100644
--- a/drivers/acpi/nfit.c
+++ b/drivers/acpi/nfit.c
@@ -45,6 +45,11 @@ module_param(scrub_overflow_abort, uint, S_IRUGO|S_IWUSR);
 MODULE_PARM_DESC(scrub_overflow_abort,
 		"Number of times we overflow ARS results before abort");
 
+static bool disable_vendor_specific;
+module_param(disable_vendor_specific, bool, S_IRUGO);
+MODULE_PARM_DESC(disable_vendor_specific,
+		"Limit commands to the publicly specified set\n");
+
 static struct workqueue_struct *nfit_wq;
 
 struct nfit_table_prev {
@@ -989,13 +994,17 @@ static int acpi_nfit_add_dimm(struct acpi_nfit_desc *acpi_desc,
 
 	/* limit the supported commands to those that are publicly documented */
 	nfit_mem->family = i;
-	if (nfit_mem->family == NVDIMM_FAMILY_INTEL)
+	if (nfit_mem->family == NVDIMM_FAMILY_INTEL) {
 		dsm_mask = 0x3fe;
-	else if (nfit_mem->family == NVDIMM_FAMILY_HPE1)
+		if (disable_vendor_specific)
+			dsm_mask &= ~(1 << ND_CMD_VENDOR);
+	} else if (nfit_mem->family == NVDIMM_FAMILY_HPE1)
 		dsm_mask = 0x1c3c76;
-	else if (nfit_mem->family == NVDIMM_FAMILY_HPE2)
+	else if (nfit_mem->family == NVDIMM_FAMILY_HPE2) {
 		dsm_mask = 0x1fe;
-	else {
+		if (disable_vendor_specific)
+			dsm_mask &= ~(1 << 8);
+	} else {
 		dev_err(dev, "unknown dimm command family\n");
 		nfit_mem->family = -1;
 		return force_enable_dimms ? 0 : -ENODEV;
-- 
cgit v0.10.2


From 6634fb06906f52a3a3125e88681a7fa6e353f31d Mon Sep 17 00:00:00 2001
From: Dan Williams <dan.j.williams@intel.com>
Date: Wed, 27 Apr 2016 16:46:15 -0600
Subject: tools/testing/nvdimm: ND_CMD_CALL support

Enable nfit_test to use nd_cmd_pkg marshaling.

Signed-off-by: Dan Williams <dan.j.williams@intel.com>

diff --git a/tools/testing/nvdimm/test/nfit.c b/tools/testing/nvdimm/test/nfit.c
index ed899a4..e09a300 100644
--- a/tools/testing/nvdimm/test/nfit.c
+++ b/tools/testing/nvdimm/test/nfit.c
@@ -336,6 +336,7 @@ static int nfit_test_ctl(struct nvdimm_bus_descriptor *nd_desc,
 {
 	struct acpi_nfit_desc *acpi_desc = to_acpi_desc(nd_desc);
 	struct nfit_test *t = container_of(acpi_desc, typeof(*t), acpi_desc);
+	unsigned int func = cmd;
 	int i, rc = 0, __cmd_rc;
 
 	if (!cmd_rc)
@@ -346,7 +347,21 @@ static int nfit_test_ctl(struct nvdimm_bus_descriptor *nd_desc,
 		struct nfit_mem *nfit_mem = nvdimm_provider_data(nvdimm);
 		unsigned long cmd_mask = nvdimm_cmd_mask(nvdimm);
 
-		if (!nfit_mem || !test_bit(cmd, &cmd_mask))
+		if (!nfit_mem)
+			return -ENOTTY;
+
+		if (cmd == ND_CMD_CALL) {
+			struct nd_cmd_pkg *call_pkg = buf;
+
+			buf_len = call_pkg->nd_size_in + call_pkg->nd_size_out;
+			buf = (void *) call_pkg->nd_payload;
+			func = call_pkg->nd_command;
+			if (call_pkg->nd_family != nfit_mem->family)
+				return -ENOTTY;
+		}
+
+		if (!test_bit(cmd, &cmd_mask)
+				|| !test_bit(func, &nfit_mem->dsm_mask))
 			return -ENOTTY;
 
 		/* lookup label space for the given dimm */
@@ -357,7 +372,7 @@ static int nfit_test_ctl(struct nvdimm_bus_descriptor *nd_desc,
 		if (i >= ARRAY_SIZE(handle))
 			return -ENXIO;
 
-		switch (cmd) {
+		switch (func) {
 		case ND_CMD_GET_CONFIG_SIZE:
 			rc = nfit_test_cmd_get_config_size(buf, buf_len);
 			break;
@@ -378,7 +393,7 @@ static int nfit_test_ctl(struct nvdimm_bus_descriptor *nd_desc,
 		if (!nd_desc || !test_bit(cmd, &nd_desc->cmd_mask))
 			return -ENOTTY;
 
-		switch (cmd) {
+		switch (func) {
 		case ND_CMD_ARS_CAP:
 			rc = nfit_test_cmd_ars_cap(buf, buf_len);
 			break;
-- 
cgit v0.10.2


From a94e3fbe4d53d4e512c4ea88a475e605b8d8dccb Mon Sep 17 00:00:00 2001
From: Dan Williams <dan.j.williams@intel.com>
Date: Thu, 28 Apr 2016 18:18:05 -0700
Subject: nfit: add sysfs dimm 'family' and 'dsm_mask' attributes

Communicate the command format and supported functions to userspace
tooling.

Signed-off-by: Dan Williams <dan.j.williams@intel.com>

diff --git a/drivers/acpi/nfit.c b/drivers/acpi/nfit.c
index ad4fc86..bf2d7a2 100644
--- a/drivers/acpi/nfit.c
+++ b/drivers/acpi/nfit.c
@@ -897,6 +897,30 @@ static ssize_t serial_show(struct device *dev,
 }
 static DEVICE_ATTR_RO(serial);
 
+static ssize_t family_show(struct device *dev,
+		struct device_attribute *attr, char *buf)
+{
+	struct nvdimm *nvdimm = to_nvdimm(dev);
+	struct nfit_mem *nfit_mem = nvdimm_provider_data(nvdimm);
+
+	if (nfit_mem->family < 0)
+		return -ENXIO;
+	return sprintf(buf, "%d\n", nfit_mem->family);
+}
+static DEVICE_ATTR_RO(family);
+
+static ssize_t dsm_mask_show(struct device *dev,
+		struct device_attribute *attr, char *buf)
+{
+	struct nvdimm *nvdimm = to_nvdimm(dev);
+	struct nfit_mem *nfit_mem = nvdimm_provider_data(nvdimm);
+
+	if (nfit_mem->family < 0)
+		return -ENXIO;
+	return sprintf(buf, "%#lx\n", nfit_mem->dsm_mask);
+}
+static DEVICE_ATTR_RO(dsm_mask);
+
 static ssize_t flags_show(struct device *dev,
 		struct device_attribute *attr, char *buf)
 {
@@ -920,6 +944,8 @@ static struct attribute *acpi_nfit_dimm_attributes[] = {
 	&dev_attr_serial.attr,
 	&dev_attr_rev_id.attr,
 	&dev_attr_flags.attr,
+	&dev_attr_family.attr,
+	&dev_attr_dsm_mask.attr,
 	NULL,
 };
 
-- 
cgit v0.10.2


From cd03412a51ac4cb3001a8cdfae4560c9602f3387 Mon Sep 17 00:00:00 2001
From: Dan Williams <dan.j.williams@intel.com>
Date: Fri, 11 Mar 2016 10:15:36 -0800
Subject: libnvdimm, dax: introduce device-dax infrastructure

Device DAX is the device-centric analogue of Filesystem DAX
(CONFIG_FS_DAX).  It allows persistent memory ranges to be allocated and
mapped without need of an intervening file system.  This initial
infrastructure arranges for a libnvdimm pfn-device to be represented as
a different device-type so that it can be attached to a driver other
than the pmem driver.

Signed-off-by: Dan Williams <dan.j.williams@intel.com>

diff --git a/drivers/nvdimm/Kconfig b/drivers/nvdimm/Kconfig
index 53c1162..7c8a3bf 100644
--- a/drivers/nvdimm/Kconfig
+++ b/drivers/nvdimm/Kconfig
@@ -88,4 +88,17 @@ config NVDIMM_PFN
 
 	  Select Y if unsure
 
+config NVDIMM_DAX
+	bool "NVDIMM DAX: Raw access to persistent memory"
+	default LIBNVDIMM
+	depends on NVDIMM_PFN
+	help
+	  Support raw device dax access to a persistent memory
+	  namespace.  For environments that want to hard partition
+	  peristent memory, this capability provides a mechanism to
+	  sub-divide a namespace into character devices that can only be
+	  accessed via DAX (mmap(2)).
+
+	  Select Y if unsure
+
 endif
diff --git a/drivers/nvdimm/Makefile b/drivers/nvdimm/Makefile
index ea84d3c..909554c 100644
--- a/drivers/nvdimm/Makefile
+++ b/drivers/nvdimm/Makefile
@@ -23,3 +23,4 @@ libnvdimm-y += label.o
 libnvdimm-$(CONFIG_ND_CLAIM) += claim.o
 libnvdimm-$(CONFIG_BTT) += btt_devs.o
 libnvdimm-$(CONFIG_NVDIMM_PFN) += pfn_devs.o
+libnvdimm-$(CONFIG_NVDIMM_DAX) += dax_devs.o
diff --git a/drivers/nvdimm/bus.c b/drivers/nvdimm/bus.c
index 19f822d..97589e3 100644
--- a/drivers/nvdimm/bus.c
+++ b/drivers/nvdimm/bus.c
@@ -40,6 +40,8 @@ static int to_nd_device_type(struct device *dev)
 		return ND_DEVICE_REGION_PMEM;
 	else if (is_nd_blk(dev))
 		return ND_DEVICE_REGION_BLK;
+	else if (is_nd_dax(dev))
+		return ND_DEVICE_DAX_PMEM;
 	else if (is_nd_pmem(dev->parent) || is_nd_blk(dev->parent))
 		return nd_region_to_nstype(to_nd_region(dev->parent));
 
@@ -246,6 +248,8 @@ static void nd_async_device_unregister(void *d, async_cookie_t cookie)
 
 void __nd_device_register(struct device *dev)
 {
+	if (!dev)
+		return;
 	dev->bus = &nvdimm_bus_type;
 	get_device(dev);
 	async_schedule_domain(nd_async_device_register, dev,
diff --git a/drivers/nvdimm/claim.c b/drivers/nvdimm/claim.c
index 6bbd0a3..5f53db5 100644
--- a/drivers/nvdimm/claim.c
+++ b/drivers/nvdimm/claim.c
@@ -85,6 +85,8 @@ static bool is_idle(struct device *dev, struct nd_namespace_common *ndns)
 		seed = nd_region->btt_seed;
 	else if (is_nd_pfn(dev))
 		seed = nd_region->pfn_seed;
+	else if (is_nd_dax(dev))
+		seed = nd_region->dax_seed;
 
 	if (seed == dev || ndns || dev->driver)
 		return false;
diff --git a/drivers/nvdimm/dax_devs.c b/drivers/nvdimm/dax_devs.c
new file mode 100644
index 0000000..f90f754
--- /dev/null
+++ b/drivers/nvdimm/dax_devs.c
@@ -0,0 +1,99 @@
+/*
+ * Copyright(c) 2013-2016 Intel Corporation. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ */
+#include <linux/device.h>
+#include <linux/sizes.h>
+#include <linux/slab.h>
+#include <linux/mm.h>
+#include "nd-core.h"
+#include "nd.h"
+
+static void nd_dax_release(struct device *dev)
+{
+	struct nd_region *nd_region = to_nd_region(dev->parent);
+	struct nd_dax *nd_dax = to_nd_dax(dev);
+	struct nd_pfn *nd_pfn = &nd_dax->nd_pfn;
+
+	dev_dbg(dev, "%s\n", __func__);
+	nd_detach_ndns(dev, &nd_pfn->ndns);
+	ida_simple_remove(&nd_region->dax_ida, nd_pfn->id);
+	kfree(nd_pfn->uuid);
+	kfree(nd_dax);
+}
+
+static struct device_type nd_dax_device_type = {
+	.name = "nd_dax",
+	.release = nd_dax_release,
+};
+
+bool is_nd_dax(struct device *dev)
+{
+	return dev ? dev->type == &nd_dax_device_type : false;
+}
+EXPORT_SYMBOL(is_nd_dax);
+
+struct nd_dax *to_nd_dax(struct device *dev)
+{
+	struct nd_dax *nd_dax = container_of(dev, struct nd_dax, nd_pfn.dev);
+
+	WARN_ON(!is_nd_dax(dev));
+	return nd_dax;
+}
+EXPORT_SYMBOL(to_nd_dax);
+
+static const struct attribute_group *nd_dax_attribute_groups[] = {
+	&nd_pfn_attribute_group,
+	&nd_device_attribute_group,
+	&nd_numa_attribute_group,
+	NULL,
+};
+
+static struct nd_dax *nd_dax_alloc(struct nd_region *nd_region)
+{
+	struct nd_pfn *nd_pfn;
+	struct nd_dax *nd_dax;
+	struct device *dev;
+
+	nd_dax = kzalloc(sizeof(*nd_dax), GFP_KERNEL);
+	if (!nd_dax)
+		return NULL;
+
+	nd_pfn = &nd_dax->nd_pfn;
+	nd_pfn->id = ida_simple_get(&nd_region->dax_ida, 0, 0, GFP_KERNEL);
+	if (nd_pfn->id < 0) {
+		kfree(nd_dax);
+		return NULL;
+	}
+
+	dev = &nd_pfn->dev;
+	dev_set_name(dev, "dax%d.%d", nd_region->id, nd_pfn->id);
+	dev->groups = nd_dax_attribute_groups;
+	dev->type = &nd_dax_device_type;
+	dev->parent = &nd_region->dev;
+
+	return nd_dax;
+}
+
+struct device *nd_dax_create(struct nd_region *nd_region)
+{
+	struct device *dev = NULL;
+	struct nd_dax *nd_dax;
+
+	if (!is_nd_pmem(&nd_region->dev))
+		return NULL;
+
+	nd_dax = nd_dax_alloc(nd_region);
+	if (nd_dax)
+		dev = nd_pfn_devinit(&nd_dax->nd_pfn, NULL);
+	__nd_device_register(dev);
+	return dev;
+}
diff --git a/drivers/nvdimm/namespace_devs.c b/drivers/nvdimm/namespace_devs.c
index e5ad516..c5e3196 100644
--- a/drivers/nvdimm/namespace_devs.c
+++ b/drivers/nvdimm/namespace_devs.c
@@ -1288,6 +1288,8 @@ static ssize_t mode_show(struct device *dev,
 		mode = "safe";
 	else if (claim && is_nd_pfn(claim))
 		mode = "memory";
+	else if (claim && is_nd_dax(claim))
+		mode = "dax";
 	else if (!claim && pmem_should_map_pages(dev))
 		mode = "memory";
 	else
@@ -1379,14 +1381,17 @@ struct nd_namespace_common *nvdimm_namespace_common_probe(struct device *dev)
 {
 	struct nd_btt *nd_btt = is_nd_btt(dev) ? to_nd_btt(dev) : NULL;
 	struct nd_pfn *nd_pfn = is_nd_pfn(dev) ? to_nd_pfn(dev) : NULL;
+	struct nd_dax *nd_dax = is_nd_dax(dev) ? to_nd_dax(dev) : NULL;
 	struct nd_namespace_common *ndns = NULL;
 	resource_size_t size;
 
-	if (nd_btt || nd_pfn) {
+	if (nd_btt || nd_pfn || nd_dax) {
 		if (nd_btt)
 			ndns = nd_btt->ndns;
 		else if (nd_pfn)
 			ndns = nd_pfn->ndns;
+		else if (nd_dax)
+			ndns = nd_dax->nd_pfn.ndns;
 
 		if (!ndns)
 			return ERR_PTR(-ENODEV);
@@ -1779,6 +1784,18 @@ void nd_region_create_blk_seed(struct nd_region *nd_region)
 		nd_device_register(nd_region->ns_seed);
 }
 
+void nd_region_create_dax_seed(struct nd_region *nd_region)
+{
+	WARN_ON(!is_nvdimm_bus_locked(&nd_region->dev));
+	nd_region->dax_seed = nd_dax_create(nd_region);
+	/*
+	 * Seed creation failures are not fatal, provisioning is simply
+	 * disabled until memory becomes available
+	 */
+	if (!nd_region->dax_seed)
+		dev_err(&nd_region->dev, "failed to create dax namespace\n");
+}
+
 void nd_region_create_pfn_seed(struct nd_region *nd_region)
 {
 	WARN_ON(!is_nvdimm_bus_locked(&nd_region->dev));
diff --git a/drivers/nvdimm/nd-core.h b/drivers/nvdimm/nd-core.h
index 1d1500f..cb65308 100644
--- a/drivers/nvdimm/nd-core.h
+++ b/drivers/nvdimm/nd-core.h
@@ -54,6 +54,7 @@ struct nd_region;
 void nd_region_create_blk_seed(struct nd_region *nd_region);
 void nd_region_create_btt_seed(struct nd_region *nd_region);
 void nd_region_create_pfn_seed(struct nd_region *nd_region);
+void nd_region_create_dax_seed(struct nd_region *nd_region);
 void nd_region_disable(struct nvdimm_bus *nvdimm_bus, struct device *dev);
 int nvdimm_bus_create_ndctl(struct nvdimm_bus *nvdimm_bus);
 void nvdimm_bus_destroy_ndctl(struct nvdimm_bus *nvdimm_bus);
diff --git a/drivers/nvdimm/nd.h b/drivers/nvdimm/nd.h
index 6c36509..46910b8 100644
--- a/drivers/nvdimm/nd.h
+++ b/drivers/nvdimm/nd.h
@@ -101,10 +101,12 @@ struct nd_region {
 	struct ida ns_ida;
 	struct ida btt_ida;
 	struct ida pfn_ida;
+	struct ida dax_ida;
 	unsigned long flags;
 	struct device *ns_seed;
 	struct device *btt_seed;
 	struct device *pfn_seed;
+	struct device *dax_seed;
 	u16 ndr_mappings;
 	u64 ndr_size;
 	u64 ndr_start;
@@ -161,6 +163,10 @@ struct nd_pfn {
 	struct nd_namespace_common *ndns;
 };
 
+struct nd_dax {
+	struct nd_pfn nd_pfn;
+};
+
 enum nd_async_mode {
 	ND_SYNC,
 	ND_ASYNC,
@@ -224,7 +230,10 @@ struct nd_pfn *to_nd_pfn(struct device *dev);
 int nd_pfn_probe(struct device *dev, struct nd_namespace_common *ndns);
 bool is_nd_pfn(struct device *dev);
 struct device *nd_pfn_create(struct nd_region *nd_region);
+struct device *nd_pfn_devinit(struct nd_pfn *nd_pfn,
+		struct nd_namespace_common *ndns);
 int nd_pfn_validate(struct nd_pfn *nd_pfn);
+extern struct attribute_group nd_pfn_attribute_group;
 #else
 static inline int nd_pfn_probe(struct device *dev,
 		struct nd_namespace_common *ndns)
@@ -248,6 +257,22 @@ static inline int nd_pfn_validate(struct nd_pfn *nd_pfn)
 }
 #endif
 
+struct nd_dax *to_nd_dax(struct device *dev);
+#if IS_ENABLED(CONFIG_NVDIMM_DAX)
+bool is_nd_dax(struct device *dev);
+struct device *nd_dax_create(struct nd_region *nd_region);
+#else
+static inline bool is_nd_dax(struct device *dev)
+{
+	return false;
+}
+
+static inline struct device *nd_dax_create(struct nd_region *nd_region)
+{
+	return NULL;
+}
+#endif
+
 struct nd_region *to_nd_region(struct device *dev);
 int nd_region_to_nstype(struct nd_region *nd_region);
 int nd_region_register_namespaces(struct nd_region *nd_region, int *err);
diff --git a/drivers/nvdimm/pfn_devs.c b/drivers/nvdimm/pfn_devs.c
index e8693fe..6ade2eb 100644
--- a/drivers/nvdimm/pfn_devs.c
+++ b/drivers/nvdimm/pfn_devs.c
@@ -1,5 +1,5 @@
 /*
- * Copyright(c) 2013-2015 Intel Corporation. All rights reserved.
+ * Copyright(c) 2013-2016 Intel Corporation. All rights reserved.
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of version 2 of the GNU General Public License as
@@ -54,10 +54,29 @@ struct nd_pfn *to_nd_pfn(struct device *dev)
 }
 EXPORT_SYMBOL(to_nd_pfn);
 
+static struct nd_pfn *to_nd_pfn_safe(struct device *dev)
+{
+	/*
+	 * pfn device attributes are re-used by dax device instances, so we
+	 * need to be careful to correct device-to-nd_pfn conversion.
+	 */
+	if (is_nd_pfn(dev))
+		return to_nd_pfn(dev);
+
+	if (is_nd_dax(dev)) {
+		struct nd_dax *nd_dax = to_nd_dax(dev);
+
+		return &nd_dax->nd_pfn;
+	}
+
+	WARN_ON(1);
+	return NULL;
+}
+
 static ssize_t mode_show(struct device *dev,
 		struct device_attribute *attr, char *buf)
 {
-	struct nd_pfn *nd_pfn = to_nd_pfn(dev);
+	struct nd_pfn *nd_pfn = to_nd_pfn_safe(dev);
 
 	switch (nd_pfn->mode) {
 	case PFN_MODE_RAM:
@@ -72,7 +91,7 @@ static ssize_t mode_show(struct device *dev,
 static ssize_t mode_store(struct device *dev,
 		struct device_attribute *attr, const char *buf, size_t len)
 {
-	struct nd_pfn *nd_pfn = to_nd_pfn(dev);
+	struct nd_pfn *nd_pfn = to_nd_pfn_safe(dev);
 	ssize_t rc = 0;
 
 	device_lock(dev);
@@ -106,7 +125,7 @@ static DEVICE_ATTR_RW(mode);
 static ssize_t align_show(struct device *dev,
 		struct device_attribute *attr, char *buf)
 {
-	struct nd_pfn *nd_pfn = to_nd_pfn(dev);
+	struct nd_pfn *nd_pfn = to_nd_pfn_safe(dev);
 
 	return sprintf(buf, "%lx\n", nd_pfn->align);
 }
@@ -134,7 +153,7 @@ static ssize_t __align_store(struct nd_pfn *nd_pfn, const char *buf)
 static ssize_t align_store(struct device *dev,
 		struct device_attribute *attr, const char *buf, size_t len)
 {
-	struct nd_pfn *nd_pfn = to_nd_pfn(dev);
+	struct nd_pfn *nd_pfn = to_nd_pfn_safe(dev);
 	ssize_t rc;
 
 	device_lock(dev);
@@ -152,7 +171,7 @@ static DEVICE_ATTR_RW(align);
 static ssize_t uuid_show(struct device *dev,
 		struct device_attribute *attr, char *buf)
 {
-	struct nd_pfn *nd_pfn = to_nd_pfn(dev);
+	struct nd_pfn *nd_pfn = to_nd_pfn_safe(dev);
 
 	if (nd_pfn->uuid)
 		return sprintf(buf, "%pUb\n", nd_pfn->uuid);
@@ -162,7 +181,7 @@ static ssize_t uuid_show(struct device *dev,
 static ssize_t uuid_store(struct device *dev,
 		struct device_attribute *attr, const char *buf, size_t len)
 {
-	struct nd_pfn *nd_pfn = to_nd_pfn(dev);
+	struct nd_pfn *nd_pfn = to_nd_pfn_safe(dev);
 	ssize_t rc;
 
 	device_lock(dev);
@@ -178,7 +197,7 @@ static DEVICE_ATTR_RW(uuid);
 static ssize_t namespace_show(struct device *dev,
 		struct device_attribute *attr, char *buf)
 {
-	struct nd_pfn *nd_pfn = to_nd_pfn(dev);
+	struct nd_pfn *nd_pfn = to_nd_pfn_safe(dev);
 	ssize_t rc;
 
 	nvdimm_bus_lock(dev);
@@ -191,7 +210,7 @@ static ssize_t namespace_show(struct device *dev,
 static ssize_t namespace_store(struct device *dev,
 		struct device_attribute *attr, const char *buf, size_t len)
 {
-	struct nd_pfn *nd_pfn = to_nd_pfn(dev);
+	struct nd_pfn *nd_pfn = to_nd_pfn_safe(dev);
 	ssize_t rc;
 
 	device_lock(dev);
@@ -209,7 +228,7 @@ static DEVICE_ATTR_RW(namespace);
 static ssize_t resource_show(struct device *dev,
 		struct device_attribute *attr, char *buf)
 {
-	struct nd_pfn *nd_pfn = to_nd_pfn(dev);
+	struct nd_pfn *nd_pfn = to_nd_pfn_safe(dev);
 	ssize_t rc;
 
 	device_lock(dev);
@@ -235,7 +254,7 @@ static DEVICE_ATTR_RO(resource);
 static ssize_t size_show(struct device *dev,
 		struct device_attribute *attr, char *buf)
 {
-	struct nd_pfn *nd_pfn = to_nd_pfn(dev);
+	struct nd_pfn *nd_pfn = to_nd_pfn_safe(dev);
 	ssize_t rc;
 
 	device_lock(dev);
@@ -270,7 +289,7 @@ static struct attribute *nd_pfn_attributes[] = {
 	NULL,
 };
 
-static struct attribute_group nd_pfn_attribute_group = {
+struct attribute_group nd_pfn_attribute_group = {
 	.attrs = nd_pfn_attributes,
 };
 
@@ -281,15 +300,31 @@ static const struct attribute_group *nd_pfn_attribute_groups[] = {
 	NULL,
 };
 
-static struct device *__nd_pfn_create(struct nd_region *nd_region,
+struct device *nd_pfn_devinit(struct nd_pfn *nd_pfn,
 		struct nd_namespace_common *ndns)
 {
-	struct nd_pfn *nd_pfn;
-	struct device *dev;
+	struct device *dev = &nd_pfn->dev;
 
-	/* we can only create pages for contiguous ranged of pmem */
-	if (!is_nd_pmem(&nd_region->dev))
+	if (!nd_pfn)
+		return NULL;
+
+	nd_pfn->mode = PFN_MODE_NONE;
+	nd_pfn->align = HPAGE_SIZE;
+	dev = &nd_pfn->dev;
+	device_initialize(&nd_pfn->dev);
+	if (ndns && !__nd_attach_ndns(&nd_pfn->dev, ndns, &nd_pfn->ndns)) {
+		dev_dbg(&ndns->dev, "%s failed, already claimed by %s\n",
+				__func__, dev_name(ndns->claim));
+		put_device(dev);
 		return NULL;
+	}
+	return dev;
+}
+
+static struct nd_pfn *nd_pfn_alloc(struct nd_region *nd_region)
+{
+	struct nd_pfn *nd_pfn;
+	struct device *dev;
 
 	nd_pfn = kzalloc(sizeof(*nd_pfn), GFP_KERNEL);
 	if (!nd_pfn)
@@ -301,29 +336,27 @@ static struct device *__nd_pfn_create(struct nd_region *nd_region,
 		return NULL;
 	}
 
-	nd_pfn->mode = PFN_MODE_NONE;
-	nd_pfn->align = HPAGE_SIZE;
 	dev = &nd_pfn->dev;
 	dev_set_name(dev, "pfn%d.%d", nd_region->id, nd_pfn->id);
-	dev->parent = &nd_region->dev;
-	dev->type = &nd_pfn_device_type;
 	dev->groups = nd_pfn_attribute_groups;
-	device_initialize(&nd_pfn->dev);
-	if (ndns && !__nd_attach_ndns(&nd_pfn->dev, ndns, &nd_pfn->ndns)) {
-		dev_dbg(&ndns->dev, "%s failed, already claimed by %s\n",
-				__func__, dev_name(ndns->claim));
-		put_device(dev);
-		return NULL;
-	}
-	return dev;
+	dev->type = &nd_pfn_device_type;
+	dev->parent = &nd_region->dev;
+
+	return nd_pfn;
 }
 
 struct device *nd_pfn_create(struct nd_region *nd_region)
 {
-	struct device *dev = __nd_pfn_create(nd_region, NULL);
+	struct nd_pfn *nd_pfn;
+	struct device *dev;
+
+	if (!is_nd_pmem(&nd_region->dev))
+		return NULL;
+
+	nd_pfn = nd_pfn_alloc(nd_region);
+	dev = nd_pfn_devinit(nd_pfn, NULL);
 
-	if (dev)
-		__nd_device_register(dev);
+	__nd_device_register(dev);
 	return dev;
 }
 
@@ -423,7 +456,8 @@ int nd_pfn_probe(struct device *dev, struct nd_namespace_common *ndns)
 		return -ENODEV;
 
 	nvdimm_bus_lock(&ndns->dev);
-	pfn_dev = __nd_pfn_create(nd_region, ndns);
+	nd_pfn = nd_pfn_alloc(nd_region);
+	pfn_dev = nd_pfn_devinit(nd_pfn, ndns);
 	nvdimm_bus_unlock(&ndns->dev);
 	if (!pfn_dev)
 		return -ENOMEM;
diff --git a/drivers/nvdimm/region.c b/drivers/nvdimm/region.c
index 4b7715e..05a9123 100644
--- a/drivers/nvdimm/region.c
+++ b/drivers/nvdimm/region.c
@@ -54,6 +54,7 @@ static int nd_region_probe(struct device *dev)
 
 	nd_region->btt_seed = nd_btt_create(nd_region);
 	nd_region->pfn_seed = nd_pfn_create(nd_region);
+	nd_region->dax_seed = nd_dax_create(nd_region);
 	if (err == 0)
 		return 0;
 
@@ -86,6 +87,7 @@ static int nd_region_remove(struct device *dev)
 	nd_region->ns_seed = NULL;
 	nd_region->btt_seed = NULL;
 	nd_region->pfn_seed = NULL;
+	nd_region->dax_seed = NULL;
 	dev_set_drvdata(dev, NULL);
 	nvdimm_bus_unlock(dev);
 
diff --git a/drivers/nvdimm/region_devs.c b/drivers/nvdimm/region_devs.c
index 139bf71..9e1b054 100644
--- a/drivers/nvdimm/region_devs.c
+++ b/drivers/nvdimm/region_devs.c
@@ -306,6 +306,23 @@ static ssize_t pfn_seed_show(struct device *dev,
 }
 static DEVICE_ATTR_RO(pfn_seed);
 
+static ssize_t dax_seed_show(struct device *dev,
+		struct device_attribute *attr, char *buf)
+{
+	struct nd_region *nd_region = to_nd_region(dev);
+	ssize_t rc;
+
+	nvdimm_bus_lock(dev);
+	if (nd_region->dax_seed)
+		rc = sprintf(buf, "%s\n", dev_name(nd_region->dax_seed));
+	else
+		rc = sprintf(buf, "\n");
+	nvdimm_bus_unlock(dev);
+
+	return rc;
+}
+static DEVICE_ATTR_RO(dax_seed);
+
 static ssize_t read_only_show(struct device *dev,
 		struct device_attribute *attr, char *buf)
 {
@@ -335,6 +352,7 @@ static struct attribute *nd_region_attributes[] = {
 	&dev_attr_mappings.attr,
 	&dev_attr_btt_seed.attr,
 	&dev_attr_pfn_seed.attr,
+	&dev_attr_dax_seed.attr,
 	&dev_attr_read_only.attr,
 	&dev_attr_set_cookie.attr,
 	&dev_attr_available_size.attr,
@@ -353,6 +371,9 @@ static umode_t region_visible(struct kobject *kobj, struct attribute *a, int n)
 	if (!is_nd_pmem(dev) && a == &dev_attr_pfn_seed.attr)
 		return 0;
 
+	if (!is_nd_pmem(dev) && a == &dev_attr_dax_seed.attr)
+		return 0;
+
 	if (a != &dev_attr_set_cookie.attr
 			&& a != &dev_attr_available_size.attr)
 		return a->mode;
@@ -441,6 +462,13 @@ static void nd_region_notify_driver_action(struct nvdimm_bus *nvdimm_bus,
 			nd_region_create_pfn_seed(nd_region);
 		nvdimm_bus_unlock(dev);
 	}
+	if (is_nd_dax(dev) && probe) {
+		nd_region = to_nd_region(dev->parent);
+		nvdimm_bus_lock(dev);
+		if (nd_region->dax_seed == dev)
+			nd_region_create_dax_seed(nd_region);
+		nvdimm_bus_unlock(dev);
+	}
 }
 
 void nd_region_probe_success(struct nvdimm_bus *nvdimm_bus, struct device *dev)
@@ -718,6 +746,7 @@ static struct nd_region *nd_region_create(struct nvdimm_bus *nvdimm_bus,
 	ida_init(&nd_region->ns_ida);
 	ida_init(&nd_region->btt_ida);
 	ida_init(&nd_region->pfn_ida);
+	ida_init(&nd_region->dax_ida);
 	dev = &nd_region->dev;
 	dev_set_name(dev, "region%d", nd_region->id);
 	dev->parent = &nvdimm_bus->dev;
diff --git a/include/uapi/linux/ndctl.h b/include/uapi/linux/ndctl.h
index 7cc28ab..4f29d24 100644
--- a/include/uapi/linux/ndctl.h
+++ b/include/uapi/linux/ndctl.h
@@ -206,6 +206,7 @@ static inline const char *nvdimm_cmd_name(unsigned cmd)
 #define ND_DEVICE_NAMESPACE_IO 4    /* legacy persistent memory */
 #define ND_DEVICE_NAMESPACE_PMEM 5  /* PMEM namespace (may alias with BLK) */
 #define ND_DEVICE_NAMESPACE_BLK 6   /* BLK namespace (may alias with PMEM) */
+#define ND_DEVICE_DAX_PMEM 7        /* Device DAX interface to pmem */
 
 enum nd_driver_flags {
 	ND_DRIVER_DIMM            = 1 << ND_DEVICE_DIMM,
@@ -214,6 +215,7 @@ enum nd_driver_flags {
 	ND_DRIVER_NAMESPACE_IO    = 1 << ND_DEVICE_NAMESPACE_IO,
 	ND_DRIVER_NAMESPACE_PMEM  = 1 << ND_DEVICE_NAMESPACE_PMEM,
 	ND_DRIVER_NAMESPACE_BLK   = 1 << ND_DEVICE_NAMESPACE_BLK,
+	ND_DRIVER_DAX_PMEM	  = 1 << ND_DEVICE_DAX_PMEM,
 };
 
 enum {
diff --git a/tools/testing/nvdimm/Kbuild b/tools/testing/nvdimm/Kbuild
index d5bc8c0..5ff6d3c 100644
--- a/tools/testing/nvdimm/Kbuild
+++ b/tools/testing/nvdimm/Kbuild
@@ -50,6 +50,7 @@ libnvdimm-y += $(NVDIMM_SRC)/label.o
 libnvdimm-$(CONFIG_ND_CLAIM) += $(NVDIMM_SRC)/claim.o
 libnvdimm-$(CONFIG_BTT) += $(NVDIMM_SRC)/btt_devs.o
 libnvdimm-$(CONFIG_NVDIMM_PFN) += $(NVDIMM_SRC)/pfn_devs.o
+libnvdimm-$(CONFIG_NVDIMM_DAX) += $(NVDIMM_SRC)/dax_devs.o
 libnvdimm-y += config_check.o
 
 obj-m += test/
-- 
cgit v0.10.2


From 52ac23b25eb26511f8dea2382534eeada2fa8244 Mon Sep 17 00:00:00 2001
From: Dan Williams <dan.j.williams@intel.com>
Date: Thu, 31 Mar 2016 09:37:11 -0700
Subject: libnvdimm, dax: reserve space to store labels for device-dax

We may want to subdivide a device-dax range into multiple devices so
that each can have separate permissions or naming.  Reserve 128K of
label space by default so we have the capability of making allocation
decisions persistent.  This reservation is not something we can add
later since it would result in the default size of a device-dax range
changing between kernel versions.

Signed-off-by: Dan Williams <dan.j.williams@intel.com>

diff --git a/drivers/nvdimm/pfn_devs.c b/drivers/nvdimm/pfn_devs.c
index 6ade2eb..ca396c8 100644
--- a/drivers/nvdimm/pfn_devs.c
+++ b/drivers/nvdimm/pfn_devs.c
@@ -540,6 +540,7 @@ static struct vmem_altmap *__nvdimm_setup_pfn(struct nd_pfn *nd_pfn,
 
 static int nd_pfn_init(struct nd_pfn *nd_pfn)
 {
+	u32 dax_label_reserve = is_nd_dax(&nd_pfn->dev) ? SZ_128K : 0;
 	struct nd_namespace_common *ndns = nd_pfn->ndns;
 	u32 start_pad = 0, end_trunc = 0;
 	resource_size_t start, size;
@@ -606,10 +607,11 @@ static int nd_pfn_init(struct nd_pfn *nd_pfn)
 	size = resource_size(&nsio->res);
 	npfns = (size - start_pad - end_trunc - SZ_8K) / SZ_4K;
 	if (nd_pfn->mode == PFN_MODE_PMEM)
-		offset = ALIGN(start + SZ_8K + 64 * npfns, nd_pfn->align)
-			- start;
+		offset = ALIGN(start + SZ_8K + 64 * npfns + dax_label_reserve,
+				nd_pfn->align) - start;
 	else if (nd_pfn->mode == PFN_MODE_RAM)
-		offset = ALIGN(start + SZ_8K, nd_pfn->align) - start;
+		offset = ALIGN(start + SZ_8K + dax_label_reserve,
+				nd_pfn->align) - start;
 	else
 		return -ENXIO;
 
-- 
cgit v0.10.2


From 45a0dac0451136fa7ae34a6fea53ef6a136287ce Mon Sep 17 00:00:00 2001
From: Dan Williams <dan.j.williams@intel.com>
Date: Thu, 31 Mar 2016 15:41:18 -0700
Subject: libnvdimm, dax: record the specified alignment of a dax-device
 instance

We want to use the alignment as the allocation and mapping unit.
Previously this information was only useful for establishing the data
offset, but now it is important to remember the granularity for the
later use.

Signed-off-by: Dan Williams <dan.j.williams@intel.com>

diff --git a/drivers/nvdimm/pfn.h b/drivers/nvdimm/pfn.h
index 8e343a3..9d2704c 100644
--- a/drivers/nvdimm/pfn.h
+++ b/drivers/nvdimm/pfn.h
@@ -33,7 +33,9 @@ struct nd_pfn_sb {
 	/* minor-version-1 additions for section alignment */
 	__le32 start_pad;
 	__le32 end_trunc;
-	u8 padding[4004];
+	/* minor-version-2 record the base alignment of the mapping */
+	__le32 align;
+	u8 padding[4000];
 	__le64 checksum;
 };
 
diff --git a/drivers/nvdimm/pfn_devs.c b/drivers/nvdimm/pfn_devs.c
index ca396c8..58740d7 100644
--- a/drivers/nvdimm/pfn_devs.c
+++ b/drivers/nvdimm/pfn_devs.c
@@ -394,6 +394,9 @@ int nd_pfn_validate(struct nd_pfn *nd_pfn)
 		pfn_sb->end_trunc = 0;
 	}
 
+	if (__le16_to_cpu(pfn_sb->version_minor) < 2)
+		pfn_sb->align = 0;
+
 	switch (le32_to_cpu(pfn_sb->mode)) {
 	case PFN_MODE_RAM:
 	case PFN_MODE_PMEM:
@@ -433,7 +436,7 @@ int nd_pfn_validate(struct nd_pfn *nd_pfn)
 		return -EBUSY;
 	}
 
-	nd_pfn->align = 1UL << ilog2(offset);
+	nd_pfn->align = le32_to_cpu(pfn_sb->align);
 	if (!is_power_of_2(offset) || offset < PAGE_SIZE) {
 		dev_err(&nd_pfn->dev, "bad offset: %#llx dax disabled\n",
 				offset);
@@ -629,9 +632,10 @@ static int nd_pfn_init(struct nd_pfn *nd_pfn)
 	memcpy(pfn_sb->uuid, nd_pfn->uuid, 16);
 	memcpy(pfn_sb->parent_uuid, nd_dev_to_uuid(&ndns->dev), 16);
 	pfn_sb->version_major = cpu_to_le16(1);
-	pfn_sb->version_minor = cpu_to_le16(1);
+	pfn_sb->version_minor = cpu_to_le16(2);
 	pfn_sb->start_pad = cpu_to_le32(start_pad);
 	pfn_sb->end_trunc = cpu_to_le32(end_trunc);
+	pfn_sb->align = cpu_to_le32(nd_pfn->align);
 	checksum = nd_sb_checksum((struct nd_gen_sb *) pfn_sb);
 	pfn_sb->checksum = cpu_to_le64(checksum);
 
-- 
cgit v0.10.2


From 6cf9c5babd980ec1959e0dd45e3036474c6a294f Mon Sep 17 00:00:00 2001
From: Dan Williams <dan.j.williams@intel.com>
Date: Wed, 18 May 2016 09:13:13 -0700
Subject: libnvdimm: stop requiring a driver ->remove() method

The dax_pmem driver was implementing an empty ->remove() method to
satisfy the nvdimm bus driver that unconditionally calls ->remove().
Teach the core bus driver to check if ->remove() is NULL to remove that
requirement.

Reported-by: Johannes Thumshirn <jthumshirn@suse.de>
Reviewed-by: Johannes Thumshirn <jthumshirn@suse.de>
Signed-off-by: Dan Williams <dan.j.williams@intel.com>

diff --git a/drivers/nvdimm/bus.c b/drivers/nvdimm/bus.c
index 97589e3..7cbc3d5 100644
--- a/drivers/nvdimm/bus.c
+++ b/drivers/nvdimm/bus.c
@@ -124,9 +124,10 @@ static int nvdimm_bus_remove(struct device *dev)
 	struct nd_device_driver *nd_drv = to_nd_device_driver(dev->driver);
 	struct module *provider = to_bus_provider(dev);
 	struct nvdimm_bus *nvdimm_bus = walk_to_nvdimm_bus(dev);
-	int rc;
+	int rc = 0;
 
-	rc = nd_drv->remove(dev);
+	if (nd_drv->remove)
+		rc = nd_drv->remove(dev);
 	nd_region_disable(nvdimm_bus, dev);
 
 	dev_dbg(&nvdimm_bus->dev, "%s.remove(%s) = %d\n", dev->driver->name,
@@ -296,8 +297,8 @@ int __nd_driver_register(struct nd_device_driver *nd_drv, struct module *owner,
 		return -EINVAL;
 	}
 
-	if (!nd_drv->probe || !nd_drv->remove) {
-		pr_debug("->probe() and ->remove() must be specified\n");
+	if (!nd_drv->probe) {
+		pr_debug("%s ->probe() must be specified\n", mod_name);
 		return -EINVAL;
 	}
 
-- 
cgit v0.10.2


From ab68f26221366f92611650e8470e6a926801c7d4 Mon Sep 17 00:00:00 2001
From: Dan Williams <dan.j.williams@intel.com>
Date: Wed, 18 May 2016 09:15:08 -0700
Subject: /dev/dax, pmem: direct access to persistent memory

Device DAX is the device-centric analogue of Filesystem DAX
(CONFIG_FS_DAX).  It allows memory ranges to be allocated and mapped
without need of an intervening file system.  Device DAX is strict,
precise and predictable.  Specifically this interface:

1/ Guarantees fault granularity with respect to a given page size (pte,
pmd, or pud) set at configuration time.

2/ Enforces deterministic behavior by being strict about what fault
scenarios are supported.

For example, by forcing MADV_DONTFORK semantics and omitting MAP_PRIVATE
support device-dax guarantees that a mapping always behaves/performs the
same once established.  It is the "what you see is what you get" access
mechanism to differentiated memory vs filesystem DAX which has
filesystem specific implementation semantics.

Persistent memory is the first target, but the mechanism is also
targeted for exclusive allocations of performance differentiated memory
ranges.

This commit is limited to the base device driver infrastructure to
associate a dax device with pmem range.

Cc: Jeff Moyer <jmoyer@redhat.com>
Cc: Christoph Hellwig <hch@lst.de>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: Ross Zwisler <ross.zwisler@linux.intel.com>
Reviewed-by: Johannes Thumshirn <jthumshirn@suse.de>
Signed-off-by: Dan Williams <dan.j.williams@intel.com>

diff --git a/drivers/Kconfig b/drivers/Kconfig
index d2ac339..8298eab 100644
--- a/drivers/Kconfig
+++ b/drivers/Kconfig
@@ -190,6 +190,8 @@ source "drivers/android/Kconfig"
 
 source "drivers/nvdimm/Kconfig"
 
+source "drivers/dax/Kconfig"
+
 source "drivers/nvmem/Kconfig"
 
 source "drivers/hwtracing/stm/Kconfig"
diff --git a/drivers/Makefile b/drivers/Makefile
index 8f5d076..0b6f3d6 100644
--- a/drivers/Makefile
+++ b/drivers/Makefile
@@ -66,6 +66,7 @@ obj-$(CONFIG_PARPORT)		+= parport/
 obj-$(CONFIG_NVM)		+= lightnvm/
 obj-y				+= base/ block/ misc/ mfd/ nfc/
 obj-$(CONFIG_LIBNVDIMM)		+= nvdimm/
+obj-$(CONFIG_DEV_DAX)		+= dax/
 obj-$(CONFIG_DMA_SHARED_BUFFER) += dma-buf/
 obj-$(CONFIG_NUBUS)		+= nubus/
 obj-y				+= macintosh/
diff --git a/drivers/dax/Kconfig b/drivers/dax/Kconfig
new file mode 100644
index 0000000..86ffbaa
--- /dev/null
+++ b/drivers/dax/Kconfig
@@ -0,0 +1,25 @@
+menuconfig DEV_DAX
+	tristate "DAX: direct access to differentiated memory"
+	default m if NVDIMM_DAX
+	help
+	  Support raw access to differentiated (persistence, bandwidth,
+	  latency...) memory via an mmap(2) capable character
+	  device.  Platform firmware or a device driver may identify a
+	  platform memory resource that is differentiated from the
+	  baseline memory pool.  Mappings of a /dev/daxX.Y device impose
+	  restrictions that make the mapping behavior deterministic.
+
+if DEV_DAX
+
+config DEV_DAX_PMEM
+	tristate "PMEM DAX: direct access to persistent memory"
+	depends on NVDIMM_DAX
+	default DEV_DAX
+	help
+	  Support raw access to persistent memory.  Note that this
+	  driver consumes memory ranges allocated and exported by the
+	  libnvdimm sub-system.
+
+	  Say Y if unsure
+
+endif
diff --git a/drivers/dax/Makefile b/drivers/dax/Makefile
new file mode 100644
index 0000000..27c54e3
--- /dev/null
+++ b/drivers/dax/Makefile
@@ -0,0 +1,4 @@
+obj-$(CONFIG_DEV_DAX) += dax.o
+obj-$(CONFIG_DEV_DAX_PMEM) += dax_pmem.o
+
+dax_pmem-y := pmem.o
diff --git a/drivers/dax/dax.c b/drivers/dax/dax.c
new file mode 100644
index 0000000..4c22a40
--- /dev/null
+++ b/drivers/dax/dax.c
@@ -0,0 +1,253 @@
+/*
+ * Copyright(c) 2016 Intel Corporation. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ */
+#include <linux/pagemap.h>
+#include <linux/module.h>
+#include <linux/device.h>
+#include <linux/pfn_t.h>
+#include <linux/slab.h>
+#include <linux/dax.h>
+#include <linux/fs.h>
+#include <linux/mm.h>
+
+static int dax_major;
+static struct class *dax_class;
+static DEFINE_IDA(dax_minor_ida);
+
+/**
+ * struct dax_region - mapping infrastructure for dax devices
+ * @id: kernel-wide unique region for a memory range
+ * @base: linear address corresponding to @res
+ * @kref: to pin while other agents have a need to do lookups
+ * @dev: parent device backing this region
+ * @align: allocation and mapping alignment for child dax devices
+ * @res: physical address range of the region
+ * @pfn_flags: identify whether the pfns are paged back or not
+ */
+struct dax_region {
+	int id;
+	struct ida ida;
+	void *base;
+	struct kref kref;
+	struct device *dev;
+	unsigned int align;
+	struct resource res;
+	unsigned long pfn_flags;
+};
+
+/**
+ * struct dax_dev - subdivision of a dax region
+ * @region - parent region
+ * @dev - device backing the character device
+ * @kref - enable this data to be tracked in filp->private_data
+ * @id - child id in the region
+ * @num_resources - number of physical address extents in this device
+ * @res - array of physical address ranges
+ */
+struct dax_dev {
+	struct dax_region *region;
+	struct device *dev;
+	struct kref kref;
+	int id;
+	int num_resources;
+	struct resource res[0];
+};
+
+static void dax_region_free(struct kref *kref)
+{
+	struct dax_region *dax_region;
+
+	dax_region = container_of(kref, struct dax_region, kref);
+	kfree(dax_region);
+}
+
+void dax_region_put(struct dax_region *dax_region)
+{
+	kref_put(&dax_region->kref, dax_region_free);
+}
+EXPORT_SYMBOL_GPL(dax_region_put);
+
+static void dax_dev_free(struct kref *kref)
+{
+	struct dax_dev *dax_dev;
+
+	dax_dev = container_of(kref, struct dax_dev, kref);
+	dax_region_put(dax_dev->region);
+	kfree(dax_dev);
+}
+
+static void dax_dev_put(struct dax_dev *dax_dev)
+{
+	kref_put(&dax_dev->kref, dax_dev_free);
+}
+
+struct dax_region *alloc_dax_region(struct device *parent, int region_id,
+		struct resource *res, unsigned int align, void *addr,
+		unsigned long pfn_flags)
+{
+	struct dax_region *dax_region;
+
+	dax_region = kzalloc(sizeof(*dax_region), GFP_KERNEL);
+
+	if (!dax_region)
+		return NULL;
+
+	memcpy(&dax_region->res, res, sizeof(*res));
+	dax_region->pfn_flags = pfn_flags;
+	kref_init(&dax_region->kref);
+	dax_region->id = region_id;
+	ida_init(&dax_region->ida);
+	dax_region->align = align;
+	dax_region->dev = parent;
+	dax_region->base = addr;
+
+	return dax_region;
+}
+EXPORT_SYMBOL_GPL(alloc_dax_region);
+
+static ssize_t size_show(struct device *dev,
+		struct device_attribute *attr, char *buf)
+{
+	struct dax_dev *dax_dev = dev_get_drvdata(dev);
+	unsigned long long size = 0;
+	int i;
+
+	for (i = 0; i < dax_dev->num_resources; i++)
+		size += resource_size(&dax_dev->res[i]);
+
+	return sprintf(buf, "%llu\n", size);
+}
+static DEVICE_ATTR_RO(size);
+
+static struct attribute *dax_device_attributes[] = {
+	&dev_attr_size.attr,
+	NULL,
+};
+
+static const struct attribute_group dax_device_attribute_group = {
+	.attrs = dax_device_attributes,
+};
+
+static const struct attribute_group *dax_attribute_groups[] = {
+	&dax_device_attribute_group,
+	NULL,
+};
+
+static void unregister_dax_dev(void *_dev)
+{
+	struct device *dev = _dev;
+	struct dax_dev *dax_dev = dev_get_drvdata(dev);
+	struct dax_region *dax_region = dax_dev->region;
+
+	dev_dbg(dev, "%s\n", __func__);
+
+	get_device(dev);
+	device_unregister(dev);
+	ida_simple_remove(&dax_region->ida, dax_dev->id);
+	ida_simple_remove(&dax_minor_ida, MINOR(dev->devt));
+	put_device(dev);
+	dax_dev_put(dax_dev);
+}
+
+int devm_create_dax_dev(struct dax_region *dax_region, struct resource *res,
+		int count)
+{
+	struct device *parent = dax_region->dev;
+	struct dax_dev *dax_dev;
+	struct device *dev;
+	int rc, minor;
+	dev_t dev_t;
+
+	dax_dev = kzalloc(sizeof(*dax_dev) + sizeof(*res) * count, GFP_KERNEL);
+	if (!dax_dev)
+		return -ENOMEM;
+	memcpy(dax_dev->res, res, sizeof(*res) * count);
+	dax_dev->num_resources = count;
+	kref_init(&dax_dev->kref);
+	dax_dev->region = dax_region;
+	kref_get(&dax_region->kref);
+
+	dax_dev->id = ida_simple_get(&dax_region->ida, 0, 0, GFP_KERNEL);
+	if (dax_dev->id < 0) {
+		rc = dax_dev->id;
+		goto err_id;
+	}
+
+	minor = ida_simple_get(&dax_minor_ida, 0, 0, GFP_KERNEL);
+	if (minor < 0) {
+		rc = minor;
+		goto err_minor;
+	}
+
+	dev_t = MKDEV(dax_major, minor);
+	dev = device_create_with_groups(dax_class, parent, dev_t, dax_dev,
+			dax_attribute_groups, "dax%d.%d", dax_region->id,
+			dax_dev->id);
+	if (IS_ERR(dev)) {
+		rc = PTR_ERR(dev);
+		goto err_create;
+	}
+	dax_dev->dev = dev;
+
+	rc = devm_add_action(dax_region->dev, unregister_dax_dev, dev);
+	if (rc) {
+		unregister_dax_dev(dev);
+		return rc;
+	}
+
+	return 0;
+
+ err_create:
+	ida_simple_remove(&dax_minor_ida, minor);
+ err_minor:
+	ida_simple_remove(&dax_region->ida, dax_dev->id);
+ err_id:
+	dax_dev_put(dax_dev);
+
+	return rc;
+}
+EXPORT_SYMBOL_GPL(devm_create_dax_dev);
+
+static const struct file_operations dax_fops = {
+	.llseek = noop_llseek,
+	.owner = THIS_MODULE,
+};
+
+static int __init dax_init(void)
+{
+	int rc;
+
+	rc = register_chrdev(0, "dax", &dax_fops);
+	if (rc < 0)
+		return rc;
+	dax_major = rc;
+
+	dax_class = class_create(THIS_MODULE, "dax");
+	if (IS_ERR(dax_class)) {
+		unregister_chrdev(dax_major, "dax");
+		return PTR_ERR(dax_class);
+	}
+
+	return 0;
+}
+
+static void __exit dax_exit(void)
+{
+	class_destroy(dax_class);
+	unregister_chrdev(dax_major, "dax");
+	ida_destroy(&dax_minor_ida);
+}
+
+MODULE_AUTHOR("Intel Corporation");
+MODULE_LICENSE("GPL v2");
+subsys_initcall(dax_init);
+module_exit(dax_exit);
diff --git a/drivers/dax/dax.h b/drivers/dax/dax.h
new file mode 100644
index 0000000..d8b8f1f
--- /dev/null
+++ b/drivers/dax/dax.h
@@ -0,0 +1,24 @@
+/*
+ * Copyright(c) 2016 Intel Corporation. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ */
+#ifndef __DAX_H__
+#define __DAX_H__
+struct device;
+struct resource;
+struct dax_region;
+void dax_region_put(struct dax_region *dax_region);
+struct dax_region *alloc_dax_region(struct device *parent,
+		int region_id, struct resource *res, unsigned int align,
+		void *addr, unsigned long flags);
+int devm_create_dax_dev(struct dax_region *dax_region, struct resource *res,
+		int count);
+#endif /* __DAX_H__ */
diff --git a/drivers/dax/pmem.c b/drivers/dax/pmem.c
new file mode 100644
index 0000000..55d510e
--- /dev/null
+++ b/drivers/dax/pmem.c
@@ -0,0 +1,158 @@
+/*
+ * Copyright(c) 2016 Intel Corporation. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ */
+#include <linux/percpu-refcount.h>
+#include <linux/memremap.h>
+#include <linux/module.h>
+#include <linux/pfn_t.h>
+#include "../nvdimm/pfn.h"
+#include "../nvdimm/nd.h"
+#include "dax.h"
+
+struct dax_pmem {
+	struct device *dev;
+	struct percpu_ref ref;
+	struct completion cmp;
+};
+
+struct dax_pmem *to_dax_pmem(struct percpu_ref *ref)
+{
+	return container_of(ref, struct dax_pmem, ref);
+}
+
+static void dax_pmem_percpu_release(struct percpu_ref *ref)
+{
+	struct dax_pmem *dax_pmem = to_dax_pmem(ref);
+
+	dev_dbg(dax_pmem->dev, "%s\n", __func__);
+	complete(&dax_pmem->cmp);
+}
+
+static void dax_pmem_percpu_exit(void *data)
+{
+	struct percpu_ref *ref = data;
+	struct dax_pmem *dax_pmem = to_dax_pmem(ref);
+
+	dev_dbg(dax_pmem->dev, "%s\n", __func__);
+	percpu_ref_exit(ref);
+	wait_for_completion(&dax_pmem->cmp);
+}
+
+static void dax_pmem_percpu_kill(void *data)
+{
+	struct percpu_ref *ref = data;
+	struct dax_pmem *dax_pmem = to_dax_pmem(ref);
+
+	dev_dbg(dax_pmem->dev, "%s\n", __func__);
+	percpu_ref_kill(ref);
+}
+
+static int dax_pmem_probe(struct device *dev)
+{
+	int rc;
+	void *addr;
+	struct resource res;
+	struct nd_pfn_sb *pfn_sb;
+	struct dax_pmem *dax_pmem;
+	struct nd_region *nd_region;
+	struct nd_namespace_io *nsio;
+	struct dax_region *dax_region;
+	struct nd_namespace_common *ndns;
+	struct nd_dax *nd_dax = to_nd_dax(dev);
+	struct nd_pfn *nd_pfn = &nd_dax->nd_pfn;
+	struct vmem_altmap __altmap, *altmap = NULL;
+
+	ndns = nvdimm_namespace_common_probe(dev);
+	if (IS_ERR(ndns))
+		return PTR_ERR(ndns);
+	nsio = to_nd_namespace_io(&ndns->dev);
+
+	/* parse the 'pfn' info block via ->rw_bytes */
+	devm_nsio_enable(dev, nsio);
+	altmap = nvdimm_setup_pfn(nd_pfn, &res, &__altmap);
+	if (IS_ERR(altmap))
+		return PTR_ERR(altmap);
+	devm_nsio_disable(dev, nsio);
+
+	pfn_sb = nd_pfn->pfn_sb;
+
+	if (!devm_request_mem_region(dev, nsio->res.start,
+				resource_size(&nsio->res), dev_name(dev))) {
+		dev_warn(dev, "could not reserve region %pR\n", &nsio->res);
+		return -EBUSY;
+	}
+
+	dax_pmem = devm_kzalloc(dev, sizeof(*dax_pmem), GFP_KERNEL);
+	if (!dax_pmem)
+		return -ENOMEM;
+
+	dax_pmem->dev = dev;
+	init_completion(&dax_pmem->cmp);
+	rc = percpu_ref_init(&dax_pmem->ref, dax_pmem_percpu_release, 0,
+			GFP_KERNEL);
+	if (rc)
+		return rc;
+
+	rc = devm_add_action(dev, dax_pmem_percpu_exit, &dax_pmem->ref);
+	if (rc) {
+		dax_pmem_percpu_exit(&dax_pmem->ref);
+		return rc;
+	}
+
+	addr = devm_memremap_pages(dev, &res, &dax_pmem->ref, altmap);
+	if (IS_ERR(addr))
+		return PTR_ERR(addr);
+
+	rc = devm_add_action(dev, dax_pmem_percpu_kill, &dax_pmem->ref);
+	if (rc) {
+		dax_pmem_percpu_kill(&dax_pmem->ref);
+		return rc;
+	}
+
+	nd_region = to_nd_region(dev->parent);
+	dax_region = alloc_dax_region(dev, nd_region->id, &res,
+			le32_to_cpu(pfn_sb->align), addr, PFN_DEV|PFN_MAP);
+	if (!dax_region)
+		return -ENOMEM;
+
+	/* TODO: support for subdividing a dax region... */
+	rc = devm_create_dax_dev(dax_region, &res, 1);
+
+	/* child dax_dev instances now own the lifetime of the dax_region */
+	dax_region_put(dax_region);
+
+	return rc;
+}
+
+static struct nd_device_driver dax_pmem_driver = {
+	.probe = dax_pmem_probe,
+	.drv = {
+		.name = "dax_pmem",
+	},
+	.type = ND_DRIVER_DAX_PMEM,
+};
+
+static int __init dax_pmem_init(void)
+{
+	return nd_driver_register(&dax_pmem_driver);
+}
+module_init(dax_pmem_init);
+
+static void __exit dax_pmem_exit(void)
+{
+	driver_unregister(&dax_pmem_driver.drv);
+}
+module_exit(dax_pmem_exit);
+
+MODULE_LICENSE("GPL v2");
+MODULE_AUTHOR("Intel Corporation");
+MODULE_ALIAS_ND_DEVICE(ND_DEVICE_DAX_PMEM);
diff --git a/tools/testing/nvdimm/Kbuild b/tools/testing/nvdimm/Kbuild
index 5ff6d3c..7859856 100644
--- a/tools/testing/nvdimm/Kbuild
+++ b/tools/testing/nvdimm/Kbuild
@@ -16,6 +16,7 @@ ldflags-y += --wrap=phys_to_pfn_t
 DRIVERS := ../../../drivers
 NVDIMM_SRC := $(DRIVERS)/nvdimm
 ACPI_SRC := $(DRIVERS)/acpi
+DAX_SRC := $(DRIVERS)/dax
 
 obj-$(CONFIG_LIBNVDIMM) += libnvdimm.o
 obj-$(CONFIG_BLK_DEV_PMEM) += nd_pmem.o
@@ -23,6 +24,8 @@ obj-$(CONFIG_ND_BTT) += nd_btt.o
 obj-$(CONFIG_ND_BLK) += nd_blk.o
 obj-$(CONFIG_X86_PMEM_LEGACY) += nd_e820.o
 obj-$(CONFIG_ACPI_NFIT) += nfit.o
+obj-$(CONFIG_DEV_DAX) += dax.o
+obj-$(CONFIG_DEV_DAX_PMEM) += dax_pmem.o
 
 nfit-y := $(ACPI_SRC)/nfit.o
 nfit-y += config_check.o
@@ -39,6 +42,12 @@ nd_blk-y += config_check.o
 nd_e820-y := $(NVDIMM_SRC)/e820.o
 nd_e820-y += config_check.o
 
+dax-y := $(DAX_SRC)/dax.o
+dax-y += config_check.o
+
+dax_pmem-y := $(DAX_SRC)/pmem.o
+dax_pmem-y += config_check.o
+
 libnvdimm-y := $(NVDIMM_SRC)/core.o
 libnvdimm-y += $(NVDIMM_SRC)/bus.o
 libnvdimm-y += $(NVDIMM_SRC)/dimm_devs.o
diff --git a/tools/testing/nvdimm/config_check.c b/tools/testing/nvdimm/config_check.c
index f2c7615..adf18bf 100644
--- a/tools/testing/nvdimm/config_check.c
+++ b/tools/testing/nvdimm/config_check.c
@@ -12,4 +12,6 @@ void check(void)
 	BUILD_BUG_ON(!IS_MODULE(CONFIG_ND_BTT));
 	BUILD_BUG_ON(!IS_MODULE(CONFIG_ND_BLK));
 	BUILD_BUG_ON(!IS_MODULE(CONFIG_ACPI_NFIT));
+	BUILD_BUG_ON(!IS_MODULE(CONFIG_DEV_DAX));
+	BUILD_BUG_ON(!IS_MODULE(CONFIG_DEV_DAX_PMEM));
 }
-- 
cgit v0.10.2


From dee410792419aaa8bc3e3b35d2ccb6515835916d Mon Sep 17 00:00:00 2001
From: Dan Williams <dan.j.williams@intel.com>
Date: Sat, 14 May 2016 12:20:44 -0700
Subject: /dev/dax, core: file operations and dax-mmap

The "Device DAX" core enables dax mappings of performance / feature
differentiated memory.  An open mapping or file handle keeps the backing
struct device live, but new mappings are only possible while the device
is enabled.   Faults are handled under rcu_read_lock to synchronize
with the enabled state of the device.

Similar to the filesystem-dax case the backing memory may optionally
have struct page entries.  However, unlike fs-dax there is no support
for private mappings, or mappings that are not backed by media (see
use of zero-page in fs-dax).

Mappings are always guaranteed to match the alignment of the dax_region.
If the dax_region is configured to have a 2MB alignment, all mappings
are guaranteed to be backed by a pmd entry.  Contrast this determinism
with the fs-dax case where pmd mappings are opportunistic.  If userspace
attempts to force a misaligned mapping, the driver will fail the mmap
attempt.  See dax_dev_check_vma() for other scenarios that are rejected,
like MAP_PRIVATE mappings.

Cc: Hannes Reinecke <hare@suse.de>
Cc: Jeff Moyer <jmoyer@redhat.com>
Cc: Christoph Hellwig <hch@lst.de>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: Ross Zwisler <ross.zwisler@linux.intel.com>
Acked-by: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Reviewed-by: Johannes Thumshirn <jthumshirn@suse.de>
Signed-off-by: Dan Williams <dan.j.williams@intel.com>

diff --git a/drivers/dax/Kconfig b/drivers/dax/Kconfig
index 86ffbaa..cedab75 100644
--- a/drivers/dax/Kconfig
+++ b/drivers/dax/Kconfig
@@ -1,6 +1,7 @@
 menuconfig DEV_DAX
 	tristate "DAX: direct access to differentiated memory"
 	default m if NVDIMM_DAX
+	depends on TRANSPARENT_HUGEPAGE
 	help
 	  Support raw access to differentiated (persistence, bandwidth,
 	  latency...) memory via an mmap(2) capable character
diff --git a/drivers/dax/dax.c b/drivers/dax/dax.c
index 4c22a40..b891a12 100644
--- a/drivers/dax/dax.c
+++ b/drivers/dax/dax.c
@@ -49,6 +49,7 @@ struct dax_region {
  * @region - parent region
  * @dev - device backing the character device
  * @kref - enable this data to be tracked in filp->private_data
+ * @alive - !alive + rcu grace period == no new mappings can be established
  * @id - child id in the region
  * @num_resources - number of physical address extents in this device
  * @res - array of physical address ranges
@@ -57,6 +58,7 @@ struct dax_dev {
 	struct dax_region *region;
 	struct device *dev;
 	struct kref kref;
+	bool alive;
 	int id;
 	int num_resources;
 	struct resource res[0];
@@ -150,6 +152,16 @@ static void unregister_dax_dev(void *_dev)
 
 	dev_dbg(dev, "%s\n", __func__);
 
+	/*
+	 * Note, rcu is not protecting the liveness of dax_dev, rcu is
+	 * ensuring that any fault handlers that might have seen
+	 * dax_dev->alive == true, have completed.  Any fault handlers
+	 * that start after synchronize_rcu() has started will abort
+	 * upon seeing dax_dev->alive == false.
+	 */
+	dax_dev->alive = false;
+	synchronize_rcu();
+
 	get_device(dev);
 	device_unregister(dev);
 	ida_simple_remove(&dax_region->ida, dax_dev->id);
@@ -173,6 +185,7 @@ int devm_create_dax_dev(struct dax_region *dax_region, struct resource *res,
 	memcpy(dax_dev->res, res, sizeof(*res) * count);
 	dax_dev->num_resources = count;
 	kref_init(&dax_dev->kref);
+	dax_dev->alive = true;
 	dax_dev->region = dax_region;
 	kref_get(&dax_region->kref);
 
@@ -217,9 +230,318 @@ int devm_create_dax_dev(struct dax_region *dax_region, struct resource *res,
 }
 EXPORT_SYMBOL_GPL(devm_create_dax_dev);
 
+/* return an unmapped area aligned to the dax region specified alignment */
+static unsigned long dax_dev_get_unmapped_area(struct file *filp,
+		unsigned long addr, unsigned long len, unsigned long pgoff,
+		unsigned long flags)
+{
+	unsigned long off, off_end, off_align, len_align, addr_align, align;
+	struct dax_dev *dax_dev = filp ? filp->private_data : NULL;
+	struct dax_region *dax_region;
+
+	if (!dax_dev || addr)
+		goto out;
+
+	dax_region = dax_dev->region;
+	align = dax_region->align;
+	off = pgoff << PAGE_SHIFT;
+	off_end = off + len;
+	off_align = round_up(off, align);
+
+	if ((off_end <= off_align) || ((off_end - off_align) < align))
+		goto out;
+
+	len_align = len + align;
+	if ((off + len_align) < off)
+		goto out;
+
+	addr_align = current->mm->get_unmapped_area(filp, addr, len_align,
+			pgoff, flags);
+	if (!IS_ERR_VALUE(addr_align)) {
+		addr_align += (off - addr_align) & (align - 1);
+		return addr_align;
+	}
+ out:
+	return current->mm->get_unmapped_area(filp, addr, len, pgoff, flags);
+}
+
+static int __match_devt(struct device *dev, const void *data)
+{
+	const dev_t *devt = data;
+
+	return dev->devt == *devt;
+}
+
+static struct device *dax_dev_find(dev_t dev_t)
+{
+	return class_find_device(dax_class, NULL, &dev_t, __match_devt);
+}
+
+static int dax_dev_open(struct inode *inode, struct file *filp)
+{
+	struct dax_dev *dax_dev = NULL;
+	struct device *dev;
+
+	dev = dax_dev_find(inode->i_rdev);
+	if (!dev)
+		return -ENXIO;
+
+	device_lock(dev);
+	dax_dev = dev_get_drvdata(dev);
+	if (dax_dev) {
+		dev_dbg(dev, "%s\n", __func__);
+		filp->private_data = dax_dev;
+		kref_get(&dax_dev->kref);
+		inode->i_flags = S_DAX;
+	}
+	device_unlock(dev);
+
+	if (!dax_dev) {
+		put_device(dev);
+		return -ENXIO;
+	}
+	return 0;
+}
+
+static int dax_dev_release(struct inode *inode, struct file *filp)
+{
+	struct dax_dev *dax_dev = filp->private_data;
+	struct device *dev = dax_dev->dev;
+
+	dev_dbg(dax_dev->dev, "%s\n", __func__);
+	dax_dev_put(dax_dev);
+	put_device(dev);
+
+	return 0;
+}
+
+static int check_vma(struct dax_dev *dax_dev, struct vm_area_struct *vma,
+		const char *func)
+{
+	struct dax_region *dax_region = dax_dev->region;
+	struct device *dev = dax_dev->dev;
+	unsigned long mask;
+
+	if (!dax_dev->alive)
+		return -ENXIO;
+
+	/* prevent private / writable mappings from being established */
+	if ((vma->vm_flags & (VM_NORESERVE|VM_SHARED|VM_WRITE)) == VM_WRITE) {
+		dev_info(dev, "%s: %s: fail, attempted private mapping\n",
+				current->comm, func);
+		return -EINVAL;
+	}
+
+	mask = dax_region->align - 1;
+	if (vma->vm_start & mask || vma->vm_end & mask) {
+		dev_info(dev, "%s: %s: fail, unaligned vma (%#lx - %#lx, %#lx)\n",
+				current->comm, func, vma->vm_start, vma->vm_end,
+				mask);
+		return -EINVAL;
+	}
+
+	if ((dax_region->pfn_flags & (PFN_DEV|PFN_MAP)) == PFN_DEV
+			&& (vma->vm_flags & VM_DONTCOPY) == 0) {
+		dev_info(dev, "%s: %s: fail, dax range requires MADV_DONTFORK\n",
+				current->comm, func);
+		return -EINVAL;
+	}
+
+	if (!vma_is_dax(vma)) {
+		dev_info(dev, "%s: %s: fail, vma is not DAX capable\n",
+				current->comm, func);
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
+static phys_addr_t pgoff_to_phys(struct dax_dev *dax_dev, pgoff_t pgoff,
+		unsigned long size)
+{
+	struct resource *res;
+	phys_addr_t phys;
+	int i;
+
+	for (i = 0; i < dax_dev->num_resources; i++) {
+		res = &dax_dev->res[i];
+		phys = pgoff * PAGE_SIZE + res->start;
+		if (phys >= res->start && phys <= res->end)
+			break;
+		pgoff -= PHYS_PFN(resource_size(res));
+	}
+
+	if (i < dax_dev->num_resources) {
+		res = &dax_dev->res[i];
+		if (phys + size - 1 <= res->end)
+			return phys;
+	}
+
+	return -1;
+}
+
+static int __dax_dev_fault(struct dax_dev *dax_dev, struct vm_area_struct *vma,
+		struct vm_fault *vmf)
+{
+	unsigned long vaddr = (unsigned long) vmf->virtual_address;
+	struct device *dev = dax_dev->dev;
+	struct dax_region *dax_region;
+	int rc = VM_FAULT_SIGBUS;
+	phys_addr_t phys;
+	pfn_t pfn;
+
+	if (check_vma(dax_dev, vma, __func__))
+		return VM_FAULT_SIGBUS;
+
+	dax_region = dax_dev->region;
+	if (dax_region->align > PAGE_SIZE) {
+		dev_dbg(dev, "%s: alignment > fault size\n", __func__);
+		return VM_FAULT_SIGBUS;
+	}
+
+	phys = pgoff_to_phys(dax_dev, vmf->pgoff, PAGE_SIZE);
+	if (phys == -1) {
+		dev_dbg(dev, "%s: phys_to_pgoff(%#lx) failed\n", __func__,
+				vmf->pgoff);
+		return VM_FAULT_SIGBUS;
+	}
+
+	pfn = phys_to_pfn_t(phys, dax_region->pfn_flags);
+
+	rc = vm_insert_mixed(vma, vaddr, pfn);
+
+	if (rc == -ENOMEM)
+		return VM_FAULT_OOM;
+	if (rc < 0 && rc != -EBUSY)
+		return VM_FAULT_SIGBUS;
+
+	return VM_FAULT_NOPAGE;
+}
+
+static int dax_dev_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
+{
+	int rc;
+	struct file *filp = vma->vm_file;
+	struct dax_dev *dax_dev = filp->private_data;
+
+	dev_dbg(dax_dev->dev, "%s: %s: %s (%#lx - %#lx)\n", __func__,
+			current->comm, (vmf->flags & FAULT_FLAG_WRITE)
+			? "write" : "read", vma->vm_start, vma->vm_end);
+	rcu_read_lock();
+	rc = __dax_dev_fault(dax_dev, vma, vmf);
+	rcu_read_unlock();
+
+	return rc;
+}
+
+static int __dax_dev_pmd_fault(struct dax_dev *dax_dev,
+		struct vm_area_struct *vma, unsigned long addr, pmd_t *pmd,
+		unsigned int flags)
+{
+	unsigned long pmd_addr = addr & PMD_MASK;
+	struct device *dev = dax_dev->dev;
+	struct dax_region *dax_region;
+	phys_addr_t phys;
+	pgoff_t pgoff;
+	pfn_t pfn;
+
+	if (check_vma(dax_dev, vma, __func__))
+		return VM_FAULT_SIGBUS;
+
+	dax_region = dax_dev->region;
+	if (dax_region->align > PMD_SIZE) {
+		dev_dbg(dev, "%s: alignment > fault size\n", __func__);
+		return VM_FAULT_SIGBUS;
+	}
+
+	/* dax pmd mappings require pfn_t_devmap() */
+	if ((dax_region->pfn_flags & (PFN_DEV|PFN_MAP)) != (PFN_DEV|PFN_MAP)) {
+		dev_dbg(dev, "%s: alignment > fault size\n", __func__);
+		return VM_FAULT_SIGBUS;
+	}
+
+	pgoff = linear_page_index(vma, pmd_addr);
+	phys = pgoff_to_phys(dax_dev, pgoff, PAGE_SIZE);
+	if (phys == -1) {
+		dev_dbg(dev, "%s: phys_to_pgoff(%#lx) failed\n", __func__,
+				pgoff);
+		return VM_FAULT_SIGBUS;
+	}
+
+	pfn = phys_to_pfn_t(phys, dax_region->pfn_flags);
+
+	return vmf_insert_pfn_pmd(vma, addr, pmd, pfn,
+			flags & FAULT_FLAG_WRITE);
+}
+
+static int dax_dev_pmd_fault(struct vm_area_struct *vma, unsigned long addr,
+		pmd_t *pmd, unsigned int flags)
+{
+	int rc;
+	struct file *filp = vma->vm_file;
+	struct dax_dev *dax_dev = filp->private_data;
+
+	dev_dbg(dax_dev->dev, "%s: %s: %s (%#lx - %#lx)\n", __func__,
+			current->comm, (flags & FAULT_FLAG_WRITE)
+			? "write" : "read", vma->vm_start, vma->vm_end);
+
+	rcu_read_lock();
+	rc = __dax_dev_pmd_fault(dax_dev, vma, addr, pmd, flags);
+	rcu_read_unlock();
+
+	return rc;
+}
+
+static void dax_dev_vm_open(struct vm_area_struct *vma)
+{
+	struct file *filp = vma->vm_file;
+	struct dax_dev *dax_dev = filp->private_data;
+
+	dev_dbg(dax_dev->dev, "%s\n", __func__);
+	kref_get(&dax_dev->kref);
+}
+
+static void dax_dev_vm_close(struct vm_area_struct *vma)
+{
+	struct file *filp = vma->vm_file;
+	struct dax_dev *dax_dev = filp->private_data;
+
+	dev_dbg(dax_dev->dev, "%s\n", __func__);
+	dax_dev_put(dax_dev);
+}
+
+static const struct vm_operations_struct dax_dev_vm_ops = {
+	.fault = dax_dev_fault,
+	.pmd_fault = dax_dev_pmd_fault,
+	.open = dax_dev_vm_open,
+	.close = dax_dev_vm_close,
+};
+
+static int dax_dev_mmap(struct file *filp, struct vm_area_struct *vma)
+{
+	struct dax_dev *dax_dev = filp->private_data;
+	int rc;
+
+	dev_dbg(dax_dev->dev, "%s\n", __func__);
+
+	rc = check_vma(dax_dev, vma, __func__);
+	if (rc)
+		return rc;
+
+	kref_get(&dax_dev->kref);
+	vma->vm_ops = &dax_dev_vm_ops;
+	vma->vm_flags |= VM_MIXEDMAP | VM_HUGEPAGE;
+	return 0;
+
+}
+
 static const struct file_operations dax_fops = {
 	.llseek = noop_llseek,
 	.owner = THIS_MODULE,
+	.open = dax_dev_open,
+	.release = dax_dev_release,
+	.get_unmapped_area = dax_dev_get_unmapped_area,
+	.mmap = dax_dev_mmap,
 };
 
 static int __init dax_init(void)
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 86f9f8b..52ea012 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -1013,6 +1013,7 @@ int vmf_insert_pfn_pmd(struct vm_area_struct *vma, unsigned long addr,
 	insert_pfn_pmd(vma, addr, pmd, pfn, pgprot, write);
 	return VM_FAULT_NOPAGE;
 }
+EXPORT_SYMBOL_GPL(vmf_insert_pfn_pmd);
 
 static void touch_pmd(struct vm_area_struct *vma, unsigned long addr,
 		pmd_t *pmd)
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 19d0d08..b14e981 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -624,6 +624,7 @@ pgoff_t linear_hugepage_index(struct vm_area_struct *vma,
 {
 	return vma_hugecache_offset(hstate_vma(vma), vma, address);
 }
+EXPORT_SYMBOL_GPL(linear_hugepage_index);
 
 /*
  * Return the size of the pages allocated when backing a VMA. In the majority
-- 
cgit v0.10.2


From acc93d30d7d43f428272c20a047389c4cbca82ba Mon Sep 17 00:00:00 2001
From: Dan Williams <dan.j.williams@intel.com>
Date: Sat, 7 May 2016 11:40:28 -0700
Subject: Revert "block: enable dax for raw block devices"

This reverts commit 5a023cdba50c5f5f2bc351783b3131699deb3937.

The functionality is superseded by the new "Device DAX" facility.

Cc: Jeff Moyer <jmoyer@redhat.com>
Cc: Christoph Hellwig <hch@lst.de>
Cc: Dave Chinner <david@fromorbit.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Ross Zwisler <ross.zwisler@linux.intel.com>
Cc: Jan Kara <jack@suse.com>
Signed-off-by: Dan Williams <dan.j.williams@intel.com>

diff --git a/block/ioctl.c b/block/ioctl.c
index 4ff1f92..698c793 100644
--- a/block/ioctl.c
+++ b/block/ioctl.c
@@ -407,35 +407,6 @@ static inline int is_unrecognized_ioctl(int ret)
 		ret == -ENOIOCTLCMD;
 }
 
-#ifdef CONFIG_FS_DAX
-bool blkdev_dax_capable(struct block_device *bdev)
-{
-	struct gendisk *disk = bdev->bd_disk;
-
-	if (!disk->fops->direct_access)
-		return false;
-
-	/*
-	 * If the partition is not aligned on a page boundary, we can't
-	 * do dax I/O to it.
-	 */
-	if ((bdev->bd_part->start_sect % (PAGE_SIZE / 512))
-			|| (bdev->bd_part->nr_sects % (PAGE_SIZE / 512)))
-		return false;
-
-	/*
-	 * If the device has known bad blocks, force all I/O through the
-	 * driver / page cache.
-	 *
-	 * TODO: support finer grained dax error handling
-	 */
-	if (disk->bb && disk->bb->count)
-		return false;
-
-	return true;
-}
-#endif
-
 static int blkdev_flushbuf(struct block_device *bdev, fmode_t mode,
 		unsigned cmd, unsigned long arg)
 {
@@ -598,9 +569,6 @@ int blkdev_ioctl(struct block_device *bdev, fmode_t mode, unsigned cmd,
 	case BLKTRACESETUP:
 	case BLKTRACETEARDOWN:
 		return blk_trace_ioctl(bdev, cmd, argp);
-	case BLKDAXGET:
-		return put_int(arg, !!(bdev->bd_inode->i_flags & S_DAX));
-		break;
 	case IOC_PR_REGISTER:
 		return blkdev_pr_register(bdev, argp);
 	case IOC_PR_RESERVE:
diff --git a/fs/block_dev.c b/fs/block_dev.c
index 20a2c02..36ee10c 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -29,6 +29,7 @@
 #include <linux/log2.h>
 #include <linux/cleancache.h>
 #include <linux/dax.h>
+#include <linux/badblocks.h>
 #include <asm/uaccess.h>
 #include "internal.h"
 
@@ -1159,6 +1160,33 @@ void bd_set_size(struct block_device *bdev, loff_t size)
 }
 EXPORT_SYMBOL(bd_set_size);
 
+static bool blkdev_dax_capable(struct block_device *bdev)
+{
+	struct gendisk *disk = bdev->bd_disk;
+
+	if (!disk->fops->direct_access || !IS_ENABLED(CONFIG_FS_DAX))
+		return false;
+
+	/*
+	 * If the partition is not aligned on a page boundary, we can't
+	 * do dax I/O to it.
+	 */
+	if ((bdev->bd_part->start_sect % (PAGE_SIZE / 512))
+			|| (bdev->bd_part->nr_sects % (PAGE_SIZE / 512)))
+		return false;
+
+	/*
+	 * If the device has known bad blocks, force all I/O through the
+	 * driver / page cache.
+	 *
+	 * TODO: support finer grained dax error handling
+	 */
+	if (disk->bb && disk->bb->count)
+		return false;
+
+	return true;
+}
+
 static void __blkdev_put(struct block_device *bdev, fmode_t mode, int for_part);
 
 /*
@@ -1724,79 +1752,13 @@ static const struct address_space_operations def_blk_aops = {
 	.is_dirty_writeback = buffer_check_dirty_writeback,
 };
 
-#ifdef CONFIG_FS_DAX
-/*
- * In the raw block case we do not need to contend with truncation nor
- * unwritten file extents.  Without those concerns there is no need for
- * additional locking beyond the mmap_sem context that these routines
- * are already executing under.
- *
- * Note, there is no protection if the block device is dynamically
- * resized (partition grow/shrink) during a fault. A stable block device
- * size is already not enforced in the blkdev_direct_IO path.
- *
- * For DAX, it is the responsibility of the block device driver to
- * ensure the whole-disk device size is stable while requests are in
- * flight.
- *
- * Finally, unlike the filemap_page_mkwrite() case there is no
- * filesystem superblock to sync against freezing.  We still include a
- * pfn_mkwrite callback for dax drivers to receive write fault
- * notifications.
- */
-static int blkdev_dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
-{
-	return __dax_fault(vma, vmf, blkdev_get_block, NULL);
-}
-
-static int blkdev_dax_pfn_mkwrite(struct vm_area_struct *vma,
-		struct vm_fault *vmf)
-{
-	return dax_pfn_mkwrite(vma, vmf);
-}
-
-static int blkdev_dax_pmd_fault(struct vm_area_struct *vma, unsigned long addr,
-		pmd_t *pmd, unsigned int flags)
-{
-	return __dax_pmd_fault(vma, addr, pmd, flags, blkdev_get_block, NULL);
-}
-
-static const struct vm_operations_struct blkdev_dax_vm_ops = {
-	.fault		= blkdev_dax_fault,
-	.pmd_fault	= blkdev_dax_pmd_fault,
-	.pfn_mkwrite	= blkdev_dax_pfn_mkwrite,
-};
-
-static const struct vm_operations_struct blkdev_default_vm_ops = {
-	.fault		= filemap_fault,
-	.map_pages	= filemap_map_pages,
-};
-
-static int blkdev_mmap(struct file *file, struct vm_area_struct *vma)
-{
-	struct inode *bd_inode = bdev_file_inode(file);
-
-	file_accessed(file);
-	if (IS_DAX(bd_inode)) {
-		vma->vm_ops = &blkdev_dax_vm_ops;
-		vma->vm_flags |= VM_MIXEDMAP | VM_HUGEPAGE;
-	} else {
-		vma->vm_ops = &blkdev_default_vm_ops;
-	}
-
-	return 0;
-}
-#else
-#define blkdev_mmap generic_file_mmap
-#endif
-
 const struct file_operations def_blk_fops = {
 	.open		= blkdev_open,
 	.release	= blkdev_close,
 	.llseek		= block_llseek,
 	.read_iter	= blkdev_read_iter,
 	.write_iter	= blkdev_write_iter,
-	.mmap		= blkdev_mmap,
+	.mmap		= generic_file_mmap,
 	.fsync		= blkdev_fsync,
 	.unlocked_ioctl	= block_ioctl,
 #ifdef CONFIG_COMPAT
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 70e61b5..8363a10 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -2320,14 +2320,6 @@ extern struct super_block *freeze_bdev(struct block_device *);
 extern void emergency_thaw_all(void);
 extern int thaw_bdev(struct block_device *bdev, struct super_block *sb);
 extern int fsync_bdev(struct block_device *);
-#ifdef CONFIG_FS_DAX
-extern bool blkdev_dax_capable(struct block_device *bdev);
-#else
-static inline bool blkdev_dax_capable(struct block_device *bdev)
-{
-	return false;
-}
-#endif
 
 extern struct super_block *blockdev_superblock;
 
diff --git a/include/uapi/linux/fs.h b/include/uapi/linux/fs.h
index a079d50..fbff8b2 100644
--- a/include/uapi/linux/fs.h
+++ b/include/uapi/linux/fs.h
@@ -222,7 +222,6 @@ struct fsxattr {
 #define BLKSECDISCARD _IO(0x12,125)
 #define BLKROTATIONAL _IO(0x12,126)
 #define BLKZEROOUT _IO(0x12,127)
-#define BLKDAXGET _IO(0x12,129)
 
 #define BMAP_IOCTL 1		/* obsolete - kept for compatibility */
 #define FIBMAP	   _IO(0x00,1)	/* bmap access */
-- 
cgit v0.10.2


From b354aba0165519a74f540f2ba89d7ec78efca21d Mon Sep 17 00:00:00 2001
From: Dan Williams <dan.j.williams@intel.com>
Date: Tue, 17 May 2016 20:24:16 -0700
Subject: libnvdimm: release ida resources

ida instances allocate some internal memory for ->free_bitmap in
addition to the base 'struct ida'.  Use ida_destroy() to release that
memory at module_exit().

Reported-by: Johannes Thumshirn <jthumshirn@suse.de>
Reviewed-by: Johannes Thumshirn <jthumshirn@suse.de>
Signed-off-by: Dan Williams <dan.j.williams@intel.com>

diff --git a/drivers/nvdimm/core.c b/drivers/nvdimm/core.c
index 182a93f..847532d 100644
--- a/drivers/nvdimm/core.c
+++ b/drivers/nvdimm/core.c
@@ -648,6 +648,9 @@ static __exit void libnvdimm_exit(void)
 	nd_region_exit();
 	nvdimm_exit();
 	nvdimm_bus_exit();
+	nd_region_devs_exit();
+	nvdimm_devs_exit();
+	ida_destroy(&nd_ida);
 }
 
 MODULE_LICENSE("GPL v2");
diff --git a/drivers/nvdimm/dimm_devs.c b/drivers/nvdimm/dimm_devs.c
index c56f882..6cca03e 100644
--- a/drivers/nvdimm/dimm_devs.c
+++ b/drivers/nvdimm/dimm_devs.c
@@ -546,3 +546,8 @@ int nvdimm_bus_check_dimm_count(struct nvdimm_bus *nvdimm_bus, int dimm_count)
 	return 0;
 }
 EXPORT_SYMBOL_GPL(nvdimm_bus_check_dimm_count);
+
+void __exit nvdimm_devs_exit(void)
+{
+	ida_destroy(&dimm_ida);
+}
diff --git a/drivers/nvdimm/nd-core.h b/drivers/nvdimm/nd-core.h
index cb65308..4136c1a 100644
--- a/drivers/nvdimm/nd-core.h
+++ b/drivers/nvdimm/nd-core.h
@@ -49,6 +49,8 @@ bool is_nd_blk(struct device *dev);
 struct nvdimm_bus *walk_to_nvdimm_bus(struct device *nd_dev);
 int __init nvdimm_bus_init(void);
 void nvdimm_bus_exit(void);
+void nvdimm_devs_exit(void);
+void nd_region_devs_exit(void);
 void nd_region_probe_success(struct nvdimm_bus *nvdimm_bus, struct device *dev);
 struct nd_region;
 void nd_region_create_blk_seed(struct nd_region *nd_region);
diff --git a/drivers/nvdimm/region_devs.c b/drivers/nvdimm/region_devs.c
index 9e1b054..40fcfea 100644
--- a/drivers/nvdimm/region_devs.c
+++ b/drivers/nvdimm/region_devs.c
@@ -793,3 +793,8 @@ struct nd_region *nvdimm_volatile_region_create(struct nvdimm_bus *nvdimm_bus,
 			__func__);
 }
 EXPORT_SYMBOL_GPL(nvdimm_volatile_region_create);
+
+void __exit nd_region_devs_exit(void)
+{
+	ida_destroy(&region_ida);
+}
-- 
cgit v0.10.2


From c5ed9268643c7c4c9f2aaa0fd4c936095e6480ef Mon Sep 17 00:00:00 2001
From: Dan Williams <dan.j.williams@intel.com>
Date: Wed, 18 May 2016 14:50:12 -0700
Subject: libnvdimm, dax: autodetect support

For autodetecting a previously established dax configuration we need the
info block to indicate block-device vs device-dax mode, and we need to
have the default namespace probe hand-off the configuration to the
dax_pmem driver.

Signed-off-by: Dan Williams <dan.j.williams@intel.com>

diff --git a/drivers/nvdimm/dax_devs.c b/drivers/nvdimm/dax_devs.c
index f90f754..45fa82c 100644
--- a/drivers/nvdimm/dax_devs.c
+++ b/drivers/nvdimm/dax_devs.c
@@ -15,6 +15,7 @@
 #include <linux/slab.h>
 #include <linux/mm.h>
 #include "nd-core.h"
+#include "pfn.h"
 #include "nd.h"
 
 static void nd_dax_release(struct device *dev)
@@ -97,3 +98,37 @@ struct device *nd_dax_create(struct nd_region *nd_region)
 	__nd_device_register(dev);
 	return dev;
 }
+
+int nd_dax_probe(struct device *dev, struct nd_namespace_common *ndns)
+{
+	int rc;
+	struct nd_dax *nd_dax;
+	struct device *dax_dev;
+	struct nd_pfn *nd_pfn;
+	struct nd_pfn_sb *pfn_sb;
+	struct nd_region *nd_region = to_nd_region(ndns->dev.parent);
+
+	if (ndns->force_raw)
+		return -ENODEV;
+
+	nvdimm_bus_lock(&ndns->dev);
+	nd_dax = nd_dax_alloc(nd_region);
+	nd_pfn = &nd_dax->nd_pfn;
+	dax_dev = nd_pfn_devinit(nd_pfn, ndns);
+	nvdimm_bus_unlock(&ndns->dev);
+	if (!dax_dev)
+		return -ENOMEM;
+	pfn_sb = devm_kzalloc(dev, sizeof(*pfn_sb), GFP_KERNEL);
+	nd_pfn->pfn_sb = pfn_sb;
+	rc = nd_pfn_validate(nd_pfn, DAX_SIG);
+	dev_dbg(dev, "%s: dax: %s\n", __func__,
+			rc == 0 ? dev_name(dax_dev) : "<none>");
+	if (rc < 0) {
+		__nd_detach_ndns(dax_dev, &nd_pfn->ndns);
+		put_device(dax_dev);
+	} else
+		__nd_device_register(dax_dev);
+
+	return rc;
+}
+EXPORT_SYMBOL(nd_dax_probe);
diff --git a/drivers/nvdimm/nd.h b/drivers/nvdimm/nd.h
index 46910b8..d0ac93c 100644
--- a/drivers/nvdimm/nd.h
+++ b/drivers/nvdimm/nd.h
@@ -232,7 +232,7 @@ bool is_nd_pfn(struct device *dev);
 struct device *nd_pfn_create(struct nd_region *nd_region);
 struct device *nd_pfn_devinit(struct nd_pfn *nd_pfn,
 		struct nd_namespace_common *ndns);
-int nd_pfn_validate(struct nd_pfn *nd_pfn);
+int nd_pfn_validate(struct nd_pfn *nd_pfn, const char *sig);
 extern struct attribute_group nd_pfn_attribute_group;
 #else
 static inline int nd_pfn_probe(struct device *dev,
@@ -251,7 +251,7 @@ static inline struct device *nd_pfn_create(struct nd_region *nd_region)
 	return NULL;
 }
 
-static inline int nd_pfn_validate(struct nd_pfn *nd_pfn)
+static inline int nd_pfn_validate(struct nd_pfn *nd_pfn, const char *sig)
 {
 	return -ENODEV;
 }
@@ -259,9 +259,16 @@ static inline int nd_pfn_validate(struct nd_pfn *nd_pfn)
 
 struct nd_dax *to_nd_dax(struct device *dev);
 #if IS_ENABLED(CONFIG_NVDIMM_DAX)
+int nd_dax_probe(struct device *dev, struct nd_namespace_common *ndns);
 bool is_nd_dax(struct device *dev);
 struct device *nd_dax_create(struct nd_region *nd_region);
 #else
+static inline int nd_dax_probe(struct device *dev,
+		struct nd_namespace_common *ndns)
+{
+	return -ENODEV;
+}
+
 static inline bool is_nd_dax(struct device *dev)
 {
 	return false;
diff --git a/drivers/nvdimm/pfn.h b/drivers/nvdimm/pfn.h
index 9d2704c..dde9853 100644
--- a/drivers/nvdimm/pfn.h
+++ b/drivers/nvdimm/pfn.h
@@ -19,6 +19,7 @@
 
 #define PFN_SIG_LEN 16
 #define PFN_SIG "NVDIMM_PFN_INFO\0"
+#define DAX_SIG "NVDIMM_DAX_INFO\0"
 
 struct nd_pfn_sb {
 	u8 signature[PFN_SIG_LEN];
diff --git a/drivers/nvdimm/pfn_devs.c b/drivers/nvdimm/pfn_devs.c
index 58740d7..816cd98 100644
--- a/drivers/nvdimm/pfn_devs.c
+++ b/drivers/nvdimm/pfn_devs.c
@@ -360,7 +360,7 @@ struct device *nd_pfn_create(struct nd_region *nd_region)
 	return dev;
 }
 
-int nd_pfn_validate(struct nd_pfn *nd_pfn)
+int nd_pfn_validate(struct nd_pfn *nd_pfn, const char *sig)
 {
 	u64 checksum, offset;
 	struct nd_namespace_io *nsio;
@@ -377,7 +377,7 @@ int nd_pfn_validate(struct nd_pfn *nd_pfn)
 	if (nvdimm_read_bytes(ndns, SZ_4K, pfn_sb, sizeof(*pfn_sb)))
 		return -ENXIO;
 
-	if (memcmp(pfn_sb->signature, PFN_SIG, PFN_SIG_LEN) != 0)
+	if (memcmp(pfn_sb->signature, sig, PFN_SIG_LEN) != 0)
 		return -ENODEV;
 
 	checksum = le64_to_cpu(pfn_sb->checksum);
@@ -467,7 +467,7 @@ int nd_pfn_probe(struct device *dev, struct nd_namespace_common *ndns)
 	pfn_sb = devm_kzalloc(dev, sizeof(*pfn_sb), GFP_KERNEL);
 	nd_pfn = to_nd_pfn(pfn_dev);
 	nd_pfn->pfn_sb = pfn_sb;
-	rc = nd_pfn_validate(nd_pfn);
+	rc = nd_pfn_validate(nd_pfn, PFN_SIG);
 	dev_dbg(dev, "%s: pfn: %s\n", __func__,
 			rc == 0 ? dev_name(pfn_dev) : "<none>");
 	if (rc < 0) {
@@ -552,6 +552,7 @@ static int nd_pfn_init(struct nd_pfn *nd_pfn)
 	struct nd_pfn_sb *pfn_sb;
 	unsigned long npfns;
 	phys_addr_t offset;
+	const char *sig;
 	u64 checksum;
 	int rc;
 
@@ -560,7 +561,11 @@ static int nd_pfn_init(struct nd_pfn *nd_pfn)
 		return -ENOMEM;
 
 	nd_pfn->pfn_sb = pfn_sb;
-	rc = nd_pfn_validate(nd_pfn);
+	if (is_nd_dax(&nd_pfn->dev))
+		sig = DAX_SIG;
+	else
+		sig = PFN_SIG;
+	rc = nd_pfn_validate(nd_pfn, sig);
 	if (rc != -ENODEV)
 		return rc;
 
@@ -628,7 +633,7 @@ static int nd_pfn_init(struct nd_pfn *nd_pfn)
 	pfn_sb->mode = cpu_to_le32(nd_pfn->mode);
 	pfn_sb->dataoff = cpu_to_le64(offset);
 	pfn_sb->npfns = cpu_to_le64(npfns);
-	memcpy(pfn_sb->signature, PFN_SIG, PFN_SIG_LEN);
+	memcpy(pfn_sb->signature, sig, PFN_SIG_LEN);
 	memcpy(pfn_sb->uuid, nd_pfn->uuid, 16);
 	memcpy(pfn_sb->parent_uuid, nd_dev_to_uuid(&ndns->dev), 16);
 	pfn_sb->version_major = cpu_to_le16(1);
diff --git a/drivers/nvdimm/pmem.c b/drivers/nvdimm/pmem.c
index d9a0dbc..042baec 100644
--- a/drivers/nvdimm/pmem.c
+++ b/drivers/nvdimm/pmem.c
@@ -320,7 +320,8 @@ static int nd_pmem_probe(struct device *dev)
 		return pmem_attach_disk(dev, ndns);
 
 	/* if we find a valid info-block we'll come back as that personality */
-	if (nd_btt_probe(dev, ndns) == 0 || nd_pfn_probe(dev, ndns) == 0)
+	if (nd_btt_probe(dev, ndns) == 0 || nd_pfn_probe(dev, ndns) == 0
+			|| nd_dax_probe(dev, ndns) == 0)
 		return -ENXIO;
 
 	/* ...otherwise we're just a raw pmem device */
-- 
cgit v0.10.2


From 5e24c9fd36285535c704e84748d6c890be870fb6 Mon Sep 17 00:00:00 2001
From: Dan Williams <dan.j.williams@intel.com>
Date: Sat, 21 May 2016 11:01:41 -0700
Subject: libnvdimm, dax: fix alignment validation

Testing the dax-device autodetect support revealed a probe failure with
the following result:

    dax0.1: bad offset: 0x8200000 dax disabled

The original pfn-device implementation inferred the alignment from
ilog2(offset), now that the alignment is explicit the is_power_of_2()
needs replacing with a real sanity check against the recorded alignment.
Otherwise the alignment check is useless in the implicit case and only
the minimum size of the offset matters.

This self-consistency check is further validated by the probe path that
will re-check that the offset is large enough to contain all the
metadata required to enable the device.

Cc: <stable@vger.kernel.org>
Signed-off-by: Dan Williams <dan.j.williams@intel.com>

diff --git a/drivers/nvdimm/pfn_devs.c b/drivers/nvdimm/pfn_devs.c
index 816cd98..04f71d6 100644
--- a/drivers/nvdimm/pfn_devs.c
+++ b/drivers/nvdimm/pfn_devs.c
@@ -416,6 +416,8 @@ int nd_pfn_validate(struct nd_pfn *nd_pfn, const char *sig)
 			return -ENODEV;
 	}
 
+	if (nd_pfn->align == 0)
+		nd_pfn->align = le32_to_cpu(pfn_sb->align);
 	if (nd_pfn->align > nvdimm_namespace_capacity(ndns)) {
 		dev_err(&nd_pfn->dev, "alignment: %lx exceeds capacity %llx\n",
 				nd_pfn->align, nvdimm_namespace_capacity(ndns));
@@ -436,8 +438,8 @@ int nd_pfn_validate(struct nd_pfn *nd_pfn, const char *sig)
 		return -EBUSY;
 	}
 
-	nd_pfn->align = le32_to_cpu(pfn_sb->align);
-	if (!is_power_of_2(offset) || offset < PAGE_SIZE) {
+	if ((nd_pfn->align && !IS_ALIGNED(offset, nd_pfn->align))
+			|| !IS_ALIGNED(offset, PAGE_SIZE)) {
 		dev_err(&nd_pfn->dev, "bad offset: %#llx dax disabled\n",
 				offset);
 		return -ENXIO;
-- 
cgit v0.10.2


From 03dca343afe080968d90c4d9196404b5bbbc8461 Mon Sep 17 00:00:00 2001
From: Dan Williams <dan.j.williams@intel.com>
Date: Sat, 21 May 2016 12:22:41 -0700
Subject: libnvdimm, dax: fix deletion

The ndctl unit tests discovered that the dax enabling omitted updates to
nd_detach_and_reset().  This routine clears device the configuration
when the namespace is detached.  Without this clearing userspace may
assume that the device is in the process of being configured by another
agent in the system.

Signed-off-by: Dan Williams <dan.j.williams@intel.com>

diff --git a/drivers/nvdimm/claim.c b/drivers/nvdimm/claim.c
index 5f53db5..8b2e3c4 100644
--- a/drivers/nvdimm/claim.c
+++ b/drivers/nvdimm/claim.c
@@ -93,6 +93,25 @@ static bool is_idle(struct device *dev, struct nd_namespace_common *ndns)
 	return true;
 }
 
+struct nd_pfn *to_nd_pfn_safe(struct device *dev)
+{
+	/*
+	 * pfn device attributes are re-used by dax device instances, so we
+	 * need to be careful to correct device-to-nd_pfn conversion.
+	 */
+	if (is_nd_pfn(dev))
+		return to_nd_pfn(dev);
+
+	if (is_nd_dax(dev)) {
+		struct nd_dax *nd_dax = to_nd_dax(dev);
+
+		return &nd_dax->nd_pfn;
+	}
+
+	WARN_ON(1);
+	return NULL;
+}
+
 static void nd_detach_and_reset(struct device *dev,
 		struct nd_namespace_common **_ndns)
 {
@@ -106,8 +125,8 @@ static void nd_detach_and_reset(struct device *dev,
 		nd_btt->lbasize = 0;
 		kfree(nd_btt->uuid);
 		nd_btt->uuid = NULL;
-	} else if (is_nd_pfn(dev)) {
-		struct nd_pfn *nd_pfn = to_nd_pfn(dev);
+	} else if (is_nd_pfn(dev) || is_nd_dax(dev)) {
+		struct nd_pfn *nd_pfn = to_nd_pfn_safe(dev);
 
 		kfree(nd_pfn->uuid);
 		nd_pfn->uuid = NULL;
diff --git a/drivers/nvdimm/nd-core.h b/drivers/nvdimm/nd-core.h
index 4136c1a..6c42eda 100644
--- a/drivers/nvdimm/nd-core.h
+++ b/drivers/nvdimm/nd-core.h
@@ -94,4 +94,5 @@ bool __nd_attach_ndns(struct device *dev, struct nd_namespace_common *attach,
 ssize_t nd_namespace_store(struct device *dev,
 		struct nd_namespace_common **_ndns, const char *buf,
 		size_t len);
+struct nd_pfn *to_nd_pfn_safe(struct device *dev);
 #endif /* __ND_CORE_H__ */
diff --git a/drivers/nvdimm/pfn_devs.c b/drivers/nvdimm/pfn_devs.c
index 04f71d6..436191c 100644
--- a/drivers/nvdimm/pfn_devs.c
+++ b/drivers/nvdimm/pfn_devs.c
@@ -54,25 +54,6 @@ struct nd_pfn *to_nd_pfn(struct device *dev)
 }
 EXPORT_SYMBOL(to_nd_pfn);
 
-static struct nd_pfn *to_nd_pfn_safe(struct device *dev)
-{
-	/*
-	 * pfn device attributes are re-used by dax device instances, so we
-	 * need to be careful to correct device-to-nd_pfn conversion.
-	 */
-	if (is_nd_pfn(dev))
-		return to_nd_pfn(dev);
-
-	if (is_nd_dax(dev)) {
-		struct nd_dax *nd_dax = to_nd_dax(dev);
-
-		return &nd_dax->nd_pfn;
-	}
-
-	WARN_ON(1);
-	return NULL;
-}
-
 static ssize_t mode_show(struct device *dev,
 		struct device_attribute *attr, char *buf)
 {
-- 
cgit v0.10.2