Add a new bus method to fetch device-specific CPU sets.

bus_get_cpus() returns a specified set of CPUs for a device.  It accepts
an enum for the second parameter that indicates the type of cpuset to
request.  Currently two valus are supported:

 - LOCAL_CPUS (on x86 this returns all the CPUs in the package closest to
   the device when DEVICE_NUMA is enabled)
 - INTR_CPUS (like LOCAL_CPUS but only returns 1 SMT thread for each core)

For systems that do not support NUMA (or if it is not enabled in the kernel
config), LOCAL_CPUS fails with EINVAL.  INTR_CPUS is mapped to 'all_cpus'
by default.  The idea is that INTR_CPUS should always return a valid set.

Device drivers which want to use per-CPU interrupts should start using
INTR_CPUS instead of simply assigning interrupts to all available CPUs.
In the future we may wish to add tunables to control the policy of
INTR_CPUS (e.g. should it be local-only or global, should it ignore
SMT threads or not).

The x86 nexus driver exposes the internal set of interrupt CPUs from the
the x86 interrupt code via INTR_CPUS.

The ACPI bus driver and PCI bridge drivers use _PXM to return a suitable
LOCAL_CPUS set when _PXM exists and DEVICE_NUMA is enabled.  They also and
the global INTR_CPUS set from the nexus driver with the per-domain set from
_PXM to generate a local INTR_CPUS set for child devices.

Compared to the r298933, this version uses 'struct _cpuset' in
<sys/bus.h> instead of 'cpuset_t' to avoid requiring <sys/param.h>
(<sys/_cpuset.h> still requires <sys/param.h> for MAXCPU even though
<sys/_bitset.h> does not after recent changes).
This commit is contained in:
John Baldwin 2016-05-09 20:50:21 +00:00
parent 8dd5aa946f
commit 8d791e5af1
16 changed files with 288 additions and 31 deletions

View File

@ -0,0 +1,101 @@
.\" -*- nroff -*-
.\"
.\" Copyright (c) 2016 John H. Baldwin <jhb@FreeBSD.org>
.\" All rights reserved.
.\"
.\" Redistribution and use in source and binary forms, with or without
.\" modification, are permitted provided that the following conditions
.\" are met:
.\" 1. Redistributions of source code must retain the above copyright
.\" notice, this list of conditions and the following disclaimer.
.\" 2. Redistributions in binary form must reproduce the above copyright
.\" notice, this list of conditions and the following disclaimer in the
.\" documentation and/or other materials provided with the distribution.
.\"
.\" THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
.\" ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
.\" IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
.\" ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
.\" FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
.\" DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
.\" OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
.\" HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
.\" LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
.\" OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
.\" SUCH DAMAGE.
.\"
.\" $FreeBSD$
.\"
.Dd March 1, 2016
.Dt BUS_GET_CPUS 9
.Os
.Sh NAME
.Nm BUS_GET_CPUS ,
.Nm bus_get_cpus
.Nd "request a set of device-specific CPUs"
.Sh SYNOPSIS
.In sys/param.h
.In sys/bus.h
.In sys/cpuset.h
.Ft int
.Fo BUS_GET_CPUS
.Fa "device_t dev" "device_t child" "enum cpu_sets op" "size_t setsize"
.Fa "cpuset_t *cpuset"
.Fc
.Ft int
.Fo bus_get_cpus
.Fa "device_t dev" "enum cpu_sets op" "size_t setsize" "cpuset_t *cpuset"
.Fc
.Sh DESCRIPTION
The
.Fn BUS_GET_CPUS
method queries the parent bus device for a set of device-specific CPUs.
The
.Fa op
argument specifies which set of CPUs to retrieve.
If successful,
the requested set of CPUs are returned in
.Fa cpuset .
The
.Fa setsize
argument specifies the size in bytes of the set passed in
.Fa cpuset .
.Pp
.Fn BUS_GET_CPUS
supports querying different types of CPU sets via the the
.Fa op argument.
Not all set types are supported for every device.
If a set type is not supported,
.Fn BUS_GET_CPUS
fails with
.Er EINVAL .
These set types are supported:
.Bl -tag -width ".Dv LOCAL_CPUS"
.It Dv LOCAL_CPUS
The set of CPUs that are local to the device.
If a device is closer to a specific memory domain in a non-uniform memory
architecture system
.Pq NUMA ,
this will return the set of CPUs in that memory domain.
.It Dv INTR_CPUS
The preferred set of CPUs that this device should use for device interrupts.
This set type must be supported by all bus drivers.
.El
.Pp
The
.Fn bus_get_cpus
function is a simple wrapper around
.Fn BUS_GET_CPUS .
.Sh RETURN VALUES
Zero is returned on success, otherwise an appropriate error is returned.
.Sh SEE ALSO
.Xr cpuset 2 ,
.Xr BUS_BIND_INTR 9 ,
.Xr device 9
.Sh HISTORY
The
.Fn BUS_GET_CPUS
method and
.Fn bus_get_cpus
function first appeared in
.Fx 11.0 .

View File

@ -42,6 +42,7 @@ MAN= accept_filter.9 \
bus_generic_print_child.9 \
bus_generic_read_ivar.9 \
bus_generic_shutdown.9 \
BUS_GET_CPUS.9 \
bus_get_resource.9 \
BUS_NEW_PASS.9 \
BUS_PRINT_CHILD.9 \
@ -502,6 +503,7 @@ MLINKS+=bus_dma.9 busdma.9 \
bus_dma.9 bus_dma_tag_create.9 \
bus_dma.9 bus_dma_tag_destroy.9
MLINKS+=bus_generic_read_ivar.9 bus_generic_write_ivar.9
MLINKS+=BUS_GET_CPUS.9 bus_get_cpus.9
MLINKS+=BUS_READ_IVAR.9 BUS_WRITE_IVAR.9
MLINKS+=BUS_SETUP_INTR.9 bus_setup_intr.9 \
BUS_SETUP_INTR.9 BUS_TEARDOWN_INTR.9 \

View File

@ -143,6 +143,9 @@ struct nmi_pcpu {
register_t __padding; /* pad to 16 bytes */
};
#ifdef SMP
extern cpuset_t intr_cpus;
#endif
extern struct mtx icu_lock;
extern int elcr_found;

View File

@ -211,6 +211,7 @@ static device_method_t acpi_methods[] = {
DEVMETHOD(bus_setup_intr, bus_generic_setup_intr),
DEVMETHOD(bus_teardown_intr, bus_generic_teardown_intr),
DEVMETHOD(bus_hint_device_unit, acpi_hint_device_unit),
DEVMETHOD(bus_get_cpus, acpi_get_cpus),
DEVMETHOD(bus_get_domain, acpi_get_domain),
/* ACPI bus */
@ -1077,52 +1078,79 @@ acpi_hint_device_unit(device_t acdev, device_t child, const char *name,
}
/*
* Fetch the VM domain for the given device 'dev'.
*
* Return 1 + domain if there's a domain, 0 if not found;
* -1 upon an error.
* Fetch the NUMA domain for a device by mapping the value returned by
* _PXM to a NUMA domain. If the device does not have a _PXM method,
* -2 is returned. If any other error occurs, -1 is returned.
*/
int
acpi_parse_pxm(device_t dev, int *domain)
static int
acpi_parse_pxm(device_t dev)
{
#ifdef DEVICE_NUMA
ACPI_HANDLE h;
int d, pxm;
ACPI_HANDLE handle;
ACPI_STATUS status;
int pxm;
h = acpi_get_handle(dev);
if ((h != NULL) &&
ACPI_SUCCESS(acpi_GetInteger(h, "_PXM", &pxm))) {
d = acpi_map_pxm_to_vm_domainid(pxm);
if (d < 0)
return (-1);
*domain = d;
return (1);
}
handle = acpi_get_handle(dev);
if (handle == NULL)
return (-2);
status = acpi_GetInteger(handle, "_PXM", &pxm);
if (ACPI_SUCCESS(status))
return (acpi_map_pxm_to_vm_domainid(pxm));
if (status == AE_NOT_FOUND)
return (-2);
#endif
return (-1);
}
return (0);
int
acpi_get_cpus(device_t dev, device_t child, enum cpu_sets op, size_t setsize,
cpuset_t *cpuset)
{
int d, error;
d = acpi_parse_pxm(child);
if (d < 0)
return (bus_generic_get_cpus(dev, child, op, setsize, cpuset));
switch (op) {
case LOCAL_CPUS:
if (setsize != sizeof(cpuset_t))
return (EINVAL);
*cpuset = cpuset_domain[d];
return (0);
case INTR_CPUS:
error = bus_generic_get_cpus(dev, child, op, setsize, cpuset);
if (error != 0)
return (error);
if (setsize != sizeof(cpuset_t))
return (EINVAL);
CPU_AND(cpuset, &cpuset_domain[d]);
return (0);
default:
return (bus_generic_get_cpus(dev, child, op, setsize, cpuset));
}
}
/*
* Fetch the NUMA domain for the given device.
* Fetch the NUMA domain for the given device 'dev'.
*
* If a device has a _PXM method, map that to a NUMA domain.
*
* If none is found, then it'll call the parent method.
* If there's no domain, return ENOENT.
* Otherwise, pass the request up to the parent.
* If there's no matching domain or the domain cannot be
* determined, return ENOENT.
*/
int
acpi_get_domain(device_t dev, device_t child, int *domain)
{
int ret;
int d;
ret = acpi_parse_pxm(child, domain);
/* Error */
if (ret == -1)
return (ENOENT);
/* Found */
if (ret == 1)
d = acpi_parse_pxm(child);
if (d >= 0) {
*domain = d;
return (0);
}
if (d == -1)
return (ENOENT);
/* No _PXM node; go up a level */
return (bus_generic_get_domain(dev, child, domain));

View File

@ -95,6 +95,7 @@ static device_method_t acpi_pci_methods[] = {
DEVMETHOD(bus_write_ivar, acpi_pci_write_ivar),
DEVMETHOD(bus_child_deleted, acpi_pci_child_deleted),
DEVMETHOD(bus_child_location_str, acpi_pci_child_location_str_method),
DEVMETHOD(bus_get_cpus, acpi_get_cpus),
DEVMETHOD(bus_get_dma_tag, acpi_pci_get_dma_tag),
DEVMETHOD(bus_get_domain, acpi_get_domain),

View File

@ -265,3 +265,11 @@ acpi_pcib_power_for_sleep(device_t pcib, device_t dev, int *pstate)
acpi_device_pwr_for_sleep(acpi_dev, dev, pstate);
return (0);
}
int
acpi_pcib_get_cpus(device_t pcib, device_t dev, enum cpu_sets op,
size_t setsize, cpuset_t *cpuset)
{
return (bus_get_cpus(pcib, op, setsize, cpuset));
}

View File

@ -132,6 +132,7 @@ static device_method_t acpi_pcib_acpi_methods[] = {
DEVMETHOD(bus_deactivate_resource, bus_generic_deactivate_resource),
DEVMETHOD(bus_setup_intr, bus_generic_setup_intr),
DEVMETHOD(bus_teardown_intr, bus_generic_teardown_intr),
DEVMETHOD(bus_get_cpus, acpi_pcib_get_cpus),
/* pcib interface */
DEVMETHOD(pcib_maxslots, pcib_maxslots),

View File

@ -78,6 +78,7 @@ static device_method_t acpi_pcib_pci_methods[] = {
/* Bus interface */
DEVMETHOD(bus_read_ivar, acpi_pcib_read_ivar),
DEVMETHOD(bus_get_cpus, acpi_pcib_get_cpus),
/* pcib interface */
DEVMETHOD(pcib_route_interrupt, acpi_pcib_pci_route_interrupt),

View File

@ -36,6 +36,8 @@ void acpi_pci_link_add_reference(device_t dev, int index, device_t pcib,
int slot, int pin);
int acpi_pci_link_route_interrupt(device_t dev, int index);
void acpi_pcib_fetch_prt(device_t bus, ACPI_BUFFER *prt);
int acpi_pcib_get_cpus(device_t pcib, device_t dev, enum cpu_sets op,
size_t setsize, cpuset_t *cpuset);
int acpi_pcib_route_interrupt(device_t pcib, device_t dev, int pin,
ACPI_BUFFER *prtbuf);
int acpi_pcib_power_for_sleep(device_t pcib, device_t dev,

View File

@ -506,8 +506,9 @@ SYSCTL_DECL(_debug_acpi);
* Returns the VM domain ID if found, or -1 if not found / invalid.
*/
int acpi_map_pxm_to_vm_domainid(int pxm);
int acpi_get_cpus(device_t dev, device_t child, enum cpu_sets op,
size_t setsize, cpuset_t *cpuset);
int acpi_get_domain(device_t dev, device_t child, int *domain);
int acpi_parse_pxm(device_t dev, int *domain);
#endif /* _KERNEL */
#endif /* !_ACPIVAR_H_ */

View File

@ -134,6 +134,9 @@ struct intsrc {
struct trapframe;
#ifdef SMP
extern cpuset_t intr_cpus;
#endif
extern struct mtx icu_lock;
extern int elcr_found;

View File

@ -731,3 +731,21 @@ METHOD int get_domain {
device_t _child;
int *_domain;
} DEFAULT bus_generic_get_domain;
/**
* @brief Request a set of CPUs
*
* @param _dev the bus device
* @param _child the child device
* @param _op type of CPUs to request
* @param _setsize the size of the set passed in _cpuset
* @param _cpuset a pointer to a cpuset to receive the requested
* set of CPUs
*/
METHOD int get_cpus {
device_t _dev;
device_t _child;
enum cpu_sets _op;
size_t _setsize;
cpuset_t *_cpuset;
} DEFAULT bus_generic_get_cpus;

View File

@ -49,6 +49,7 @@ __FBSDID("$FreeBSD$");
#include <sys/rman.h>
#include <sys/selinfo.h>
#include <sys/signalvar.h>
#include <sys/smp.h>
#include <sys/sysctl.h>
#include <sys/systm.h>
#include <sys/uio.h>
@ -4110,6 +4111,23 @@ bus_generic_describe_intr(device_t dev, device_t child, struct resource *irq,
return (EINVAL);
}
/**
* @brief Helper function for implementing BUS_GET_CPUS().
*
* This simple implementation of BUS_GET_CPUS() simply calls the
* BUS_GET_CPUS() method of the parent of @p dev.
*/
int
bus_generic_get_cpus(device_t dev, device_t child, enum cpu_sets op,
size_t setsize, cpuset_t *cpuset)
{
/* Propagate up the bus hierarchy until someone handles it. */
if (dev->parent != NULL)
return (BUS_GET_CPUS(dev->parent, child, op, setsize, cpuset));
return (EINVAL);
}
/**
* @brief Helper function for implementing BUS_GET_DMA_TAG().
*
@ -4619,6 +4637,23 @@ bus_child_location_str(device_t child, char *buf, size_t buflen)
return (BUS_CHILD_LOCATION_STR(parent, child, buf, buflen));
}
/**
* @brief Wrapper function for BUS_GET_CPUS().
*
* This function simply calls the BUS_GET_CPUS() method of the
* parent of @p dev.
*/
int
bus_get_cpus(device_t dev, enum cpu_sets op, size_t setsize, cpuset_t *cpuset)
{
device_t parent;
parent = device_get_parent(dev);
if (parent == NULL)
return (EINVAL);
return (BUS_GET_CPUS(parent, dev, op, setsize, cpuset));
}
/**
* @brief Wrapper function for BUS_GET_DMA_TAG().
*
@ -4711,6 +4746,23 @@ root_child_present(device_t dev, device_t child)
return (-1);
}
static int
root_get_cpus(device_t dev, device_t child, enum cpu_sets op, size_t setsize,
cpuset_t *cpuset)
{
switch (op) {
case INTR_CPUS:
/* Default to returning the set of all CPUs. */
if (setsize != sizeof(cpuset_t))
return (EINVAL);
*cpuset = all_cpus;
return (0);
default:
return (EINVAL);
}
}
static kobj_method_t root_methods[] = {
/* Device interface */
KOBJMETHOD(device_shutdown, bus_generic_shutdown),
@ -4723,6 +4775,7 @@ static kobj_method_t root_methods[] = {
KOBJMETHOD(bus_write_ivar, bus_generic_write_ivar),
KOBJMETHOD(bus_setup_intr, root_setup_intr),
KOBJMETHOD(bus_child_present, root_child_present),
KOBJMETHOD(bus_get_cpus, root_get_cpus),
KOBJMETHOD_END
};

View File

@ -272,6 +272,16 @@ enum intr_polarity {
INTR_POLARITY_LOW = 2
};
/**
* CPU sets supported by bus_get_cpus(). Note that not all sets may be
* supported for a given device. If a request is not supported by a
* device (or its parents), then bus_get_cpus() will fail with EINVAL.
*/
enum cpu_sets {
LOCAL_CPUS = 0,
INTR_CPUS
};
typedef int (*devop_t)(void);
/**
@ -388,6 +398,8 @@ int bus_generic_deactivate_resource(device_t dev, device_t child, int type,
int rid, struct resource *r);
int bus_generic_detach(device_t dev);
void bus_generic_driver_added(device_t dev, driver_t *driver);
int bus_generic_get_cpus(device_t dev, device_t child, enum cpu_sets op,
size_t setsize, struct _cpuset *cpuset);
bus_dma_tag_t
bus_generic_get_dma_tag(device_t dev, device_t child);
bus_space_tag_t
@ -457,6 +469,8 @@ int bus_activate_resource(device_t dev, int type, int rid,
struct resource *r);
int bus_deactivate_resource(device_t dev, int type, int rid,
struct resource *r);
int bus_get_cpus(device_t dev, enum cpu_sets op, size_t setsize,
struct _cpuset *cpuset);
bus_dma_tag_t bus_get_dma_tag(device_t dev);
bus_space_tag_t bus_get_bus_tag(device_t dev);
int bus_get_domain(device_t dev, int *domain);

View File

@ -490,7 +490,7 @@ DB_SHOW_COMMAND(irqs, db_show_irqs)
* allocate CPUs round-robin.
*/
static cpuset_t intr_cpus = CPUSET_T_INITIALIZER(0x1);
cpuset_t intr_cpus = CPUSET_T_INITIALIZER(0x1);
static int current_cpu;
/*

View File

@ -127,6 +127,8 @@ static int nexus_set_resource(device_t, device_t, int, int,
static int nexus_get_resource(device_t, device_t, int, int,
rman_res_t *, rman_res_t *);
static void nexus_delete_resource(device_t, device_t, int, int);
static int nexus_get_cpus(device_t, device_t, enum cpu_sets, size_t,
cpuset_t *);
#ifdef DEV_APIC
static int nexus_alloc_msi(device_t pcib, device_t dev, int count, int maxcount, int *irqs);
static int nexus_release_msi(device_t pcib, device_t dev, int count, int *irqs);
@ -163,6 +165,7 @@ static device_method_t nexus_methods[] = {
DEVMETHOD(bus_set_resource, nexus_set_resource),
DEVMETHOD(bus_get_resource, nexus_get_resource),
DEVMETHOD(bus_delete_resource, nexus_delete_resource),
DEVMETHOD(bus_get_cpus, nexus_get_cpus),
/* pcib interface */
#ifdef DEV_APIC
@ -619,6 +622,24 @@ nexus_delete_resource(device_t dev, device_t child, int type, int rid)
resource_list_delete(rl, type, rid);
}
static int
nexus_get_cpus(device_t dev, device_t child, enum cpu_sets op, size_t setsize,
cpuset_t *cpuset)
{
switch (op) {
#ifdef SMP
case INTR_CPUS:
if (setsize != sizeof(cpuset_t))
return (EINVAL);
*cpuset = intr_cpus;
return (0);
#endif
default:
return (bus_generic_get_cpus(dev, child, op, setsize, cpuset));
}
}
/* Called from the MSI code to add new IRQs to the IRQ rman. */
void
nexus_add_irq(u_long irq)