Message ID | 20230807205755.29579-7-brett.creeley@amd.com (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
Series | pds-vfio-pci driver | expand |
On Mon, 7 Aug 2023 13:57:53 -0700 Brett Creeley <brett.creeley@amd.com> wrote: ... > +static int pds_vfio_dirty_enable(struct pds_vfio_pci_device *pds_vfio, > + struct rb_root_cached *ranges, u32 nnodes, > + u64 *page_size) > +{ > + struct pci_dev *pdev = pds_vfio->vfio_coredev.pdev; > + struct device *pdsc_dev = &pci_physfn(pdev)->dev; > + struct pds_vfio_dirty *dirty = &pds_vfio->dirty; > + u64 region_start, region_size, region_page_size; > + struct pds_lm_dirty_region_info *region_info; > + struct interval_tree_node *node = NULL; > + u8 max_regions = 0, num_regions; > + dma_addr_t regions_dma = 0; > + u32 num_ranges = nnodes; > + u32 page_count; > + u16 len; > + int err; > + > + dev_dbg(&pdev->dev, "vf%u: Start dirty page tracking\n", > + pds_vfio->vf_id); > + > + if (pds_vfio_dirty_is_enabled(pds_vfio)) > + return -EINVAL; > + > + /* find if dirty tracking is disabled, i.e. num_regions == 0 */ > + err = pds_vfio_dirty_status_cmd(pds_vfio, 0, &max_regions, > + &num_regions); > + if (err < 0) { > + dev_err(&pdev->dev, "Failed to get dirty status, err %pe\n", > + ERR_PTR(err)); > + return err; > + } else if (num_regions) { > + dev_err(&pdev->dev, > + "Dirty tracking already enabled for %d regions\n", > + num_regions); > + return -EEXIST; > + } else if (!max_regions) { > + dev_err(&pdev->dev, > + "Device doesn't support dirty tracking, max_regions %d\n", > + max_regions); > + return -EOPNOTSUPP; > + } > + > + /* > + * Only support 1 region for now. If there are any large gaps in the > + * VM's address regions, then this would be a waste of memory as we are > + * generating 2 bitmaps (ack/seq) from the min address to the max > + * address of the VM's address regions. In the future, if we support > + * more than one region in the device/driver we can split the bitmaps > + * on the largest address region gaps. We can do this split up to the > + * max_regions times returned from the dirty_status command. > + */ Isn't this a pretty unfortunately limitation given QEMU makes a 1TB hole on AMD hosts? Or maybe I misunderstand. https://gitlab.com/qemu-project/qemu/-/commit/8504f129450b909c88e199ca44facd35d38ba4de Thanks, Alex
On 8/8/2023 3:27 PM, Alex Williamson wrote: > Caution: This message originated from an External Source. Use proper caution when opening attachments, clicking links, or responding. > > > On Mon, 7 Aug 2023 13:57:53 -0700 > Brett Creeley <brett.creeley@amd.com> wrote: > ... >> +static int pds_vfio_dirty_enable(struct pds_vfio_pci_device *pds_vfio, >> + struct rb_root_cached *ranges, u32 nnodes, >> + u64 *page_size) >> +{ >> + struct pci_dev *pdev = pds_vfio->vfio_coredev.pdev; >> + struct device *pdsc_dev = &pci_physfn(pdev)->dev; >> + struct pds_vfio_dirty *dirty = &pds_vfio->dirty; >> + u64 region_start, region_size, region_page_size; >> + struct pds_lm_dirty_region_info *region_info; >> + struct interval_tree_node *node = NULL; >> + u8 max_regions = 0, num_regions; >> + dma_addr_t regions_dma = 0; >> + u32 num_ranges = nnodes; >> + u32 page_count; >> + u16 len; >> + int err; >> + >> + dev_dbg(&pdev->dev, "vf%u: Start dirty page tracking\n", >> + pds_vfio->vf_id); >> + >> + if (pds_vfio_dirty_is_enabled(pds_vfio)) >> + return -EINVAL; >> + >> + /* find if dirty tracking is disabled, i.e. num_regions == 0 */ >> + err = pds_vfio_dirty_status_cmd(pds_vfio, 0, &max_regions, >> + &num_regions); >> + if (err < 0) { >> + dev_err(&pdev->dev, "Failed to get dirty status, err %pe\n", >> + ERR_PTR(err)); >> + return err; >> + } else if (num_regions) { >> + dev_err(&pdev->dev, >> + "Dirty tracking already enabled for %d regions\n", >> + num_regions); >> + return -EEXIST; >> + } else if (!max_regions) { >> + dev_err(&pdev->dev, >> + "Device doesn't support dirty tracking, max_regions %d\n", >> + max_regions); >> + return -EOPNOTSUPP; >> + } >> + >> + /* >> + * Only support 1 region for now. If there are any large gaps in the >> + * VM's address regions, then this would be a waste of memory as we are >> + * generating 2 bitmaps (ack/seq) from the min address to the max >> + * address of the VM's address regions. In the future, if we support >> + * more than one region in the device/driver we can split the bitmaps >> + * on the largest address region gaps. We can do this split up to the >> + * max_regions times returned from the dirty_status command. >> + */ > > Isn't this a pretty unfortunately limitation given QEMU makes a 1TB > hole on AMD hosts? Or maybe I misunderstand. > > https://gitlab.com/qemu-project/qemu/-/commit/8504f129450b909c88e199ca44facd35d38ba4de > > Thanks, > Alex > Yes, this is currently an unfortunate limitation. However, our device is flexible enough to support >1 regions. There has been some work in this area, but we aren't quite there yet. The goal was to get this initial support accepted and submit follow on work to support >1 regions. Thanks, Brett
On Wed, 9 Aug 2023 08:44:44 -0700 Brett Creeley <bcreeley@amd.com> wrote: > On 8/8/2023 3:27 PM, Alex Williamson wrote: > > Caution: This message originated from an External Source. Use proper caution when opening attachments, clicking links, or responding. > > > > > > On Mon, 7 Aug 2023 13:57:53 -0700 > > Brett Creeley <brett.creeley@amd.com> wrote: > > ... > >> +static int pds_vfio_dirty_enable(struct pds_vfio_pci_device *pds_vfio, > >> + struct rb_root_cached *ranges, u32 nnodes, > >> + u64 *page_size) > >> +{ > >> + struct pci_dev *pdev = pds_vfio->vfio_coredev.pdev; > >> + struct device *pdsc_dev = &pci_physfn(pdev)->dev; > >> + struct pds_vfio_dirty *dirty = &pds_vfio->dirty; > >> + u64 region_start, region_size, region_page_size; > >> + struct pds_lm_dirty_region_info *region_info; > >> + struct interval_tree_node *node = NULL; > >> + u8 max_regions = 0, num_regions; > >> + dma_addr_t regions_dma = 0; > >> + u32 num_ranges = nnodes; > >> + u32 page_count; > >> + u16 len; > >> + int err; > >> + > >> + dev_dbg(&pdev->dev, "vf%u: Start dirty page tracking\n", > >> + pds_vfio->vf_id); > >> + > >> + if (pds_vfio_dirty_is_enabled(pds_vfio)) > >> + return -EINVAL; > >> + > >> + /* find if dirty tracking is disabled, i.e. num_regions == 0 */ > >> + err = pds_vfio_dirty_status_cmd(pds_vfio, 0, &max_regions, > >> + &num_regions); > >> + if (err < 0) { > >> + dev_err(&pdev->dev, "Failed to get dirty status, err %pe\n", > >> + ERR_PTR(err)); > >> + return err; > >> + } else if (num_regions) { > >> + dev_err(&pdev->dev, > >> + "Dirty tracking already enabled for %d regions\n", > >> + num_regions); > >> + return -EEXIST; > >> + } else if (!max_regions) { > >> + dev_err(&pdev->dev, > >> + "Device doesn't support dirty tracking, max_regions %d\n", > >> + max_regions); > >> + return -EOPNOTSUPP; > >> + } > >> + > >> + /* > >> + * Only support 1 region for now. If there are any large gaps in the > >> + * VM's address regions, then this would be a waste of memory as we are > >> + * generating 2 bitmaps (ack/seq) from the min address to the max > >> + * address of the VM's address regions. In the future, if we support > >> + * more than one region in the device/driver we can split the bitmaps > >> + * on the largest address region gaps. We can do this split up to the > >> + * max_regions times returned from the dirty_status command. > >> + */ > > > > Isn't this a pretty unfortunately limitation given QEMU makes a 1TB > > hole on AMD hosts? Or maybe I misunderstand. > > > > https://gitlab.com/qemu-project/qemu/-/commit/8504f129450b909c88e199ca44facd35d38ba4de > > > > Thanks, > > Alex > > > > Yes, this is currently an unfortunate limitation. However, our device is > flexible enough to support >1 regions. There has been some work in this > area, but we aren't quite there yet. The goal was to get this initial > support accepted and submit follow on work to support >1 regions. Ok, good that this is temporary. Shameer, Kevin, Jason, Yishai, I'm hoping one or more of you can approve this series as well. Thanks, Alex
On Wed, Aug 09, 2023 at 11:33:00AM -0600, Alex Williamson wrote: > Shameer, Kevin, Jason, Yishai, I'm hoping one or more of you can > approve this series as well. Thanks, I've looked at it a few times now, I think it is OK, aside from the nvme issue. Jason
> From: Jason Gunthorpe <jgg@nvidia.com> > Sent: Thursday, August 10, 2023 2:06 AM > > On Wed, Aug 09, 2023 at 11:33:00AM -0600, Alex Williamson wrote: > > > Shameer, Kevin, Jason, Yishai, I'm hoping one or more of you can > > approve this series as well. Thanks, > > I've looked at it a few times now, I think it is OK, aside from the > nvme issue. > My only concern is the duplication of backing storage management of the migration file which I didn't take time to review. If all others are fine to leave it as is then I will not insist.
On Thu, 10 Aug 2023 02:47:15 +0000 "Tian, Kevin" <kevin.tian@intel.com> wrote: > > From: Jason Gunthorpe <jgg@nvidia.com> > > Sent: Thursday, August 10, 2023 2:06 AM > > > > On Wed, Aug 09, 2023 at 11:33:00AM -0600, Alex Williamson wrote: > > > > > Shameer, Kevin, Jason, Yishai, I'm hoping one or more of you can > > > approve this series as well. Thanks, > > > > I've looked at it a few times now, I think it is OK, aside from the > > nvme issue. > > > > My only concern is the duplication of backing storage management > of the migration file which I didn't take time to review. > > If all others are fine to leave it as is then I will not insist. There's leverage now if you feel strongly about it, but code consolidation could certainly come later. Are either of you willing to provide a R-b? What are we looking for relative to NVMe? AIUI, the first couple revisions of this series specified an NVMe device ID, then switched to a wildcard, then settled on an Ethernet device ID, all with no obvious changes that would suggest support is limited to a specific device type. I think we're therefore concerned that migration of an NVMe VF could be enabled by overriding/adding device IDs, whereas we'd like to standardize NVMe migration to avoid avoid incompatible implementations. It's somewhat a strange requirement since we have no expectation of compatibility between vendors for any other device type, but how far are we going to take it? Is it enough that the device table here only includes the Ethernet VF ID or do we want to actively prevent what might be a trivial enabling of migration for another device type because we envision it happening through an industry standard that currently doesn't exist? Sorry if I'm not familiar with the dynamics of the NVMe working group or previous agreements. Thanks, Alex
On Thu, Aug 10, 2023 at 10:47:34AM -0600, Alex Williamson wrote: > On Thu, 10 Aug 2023 02:47:15 +0000 > "Tian, Kevin" <kevin.tian@intel.com> wrote: > > > > From: Jason Gunthorpe <jgg@nvidia.com> > > > Sent: Thursday, August 10, 2023 2:06 AM > > > > > > On Wed, Aug 09, 2023 at 11:33:00AM -0600, Alex Williamson wrote: > > > > > > > Shameer, Kevin, Jason, Yishai, I'm hoping one or more of you can > > > > approve this series as well. Thanks, > > > > > > I've looked at it a few times now, I think it is OK, aside from the > > > nvme issue. > > > > > > > My only concern is the duplication of backing storage management > > of the migration file which I didn't take time to review. > > > > If all others are fine to leave it as is then I will not insist. > > There's leverage now if you feel strongly about it, but code > consolidation could certainly come later. > > Are either of you willing to provide a R-b? The code structure is good enough (though I agree with Kevin), so sure: Reviewed-by: Jason Gunthorpe <jgg@nvidia.com> > What are we looking for relative to NVMe? AIUI, the first couple > revisions of this series specified an NVMe device ID, then switched to > a wildcard, then settled on an Ethernet device ID, all with no obvious > changes that would suggest support is limited to a specific device > type. I think we're therefore concerned that migration of an NVMe VF > could be enabled by overriding/adding device IDs, whereas we'd like to > standardize NVMe migration to avoid avoid incompatible implementations. Yeah > It's somewhat a strange requirement since we have no expectation of > compatibility between vendors for any other device type, but how far > are we going to take it? Is it enough that the device table here only > includes the Ethernet VF ID or do we want to actively prevent what > might be a trivial enabling of migration for another device type > because we envision it happening through an industry standard that > currently doesn't exist? Sorry if I'm not familiar with the dynamics > of the NVMe working group or previous agreements. Thanks, I don't really have a solid answer. Christoph and others in the NVMe space are very firm that NVMe related things must go through standards, I think that is their right. It does not seem good to allow undermining that approach. On the flip side, if we are going to allow this driver, why are we not letting them enable their full device functionality with all their non-compliant VF/PF combinations? They shouldn't have to hide what they are actually doing just to get merged. If we want to block anything it should be to block the PCI spec non-compliance of having PF/VF IDs that are different. Jason
On Thu, 10 Aug 2023 14:19:40 -0300 Jason Gunthorpe <jgg@nvidia.com> wrote: > On Thu, Aug 10, 2023 at 10:47:34AM -0600, Alex Williamson wrote: > > On Thu, 10 Aug 2023 02:47:15 +0000 > > "Tian, Kevin" <kevin.tian@intel.com> wrote: > > > > > > From: Jason Gunthorpe <jgg@nvidia.com> > > > > Sent: Thursday, August 10, 2023 2:06 AM > > > > > > > > On Wed, Aug 09, 2023 at 11:33:00AM -0600, Alex Williamson wrote: > > > > > > > > > Shameer, Kevin, Jason, Yishai, I'm hoping one or more of you can > > > > > approve this series as well. Thanks, > > > > > > > > I've looked at it a few times now, I think it is OK, aside from the > > > > nvme issue. > > > > > > > > > > My only concern is the duplication of backing storage management > > > of the migration file which I didn't take time to review. > > > > > > If all others are fine to leave it as is then I will not insist. > > > > There's leverage now if you feel strongly about it, but code > > consolidation could certainly come later. > > > > Are either of you willing to provide a R-b? > > The code structure is good enough (though I agree with Kevin), so sure: > > Reviewed-by: Jason Gunthorpe <jgg@nvidia.com> > > > What are we looking for relative to NVMe? AIUI, the first couple > > revisions of this series specified an NVMe device ID, then switched to > > a wildcard, then settled on an Ethernet device ID, all with no obvious > > changes that would suggest support is limited to a specific device > > type. I think we're therefore concerned that migration of an NVMe VF > > could be enabled by overriding/adding device IDs, whereas we'd like to > > standardize NVMe migration to avoid avoid incompatible implementations. > > Yeah > > > It's somewhat a strange requirement since we have no expectation of > > compatibility between vendors for any other device type, but how far > > are we going to take it? Is it enough that the device table here only > > includes the Ethernet VF ID or do we want to actively prevent what > > might be a trivial enabling of migration for another device type > > because we envision it happening through an industry standard that > > currently doesn't exist? Sorry if I'm not familiar with the dynamics > > of the NVMe working group or previous agreements. Thanks, > > I don't really have a solid answer. Christoph and others in the NVMe > space are very firm that NVMe related things must go through > standards, I think that is their right. > > It does not seem good to allow undermining that approach. If we wanted to enforce something like this the probe function could reject NVMe class devices, but... > On the flip side, if we are going to allow this driver, why are we not > letting them enable their full device functionality with all their > non-compliant VF/PF combinations? They shouldn't have to hide what > they are actually doing just to get merged. This. Is it enough that this appears to implement device type agnostic migration support for devices hosted by this distributed services card and NVMe happens to be one of those device types? Is that a high enough bar that this is not simply a vendor specific NVMe migration implementation? > If we want to block anything it should be to block the PCI spec > non-compliance of having PF/VF IDs that are different. PCI Express® Base Specification Revision 6.0.1, pg 1461: 9.3.3.11 VF Device ID (Offset 1Ah) This field contains the Device ID that should be presented for every VF to the SI. VF Device ID may be different from the PF Device ID... That? Thanks, Alex
On Thu, Aug 10, 2023 at 11:40:08AM -0600, Alex Williamson wrote: > PCI Express® Base Specification Revision 6.0.1, pg 1461: > > 9.3.3.11 VF Device ID (Offset 1Ah) > > This field contains the Device ID that should be presented for every VF to the SI. > > VF Device ID may be different from the PF Device ID... > > That? Thanks, NVMe matches using the class code, IIRC there is language requiring the class code to be the same. Jason
On Thu, 10 Aug 2023 14:43:04 -0300 Jason Gunthorpe <jgg@nvidia.com> wrote: > On Thu, Aug 10, 2023 at 11:40:08AM -0600, Alex Williamson wrote: > > > PCI Express® Base Specification Revision 6.0.1, pg 1461: > > > > 9.3.3.11 VF Device ID (Offset 1Ah) > > > > This field contains the Device ID that should be presented for every VF to the SI. > > > > VF Device ID may be different from the PF Device ID... > > > > That? Thanks, > > NVMe matches using the class code, IIRC there is language requiring > the class code to be the same. Ok, yes: 7.5.1.1.6 Class Code Register (Offset 09h) ... The field in a PF and its associated VFs must return the same value when read. Seems limiting, but it's indeed there. We've got a lot of cleanup to do if we're going to start rejecting drivers for devices with PCI spec violations though ;) Thanks, Alex
On Thu, Aug 10, 2023 at 11:54:44AM -0600, Alex Williamson wrote: > On Thu, 10 Aug 2023 14:43:04 -0300 > Jason Gunthorpe <jgg@nvidia.com> wrote: > > > On Thu, Aug 10, 2023 at 11:40:08AM -0600, Alex Williamson wrote: > > > > > PCI Express® Base Specification Revision 6.0.1, pg 1461: > > > > > > 9.3.3.11 VF Device ID (Offset 1Ah) > > > > > > This field contains the Device ID that should be presented for every VF to the SI. > > > > > > VF Device ID may be different from the PF Device ID... > > > > > > That? Thanks, > > > > NVMe matches using the class code, IIRC there is language requiring > > the class code to be the same. > > Ok, yes: > > 7.5.1.1.6 Class Code Register (Offset 09h) > ... > The field in a PF and its associated VFs must return the same value > when read. > > Seems limiting, but it's indeed there. We've got a lot of cleanup to > do if we're going to start rejecting drivers for devices with PCI > spec violations though ;) Thanks, Well.. If we defacto say that Linux is endorsing ignoring this part of the spec then I predict we will see more vendors follow this approach. Jason
> From: Jason Gunthorpe <jgg@nvidia.com> > Sent: Friday, August 11, 2023 2:12 AM > > On Thu, Aug 10, 2023 at 11:54:44AM -0600, Alex Williamson wrote: > > On Thu, 10 Aug 2023 14:43:04 -0300 > > Jason Gunthorpe <jgg@nvidia.com> wrote: > > > > > On Thu, Aug 10, 2023 at 11:40:08AM -0600, Alex Williamson wrote: > > > > > > > PCI Express® Base Specification Revision 6.0.1, pg 1461: > > > > > > > > 9.3.3.11 VF Device ID (Offset 1Ah) > > > > > > > > This field contains the Device ID that should be presented for every VF > to the SI. > > > > > > > > VF Device ID may be different from the PF Device ID... > > > > > > > > That? Thanks, > > > > > > NVMe matches using the class code, IIRC there is language requiring > > > the class code to be the same. > > > > Ok, yes: > > > > 7.5.1.1.6 Class Code Register (Offset 09h) > > ... > > The field in a PF and its associated VFs must return the same value > > when read. > > > > Seems limiting, but it's indeed there. We've got a lot of cleanup to > > do if we're going to start rejecting drivers for devices with PCI > > spec violations though ;) Thanks, > > Well.. If we defacto say that Linux is endorsing ignoring this part of > the spec then I predict we will see more vendors follow this approach. > Looks PCI core assumes the class code must be same across VFs (though not cross PF/VF). And it even violates the spec to require Revision ID and Subsystem ID must be same too: static void pci_read_vf_config_common(struct pci_dev *virtfn) { struct pci_dev *physfn = virtfn->physfn; /* * Some config registers are the same across all associated VFs. * Read them once from VF0 so we can skip reading them from the * other VFs. * * PCIe r4.0, sec 9.3.4.1, technically doesn't require all VFs to * have the same Revision ID and Subsystem ID, but we assume they * do. */ pci_read_config_dword(virtfn, PCI_CLASS_REVISION, &physfn->sriov->class); pci_read_config_byte(virtfn, PCI_HEADER_TYPE, &physfn->sriov->hdr_type); pci_read_config_word(virtfn, PCI_SUBSYSTEM_VENDOR_ID, &physfn->sriov->subsystem_vendor); pci_read_config_word(virtfn, PCI_SUBSYSTEM_ID, &physfn->sriov->subsystem_device); } Does AMD distributed card provide multiple PF's each for a class of VF's or a single PF for all VF's?
On Thu, 10 Aug 2023 15:11:43 -0300 Jason Gunthorpe <jgg@nvidia.com> wrote: > On Thu, Aug 10, 2023 at 11:54:44AM -0600, Alex Williamson wrote: > > On Thu, 10 Aug 2023 14:43:04 -0300 > > Jason Gunthorpe <jgg@nvidia.com> wrote: > > > > > On Thu, Aug 10, 2023 at 11:40:08AM -0600, Alex Williamson wrote: > > > > > > > PCI Express® Base Specification Revision 6.0.1, pg 1461: > > > > > > > > 9.3.3.11 VF Device ID (Offset 1Ah) > > > > > > > > This field contains the Device ID that should be presented for every VF to the SI. > > > > > > > > VF Device ID may be different from the PF Device ID... > > > > > > > > That? Thanks, > > > > > > NVMe matches using the class code, IIRC there is language requiring > > > the class code to be the same. > > > > Ok, yes: > > > > 7.5.1.1.6 Class Code Register (Offset 09h) > > ... > > The field in a PF and its associated VFs must return the same value > > when read. > > > > Seems limiting, but it's indeed there. We've got a lot of cleanup to > > do if we're going to start rejecting drivers for devices with PCI > > spec violations though ;) Thanks, > > Well.. If we defacto say that Linux is endorsing ignoring this part of > the spec then I predict we will see more vendors follow this approach. The NVMe driver will claim PCI_CLASS_STORAGE_EXPRESS devices, but there are also various vendor/device IDs in the table, some for the purpose of setting driver data with quirks, some not. So I think the spec compliant behavior here would be that the VF replicates the PF class code and we'd simply need to add the vendor/device explicitly to the id table. TBH, I can see why this spec requirement might get overlooked, it's a rather arbitrary restriction of the VF device. Thanks, Alex
On Thu, Aug 10, 2023 at 02:19:40PM -0300, Jason Gunthorpe wrote: > > It's somewhat a strange requirement since we have no expectation of > > compatibility between vendors for any other device type, but how far > > are we going to take it? Is it enough that the device table here only > > includes the Ethernet VF ID or do we want to actively prevent what > > might be a trivial enabling of migration for another device type > > because we envision it happening through an industry standard that > > currently doesn't exist? Sorry if I'm not familiar with the dynamics > > of the NVMe working group or previous agreements. Thanks, > > I don't really have a solid answer. Christoph and others in the NVMe > space are very firm that NVMe related things must go through > standards, I think that is their right. Yes, anything that uses a class code needs a standardized way of being managed. That is very different from say mlx5 which is obviously controlled by Mellanox. So I don't think any vfio driver except for the plain passthrough ones should bind anything but very specific PCI IDs. And AMD really needs to join the NVMe working group where the passthrough work is happening right now. If you need help finding the right persons at AMD to work with NVMe send me a mail offline, I can point you to them.
On 8/10/2023 8:25 PM, Tian, Kevin wrote: > Caution: This message originated from an External Source. Use proper caution when opening attachments, clicking links, or responding. > > >> From: Jason Gunthorpe <jgg@nvidia.com> >> Sent: Friday, August 11, 2023 2:12 AM >> >> On Thu, Aug 10, 2023 at 11:54:44AM -0600, Alex Williamson wrote: >>> On Thu, 10 Aug 2023 14:43:04 -0300 >>> Jason Gunthorpe <jgg@nvidia.com> wrote: >>> >>>> On Thu, Aug 10, 2023 at 11:40:08AM -0600, Alex Williamson wrote: >>>> >>>>> PCI Express® Base Specification Revision 6.0.1, pg 1461: >>>>> >>>>> 9.3.3.11 VF Device ID (Offset 1Ah) >>>>> >>>>> This field contains the Device ID that should be presented for every VF >> to the SI. >>>>> >>>>> VF Device ID may be different from the PF Device ID... >>>>> >>>>> That? Thanks, >>>> >>>> NVMe matches using the class code, IIRC there is language requiring >>>> the class code to be the same. >>> >>> Ok, yes: >>> >>> 7.5.1.1.6 Class Code Register (Offset 09h) >>> ... >>> The field in a PF and its associated VFs must return the same value >>> when read. >>> >>> Seems limiting, but it's indeed there. We've got a lot of cleanup to >>> do if we're going to start rejecting drivers for devices with PCI >>> spec violations though ;) Thanks, >> >> Well.. If we defacto say that Linux is endorsing ignoring this part of >> the spec then I predict we will see more vendors follow this approach. >> > > Looks PCI core assumes the class code must be same across VFs (though > not cross PF/VF). And it even violates the spec to require Revision ID > and Subsystem ID must be same too: > > static void pci_read_vf_config_common(struct pci_dev *virtfn) > { > struct pci_dev *physfn = virtfn->physfn; > > /* > * Some config registers are the same across all associated VFs. > * Read them once from VF0 so we can skip reading them from the > * other VFs. > * > * PCIe r4.0, sec 9.3.4.1, technically doesn't require all VFs to > * have the same Revision ID and Subsystem ID, but we assume they > * do. > */ > pci_read_config_dword(virtfn, PCI_CLASS_REVISION, > &physfn->sriov->class); > pci_read_config_byte(virtfn, PCI_HEADER_TYPE, > &physfn->sriov->hdr_type); > pci_read_config_word(virtfn, PCI_SUBSYSTEM_VENDOR_ID, > &physfn->sriov->subsystem_vendor); > pci_read_config_word(virtfn, PCI_SUBSYSTEM_ID, > &physfn->sriov->subsystem_device); > } > > Does AMD distributed card provide multiple PF's each for a class of > VF's or a single PF for all VF's? Hey Kevin, The AMD Pensando DSC provides multiple PFs for each class of VFs. All of our production devices will meet the assumptions of the pci core function above that all VFs match VF0 for those common fields. I've been out for a few days so apologies for the delayed response. Thanks, Brett
On 8/12/2023 3:49 AM, Christoph Hellwig wrote: > Caution: This message originated from an External Source. Use proper caution when opening attachments, clicking links, or responding. > > > On Thu, Aug 10, 2023 at 02:19:40PM -0300, Jason Gunthorpe wrote: >>> It's somewhat a strange requirement since we have no expectation of >>> compatibility between vendors for any other device type, but how far >>> are we going to take it? Is it enough that the device table here only >>> includes the Ethernet VF ID or do we want to actively prevent what >>> might be a trivial enabling of migration for another device type >>> because we envision it happening through an industry standard that >>> currently doesn't exist? Sorry if I'm not familiar with the dynamics >>> of the NVMe working group or previous agreements. Thanks, >> >> I don't really have a solid answer. Christoph and others in the NVMe >> space are very firm that NVMe related things must go through >> standards, I think that is their right. > > Yes, anything that uses a class code needs a standardized way of > being managed. That is very different from say mlx5 which is obviously > controlled by Mellanox. > > So I don't think any vfio driver except for the plain passthrough ones > should bind anything but very specific PCI IDs. > > And AMD really needs to join the NVMe working group where the passthrough > work is happening right now. If you need help finding the right persons > at AMD to work with NVMe send me a mail offline, I can point you to them. > Hi Christoph, We have folks at AMD participating in NVMe working groups and are aware of TPARs related to NVMe live migration. We’re checking to be sure they are up to speed on the discussions and will reach out to you if they need help getting further involved. As I mentioned in another response, I've been out for a few days so apologies for the delayed response. Thanks for your help, Brett
> From: Brett Creeley <bcreeley@amd.com> > Sent: Tuesday, August 15, 2023 2:42 AM > > On 8/10/2023 8:25 PM, Tian, Kevin wrote: > > Caution: This message originated from an External Source. Use proper > caution when opening attachments, clicking links, or responding. > > > > > >> From: Jason Gunthorpe <jgg@nvidia.com> > >> Sent: Friday, August 11, 2023 2:12 AM > >> > >> On Thu, Aug 10, 2023 at 11:54:44AM -0600, Alex Williamson wrote: > >>> On Thu, 10 Aug 2023 14:43:04 -0300 > >>> Jason Gunthorpe <jgg@nvidia.com> wrote: > >>> > >>>> On Thu, Aug 10, 2023 at 11:40:08AM -0600, Alex Williamson wrote: > >>>> > >>>>> PCI Express® Base Specification Revision 6.0.1, pg 1461: > >>>>> > >>>>> 9.3.3.11 VF Device ID (Offset 1Ah) > >>>>> > >>>>> This field contains the Device ID that should be presented for every > VF > >> to the SI. > >>>>> > >>>>> VF Device ID may be different from the PF Device ID... > >>>>> > >>>>> That? Thanks, > >>>> > >>>> NVMe matches using the class code, IIRC there is language requiring > >>>> the class code to be the same. > >>> > >>> Ok, yes: > >>> > >>> 7.5.1.1.6 Class Code Register (Offset 09h) > >>> ... > >>> The field in a PF and its associated VFs must return the same value > >>> when read. > >>> > >>> Seems limiting, but it's indeed there. We've got a lot of cleanup to > >>> do if we're going to start rejecting drivers for devices with PCI > >>> spec violations though ;) Thanks, > >> > >> Well.. If we defacto say that Linux is endorsing ignoring this part of > >> the spec then I predict we will see more vendors follow this approach. > >> > > > > Looks PCI core assumes the class code must be same across VFs (though > > not cross PF/VF). And it even violates the spec to require Revision ID > > and Subsystem ID must be same too: > > > > static void pci_read_vf_config_common(struct pci_dev *virtfn) > > { > > struct pci_dev *physfn = virtfn->physfn; > > > > /* > > * Some config registers are the same across all associated VFs. > > * Read them once from VF0 so we can skip reading them from the > > * other VFs. > > * > > * PCIe r4.0, sec 9.3.4.1, technically doesn't require all VFs to > > * have the same Revision ID and Subsystem ID, but we assume they > > * do. > > */ > > pci_read_config_dword(virtfn, PCI_CLASS_REVISION, > > &physfn->sriov->class); > > pci_read_config_byte(virtfn, PCI_HEADER_TYPE, > > &physfn->sriov->hdr_type); > > pci_read_config_word(virtfn, PCI_SUBSYSTEM_VENDOR_ID, > > &physfn->sriov->subsystem_vendor); > > pci_read_config_word(virtfn, PCI_SUBSYSTEM_ID, > > &physfn->sriov->subsystem_device); > > } > > > > Does AMD distributed card provide multiple PF's each for a class of > > VF's or a single PF for all VF's? > > Hey Kevin, > > The AMD Pensando DSC provides multiple PFs for each class of VFs. > All of our production devices will meet the assumptions of the pci core > function above that all VFs match VF0 for those common fields. > > I've been out for a few days so apologies for the delayed response. > Sounds good. No more open then.
diff --git a/drivers/vfio/pci/pds/Makefile b/drivers/vfio/pci/pds/Makefile index 47750bb31ea2..d5a06d81634f 100644 --- a/drivers/vfio/pci/pds/Makefile +++ b/drivers/vfio/pci/pds/Makefile @@ -5,6 +5,7 @@ obj-$(CONFIG_PDS_VFIO_PCI) += pds-vfio-pci.o pds-vfio-pci-y := \ cmds.o \ + dirty.o \ lm.o \ pci_drv.o \ vfio_dev.o diff --git a/drivers/vfio/pci/pds/cmds.c b/drivers/vfio/pci/pds/cmds.c index b1c5f103500f..b0d88442b091 100644 --- a/drivers/vfio/pci/pds/cmds.c +++ b/drivers/vfio/pci/pds/cmds.c @@ -382,3 +382,128 @@ void pds_vfio_send_host_vf_lm_status_cmd(struct pds_vfio_pci_device *pds_vfio, dev_warn(dev, "failed to send host VF migration status: %pe\n", ERR_PTR(err)); } + +int pds_vfio_dirty_status_cmd(struct pds_vfio_pci_device *pds_vfio, + u64 regions_dma, u8 *max_regions, u8 *num_regions) +{ + union pds_core_adminq_cmd cmd = { + .lm_dirty_status = { + .opcode = PDS_LM_CMD_DIRTY_STATUS, + .vf_id = cpu_to_le16(pds_vfio->vf_id), + }, + }; + struct device *dev = pds_vfio_to_dev(pds_vfio); + union pds_core_adminq_comp comp = {}; + int err; + + dev_dbg(dev, "vf%u: Dirty status\n", pds_vfio->vf_id); + + cmd.lm_dirty_status.regions_dma = cpu_to_le64(regions_dma); + cmd.lm_dirty_status.max_regions = *max_regions; + + err = pds_vfio_client_adminq_cmd(pds_vfio, &cmd, &comp, false); + if (err) { + dev_err(dev, "failed to get dirty status: %pe\n", ERR_PTR(err)); + return err; + } + + /* only support seq_ack approach for now */ + if (!(le32_to_cpu(comp.lm_dirty_status.bmp_type_mask) & + BIT(PDS_LM_DIRTY_BMP_TYPE_SEQ_ACK))) { + dev_err(dev, "Dirty bitmap tracking SEQ_ACK not supported\n"); + return -EOPNOTSUPP; + } + + *num_regions = comp.lm_dirty_status.num_regions; + *max_regions = comp.lm_dirty_status.max_regions; + + dev_dbg(dev, + "Page Tracking Status command successful, max_regions: %d, num_regions: %d, bmp_type: %s\n", + *max_regions, *num_regions, "PDS_LM_DIRTY_BMP_TYPE_SEQ_ACK"); + + return 0; +} + +int pds_vfio_dirty_enable_cmd(struct pds_vfio_pci_device *pds_vfio, + u64 regions_dma, u8 num_regions) +{ + union pds_core_adminq_cmd cmd = { + .lm_dirty_enable = { + .opcode = PDS_LM_CMD_DIRTY_ENABLE, + .vf_id = cpu_to_le16(pds_vfio->vf_id), + .regions_dma = cpu_to_le64(regions_dma), + .bmp_type = PDS_LM_DIRTY_BMP_TYPE_SEQ_ACK, + .num_regions = num_regions, + }, + }; + struct device *dev = pds_vfio_to_dev(pds_vfio); + union pds_core_adminq_comp comp = {}; + int err; + + err = pds_vfio_client_adminq_cmd(pds_vfio, &cmd, &comp, false); + if (err) { + dev_err(dev, "failed dirty tracking enable: %pe\n", + ERR_PTR(err)); + return err; + } + + return 0; +} + +int pds_vfio_dirty_disable_cmd(struct pds_vfio_pci_device *pds_vfio) +{ + union pds_core_adminq_cmd cmd = { + .lm_dirty_disable = { + .opcode = PDS_LM_CMD_DIRTY_DISABLE, + .vf_id = cpu_to_le16(pds_vfio->vf_id), + }, + }; + struct device *dev = pds_vfio_to_dev(pds_vfio); + union pds_core_adminq_comp comp = {}; + int err; + + err = pds_vfio_client_adminq_cmd(pds_vfio, &cmd, &comp, false); + if (err || comp.lm_dirty_status.num_regions != 0) { + /* in case num_regions is still non-zero after disable */ + err = err ? err : -EIO; + dev_err(dev, + "failed dirty tracking disable: %pe, num_regions %d\n", + ERR_PTR(err), comp.lm_dirty_status.num_regions); + return err; + } + + return 0; +} + +int pds_vfio_dirty_seq_ack_cmd(struct pds_vfio_pci_device *pds_vfio, + u64 sgl_dma, u16 num_sge, u32 offset, + u32 total_len, bool read_seq) +{ + const char *cmd_type_str = read_seq ? "read_seq" : "write_ack"; + union pds_core_adminq_cmd cmd = { + .lm_dirty_seq_ack = { + .vf_id = cpu_to_le16(pds_vfio->vf_id), + .len_bytes = cpu_to_le32(total_len), + .off_bytes = cpu_to_le32(offset), + .sgl_addr = cpu_to_le64(sgl_dma), + .num_sge = cpu_to_le16(num_sge), + }, + }; + struct device *dev = pds_vfio_to_dev(pds_vfio); + union pds_core_adminq_comp comp = {}; + int err; + + if (read_seq) + cmd.lm_dirty_seq_ack.opcode = PDS_LM_CMD_DIRTY_READ_SEQ; + else + cmd.lm_dirty_seq_ack.opcode = PDS_LM_CMD_DIRTY_WRITE_ACK; + + err = pds_vfio_client_adminq_cmd(pds_vfio, &cmd, &comp, false); + if (err) { + dev_err(dev, "failed cmd Page Tracking %s: %pe\n", cmd_type_str, + ERR_PTR(err)); + return err; + } + + return 0; +} diff --git a/drivers/vfio/pci/pds/cmds.h b/drivers/vfio/pci/pds/cmds.h index a0ec169f18a1..95221100b954 100644 --- a/drivers/vfio/pci/pds/cmds.h +++ b/drivers/vfio/pci/pds/cmds.h @@ -13,4 +13,13 @@ int pds_vfio_get_lm_state_cmd(struct pds_vfio_pci_device *pds_vfio); int pds_vfio_set_lm_state_cmd(struct pds_vfio_pci_device *pds_vfio); void pds_vfio_send_host_vf_lm_status_cmd(struct pds_vfio_pci_device *pds_vfio, enum pds_lm_host_vf_status vf_status); +int pds_vfio_dirty_status_cmd(struct pds_vfio_pci_device *pds_vfio, + u64 regions_dma, u8 *max_regions, + u8 *num_regions); +int pds_vfio_dirty_enable_cmd(struct pds_vfio_pci_device *pds_vfio, + u64 regions_dma, u8 num_regions); +int pds_vfio_dirty_disable_cmd(struct pds_vfio_pci_device *pds_vfio); +int pds_vfio_dirty_seq_ack_cmd(struct pds_vfio_pci_device *pds_vfio, + u64 sgl_dma, u16 num_sge, u32 offset, + u32 total_len, bool read_seq); #endif /* _CMDS_H_ */ diff --git a/drivers/vfio/pci/pds/dirty.c b/drivers/vfio/pci/pds/dirty.c new file mode 100644 index 000000000000..c937aa6f3954 --- /dev/null +++ b/drivers/vfio/pci/pds/dirty.c @@ -0,0 +1,564 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright(c) 2023 Advanced Micro Devices, Inc. */ + +#include <linux/interval_tree.h> +#include <linux/vfio.h> + +#include <linux/pds/pds_common.h> +#include <linux/pds/pds_core_if.h> +#include <linux/pds/pds_adminq.h> + +#include "vfio_dev.h" +#include "cmds.h" +#include "dirty.h" + +#define READ_SEQ true +#define WRITE_ACK false + +bool pds_vfio_dirty_is_enabled(struct pds_vfio_pci_device *pds_vfio) +{ + return pds_vfio->dirty.is_enabled; +} + +void pds_vfio_dirty_set_enabled(struct pds_vfio_pci_device *pds_vfio) +{ + pds_vfio->dirty.is_enabled = true; +} + +void pds_vfio_dirty_set_disabled(struct pds_vfio_pci_device *pds_vfio) +{ + pds_vfio->dirty.is_enabled = false; +} + +static void +pds_vfio_print_guest_region_info(struct pds_vfio_pci_device *pds_vfio, + u8 max_regions) +{ + int len = max_regions * sizeof(struct pds_lm_dirty_region_info); + struct pci_dev *pdev = pds_vfio->vfio_coredev.pdev; + struct device *pdsc_dev = &pci_physfn(pdev)->dev; + struct pds_lm_dirty_region_info *region_info; + dma_addr_t regions_dma; + u8 num_regions; + int err; + + region_info = kcalloc(max_regions, + sizeof(struct pds_lm_dirty_region_info), + GFP_KERNEL); + if (!region_info) + return; + + regions_dma = + dma_map_single(pdsc_dev, region_info, len, DMA_FROM_DEVICE); + if (dma_mapping_error(pdsc_dev, regions_dma)) + goto out_free_region_info; + + err = pds_vfio_dirty_status_cmd(pds_vfio, regions_dma, &max_regions, + &num_regions); + dma_unmap_single(pdsc_dev, regions_dma, len, DMA_FROM_DEVICE); + if (err) + goto out_free_region_info; + + for (unsigned int i = 0; i < num_regions; i++) + dev_dbg(&pdev->dev, + "region_info[%d]: dma_base 0x%llx page_count %u page_size_log2 %u\n", + i, le64_to_cpu(region_info[i].dma_base), + le32_to_cpu(region_info[i].page_count), + region_info[i].page_size_log2); + +out_free_region_info: + kfree(region_info); +} + +static int pds_vfio_dirty_alloc_bitmaps(struct pds_vfio_dirty *dirty, + unsigned long bytes) +{ + unsigned long *host_seq_bmp, *host_ack_bmp; + + host_seq_bmp = vzalloc(bytes); + if (!host_seq_bmp) + return -ENOMEM; + + host_ack_bmp = vzalloc(bytes); + if (!host_ack_bmp) { + bitmap_free(host_seq_bmp); + return -ENOMEM; + } + + dirty->host_seq.bmp = host_seq_bmp; + dirty->host_ack.bmp = host_ack_bmp; + + return 0; +} + +static void pds_vfio_dirty_free_bitmaps(struct pds_vfio_dirty *dirty) +{ + vfree(dirty->host_seq.bmp); + vfree(dirty->host_ack.bmp); + dirty->host_seq.bmp = NULL; + dirty->host_ack.bmp = NULL; +} + +static void __pds_vfio_dirty_free_sgl(struct pds_vfio_pci_device *pds_vfio, + struct pds_vfio_bmp_info *bmp_info) +{ + struct pci_dev *pdev = pds_vfio->vfio_coredev.pdev; + struct device *pdsc_dev = &pci_physfn(pdev)->dev; + + dma_unmap_single(pdsc_dev, bmp_info->sgl_addr, + bmp_info->num_sge * sizeof(struct pds_lm_sg_elem), + DMA_BIDIRECTIONAL); + kfree(bmp_info->sgl); + + bmp_info->num_sge = 0; + bmp_info->sgl = NULL; + bmp_info->sgl_addr = 0; +} + +static void pds_vfio_dirty_free_sgl(struct pds_vfio_pci_device *pds_vfio) +{ + if (pds_vfio->dirty.host_seq.sgl) + __pds_vfio_dirty_free_sgl(pds_vfio, &pds_vfio->dirty.host_seq); + if (pds_vfio->dirty.host_ack.sgl) + __pds_vfio_dirty_free_sgl(pds_vfio, &pds_vfio->dirty.host_ack); +} + +static int __pds_vfio_dirty_alloc_sgl(struct pds_vfio_pci_device *pds_vfio, + struct pds_vfio_bmp_info *bmp_info, + u32 page_count) +{ + struct pci_dev *pdev = pds_vfio->vfio_coredev.pdev; + struct device *pdsc_dev = &pci_physfn(pdev)->dev; + struct pds_lm_sg_elem *sgl; + dma_addr_t sgl_addr; + size_t sgl_size; + u32 max_sge; + + max_sge = DIV_ROUND_UP(page_count, PAGE_SIZE * 8); + sgl_size = max_sge * sizeof(struct pds_lm_sg_elem); + + sgl = kzalloc(sgl_size, GFP_KERNEL); + if (!sgl) + return -ENOMEM; + + sgl_addr = dma_map_single(pdsc_dev, sgl, sgl_size, DMA_BIDIRECTIONAL); + if (dma_mapping_error(pdsc_dev, sgl_addr)) { + kfree(sgl); + return -EIO; + } + + bmp_info->sgl = sgl; + bmp_info->num_sge = max_sge; + bmp_info->sgl_addr = sgl_addr; + + return 0; +} + +static int pds_vfio_dirty_alloc_sgl(struct pds_vfio_pci_device *pds_vfio, + u32 page_count) +{ + struct pds_vfio_dirty *dirty = &pds_vfio->dirty; + int err; + + err = __pds_vfio_dirty_alloc_sgl(pds_vfio, &dirty->host_seq, + page_count); + if (err) + return err; + + err = __pds_vfio_dirty_alloc_sgl(pds_vfio, &dirty->host_ack, + page_count); + if (err) { + __pds_vfio_dirty_free_sgl(pds_vfio, &dirty->host_seq); + return err; + } + + return 0; +} + +static int pds_vfio_dirty_enable(struct pds_vfio_pci_device *pds_vfio, + struct rb_root_cached *ranges, u32 nnodes, + u64 *page_size) +{ + struct pci_dev *pdev = pds_vfio->vfio_coredev.pdev; + struct device *pdsc_dev = &pci_physfn(pdev)->dev; + struct pds_vfio_dirty *dirty = &pds_vfio->dirty; + u64 region_start, region_size, region_page_size; + struct pds_lm_dirty_region_info *region_info; + struct interval_tree_node *node = NULL; + u8 max_regions = 0, num_regions; + dma_addr_t regions_dma = 0; + u32 num_ranges = nnodes; + u32 page_count; + u16 len; + int err; + + dev_dbg(&pdev->dev, "vf%u: Start dirty page tracking\n", + pds_vfio->vf_id); + + if (pds_vfio_dirty_is_enabled(pds_vfio)) + return -EINVAL; + + /* find if dirty tracking is disabled, i.e. num_regions == 0 */ + err = pds_vfio_dirty_status_cmd(pds_vfio, 0, &max_regions, + &num_regions); + if (err < 0) { + dev_err(&pdev->dev, "Failed to get dirty status, err %pe\n", + ERR_PTR(err)); + return err; + } else if (num_regions) { + dev_err(&pdev->dev, + "Dirty tracking already enabled for %d regions\n", + num_regions); + return -EEXIST; + } else if (!max_regions) { + dev_err(&pdev->dev, + "Device doesn't support dirty tracking, max_regions %d\n", + max_regions); + return -EOPNOTSUPP; + } + + /* + * Only support 1 region for now. If there are any large gaps in the + * VM's address regions, then this would be a waste of memory as we are + * generating 2 bitmaps (ack/seq) from the min address to the max + * address of the VM's address regions. In the future, if we support + * more than one region in the device/driver we can split the bitmaps + * on the largest address region gaps. We can do this split up to the + * max_regions times returned from the dirty_status command. + */ + max_regions = 1; + if (num_ranges > max_regions) { + vfio_combine_iova_ranges(ranges, nnodes, max_regions); + num_ranges = max_regions; + } + + node = interval_tree_iter_first(ranges, 0, ULONG_MAX); + if (!node) + return -EINVAL; + + region_size = node->last - node->start + 1; + region_start = node->start; + region_page_size = *page_size; + + len = sizeof(*region_info); + region_info = kzalloc(len, GFP_KERNEL); + if (!region_info) + return -ENOMEM; + + page_count = DIV_ROUND_UP(region_size, region_page_size); + + region_info->dma_base = cpu_to_le64(region_start); + region_info->page_count = cpu_to_le32(page_count); + region_info->page_size_log2 = ilog2(region_page_size); + + regions_dma = dma_map_single(pdsc_dev, (void *)region_info, len, + DMA_BIDIRECTIONAL); + if (dma_mapping_error(pdsc_dev, regions_dma)) { + err = -ENOMEM; + goto out_free_region_info; + } + + err = pds_vfio_dirty_enable_cmd(pds_vfio, regions_dma, max_regions); + dma_unmap_single(pdsc_dev, regions_dma, len, DMA_BIDIRECTIONAL); + if (err) + goto out_free_region_info; + + /* + * page_count might be adjusted by the device, + * update it before freeing region_info DMA + */ + page_count = le32_to_cpu(region_info->page_count); + + dev_dbg(&pdev->dev, + "region_info: regions_dma 0x%llx dma_base 0x%llx page_count %u page_size_log2 %u\n", + regions_dma, region_start, page_count, + (u8)ilog2(region_page_size)); + + err = pds_vfio_dirty_alloc_bitmaps(dirty, page_count / BITS_PER_BYTE); + if (err) { + dev_err(&pdev->dev, "Failed to alloc dirty bitmaps: %pe\n", + ERR_PTR(err)); + goto out_free_region_info; + } + + err = pds_vfio_dirty_alloc_sgl(pds_vfio, page_count); + if (err) { + dev_err(&pdev->dev, "Failed to alloc dirty sg lists: %pe\n", + ERR_PTR(err)); + goto out_free_bitmaps; + } + + dirty->region_start = region_start; + dirty->region_size = region_size; + dirty->region_page_size = region_page_size; + pds_vfio_dirty_set_enabled(pds_vfio); + + pds_vfio_print_guest_region_info(pds_vfio, max_regions); + + kfree(region_info); + + return 0; + +out_free_bitmaps: + pds_vfio_dirty_free_bitmaps(dirty); +out_free_region_info: + kfree(region_info); + return err; +} + +void pds_vfio_dirty_disable(struct pds_vfio_pci_device *pds_vfio, bool send_cmd) +{ + if (pds_vfio_dirty_is_enabled(pds_vfio)) { + pds_vfio_dirty_set_disabled(pds_vfio); + if (send_cmd) + pds_vfio_dirty_disable_cmd(pds_vfio); + pds_vfio_dirty_free_sgl(pds_vfio); + pds_vfio_dirty_free_bitmaps(&pds_vfio->dirty); + } + + if (send_cmd) + pds_vfio_send_host_vf_lm_status_cmd(pds_vfio, PDS_LM_STA_NONE); +} + +static int pds_vfio_dirty_seq_ack(struct pds_vfio_pci_device *pds_vfio, + struct pds_vfio_bmp_info *bmp_info, + u32 offset, u32 bmp_bytes, bool read_seq) +{ + const char *bmp_type_str = read_seq ? "read_seq" : "write_ack"; + u8 dma_dir = read_seq ? DMA_FROM_DEVICE : DMA_TO_DEVICE; + struct pci_dev *pdev = pds_vfio->vfio_coredev.pdev; + struct device *pdsc_dev = &pci_physfn(pdev)->dev; + unsigned long long npages; + struct sg_table sg_table; + struct scatterlist *sg; + struct page **pages; + u32 page_offset; + const void *bmp; + size_t size; + u16 num_sge; + int err; + int i; + + bmp = (void *)((u64)bmp_info->bmp + offset); + page_offset = offset_in_page(bmp); + bmp -= page_offset; + + /* + * Start and end of bitmap section to seq/ack might not be page + * aligned, so use the page_offset to account for that so there + * will be enough pages to represent the bmp_bytes + */ + npages = DIV_ROUND_UP_ULL(bmp_bytes + page_offset, PAGE_SIZE); + pages = kmalloc_array(npages, sizeof(*pages), GFP_KERNEL); + if (!pages) + return -ENOMEM; + + for (unsigned long long i = 0; i < npages; i++) { + struct page *page = vmalloc_to_page(bmp); + + if (!page) { + err = -EFAULT; + goto out_free_pages; + } + + pages[i] = page; + bmp += PAGE_SIZE; + } + + err = sg_alloc_table_from_pages(&sg_table, pages, npages, page_offset, + bmp_bytes, GFP_KERNEL); + if (err) + goto out_free_pages; + + err = dma_map_sgtable(pdsc_dev, &sg_table, dma_dir, 0); + if (err) + goto out_free_sg_table; + + for_each_sgtable_dma_sg(&sg_table, sg, i) { + struct pds_lm_sg_elem *sg_elem = &bmp_info->sgl[i]; + + sg_elem->addr = cpu_to_le64(sg_dma_address(sg)); + sg_elem->len = cpu_to_le32(sg_dma_len(sg)); + } + + num_sge = sg_table.nents; + size = num_sge * sizeof(struct pds_lm_sg_elem); + dma_sync_single_for_device(pdsc_dev, bmp_info->sgl_addr, size, dma_dir); + err = pds_vfio_dirty_seq_ack_cmd(pds_vfio, bmp_info->sgl_addr, num_sge, + offset, bmp_bytes, read_seq); + if (err) + dev_err(&pdev->dev, + "Dirty bitmap %s failed offset %u bmp_bytes %u num_sge %u DMA 0x%llx: %pe\n", + bmp_type_str, offset, bmp_bytes, + num_sge, bmp_info->sgl_addr, ERR_PTR(err)); + dma_sync_single_for_cpu(pdsc_dev, bmp_info->sgl_addr, size, dma_dir); + + dma_unmap_sgtable(pdsc_dev, &sg_table, dma_dir, 0); +out_free_sg_table: + sg_free_table(&sg_table); +out_free_pages: + kfree(pages); + + return err; +} + +static int pds_vfio_dirty_write_ack(struct pds_vfio_pci_device *pds_vfio, + u32 offset, u32 len) +{ + return pds_vfio_dirty_seq_ack(pds_vfio, &pds_vfio->dirty.host_ack, + offset, len, WRITE_ACK); +} + +static int pds_vfio_dirty_read_seq(struct pds_vfio_pci_device *pds_vfio, + u32 offset, u32 len) +{ + return pds_vfio_dirty_seq_ack(pds_vfio, &pds_vfio->dirty.host_seq, + offset, len, READ_SEQ); +} + +static int pds_vfio_dirty_process_bitmaps(struct pds_vfio_pci_device *pds_vfio, + struct iova_bitmap *dirty_bitmap, + u32 bmp_offset, u32 len_bytes) +{ + u64 page_size = pds_vfio->dirty.region_page_size; + u64 region_start = pds_vfio->dirty.region_start; + u32 bmp_offset_bit; + __le64 *seq, *ack; + int dword_count; + + dword_count = len_bytes / sizeof(u64); + seq = (__le64 *)((u64)pds_vfio->dirty.host_seq.bmp + bmp_offset); + ack = (__le64 *)((u64)pds_vfio->dirty.host_ack.bmp + bmp_offset); + bmp_offset_bit = bmp_offset * 8; + + for (int i = 0; i < dword_count; i++) { + u64 xor = le64_to_cpu(seq[i]) ^ le64_to_cpu(ack[i]); + + /* prepare for next write_ack call */ + ack[i] = seq[i]; + + for (u8 bit_i = 0; bit_i < BITS_PER_TYPE(u64); ++bit_i) { + if (xor & BIT(bit_i)) { + u64 abs_bit_i = bmp_offset_bit + + i * BITS_PER_TYPE(u64) + bit_i; + u64 addr = abs_bit_i * page_size + region_start; + + iova_bitmap_set(dirty_bitmap, addr, page_size); + } + } + } + + return 0; +} + +static int pds_vfio_dirty_sync(struct pds_vfio_pci_device *pds_vfio, + struct iova_bitmap *dirty_bitmap, + unsigned long iova, unsigned long length) +{ + struct device *dev = &pds_vfio->vfio_coredev.pdev->dev; + struct pds_vfio_dirty *dirty = &pds_vfio->dirty; + u64 bmp_offset, bmp_bytes; + u64 bitmap_size, pages; + int err; + + dev_dbg(dev, "vf%u: Get dirty page bitmap\n", pds_vfio->vf_id); + + if (!pds_vfio_dirty_is_enabled(pds_vfio)) { + dev_err(dev, "vf%u: Sync failed, dirty tracking is disabled\n", + pds_vfio->vf_id); + return -EINVAL; + } + + pages = DIV_ROUND_UP(length, pds_vfio->dirty.region_page_size); + bitmap_size = + round_up(pages, sizeof(u64) * BITS_PER_BYTE) / BITS_PER_BYTE; + + dev_dbg(dev, + "vf%u: iova 0x%lx length %lu page_size %llu pages %llu bitmap_size %llu\n", + pds_vfio->vf_id, iova, length, pds_vfio->dirty.region_page_size, + pages, bitmap_size); + + if (!length || ((dirty->region_start + iova + length) > + (dirty->region_start + dirty->region_size))) { + dev_err(dev, "Invalid iova 0x%lx and/or length 0x%lx to sync\n", + iova, length); + return -EINVAL; + } + + /* bitmap is modified in 64 bit chunks */ + bmp_bytes = ALIGN(DIV_ROUND_UP(length / dirty->region_page_size, + sizeof(u64)), + sizeof(u64)); + if (bmp_bytes != bitmap_size) { + dev_err(dev, + "Calculated bitmap bytes %llu not equal to bitmap size %llu\n", + bmp_bytes, bitmap_size); + return -EINVAL; + } + + bmp_offset = DIV_ROUND_UP(iova / dirty->region_page_size, sizeof(u64)); + + dev_dbg(dev, + "Syncing dirty bitmap, iova 0x%lx length 0x%lx, bmp_offset %llu bmp_bytes %llu\n", + iova, length, bmp_offset, bmp_bytes); + + err = pds_vfio_dirty_read_seq(pds_vfio, bmp_offset, bmp_bytes); + if (err) + return err; + + err = pds_vfio_dirty_process_bitmaps(pds_vfio, dirty_bitmap, bmp_offset, + bmp_bytes); + if (err) + return err; + + err = pds_vfio_dirty_write_ack(pds_vfio, bmp_offset, bmp_bytes); + if (err) + return err; + + return 0; +} + +int pds_vfio_dma_logging_report(struct vfio_device *vdev, unsigned long iova, + unsigned long length, struct iova_bitmap *dirty) +{ + struct pds_vfio_pci_device *pds_vfio = + container_of(vdev, struct pds_vfio_pci_device, + vfio_coredev.vdev); + int err; + + mutex_lock(&pds_vfio->state_mutex); + err = pds_vfio_dirty_sync(pds_vfio, dirty, iova, length); + pds_vfio_state_mutex_unlock(pds_vfio); + + return err; +} + +int pds_vfio_dma_logging_start(struct vfio_device *vdev, + struct rb_root_cached *ranges, u32 nnodes, + u64 *page_size) +{ + struct pds_vfio_pci_device *pds_vfio = + container_of(vdev, struct pds_vfio_pci_device, + vfio_coredev.vdev); + int err; + + mutex_lock(&pds_vfio->state_mutex); + pds_vfio_send_host_vf_lm_status_cmd(pds_vfio, PDS_LM_STA_IN_PROGRESS); + err = pds_vfio_dirty_enable(pds_vfio, ranges, nnodes, page_size); + pds_vfio_state_mutex_unlock(pds_vfio); + + return err; +} + +int pds_vfio_dma_logging_stop(struct vfio_device *vdev) +{ + struct pds_vfio_pci_device *pds_vfio = + container_of(vdev, struct pds_vfio_pci_device, + vfio_coredev.vdev); + + mutex_lock(&pds_vfio->state_mutex); + pds_vfio_dirty_disable(pds_vfio, true); + pds_vfio_state_mutex_unlock(pds_vfio); + + return 0; +} diff --git a/drivers/vfio/pci/pds/dirty.h b/drivers/vfio/pci/pds/dirty.h new file mode 100644 index 000000000000..f78da25d75ca --- /dev/null +++ b/drivers/vfio/pci/pds/dirty.h @@ -0,0 +1,39 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* Copyright(c) 2023 Advanced Micro Devices, Inc. */ + +#ifndef _DIRTY_H_ +#define _DIRTY_H_ + +struct pds_vfio_bmp_info { + unsigned long *bmp; + u32 bmp_bytes; + struct pds_lm_sg_elem *sgl; + dma_addr_t sgl_addr; + u16 num_sge; +}; + +struct pds_vfio_dirty { + struct pds_vfio_bmp_info host_seq; + struct pds_vfio_bmp_info host_ack; + u64 region_size; + u64 region_start; + u64 region_page_size; + bool is_enabled; +}; + +struct pds_vfio_pci_device; + +bool pds_vfio_dirty_is_enabled(struct pds_vfio_pci_device *pds_vfio); +void pds_vfio_dirty_set_enabled(struct pds_vfio_pci_device *pds_vfio); +void pds_vfio_dirty_set_disabled(struct pds_vfio_pci_device *pds_vfio); +void pds_vfio_dirty_disable(struct pds_vfio_pci_device *pds_vfio, + bool send_cmd); + +int pds_vfio_dma_logging_report(struct vfio_device *vdev, unsigned long iova, + unsigned long length, + struct iova_bitmap *dirty); +int pds_vfio_dma_logging_start(struct vfio_device *vdev, + struct rb_root_cached *ranges, u32 nnodes, + u64 *page_size); +int pds_vfio_dma_logging_stop(struct vfio_device *vdev); +#endif /* _DIRTY_H_ */ diff --git a/drivers/vfio/pci/pds/lm.c b/drivers/vfio/pci/pds/lm.c index 7e319529cf74..aec75574cab3 100644 --- a/drivers/vfio/pci/pds/lm.c +++ b/drivers/vfio/pci/pds/lm.c @@ -371,7 +371,7 @@ pds_vfio_step_device_state_locked(struct pds_vfio_pci_device *pds_vfio, if (cur == VFIO_DEVICE_STATE_STOP_COPY && next == VFIO_DEVICE_STATE_STOP) { pds_vfio_put_save_file(pds_vfio); - pds_vfio_send_host_vf_lm_status_cmd(pds_vfio, PDS_LM_STA_NONE); + pds_vfio_dirty_disable(pds_vfio, true); return NULL; } diff --git a/drivers/vfio/pci/pds/vfio_dev.c b/drivers/vfio/pci/pds/vfio_dev.c index b37ef96a7fd8..9e6a96b5db62 100644 --- a/drivers/vfio/pci/pds/vfio_dev.c +++ b/drivers/vfio/pci/pds/vfio_dev.c @@ -5,6 +5,7 @@ #include <linux/vfio_pci_core.h> #include "lm.h" +#include "dirty.h" #include "vfio_dev.h" struct pci_dev *pds_vfio_to_pci_dev(struct pds_vfio_pci_device *pds_vfio) @@ -25,7 +26,7 @@ struct pds_vfio_pci_device *pds_vfio_pci_drvdata(struct pci_dev *pdev) vfio_coredev); } -static void pds_vfio_state_mutex_unlock(struct pds_vfio_pci_device *pds_vfio) +void pds_vfio_state_mutex_unlock(struct pds_vfio_pci_device *pds_vfio) { again: spin_lock(&pds_vfio->reset_lock); @@ -35,6 +36,7 @@ static void pds_vfio_state_mutex_unlock(struct pds_vfio_pci_device *pds_vfio) pds_vfio->state = VFIO_DEVICE_STATE_RUNNING; pds_vfio_put_restore_file(pds_vfio); pds_vfio_put_save_file(pds_vfio); + pds_vfio_dirty_disable(pds_vfio, false); } spin_unlock(&pds_vfio->reset_lock); goto again; @@ -117,6 +119,12 @@ static const struct vfio_migration_ops pds_vfio_lm_ops = { .migration_get_data_size = pds_vfio_get_device_state_size }; +static const struct vfio_log_ops pds_vfio_log_ops = { + .log_start = pds_vfio_dma_logging_start, + .log_stop = pds_vfio_dma_logging_stop, + .log_read_and_clear = pds_vfio_dma_logging_report, +}; + static int pds_vfio_init_device(struct vfio_device *vdev) { struct pds_vfio_pci_device *pds_vfio = @@ -137,6 +145,7 @@ static int pds_vfio_init_device(struct vfio_device *vdev) vdev->migration_flags = VFIO_MIGRATION_STOP_COPY | VFIO_MIGRATION_P2P; vdev->mig_ops = &pds_vfio_lm_ops; + vdev->log_ops = &pds_vfio_log_ops; pci_id = PCI_DEVID(pdev->bus->number, pdev->devfn); dev_dbg(&pdev->dev, @@ -175,6 +184,7 @@ static void pds_vfio_close_device(struct vfio_device *vdev) mutex_lock(&pds_vfio->state_mutex); pds_vfio_put_restore_file(pds_vfio); pds_vfio_put_save_file(pds_vfio); + pds_vfio_dirty_disable(pds_vfio, true); mutex_unlock(&pds_vfio->state_mutex); mutex_destroy(&pds_vfio->state_mutex); vfio_pci_core_close_device(vdev); diff --git a/drivers/vfio/pci/pds/vfio_dev.h b/drivers/vfio/pci/pds/vfio_dev.h index 53ba1dd3cb92..a314f9ffd6ed 100644 --- a/drivers/vfio/pci/pds/vfio_dev.h +++ b/drivers/vfio/pci/pds/vfio_dev.h @@ -7,6 +7,7 @@ #include <linux/pci.h> #include <linux/vfio_pci_core.h> +#include "dirty.h" #include "lm.h" struct pds_vfio_pci_device { @@ -14,6 +15,7 @@ struct pds_vfio_pci_device { struct pds_vfio_lm_file *save_file; struct pds_vfio_lm_file *restore_file; + struct pds_vfio_dirty dirty; struct mutex state_mutex; /* protect migration state */ enum vfio_device_mig_state state; spinlock_t reset_lock; /* protect reset_done flow */ @@ -23,6 +25,8 @@ struct pds_vfio_pci_device { u16 client_id; }; +void pds_vfio_state_mutex_unlock(struct pds_vfio_pci_device *pds_vfio); + const struct vfio_device_ops *pds_vfio_ops_info(void); struct pds_vfio_pci_device *pds_vfio_pci_drvdata(struct pci_dev *pdev); void pds_vfio_reset(struct pds_vfio_pci_device *pds_vfio); diff --git a/include/linux/pds/pds_adminq.h b/include/linux/pds/pds_adminq.h index 9c79b3c8fc47..4b4e9a98b37b 100644 --- a/include/linux/pds/pds_adminq.h +++ b/include/linux/pds/pds_adminq.h @@ -835,6 +835,13 @@ enum pds_lm_cmd_opcode { PDS_LM_CMD_RESUME = 20, PDS_LM_CMD_SAVE = 21, PDS_LM_CMD_RESTORE = 22, + + /* Dirty page tracking commands */ + PDS_LM_CMD_DIRTY_STATUS = 32, + PDS_LM_CMD_DIRTY_ENABLE = 33, + PDS_LM_CMD_DIRTY_DISABLE = 34, + PDS_LM_CMD_DIRTY_READ_SEQ = 35, + PDS_LM_CMD_DIRTY_WRITE_ACK = 36, }; /** @@ -992,6 +999,172 @@ enum pds_lm_host_vf_status { PDS_LM_STA_MAX, }; +/** + * struct pds_lm_dirty_region_info - Memory region info for STATUS and ENABLE + * @dma_base: Base address of the DMA-contiguous memory region + * @page_count: Number of pages in the memory region + * @page_size_log2: Log2 page size in the memory region + * @rsvd: Word boundary padding + */ +struct pds_lm_dirty_region_info { + __le64 dma_base; + __le32 page_count; + u8 page_size_log2; + u8 rsvd[3]; +}; + +/** + * struct pds_lm_dirty_status_cmd - DIRTY_STATUS command + * @opcode: Opcode PDS_LM_CMD_DIRTY_STATUS + * @rsvd: Word boundary padding + * @vf_id: VF id + * @max_regions: Capacity of the region info buffer + * @rsvd2: Word boundary padding + * @regions_dma: DMA address of the region info buffer + * + * The minimum of max_regions (from the command) and num_regions (from the + * completion) of struct pds_lm_dirty_region_info will be written to + * regions_dma. + * + * The max_regions may be zero, in which case regions_dma is ignored. In that + * case, the completion will only report the maximum number of regions + * supported by the device, and the number of regions currently enabled. + */ +struct pds_lm_dirty_status_cmd { + u8 opcode; + u8 rsvd; + __le16 vf_id; + u8 max_regions; + u8 rsvd2[3]; + __le64 regions_dma; +} __packed; + +/** + * enum pds_lm_dirty_bmp_type - Type of dirty page bitmap + * @PDS_LM_DIRTY_BMP_TYPE_NONE: No bitmap / disabled + * @PDS_LM_DIRTY_BMP_TYPE_SEQ_ACK: Seq/Ack bitmap representation + */ +enum pds_lm_dirty_bmp_type { + PDS_LM_DIRTY_BMP_TYPE_NONE = 0, + PDS_LM_DIRTY_BMP_TYPE_SEQ_ACK = 1, +}; + +/** + * struct pds_lm_dirty_status_comp - STATUS command completion + * @status: Status of the command (enum pds_core_status_code) + * @rsvd: Word boundary padding + * @comp_index: Index in the desc ring for which this is the completion + * @max_regions: Maximum number of regions supported by the device + * @num_regions: Number of regions currently enabled + * @bmp_type: Type of dirty bitmap representation + * @rsvd2: Word boundary padding + * @bmp_type_mask: Mask of supported bitmap types, bit index per type + * @rsvd3: Word boundary padding + * @color: Color bit + * + * This completion descriptor is used for STATUS, ENABLE, and DISABLE. + */ +struct pds_lm_dirty_status_comp { + u8 status; + u8 rsvd; + __le16 comp_index; + u8 max_regions; + u8 num_regions; + u8 bmp_type; + u8 rsvd2; + __le32 bmp_type_mask; + u8 rsvd3[3]; + u8 color; +}; + +/** + * struct pds_lm_dirty_enable_cmd - DIRTY_ENABLE command + * @opcode: Opcode PDS_LM_CMD_DIRTY_ENABLE + * @rsvd: Word boundary padding + * @vf_id: VF id + * @bmp_type: Type of dirty bitmap representation + * @num_regions: Number of entries in the region info buffer + * @rsvd2: Word boundary padding + * @regions_dma: DMA address of the region info buffer + * + * The num_regions must be nonzero, and less than or equal to the maximum + * number of regions supported by the device. + * + * The memory regions should not overlap. + * + * The information should be initialized by the driver. The device may modify + * the information on successful completion, such as by size-aligning the + * number of pages in a region. + * + * The modified number of pages will be greater than or equal to the page count + * given in the enable command, and at least as coarsly aligned as the given + * value. For example, the count might be aligned to a multiple of 64, but + * if the value is already a multiple of 128 or higher, it will not change. + * If the driver requires its own minimum alignment of the number of pages, the + * driver should account for that already in the region info of this command. + * + * This command uses struct pds_lm_dirty_status_comp for its completion. + */ +struct pds_lm_dirty_enable_cmd { + u8 opcode; + u8 rsvd; + __le16 vf_id; + u8 bmp_type; + u8 num_regions; + u8 rsvd2[2]; + __le64 regions_dma; +} __packed; + +/** + * struct pds_lm_dirty_disable_cmd - DIRTY_DISABLE command + * @opcode: Opcode PDS_LM_CMD_DIRTY_DISABLE + * @rsvd: Word boundary padding + * @vf_id: VF id + * + * Dirty page tracking will be disabled. This may be called in any state, as + * long as dirty page tracking is supported by the device, to ensure that dirty + * page tracking is disabled. + * + * This command uses struct pds_lm_dirty_status_comp for its completion. On + * success, num_regions will be zero. + */ +struct pds_lm_dirty_disable_cmd { + u8 opcode; + u8 rsvd; + __le16 vf_id; +}; + +/** + * struct pds_lm_dirty_seq_ack_cmd - DIRTY_READ_SEQ or _WRITE_ACK command + * @opcode: Opcode PDS_LM_CMD_DIRTY_[READ_SEQ|WRITE_ACK] + * @rsvd: Word boundary padding + * @vf_id: VF id + * @off_bytes: Byte offset in the bitmap + * @len_bytes: Number of bytes to transfer + * @num_sge: Number of DMA scatter gather elements + * @rsvd2: Word boundary padding + * @sgl_addr: DMA address of scatter gather list + * + * Read bytes from the SEQ bitmap, or write bytes into the ACK bitmap. + * + * This command treats the entire bitmap as a byte buffer. It does not + * distinguish between guest memory regions. The driver should refer to the + * number of pages in each region, according to PDS_LM_CMD_DIRTY_STATUS, to + * determine the region boundaries in the bitmap. Each region will be + * represented by exactly the number of bits as the page count for that region, + * immediately following the last bit of the previous region. + */ +struct pds_lm_dirty_seq_ack_cmd { + u8 opcode; + u8 rsvd; + __le16 vf_id; + __le32 off_bytes; + __le32 len_bytes; + __le16 num_sge; + u8 rsvd2[2]; + __le64 sgl_addr; +} __packed; + /** * struct pds_lm_host_vf_status_cmd - HOST_VF_STATUS command * @opcode: Opcode PDS_LM_CMD_HOST_VF_STATUS @@ -1039,6 +1212,10 @@ union pds_core_adminq_cmd { struct pds_lm_save_cmd lm_save; struct pds_lm_restore_cmd lm_restore; struct pds_lm_host_vf_status_cmd lm_host_vf_status; + struct pds_lm_dirty_status_cmd lm_dirty_status; + struct pds_lm_dirty_enable_cmd lm_dirty_enable; + struct pds_lm_dirty_disable_cmd lm_dirty_disable; + struct pds_lm_dirty_seq_ack_cmd lm_dirty_seq_ack; }; union pds_core_adminq_comp { @@ -1065,6 +1242,7 @@ union pds_core_adminq_comp { struct pds_vdpa_vq_reset_comp vdpa_vq_reset; struct pds_lm_state_size_comp lm_state_size; + struct pds_lm_dirty_status_comp lm_dirty_status; }; #ifndef __CHECKER__