Back to home page

OSCL-LXR

 
 

    


0001 // SPDX-License-Identifier: GPL-2.0
0002 /*
0003  * Copyright (c) Microsoft Corporation.
0004  *
0005  * Author:
0006  *   Jake Oshins <jakeo@microsoft.com>
0007  *
0008  * This driver acts as a paravirtual front-end for PCI Express root buses.
0009  * When a PCI Express function (either an entire device or an SR-IOV
0010  * Virtual Function) is being passed through to the VM, this driver exposes
0011  * a new bus to the guest VM.  This is modeled as a root PCI bus because
0012  * no bridges are being exposed to the VM.  In fact, with a "Generation 2"
0013  * VM within Hyper-V, there may seem to be no PCI bus at all in the VM
0014  * until a device as been exposed using this driver.
0015  *
0016  * Each root PCI bus has its own PCI domain, which is called "Segment" in
0017  * the PCI Firmware Specifications.  Thus while each device passed through
0018  * to the VM using this front-end will appear at "device 0", the domain will
0019  * be unique.  Typically, each bus will have one PCI function on it, though
0020  * this driver does support more than one.
0021  *
0022  * In order to map the interrupts from the device through to the guest VM,
0023  * this driver also implements an IRQ Domain, which handles interrupts (either
0024  * MSI or MSI-X) associated with the functions on the bus.  As interrupts are
0025  * set up, torn down, or reaffined, this driver communicates with the
0026  * underlying hypervisor to adjust the mappings in the I/O MMU so that each
0027  * interrupt will be delivered to the correct virtual processor at the right
0028  * vector.  This driver does not support level-triggered (line-based)
0029  * interrupts, and will report that the Interrupt Line register in the
0030  * function's configuration space is zero.
0031  *
0032  * The rest of this driver mostly maps PCI concepts onto underlying Hyper-V
0033  * facilities.  For instance, the configuration space of a function exposed
0034  * by Hyper-V is mapped into a single page of memory space, and the
0035  * read and write handlers for config space must be aware of this mechanism.
0036  * Similarly, device setup and teardown involves messages sent to and from
0037  * the PCI back-end driver in Hyper-V.
0038  */
0039 
0040 #include <linux/kernel.h>
0041 #include <linux/module.h>
0042 #include <linux/pci.h>
0043 #include <linux/pci-ecam.h>
0044 #include <linux/delay.h>
0045 #include <linux/semaphore.h>
0046 #include <linux/irq.h>
0047 #include <linux/msi.h>
0048 #include <linux/hyperv.h>
0049 #include <linux/refcount.h>
0050 #include <linux/irqdomain.h>
0051 #include <linux/acpi.h>
0052 #include <asm/mshyperv.h>
0053 
0054 /*
0055  * Protocol versions. The low word is the minor version, the high word the
0056  * major version.
0057  */
0058 
0059 #define PCI_MAKE_VERSION(major, minor) ((u32)(((major) << 16) | (minor)))
0060 #define PCI_MAJOR_VERSION(version) ((u32)(version) >> 16)
0061 #define PCI_MINOR_VERSION(version) ((u32)(version) & 0xff)
0062 
0063 enum pci_protocol_version_t {
0064     PCI_PROTOCOL_VERSION_1_1 = PCI_MAKE_VERSION(1, 1),  /* Win10 */
0065     PCI_PROTOCOL_VERSION_1_2 = PCI_MAKE_VERSION(1, 2),  /* RS1 */
0066     PCI_PROTOCOL_VERSION_1_3 = PCI_MAKE_VERSION(1, 3),  /* Vibranium */
0067     PCI_PROTOCOL_VERSION_1_4 = PCI_MAKE_VERSION(1, 4),  /* WS2022 */
0068 };
0069 
0070 #define CPU_AFFINITY_ALL    -1ULL
0071 
0072 /*
0073  * Supported protocol versions in the order of probing - highest go
0074  * first.
0075  */
0076 static enum pci_protocol_version_t pci_protocol_versions[] = {
0077     PCI_PROTOCOL_VERSION_1_4,
0078     PCI_PROTOCOL_VERSION_1_3,
0079     PCI_PROTOCOL_VERSION_1_2,
0080     PCI_PROTOCOL_VERSION_1_1,
0081 };
0082 
0083 #define PCI_CONFIG_MMIO_LENGTH  0x2000
0084 #define CFG_PAGE_OFFSET 0x1000
0085 #define CFG_PAGE_SIZE (PCI_CONFIG_MMIO_LENGTH - CFG_PAGE_OFFSET)
0086 
0087 #define MAX_SUPPORTED_MSI_MESSAGES 0x400
0088 
0089 #define STATUS_REVISION_MISMATCH 0xC0000059
0090 
0091 /* space for 32bit serial number as string */
0092 #define SLOT_NAME_SIZE 11
0093 
0094 /*
0095  * Size of requestor for VMbus; the value is based on the observation
0096  * that having more than one request outstanding is 'rare', and so 64
0097  * should be generous in ensuring that we don't ever run out.
0098  */
0099 #define HV_PCI_RQSTOR_SIZE 64
0100 
0101 /*
0102  * Message Types
0103  */
0104 
0105 enum pci_message_type {
0106     /*
0107      * Version 1.1
0108      */
0109     PCI_MESSAGE_BASE                = 0x42490000,
0110     PCI_BUS_RELATIONS               = PCI_MESSAGE_BASE + 0,
0111     PCI_QUERY_BUS_RELATIONS         = PCI_MESSAGE_BASE + 1,
0112     PCI_POWER_STATE_CHANGE          = PCI_MESSAGE_BASE + 4,
0113     PCI_QUERY_RESOURCE_REQUIREMENTS = PCI_MESSAGE_BASE + 5,
0114     PCI_QUERY_RESOURCE_RESOURCES    = PCI_MESSAGE_BASE + 6,
0115     PCI_BUS_D0ENTRY                 = PCI_MESSAGE_BASE + 7,
0116     PCI_BUS_D0EXIT                  = PCI_MESSAGE_BASE + 8,
0117     PCI_READ_BLOCK                  = PCI_MESSAGE_BASE + 9,
0118     PCI_WRITE_BLOCK                 = PCI_MESSAGE_BASE + 0xA,
0119     PCI_EJECT                       = PCI_MESSAGE_BASE + 0xB,
0120     PCI_QUERY_STOP                  = PCI_MESSAGE_BASE + 0xC,
0121     PCI_REENABLE                    = PCI_MESSAGE_BASE + 0xD,
0122     PCI_QUERY_STOP_FAILED           = PCI_MESSAGE_BASE + 0xE,
0123     PCI_EJECTION_COMPLETE           = PCI_MESSAGE_BASE + 0xF,
0124     PCI_RESOURCES_ASSIGNED          = PCI_MESSAGE_BASE + 0x10,
0125     PCI_RESOURCES_RELEASED          = PCI_MESSAGE_BASE + 0x11,
0126     PCI_INVALIDATE_BLOCK            = PCI_MESSAGE_BASE + 0x12,
0127     PCI_QUERY_PROTOCOL_VERSION      = PCI_MESSAGE_BASE + 0x13,
0128     PCI_CREATE_INTERRUPT_MESSAGE    = PCI_MESSAGE_BASE + 0x14,
0129     PCI_DELETE_INTERRUPT_MESSAGE    = PCI_MESSAGE_BASE + 0x15,
0130     PCI_RESOURCES_ASSIGNED2     = PCI_MESSAGE_BASE + 0x16,
0131     PCI_CREATE_INTERRUPT_MESSAGE2   = PCI_MESSAGE_BASE + 0x17,
0132     PCI_DELETE_INTERRUPT_MESSAGE2   = PCI_MESSAGE_BASE + 0x18, /* unused */
0133     PCI_BUS_RELATIONS2      = PCI_MESSAGE_BASE + 0x19,
0134     PCI_RESOURCES_ASSIGNED3         = PCI_MESSAGE_BASE + 0x1A,
0135     PCI_CREATE_INTERRUPT_MESSAGE3   = PCI_MESSAGE_BASE + 0x1B,
0136     PCI_MESSAGE_MAXIMUM
0137 };
0138 
0139 /*
0140  * Structures defining the virtual PCI Express protocol.
0141  */
0142 
0143 union pci_version {
0144     struct {
0145         u16 minor_version;
0146         u16 major_version;
0147     } parts;
0148     u32 version;
0149 } __packed;
0150 
0151 /*
0152  * Function numbers are 8-bits wide on Express, as interpreted through ARI,
0153  * which is all this driver does.  This representation is the one used in
0154  * Windows, which is what is expected when sending this back and forth with
0155  * the Hyper-V parent partition.
0156  */
0157 union win_slot_encoding {
0158     struct {
0159         u32 dev:5;
0160         u32 func:3;
0161         u32 reserved:24;
0162     } bits;
0163     u32 slot;
0164 } __packed;
0165 
0166 /*
0167  * Pretty much as defined in the PCI Specifications.
0168  */
0169 struct pci_function_description {
0170     u16 v_id;   /* vendor ID */
0171     u16 d_id;   /* device ID */
0172     u8  rev;
0173     u8  prog_intf;
0174     u8  subclass;
0175     u8  base_class;
0176     u32 subsystem_id;
0177     union win_slot_encoding win_slot;
0178     u32 ser;    /* serial number */
0179 } __packed;
0180 
0181 enum pci_device_description_flags {
0182     HV_PCI_DEVICE_FLAG_NONE         = 0x0,
0183     HV_PCI_DEVICE_FLAG_NUMA_AFFINITY    = 0x1,
0184 };
0185 
0186 struct pci_function_description2 {
0187     u16 v_id;   /* vendor ID */
0188     u16 d_id;   /* device ID */
0189     u8  rev;
0190     u8  prog_intf;
0191     u8  subclass;
0192     u8  base_class;
0193     u32 subsystem_id;
0194     union   win_slot_encoding win_slot;
0195     u32 ser;    /* serial number */
0196     u32 flags;
0197     u16 virtual_numa_node;
0198     u16 reserved;
0199 } __packed;
0200 
0201 /**
0202  * struct hv_msi_desc
0203  * @vector:     IDT entry
0204  * @delivery_mode:  As defined in Intel's Programmer's
0205  *          Reference Manual, Volume 3, Chapter 8.
0206  * @vector_count:   Number of contiguous entries in the
0207  *          Interrupt Descriptor Table that are
0208  *          occupied by this Message-Signaled
0209  *          Interrupt. For "MSI", as first defined
0210  *          in PCI 2.2, this can be between 1 and
0211  *          32. For "MSI-X," as first defined in PCI
0212  *          3.0, this must be 1, as each MSI-X table
0213  *          entry would have its own descriptor.
0214  * @reserved:       Empty space
0215  * @cpu_mask:       All the target virtual processors.
0216  */
0217 struct hv_msi_desc {
0218     u8  vector;
0219     u8  delivery_mode;
0220     u16 vector_count;
0221     u32 reserved;
0222     u64 cpu_mask;
0223 } __packed;
0224 
0225 /**
0226  * struct hv_msi_desc2 - 1.2 version of hv_msi_desc
0227  * @vector:     IDT entry
0228  * @delivery_mode:  As defined in Intel's Programmer's
0229  *          Reference Manual, Volume 3, Chapter 8.
0230  * @vector_count:   Number of contiguous entries in the
0231  *          Interrupt Descriptor Table that are
0232  *          occupied by this Message-Signaled
0233  *          Interrupt. For "MSI", as first defined
0234  *          in PCI 2.2, this can be between 1 and
0235  *          32. For "MSI-X," as first defined in PCI
0236  *          3.0, this must be 1, as each MSI-X table
0237  *          entry would have its own descriptor.
0238  * @processor_count:    number of bits enabled in array.
0239  * @processor_array:    All the target virtual processors.
0240  */
0241 struct hv_msi_desc2 {
0242     u8  vector;
0243     u8  delivery_mode;
0244     u16 vector_count;
0245     u16 processor_count;
0246     u16 processor_array[32];
0247 } __packed;
0248 
0249 /*
0250  * struct hv_msi_desc3 - 1.3 version of hv_msi_desc
0251  *  Everything is the same as in 'hv_msi_desc2' except that the size of the
0252  *  'vector' field is larger to support bigger vector values. For ex: LPI
0253  *  vectors on ARM.
0254  */
0255 struct hv_msi_desc3 {
0256     u32 vector;
0257     u8  delivery_mode;
0258     u8  reserved;
0259     u16 vector_count;
0260     u16 processor_count;
0261     u16 processor_array[32];
0262 } __packed;
0263 
0264 /**
0265  * struct tran_int_desc
0266  * @reserved:       unused, padding
0267  * @vector_count:   same as in hv_msi_desc
0268  * @data:       This is the "data payload" value that is
0269  *          written by the device when it generates
0270  *          a message-signaled interrupt, either MSI
0271  *          or MSI-X.
0272  * @address:        This is the address to which the data
0273  *          payload is written on interrupt
0274  *          generation.
0275  */
0276 struct tran_int_desc {
0277     u16 reserved;
0278     u16 vector_count;
0279     u32 data;
0280     u64 address;
0281 } __packed;
0282 
0283 /*
0284  * A generic message format for virtual PCI.
0285  * Specific message formats are defined later in the file.
0286  */
0287 
0288 struct pci_message {
0289     u32 type;
0290 } __packed;
0291 
0292 struct pci_child_message {
0293     struct pci_message message_type;
0294     union win_slot_encoding wslot;
0295 } __packed;
0296 
0297 struct pci_incoming_message {
0298     struct vmpacket_descriptor hdr;
0299     struct pci_message message_type;
0300 } __packed;
0301 
0302 struct pci_response {
0303     struct vmpacket_descriptor hdr;
0304     s32 status;         /* negative values are failures */
0305 } __packed;
0306 
0307 struct pci_packet {
0308     void (*completion_func)(void *context, struct pci_response *resp,
0309                 int resp_packet_size);
0310     void *compl_ctxt;
0311 
0312     struct pci_message message[];
0313 };
0314 
0315 /*
0316  * Specific message types supporting the PCI protocol.
0317  */
0318 
0319 /*
0320  * Version negotiation message. Sent from the guest to the host.
0321  * The guest is free to try different versions until the host
0322  * accepts the version.
0323  *
0324  * pci_version: The protocol version requested.
0325  * is_last_attempt: If TRUE, this is the last version guest will request.
0326  * reservedz: Reserved field, set to zero.
0327  */
0328 
0329 struct pci_version_request {
0330     struct pci_message message_type;
0331     u32 protocol_version;
0332 } __packed;
0333 
0334 /*
0335  * Bus D0 Entry.  This is sent from the guest to the host when the virtual
0336  * bus (PCI Express port) is ready for action.
0337  */
0338 
0339 struct pci_bus_d0_entry {
0340     struct pci_message message_type;
0341     u32 reserved;
0342     u64 mmio_base;
0343 } __packed;
0344 
0345 struct pci_bus_relations {
0346     struct pci_incoming_message incoming;
0347     u32 device_count;
0348     struct pci_function_description func[];
0349 } __packed;
0350 
0351 struct pci_bus_relations2 {
0352     struct pci_incoming_message incoming;
0353     u32 device_count;
0354     struct pci_function_description2 func[];
0355 } __packed;
0356 
0357 struct pci_q_res_req_response {
0358     struct vmpacket_descriptor hdr;
0359     s32 status;         /* negative values are failures */
0360     u32 probed_bar[PCI_STD_NUM_BARS];
0361 } __packed;
0362 
0363 struct pci_set_power {
0364     struct pci_message message_type;
0365     union win_slot_encoding wslot;
0366     u32 power_state;        /* In Windows terms */
0367     u32 reserved;
0368 } __packed;
0369 
0370 struct pci_set_power_response {
0371     struct vmpacket_descriptor hdr;
0372     s32 status;         /* negative values are failures */
0373     union win_slot_encoding wslot;
0374     u32 resultant_state;        /* In Windows terms */
0375     u32 reserved;
0376 } __packed;
0377 
0378 struct pci_resources_assigned {
0379     struct pci_message message_type;
0380     union win_slot_encoding wslot;
0381     u8 memory_range[0x14][6];   /* not used here */
0382     u32 msi_descriptors;
0383     u32 reserved[4];
0384 } __packed;
0385 
0386 struct pci_resources_assigned2 {
0387     struct pci_message message_type;
0388     union win_slot_encoding wslot;
0389     u8 memory_range[0x14][6];   /* not used here */
0390     u32 msi_descriptor_count;
0391     u8 reserved[70];
0392 } __packed;
0393 
0394 struct pci_create_interrupt {
0395     struct pci_message message_type;
0396     union win_slot_encoding wslot;
0397     struct hv_msi_desc int_desc;
0398 } __packed;
0399 
0400 struct pci_create_int_response {
0401     struct pci_response response;
0402     u32 reserved;
0403     struct tran_int_desc int_desc;
0404 } __packed;
0405 
0406 struct pci_create_interrupt2 {
0407     struct pci_message message_type;
0408     union win_slot_encoding wslot;
0409     struct hv_msi_desc2 int_desc;
0410 } __packed;
0411 
0412 struct pci_create_interrupt3 {
0413     struct pci_message message_type;
0414     union win_slot_encoding wslot;
0415     struct hv_msi_desc3 int_desc;
0416 } __packed;
0417 
0418 struct pci_delete_interrupt {
0419     struct pci_message message_type;
0420     union win_slot_encoding wslot;
0421     struct tran_int_desc int_desc;
0422 } __packed;
0423 
0424 /*
0425  * Note: the VM must pass a valid block id, wslot and bytes_requested.
0426  */
0427 struct pci_read_block {
0428     struct pci_message message_type;
0429     u32 block_id;
0430     union win_slot_encoding wslot;
0431     u32 bytes_requested;
0432 } __packed;
0433 
0434 struct pci_read_block_response {
0435     struct vmpacket_descriptor hdr;
0436     u32 status;
0437     u8 bytes[HV_CONFIG_BLOCK_SIZE_MAX];
0438 } __packed;
0439 
0440 /*
0441  * Note: the VM must pass a valid block id, wslot and byte_count.
0442  */
0443 struct pci_write_block {
0444     struct pci_message message_type;
0445     u32 block_id;
0446     union win_slot_encoding wslot;
0447     u32 byte_count;
0448     u8 bytes[HV_CONFIG_BLOCK_SIZE_MAX];
0449 } __packed;
0450 
0451 struct pci_dev_inval_block {
0452     struct pci_incoming_message incoming;
0453     union win_slot_encoding wslot;
0454     u64 block_mask;
0455 } __packed;
0456 
0457 struct pci_dev_incoming {
0458     struct pci_incoming_message incoming;
0459     union win_slot_encoding wslot;
0460 } __packed;
0461 
0462 struct pci_eject_response {
0463     struct pci_message message_type;
0464     union win_slot_encoding wslot;
0465     u32 status;
0466 } __packed;
0467 
0468 static int pci_ring_size = (4 * PAGE_SIZE);
0469 
0470 /*
0471  * Driver specific state.
0472  */
0473 
0474 enum hv_pcibus_state {
0475     hv_pcibus_init = 0,
0476     hv_pcibus_probed,
0477     hv_pcibus_installed,
0478     hv_pcibus_removing,
0479     hv_pcibus_maximum
0480 };
0481 
0482 struct hv_pcibus_device {
0483 #ifdef CONFIG_X86
0484     struct pci_sysdata sysdata;
0485 #elif defined(CONFIG_ARM64)
0486     struct pci_config_window sysdata;
0487 #endif
0488     struct pci_host_bridge *bridge;
0489     struct fwnode_handle *fwnode;
0490     /* Protocol version negotiated with the host */
0491     enum pci_protocol_version_t protocol_version;
0492     enum hv_pcibus_state state;
0493     struct hv_device *hdev;
0494     resource_size_t low_mmio_space;
0495     resource_size_t high_mmio_space;
0496     struct resource *mem_config;
0497     struct resource *low_mmio_res;
0498     struct resource *high_mmio_res;
0499     struct completion *survey_event;
0500     struct pci_bus *pci_bus;
0501     spinlock_t config_lock; /* Avoid two threads writing index page */
0502     spinlock_t device_list_lock;    /* Protect lists below */
0503     void __iomem *cfg_addr;
0504 
0505     struct list_head children;
0506     struct list_head dr_list;
0507 
0508     struct msi_domain_info msi_info;
0509     struct irq_domain *irq_domain;
0510 
0511     spinlock_t retarget_msi_interrupt_lock;
0512 
0513     struct workqueue_struct *wq;
0514 
0515     /* Highest slot of child device with resources allocated */
0516     int wslot_res_allocated;
0517 
0518     /* hypercall arg, must not cross page boundary */
0519     struct hv_retarget_device_interrupt retarget_msi_interrupt_params;
0520 
0521     /*
0522      * Don't put anything here: retarget_msi_interrupt_params must be last
0523      */
0524 };
0525 
0526 /*
0527  * Tracks "Device Relations" messages from the host, which must be both
0528  * processed in order and deferred so that they don't run in the context
0529  * of the incoming packet callback.
0530  */
0531 struct hv_dr_work {
0532     struct work_struct wrk;
0533     struct hv_pcibus_device *bus;
0534 };
0535 
0536 struct hv_pcidev_description {
0537     u16 v_id;   /* vendor ID */
0538     u16 d_id;   /* device ID */
0539     u8  rev;
0540     u8  prog_intf;
0541     u8  subclass;
0542     u8  base_class;
0543     u32 subsystem_id;
0544     union   win_slot_encoding win_slot;
0545     u32 ser;    /* serial number */
0546     u32 flags;
0547     u16 virtual_numa_node;
0548 };
0549 
0550 struct hv_dr_state {
0551     struct list_head list_entry;
0552     u32 device_count;
0553     struct hv_pcidev_description func[];
0554 };
0555 
0556 enum hv_pcichild_state {
0557     hv_pcichild_init = 0,
0558     hv_pcichild_requirements,
0559     hv_pcichild_resourced,
0560     hv_pcichild_ejecting,
0561     hv_pcichild_maximum
0562 };
0563 
0564 struct hv_pci_dev {
0565     /* List protected by pci_rescan_remove_lock */
0566     struct list_head list_entry;
0567     refcount_t refs;
0568     enum hv_pcichild_state state;
0569     struct pci_slot *pci_slot;
0570     struct hv_pcidev_description desc;
0571     bool reported_missing;
0572     struct hv_pcibus_device *hbus;
0573     struct work_struct wrk;
0574 
0575     void (*block_invalidate)(void *context, u64 block_mask);
0576     void *invalidate_context;
0577 
0578     /*
0579      * What would be observed if one wrote 0xFFFFFFFF to a BAR and then
0580      * read it back, for each of the BAR offsets within config space.
0581      */
0582     u32 probed_bar[PCI_STD_NUM_BARS];
0583 };
0584 
0585 struct hv_pci_compl {
0586     struct completion host_event;
0587     s32 completion_status;
0588 };
0589 
0590 static void hv_pci_onchannelcallback(void *context);
0591 
0592 #ifdef CONFIG_X86
0593 #define DELIVERY_MODE   APIC_DELIVERY_MODE_FIXED
0594 #define FLOW_HANDLER    handle_edge_irq
0595 #define FLOW_NAME   "edge"
0596 
0597 static int hv_pci_irqchip_init(void)
0598 {
0599     return 0;
0600 }
0601 
0602 static struct irq_domain *hv_pci_get_root_domain(void)
0603 {
0604     return x86_vector_domain;
0605 }
0606 
0607 static unsigned int hv_msi_get_int_vector(struct irq_data *data)
0608 {
0609     struct irq_cfg *cfg = irqd_cfg(data);
0610 
0611     return cfg->vector;
0612 }
0613 
0614 static int hv_msi_prepare(struct irq_domain *domain, struct device *dev,
0615               int nvec, msi_alloc_info_t *info)
0616 {
0617     int ret = pci_msi_prepare(domain, dev, nvec, info);
0618 
0619     /*
0620      * By using the interrupt remapper in the hypervisor IOMMU, contiguous
0621      * CPU vectors is not needed for multi-MSI
0622      */
0623     if (info->type == X86_IRQ_ALLOC_TYPE_PCI_MSI)
0624         info->flags &= ~X86_IRQ_ALLOC_CONTIGUOUS_VECTORS;
0625 
0626     return ret;
0627 }
0628 
0629 /**
0630  * hv_arch_irq_unmask() - "Unmask" the IRQ by setting its current
0631  * affinity.
0632  * @data:   Describes the IRQ
0633  *
0634  * Build new a destination for the MSI and make a hypercall to
0635  * update the Interrupt Redirection Table. "Device Logical ID"
0636  * is built out of this PCI bus's instance GUID and the function
0637  * number of the device.
0638  */
0639 static void hv_arch_irq_unmask(struct irq_data *data)
0640 {
0641     struct msi_desc *msi_desc = irq_data_get_msi_desc(data);
0642     struct hv_retarget_device_interrupt *params;
0643     struct tran_int_desc *int_desc;
0644     struct hv_pcibus_device *hbus;
0645     const struct cpumask *dest;
0646     cpumask_var_t tmp;
0647     struct pci_bus *pbus;
0648     struct pci_dev *pdev;
0649     unsigned long flags;
0650     u32 var_size = 0;
0651     int cpu, nr_bank;
0652     u64 res;
0653 
0654     dest = irq_data_get_effective_affinity_mask(data);
0655     pdev = msi_desc_to_pci_dev(msi_desc);
0656     pbus = pdev->bus;
0657     hbus = container_of(pbus->sysdata, struct hv_pcibus_device, sysdata);
0658     int_desc = data->chip_data;
0659 
0660     spin_lock_irqsave(&hbus->retarget_msi_interrupt_lock, flags);
0661 
0662     params = &hbus->retarget_msi_interrupt_params;
0663     memset(params, 0, sizeof(*params));
0664     params->partition_id = HV_PARTITION_ID_SELF;
0665     params->int_entry.source = HV_INTERRUPT_SOURCE_MSI;
0666     params->int_entry.msi_entry.address.as_uint32 = int_desc->address & 0xffffffff;
0667     params->int_entry.msi_entry.data.as_uint32 = int_desc->data;
0668     params->device_id = (hbus->hdev->dev_instance.b[5] << 24) |
0669                (hbus->hdev->dev_instance.b[4] << 16) |
0670                (hbus->hdev->dev_instance.b[7] << 8) |
0671                (hbus->hdev->dev_instance.b[6] & 0xf8) |
0672                PCI_FUNC(pdev->devfn);
0673     params->int_target.vector = hv_msi_get_int_vector(data);
0674 
0675     /*
0676      * Honoring apic->delivery_mode set to APIC_DELIVERY_MODE_FIXED by
0677      * setting the HV_DEVICE_INTERRUPT_TARGET_MULTICAST flag results in a
0678      * spurious interrupt storm. Not doing so does not seem to have a
0679      * negative effect (yet?).
0680      */
0681 
0682     if (hbus->protocol_version >= PCI_PROTOCOL_VERSION_1_2) {
0683         /*
0684          * PCI_PROTOCOL_VERSION_1_2 supports the VP_SET version of the
0685          * HVCALL_RETARGET_INTERRUPT hypercall, which also coincides
0686          * with >64 VP support.
0687          * ms_hyperv.hints & HV_X64_EX_PROCESSOR_MASKS_RECOMMENDED
0688          * is not sufficient for this hypercall.
0689          */
0690         params->int_target.flags |=
0691             HV_DEVICE_INTERRUPT_TARGET_PROCESSOR_SET;
0692 
0693         if (!alloc_cpumask_var(&tmp, GFP_ATOMIC)) {
0694             res = 1;
0695             goto exit_unlock;
0696         }
0697 
0698         cpumask_and(tmp, dest, cpu_online_mask);
0699         nr_bank = cpumask_to_vpset(&params->int_target.vp_set, tmp);
0700         free_cpumask_var(tmp);
0701 
0702         if (nr_bank <= 0) {
0703             res = 1;
0704             goto exit_unlock;
0705         }
0706 
0707         /*
0708          * var-sized hypercall, var-size starts after vp_mask (thus
0709          * vp_set.format does not count, but vp_set.valid_bank_mask
0710          * does).
0711          */
0712         var_size = 1 + nr_bank;
0713     } else {
0714         for_each_cpu_and(cpu, dest, cpu_online_mask) {
0715             params->int_target.vp_mask |=
0716                 (1ULL << hv_cpu_number_to_vp_number(cpu));
0717         }
0718     }
0719 
0720     res = hv_do_hypercall(HVCALL_RETARGET_INTERRUPT | (var_size << 17),
0721                   params, NULL);
0722 
0723 exit_unlock:
0724     spin_unlock_irqrestore(&hbus->retarget_msi_interrupt_lock, flags);
0725 
0726     /*
0727      * During hibernation, when a CPU is offlined, the kernel tries
0728      * to move the interrupt to the remaining CPUs that haven't
0729      * been offlined yet. In this case, the below hv_do_hypercall()
0730      * always fails since the vmbus channel has been closed:
0731      * refer to cpu_disable_common() -> fixup_irqs() ->
0732      * irq_migrate_all_off_this_cpu() -> migrate_one_irq().
0733      *
0734      * Suppress the error message for hibernation because the failure
0735      * during hibernation does not matter (at this time all the devices
0736      * have been frozen). Note: the correct affinity info is still updated
0737      * into the irqdata data structure in migrate_one_irq() ->
0738      * irq_do_set_affinity() -> hv_set_affinity(), so later when the VM
0739      * resumes, hv_pci_restore_msi_state() is able to correctly restore
0740      * the interrupt with the correct affinity.
0741      */
0742     if (!hv_result_success(res) && hbus->state != hv_pcibus_removing)
0743         dev_err(&hbus->hdev->device,
0744             "%s() failed: %#llx", __func__, res);
0745 }
0746 #elif defined(CONFIG_ARM64)
0747 /*
0748  * SPI vectors to use for vPCI; arch SPIs range is [32, 1019], but leaving a bit
0749  * of room at the start to allow for SPIs to be specified through ACPI and
0750  * starting with a power of two to satisfy power of 2 multi-MSI requirement.
0751  */
0752 #define HV_PCI_MSI_SPI_START    64
0753 #define HV_PCI_MSI_SPI_NR   (1020 - HV_PCI_MSI_SPI_START)
0754 #define DELIVERY_MODE       0
0755 #define FLOW_HANDLER        NULL
0756 #define FLOW_NAME       NULL
0757 #define hv_msi_prepare      NULL
0758 
0759 struct hv_pci_chip_data {
0760     DECLARE_BITMAP(spi_map, HV_PCI_MSI_SPI_NR);
0761     struct mutex    map_lock;
0762 };
0763 
0764 /* Hyper-V vPCI MSI GIC IRQ domain */
0765 static struct irq_domain *hv_msi_gic_irq_domain;
0766 
0767 /* Hyper-V PCI MSI IRQ chip */
0768 static struct irq_chip hv_arm64_msi_irq_chip = {
0769     .name = "MSI",
0770     .irq_set_affinity = irq_chip_set_affinity_parent,
0771     .irq_eoi = irq_chip_eoi_parent,
0772     .irq_mask = irq_chip_mask_parent,
0773     .irq_unmask = irq_chip_unmask_parent
0774 };
0775 
0776 static unsigned int hv_msi_get_int_vector(struct irq_data *irqd)
0777 {
0778     return irqd->parent_data->hwirq;
0779 }
0780 
0781 /*
0782  * @nr_bm_irqs:     Indicates the number of IRQs that were allocated from
0783  *          the bitmap.
0784  * @nr_dom_irqs:    Indicates the number of IRQs that were allocated from
0785  *          the parent domain.
0786  */
0787 static void hv_pci_vec_irq_free(struct irq_domain *domain,
0788                 unsigned int virq,
0789                 unsigned int nr_bm_irqs,
0790                 unsigned int nr_dom_irqs)
0791 {
0792     struct hv_pci_chip_data *chip_data = domain->host_data;
0793     struct irq_data *d = irq_domain_get_irq_data(domain, virq);
0794     int first = d->hwirq - HV_PCI_MSI_SPI_START;
0795     int i;
0796 
0797     mutex_lock(&chip_data->map_lock);
0798     bitmap_release_region(chip_data->spi_map,
0799                   first,
0800                   get_count_order(nr_bm_irqs));
0801     mutex_unlock(&chip_data->map_lock);
0802     for (i = 0; i < nr_dom_irqs; i++) {
0803         if (i)
0804             d = irq_domain_get_irq_data(domain, virq + i);
0805         irq_domain_reset_irq_data(d);
0806     }
0807 
0808     irq_domain_free_irqs_parent(domain, virq, nr_dom_irqs);
0809 }
0810 
0811 static void hv_pci_vec_irq_domain_free(struct irq_domain *domain,
0812                        unsigned int virq,
0813                        unsigned int nr_irqs)
0814 {
0815     hv_pci_vec_irq_free(domain, virq, nr_irqs, nr_irqs);
0816 }
0817 
0818 static int hv_pci_vec_alloc_device_irq(struct irq_domain *domain,
0819                        unsigned int nr_irqs,
0820                        irq_hw_number_t *hwirq)
0821 {
0822     struct hv_pci_chip_data *chip_data = domain->host_data;
0823     int index;
0824 
0825     /* Find and allocate region from the SPI bitmap */
0826     mutex_lock(&chip_data->map_lock);
0827     index = bitmap_find_free_region(chip_data->spi_map,
0828                     HV_PCI_MSI_SPI_NR,
0829                     get_count_order(nr_irqs));
0830     mutex_unlock(&chip_data->map_lock);
0831     if (index < 0)
0832         return -ENOSPC;
0833 
0834     *hwirq = index + HV_PCI_MSI_SPI_START;
0835 
0836     return 0;
0837 }
0838 
0839 static int hv_pci_vec_irq_gic_domain_alloc(struct irq_domain *domain,
0840                        unsigned int virq,
0841                        irq_hw_number_t hwirq)
0842 {
0843     struct irq_fwspec fwspec;
0844     struct irq_data *d;
0845     int ret;
0846 
0847     fwspec.fwnode = domain->parent->fwnode;
0848     fwspec.param_count = 2;
0849     fwspec.param[0] = hwirq;
0850     fwspec.param[1] = IRQ_TYPE_EDGE_RISING;
0851 
0852     ret = irq_domain_alloc_irqs_parent(domain, virq, 1, &fwspec);
0853     if (ret)
0854         return ret;
0855 
0856     /*
0857      * Since the interrupt specifier is not coming from ACPI or DT, the
0858      * trigger type will need to be set explicitly. Otherwise, it will be
0859      * set to whatever is in the GIC configuration.
0860      */
0861     d = irq_domain_get_irq_data(domain->parent, virq);
0862 
0863     return d->chip->irq_set_type(d, IRQ_TYPE_EDGE_RISING);
0864 }
0865 
0866 static int hv_pci_vec_irq_domain_alloc(struct irq_domain *domain,
0867                        unsigned int virq, unsigned int nr_irqs,
0868                        void *args)
0869 {
0870     irq_hw_number_t hwirq;
0871     unsigned int i;
0872     int ret;
0873 
0874     ret = hv_pci_vec_alloc_device_irq(domain, nr_irqs, &hwirq);
0875     if (ret)
0876         return ret;
0877 
0878     for (i = 0; i < nr_irqs; i++) {
0879         ret = hv_pci_vec_irq_gic_domain_alloc(domain, virq + i,
0880                               hwirq + i);
0881         if (ret) {
0882             hv_pci_vec_irq_free(domain, virq, nr_irqs, i);
0883             return ret;
0884         }
0885 
0886         irq_domain_set_hwirq_and_chip(domain, virq + i,
0887                           hwirq + i,
0888                           &hv_arm64_msi_irq_chip,
0889                           domain->host_data);
0890         pr_debug("pID:%d vID:%u\n", (int)(hwirq + i), virq + i);
0891     }
0892 
0893     return 0;
0894 }
0895 
0896 /*
0897  * Pick the first cpu as the irq affinity that can be temporarily used for
0898  * composing MSI from the hypervisor. GIC will eventually set the right
0899  * affinity for the irq and the 'unmask' will retarget the interrupt to that
0900  * cpu.
0901  */
0902 static int hv_pci_vec_irq_domain_activate(struct irq_domain *domain,
0903                       struct irq_data *irqd, bool reserve)
0904 {
0905     int cpu = cpumask_first(cpu_present_mask);
0906 
0907     irq_data_update_effective_affinity(irqd, cpumask_of(cpu));
0908 
0909     return 0;
0910 }
0911 
0912 static const struct irq_domain_ops hv_pci_domain_ops = {
0913     .alloc  = hv_pci_vec_irq_domain_alloc,
0914     .free   = hv_pci_vec_irq_domain_free,
0915     .activate = hv_pci_vec_irq_domain_activate,
0916 };
0917 
0918 static int hv_pci_irqchip_init(void)
0919 {
0920     static struct hv_pci_chip_data *chip_data;
0921     struct fwnode_handle *fn = NULL;
0922     int ret = -ENOMEM;
0923 
0924     chip_data = kzalloc(sizeof(*chip_data), GFP_KERNEL);
0925     if (!chip_data)
0926         return ret;
0927 
0928     mutex_init(&chip_data->map_lock);
0929     fn = irq_domain_alloc_named_fwnode("hv_vpci_arm64");
0930     if (!fn)
0931         goto free_chip;
0932 
0933     /*
0934      * IRQ domain once enabled, should not be removed since there is no
0935      * way to ensure that all the corresponding devices are also gone and
0936      * no interrupts will be generated.
0937      */
0938     hv_msi_gic_irq_domain = acpi_irq_create_hierarchy(0, HV_PCI_MSI_SPI_NR,
0939                               fn, &hv_pci_domain_ops,
0940                               chip_data);
0941 
0942     if (!hv_msi_gic_irq_domain) {
0943         pr_err("Failed to create Hyper-V arm64 vPCI MSI IRQ domain\n");
0944         goto free_chip;
0945     }
0946 
0947     return 0;
0948 
0949 free_chip:
0950     kfree(chip_data);
0951     if (fn)
0952         irq_domain_free_fwnode(fn);
0953 
0954     return ret;
0955 }
0956 
0957 static struct irq_domain *hv_pci_get_root_domain(void)
0958 {
0959     return hv_msi_gic_irq_domain;
0960 }
0961 
0962 /*
0963  * SPIs are used for interrupts of PCI devices and SPIs is managed via GICD
0964  * registers which Hyper-V already supports, so no hypercall needed.
0965  */
0966 static void hv_arch_irq_unmask(struct irq_data *data) { }
0967 #endif /* CONFIG_ARM64 */
0968 
0969 /**
0970  * hv_pci_generic_compl() - Invoked for a completion packet
0971  * @context:        Set up by the sender of the packet.
0972  * @resp:       The response packet
0973  * @resp_packet_size:   Size in bytes of the packet
0974  *
0975  * This function is used to trigger an event and report status
0976  * for any message for which the completion packet contains a
0977  * status and nothing else.
0978  */
0979 static void hv_pci_generic_compl(void *context, struct pci_response *resp,
0980                  int resp_packet_size)
0981 {
0982     struct hv_pci_compl *comp_pkt = context;
0983 
0984     comp_pkt->completion_status = resp->status;
0985     complete(&comp_pkt->host_event);
0986 }
0987 
0988 static struct hv_pci_dev *get_pcichild_wslot(struct hv_pcibus_device *hbus,
0989                         u32 wslot);
0990 
0991 static void get_pcichild(struct hv_pci_dev *hpdev)
0992 {
0993     refcount_inc(&hpdev->refs);
0994 }
0995 
0996 static void put_pcichild(struct hv_pci_dev *hpdev)
0997 {
0998     if (refcount_dec_and_test(&hpdev->refs))
0999         kfree(hpdev);
1000 }
1001 
1002 /*
1003  * There is no good way to get notified from vmbus_onoffer_rescind(),
1004  * so let's use polling here, since this is not a hot path.
1005  */
1006 static int wait_for_response(struct hv_device *hdev,
1007                  struct completion *comp)
1008 {
1009     while (true) {
1010         if (hdev->channel->rescind) {
1011             dev_warn_once(&hdev->device, "The device is gone.\n");
1012             return -ENODEV;
1013         }
1014 
1015         if (wait_for_completion_timeout(comp, HZ / 10))
1016             break;
1017     }
1018 
1019     return 0;
1020 }
1021 
1022 /**
1023  * devfn_to_wslot() - Convert from Linux PCI slot to Windows
1024  * @devfn:  The Linux representation of PCI slot
1025  *
1026  * Windows uses a slightly different representation of PCI slot.
1027  *
1028  * Return: The Windows representation
1029  */
1030 static u32 devfn_to_wslot(int devfn)
1031 {
1032     union win_slot_encoding wslot;
1033 
1034     wslot.slot = 0;
1035     wslot.bits.dev = PCI_SLOT(devfn);
1036     wslot.bits.func = PCI_FUNC(devfn);
1037 
1038     return wslot.slot;
1039 }
1040 
1041 /**
1042  * wslot_to_devfn() - Convert from Windows PCI slot to Linux
1043  * @wslot:  The Windows representation of PCI slot
1044  *
1045  * Windows uses a slightly different representation of PCI slot.
1046  *
1047  * Return: The Linux representation
1048  */
1049 static int wslot_to_devfn(u32 wslot)
1050 {
1051     union win_slot_encoding slot_no;
1052 
1053     slot_no.slot = wslot;
1054     return PCI_DEVFN(slot_no.bits.dev, slot_no.bits.func);
1055 }
1056 
1057 /*
1058  * PCI Configuration Space for these root PCI buses is implemented as a pair
1059  * of pages in memory-mapped I/O space.  Writing to the first page chooses
1060  * the PCI function being written or read.  Once the first page has been
1061  * written to, the following page maps in the entire configuration space of
1062  * the function.
1063  */
1064 
1065 /**
1066  * _hv_pcifront_read_config() - Internal PCI config read
1067  * @hpdev:  The PCI driver's representation of the device
1068  * @where:  Offset within config space
1069  * @size:   Size of the transfer
1070  * @val:    Pointer to the buffer receiving the data
1071  */
1072 static void _hv_pcifront_read_config(struct hv_pci_dev *hpdev, int where,
1073                      int size, u32 *val)
1074 {
1075     unsigned long flags;
1076     void __iomem *addr = hpdev->hbus->cfg_addr + CFG_PAGE_OFFSET + where;
1077 
1078     /*
1079      * If the attempt is to read the IDs or the ROM BAR, simulate that.
1080      */
1081     if (where + size <= PCI_COMMAND) {
1082         memcpy(val, ((u8 *)&hpdev->desc.v_id) + where, size);
1083     } else if (where >= PCI_CLASS_REVISION && where + size <=
1084            PCI_CACHE_LINE_SIZE) {
1085         memcpy(val, ((u8 *)&hpdev->desc.rev) + where -
1086                PCI_CLASS_REVISION, size);
1087     } else if (where >= PCI_SUBSYSTEM_VENDOR_ID && where + size <=
1088            PCI_ROM_ADDRESS) {
1089         memcpy(val, (u8 *)&hpdev->desc.subsystem_id + where -
1090                PCI_SUBSYSTEM_VENDOR_ID, size);
1091     } else if (where >= PCI_ROM_ADDRESS && where + size <=
1092            PCI_CAPABILITY_LIST) {
1093         /* ROM BARs are unimplemented */
1094         *val = 0;
1095     } else if (where >= PCI_INTERRUPT_LINE && where + size <=
1096            PCI_INTERRUPT_PIN) {
1097         /*
1098          * Interrupt Line and Interrupt PIN are hard-wired to zero
1099          * because this front-end only supports message-signaled
1100          * interrupts.
1101          */
1102         *val = 0;
1103     } else if (where + size <= CFG_PAGE_SIZE) {
1104         spin_lock_irqsave(&hpdev->hbus->config_lock, flags);
1105         /* Choose the function to be read. (See comment above) */
1106         writel(hpdev->desc.win_slot.slot, hpdev->hbus->cfg_addr);
1107         /* Make sure the function was chosen before we start reading. */
1108         mb();
1109         /* Read from that function's config space. */
1110         switch (size) {
1111         case 1:
1112             *val = readb(addr);
1113             break;
1114         case 2:
1115             *val = readw(addr);
1116             break;
1117         default:
1118             *val = readl(addr);
1119             break;
1120         }
1121         /*
1122          * Make sure the read was done before we release the spinlock
1123          * allowing consecutive reads/writes.
1124          */
1125         mb();
1126         spin_unlock_irqrestore(&hpdev->hbus->config_lock, flags);
1127     } else {
1128         dev_err(&hpdev->hbus->hdev->device,
1129             "Attempt to read beyond a function's config space.\n");
1130     }
1131 }
1132 
1133 static u16 hv_pcifront_get_vendor_id(struct hv_pci_dev *hpdev)
1134 {
1135     u16 ret;
1136     unsigned long flags;
1137     void __iomem *addr = hpdev->hbus->cfg_addr + CFG_PAGE_OFFSET +
1138                  PCI_VENDOR_ID;
1139 
1140     spin_lock_irqsave(&hpdev->hbus->config_lock, flags);
1141 
1142     /* Choose the function to be read. (See comment above) */
1143     writel(hpdev->desc.win_slot.slot, hpdev->hbus->cfg_addr);
1144     /* Make sure the function was chosen before we start reading. */
1145     mb();
1146     /* Read from that function's config space. */
1147     ret = readw(addr);
1148     /*
1149      * mb() is not required here, because the spin_unlock_irqrestore()
1150      * is a barrier.
1151      */
1152 
1153     spin_unlock_irqrestore(&hpdev->hbus->config_lock, flags);
1154 
1155     return ret;
1156 }
1157 
1158 /**
1159  * _hv_pcifront_write_config() - Internal PCI config write
1160  * @hpdev:  The PCI driver's representation of the device
1161  * @where:  Offset within config space
1162  * @size:   Size of the transfer
1163  * @val:    The data being transferred
1164  */
1165 static void _hv_pcifront_write_config(struct hv_pci_dev *hpdev, int where,
1166                       int size, u32 val)
1167 {
1168     unsigned long flags;
1169     void __iomem *addr = hpdev->hbus->cfg_addr + CFG_PAGE_OFFSET + where;
1170 
1171     if (where >= PCI_SUBSYSTEM_VENDOR_ID &&
1172         where + size <= PCI_CAPABILITY_LIST) {
1173         /* SSIDs and ROM BARs are read-only */
1174     } else if (where >= PCI_COMMAND && where + size <= CFG_PAGE_SIZE) {
1175         spin_lock_irqsave(&hpdev->hbus->config_lock, flags);
1176         /* Choose the function to be written. (See comment above) */
1177         writel(hpdev->desc.win_slot.slot, hpdev->hbus->cfg_addr);
1178         /* Make sure the function was chosen before we start writing. */
1179         wmb();
1180         /* Write to that function's config space. */
1181         switch (size) {
1182         case 1:
1183             writeb(val, addr);
1184             break;
1185         case 2:
1186             writew(val, addr);
1187             break;
1188         default:
1189             writel(val, addr);
1190             break;
1191         }
1192         /*
1193          * Make sure the write was done before we release the spinlock
1194          * allowing consecutive reads/writes.
1195          */
1196         mb();
1197         spin_unlock_irqrestore(&hpdev->hbus->config_lock, flags);
1198     } else {
1199         dev_err(&hpdev->hbus->hdev->device,
1200             "Attempt to write beyond a function's config space.\n");
1201     }
1202 }
1203 
1204 /**
1205  * hv_pcifront_read_config() - Read configuration space
1206  * @bus: PCI Bus structure
1207  * @devfn: Device/function
1208  * @where: Offset from base
1209  * @size: Byte/word/dword
1210  * @val: Value to be read
1211  *
1212  * Return: PCIBIOS_SUCCESSFUL on success
1213  *     PCIBIOS_DEVICE_NOT_FOUND on failure
1214  */
1215 static int hv_pcifront_read_config(struct pci_bus *bus, unsigned int devfn,
1216                    int where, int size, u32 *val)
1217 {
1218     struct hv_pcibus_device *hbus =
1219         container_of(bus->sysdata, struct hv_pcibus_device, sysdata);
1220     struct hv_pci_dev *hpdev;
1221 
1222     hpdev = get_pcichild_wslot(hbus, devfn_to_wslot(devfn));
1223     if (!hpdev)
1224         return PCIBIOS_DEVICE_NOT_FOUND;
1225 
1226     _hv_pcifront_read_config(hpdev, where, size, val);
1227 
1228     put_pcichild(hpdev);
1229     return PCIBIOS_SUCCESSFUL;
1230 }
1231 
1232 /**
1233  * hv_pcifront_write_config() - Write configuration space
1234  * @bus: PCI Bus structure
1235  * @devfn: Device/function
1236  * @where: Offset from base
1237  * @size: Byte/word/dword
1238  * @val: Value to be written to device
1239  *
1240  * Return: PCIBIOS_SUCCESSFUL on success
1241  *     PCIBIOS_DEVICE_NOT_FOUND on failure
1242  */
1243 static int hv_pcifront_write_config(struct pci_bus *bus, unsigned int devfn,
1244                     int where, int size, u32 val)
1245 {
1246     struct hv_pcibus_device *hbus =
1247         container_of(bus->sysdata, struct hv_pcibus_device, sysdata);
1248     struct hv_pci_dev *hpdev;
1249 
1250     hpdev = get_pcichild_wslot(hbus, devfn_to_wslot(devfn));
1251     if (!hpdev)
1252         return PCIBIOS_DEVICE_NOT_FOUND;
1253 
1254     _hv_pcifront_write_config(hpdev, where, size, val);
1255 
1256     put_pcichild(hpdev);
1257     return PCIBIOS_SUCCESSFUL;
1258 }
1259 
1260 /* PCIe operations */
1261 static struct pci_ops hv_pcifront_ops = {
1262     .read  = hv_pcifront_read_config,
1263     .write = hv_pcifront_write_config,
1264 };
1265 
1266 /*
1267  * Paravirtual backchannel
1268  *
1269  * Hyper-V SR-IOV provides a backchannel mechanism in software for
1270  * communication between a VF driver and a PF driver.  These
1271  * "configuration blocks" are similar in concept to PCI configuration space,
1272  * but instead of doing reads and writes in 32-bit chunks through a very slow
1273  * path, packets of up to 128 bytes can be sent or received asynchronously.
1274  *
1275  * Nearly every SR-IOV device contains just such a communications channel in
1276  * hardware, so using this one in software is usually optional.  Using the
1277  * software channel, however, allows driver implementers to leverage software
1278  * tools that fuzz the communications channel looking for vulnerabilities.
1279  *
1280  * The usage model for these packets puts the responsibility for reading or
1281  * writing on the VF driver.  The VF driver sends a read or a write packet,
1282  * indicating which "block" is being referred to by number.
1283  *
1284  * If the PF driver wishes to initiate communication, it can "invalidate" one or
1285  * more of the first 64 blocks.  This invalidation is delivered via a callback
1286  * supplied by the VF driver by this driver.
1287  *
1288  * No protocol is implied, except that supplied by the PF and VF drivers.
1289  */
1290 
1291 struct hv_read_config_compl {
1292     struct hv_pci_compl comp_pkt;
1293     void *buf;
1294     unsigned int len;
1295     unsigned int bytes_returned;
1296 };
1297 
1298 /**
1299  * hv_pci_read_config_compl() - Invoked when a response packet
1300  * for a read config block operation arrives.
1301  * @context:        Identifies the read config operation
1302  * @resp:       The response packet itself
1303  * @resp_packet_size:   Size in bytes of the response packet
1304  */
1305 static void hv_pci_read_config_compl(void *context, struct pci_response *resp,
1306                      int resp_packet_size)
1307 {
1308     struct hv_read_config_compl *comp = context;
1309     struct pci_read_block_response *read_resp =
1310         (struct pci_read_block_response *)resp;
1311     unsigned int data_len, hdr_len;
1312 
1313     hdr_len = offsetof(struct pci_read_block_response, bytes);
1314     if (resp_packet_size < hdr_len) {
1315         comp->comp_pkt.completion_status = -1;
1316         goto out;
1317     }
1318 
1319     data_len = resp_packet_size - hdr_len;
1320     if (data_len > 0 && read_resp->status == 0) {
1321         comp->bytes_returned = min(comp->len, data_len);
1322         memcpy(comp->buf, read_resp->bytes, comp->bytes_returned);
1323     } else {
1324         comp->bytes_returned = 0;
1325     }
1326 
1327     comp->comp_pkt.completion_status = read_resp->status;
1328 out:
1329     complete(&comp->comp_pkt.host_event);
1330 }
1331 
1332 /**
1333  * hv_read_config_block() - Sends a read config block request to
1334  * the back-end driver running in the Hyper-V parent partition.
1335  * @pdev:       The PCI driver's representation for this device.
1336  * @buf:        Buffer into which the config block will be copied.
1337  * @len:        Size in bytes of buf.
1338  * @block_id:       Identifies the config block which has been requested.
1339  * @bytes_returned: Size which came back from the back-end driver.
1340  *
1341  * Return: 0 on success, -errno on failure
1342  */
1343 static int hv_read_config_block(struct pci_dev *pdev, void *buf,
1344                 unsigned int len, unsigned int block_id,
1345                 unsigned int *bytes_returned)
1346 {
1347     struct hv_pcibus_device *hbus =
1348         container_of(pdev->bus->sysdata, struct hv_pcibus_device,
1349                  sysdata);
1350     struct {
1351         struct pci_packet pkt;
1352         char buf[sizeof(struct pci_read_block)];
1353     } pkt;
1354     struct hv_read_config_compl comp_pkt;
1355     struct pci_read_block *read_blk;
1356     int ret;
1357 
1358     if (len == 0 || len > HV_CONFIG_BLOCK_SIZE_MAX)
1359         return -EINVAL;
1360 
1361     init_completion(&comp_pkt.comp_pkt.host_event);
1362     comp_pkt.buf = buf;
1363     comp_pkt.len = len;
1364 
1365     memset(&pkt, 0, sizeof(pkt));
1366     pkt.pkt.completion_func = hv_pci_read_config_compl;
1367     pkt.pkt.compl_ctxt = &comp_pkt;
1368     read_blk = (struct pci_read_block *)&pkt.pkt.message;
1369     read_blk->message_type.type = PCI_READ_BLOCK;
1370     read_blk->wslot.slot = devfn_to_wslot(pdev->devfn);
1371     read_blk->block_id = block_id;
1372     read_blk->bytes_requested = len;
1373 
1374     ret = vmbus_sendpacket(hbus->hdev->channel, read_blk,
1375                    sizeof(*read_blk), (unsigned long)&pkt.pkt,
1376                    VM_PKT_DATA_INBAND,
1377                    VMBUS_DATA_PACKET_FLAG_COMPLETION_REQUESTED);
1378     if (ret)
1379         return ret;
1380 
1381     ret = wait_for_response(hbus->hdev, &comp_pkt.comp_pkt.host_event);
1382     if (ret)
1383         return ret;
1384 
1385     if (comp_pkt.comp_pkt.completion_status != 0 ||
1386         comp_pkt.bytes_returned == 0) {
1387         dev_err(&hbus->hdev->device,
1388             "Read Config Block failed: 0x%x, bytes_returned=%d\n",
1389             comp_pkt.comp_pkt.completion_status,
1390             comp_pkt.bytes_returned);
1391         return -EIO;
1392     }
1393 
1394     *bytes_returned = comp_pkt.bytes_returned;
1395     return 0;
1396 }
1397 
1398 /**
1399  * hv_pci_write_config_compl() - Invoked when a response packet for a write
1400  * config block operation arrives.
1401  * @context:        Identifies the write config operation
1402  * @resp:       The response packet itself
1403  * @resp_packet_size:   Size in bytes of the response packet
1404  */
1405 static void hv_pci_write_config_compl(void *context, struct pci_response *resp,
1406                       int resp_packet_size)
1407 {
1408     struct hv_pci_compl *comp_pkt = context;
1409 
1410     comp_pkt->completion_status = resp->status;
1411     complete(&comp_pkt->host_event);
1412 }
1413 
1414 /**
1415  * hv_write_config_block() - Sends a write config block request to the
1416  * back-end driver running in the Hyper-V parent partition.
1417  * @pdev:       The PCI driver's representation for this device.
1418  * @buf:        Buffer from which the config block will be copied.
1419  * @len:        Size in bytes of buf.
1420  * @block_id:       Identifies the config block which is being written.
1421  *
1422  * Return: 0 on success, -errno on failure
1423  */
1424 static int hv_write_config_block(struct pci_dev *pdev, void *buf,
1425                 unsigned int len, unsigned int block_id)
1426 {
1427     struct hv_pcibus_device *hbus =
1428         container_of(pdev->bus->sysdata, struct hv_pcibus_device,
1429                  sysdata);
1430     struct {
1431         struct pci_packet pkt;
1432         char buf[sizeof(struct pci_write_block)];
1433         u32 reserved;
1434     } pkt;
1435     struct hv_pci_compl comp_pkt;
1436     struct pci_write_block *write_blk;
1437     u32 pkt_size;
1438     int ret;
1439 
1440     if (len == 0 || len > HV_CONFIG_BLOCK_SIZE_MAX)
1441         return -EINVAL;
1442 
1443     init_completion(&comp_pkt.host_event);
1444 
1445     memset(&pkt, 0, sizeof(pkt));
1446     pkt.pkt.completion_func = hv_pci_write_config_compl;
1447     pkt.pkt.compl_ctxt = &comp_pkt;
1448     write_blk = (struct pci_write_block *)&pkt.pkt.message;
1449     write_blk->message_type.type = PCI_WRITE_BLOCK;
1450     write_blk->wslot.slot = devfn_to_wslot(pdev->devfn);
1451     write_blk->block_id = block_id;
1452     write_blk->byte_count = len;
1453     memcpy(write_blk->bytes, buf, len);
1454     pkt_size = offsetof(struct pci_write_block, bytes) + len;
1455     /*
1456      * This quirk is required on some hosts shipped around 2018, because
1457      * these hosts don't check the pkt_size correctly (new hosts have been
1458      * fixed since early 2019). The quirk is also safe on very old hosts
1459      * and new hosts, because, on them, what really matters is the length
1460      * specified in write_blk->byte_count.
1461      */
1462     pkt_size += sizeof(pkt.reserved);
1463 
1464     ret = vmbus_sendpacket(hbus->hdev->channel, write_blk, pkt_size,
1465                    (unsigned long)&pkt.pkt, VM_PKT_DATA_INBAND,
1466                    VMBUS_DATA_PACKET_FLAG_COMPLETION_REQUESTED);
1467     if (ret)
1468         return ret;
1469 
1470     ret = wait_for_response(hbus->hdev, &comp_pkt.host_event);
1471     if (ret)
1472         return ret;
1473 
1474     if (comp_pkt.completion_status != 0) {
1475         dev_err(&hbus->hdev->device,
1476             "Write Config Block failed: 0x%x\n",
1477             comp_pkt.completion_status);
1478         return -EIO;
1479     }
1480 
1481     return 0;
1482 }
1483 
1484 /**
1485  * hv_register_block_invalidate() - Invoked when a config block invalidation
1486  * arrives from the back-end driver.
1487  * @pdev:       The PCI driver's representation for this device.
1488  * @context:        Identifies the device.
1489  * @block_invalidate:   Identifies all of the blocks being invalidated.
1490  *
1491  * Return: 0 on success, -errno on failure
1492  */
1493 static int hv_register_block_invalidate(struct pci_dev *pdev, void *context,
1494                     void (*block_invalidate)(void *context,
1495                                  u64 block_mask))
1496 {
1497     struct hv_pcibus_device *hbus =
1498         container_of(pdev->bus->sysdata, struct hv_pcibus_device,
1499                  sysdata);
1500     struct hv_pci_dev *hpdev;
1501 
1502     hpdev = get_pcichild_wslot(hbus, devfn_to_wslot(pdev->devfn));
1503     if (!hpdev)
1504         return -ENODEV;
1505 
1506     hpdev->block_invalidate = block_invalidate;
1507     hpdev->invalidate_context = context;
1508 
1509     put_pcichild(hpdev);
1510     return 0;
1511 
1512 }
1513 
1514 /* Interrupt management hooks */
1515 static void hv_int_desc_free(struct hv_pci_dev *hpdev,
1516                  struct tran_int_desc *int_desc)
1517 {
1518     struct pci_delete_interrupt *int_pkt;
1519     struct {
1520         struct pci_packet pkt;
1521         u8 buffer[sizeof(struct pci_delete_interrupt)];
1522     } ctxt;
1523 
1524     if (!int_desc->vector_count) {
1525         kfree(int_desc);
1526         return;
1527     }
1528     memset(&ctxt, 0, sizeof(ctxt));
1529     int_pkt = (struct pci_delete_interrupt *)&ctxt.pkt.message;
1530     int_pkt->message_type.type =
1531         PCI_DELETE_INTERRUPT_MESSAGE;
1532     int_pkt->wslot.slot = hpdev->desc.win_slot.slot;
1533     int_pkt->int_desc = *int_desc;
1534     vmbus_sendpacket(hpdev->hbus->hdev->channel, int_pkt, sizeof(*int_pkt),
1535              0, VM_PKT_DATA_INBAND, 0);
1536     kfree(int_desc);
1537 }
1538 
1539 /**
1540  * hv_msi_free() - Free the MSI.
1541  * @domain: The interrupt domain pointer
1542  * @info:   Extra MSI-related context
1543  * @irq:    Identifies the IRQ.
1544  *
1545  * The Hyper-V parent partition and hypervisor are tracking the
1546  * messages that are in use, keeping the interrupt redirection
1547  * table up to date.  This callback sends a message that frees
1548  * the IRT entry and related tracking nonsense.
1549  */
1550 static void hv_msi_free(struct irq_domain *domain, struct msi_domain_info *info,
1551             unsigned int irq)
1552 {
1553     struct hv_pcibus_device *hbus;
1554     struct hv_pci_dev *hpdev;
1555     struct pci_dev *pdev;
1556     struct tran_int_desc *int_desc;
1557     struct irq_data *irq_data = irq_domain_get_irq_data(domain, irq);
1558     struct msi_desc *msi = irq_data_get_msi_desc(irq_data);
1559 
1560     pdev = msi_desc_to_pci_dev(msi);
1561     hbus = info->data;
1562     int_desc = irq_data_get_irq_chip_data(irq_data);
1563     if (!int_desc)
1564         return;
1565 
1566     irq_data->chip_data = NULL;
1567     hpdev = get_pcichild_wslot(hbus, devfn_to_wslot(pdev->devfn));
1568     if (!hpdev) {
1569         kfree(int_desc);
1570         return;
1571     }
1572 
1573     hv_int_desc_free(hpdev, int_desc);
1574     put_pcichild(hpdev);
1575 }
1576 
1577 static void hv_irq_mask(struct irq_data *data)
1578 {
1579     pci_msi_mask_irq(data);
1580     if (data->parent_data->chip->irq_mask)
1581         irq_chip_mask_parent(data);
1582 }
1583 
1584 static void hv_irq_unmask(struct irq_data *data)
1585 {
1586     hv_arch_irq_unmask(data);
1587 
1588     if (data->parent_data->chip->irq_unmask)
1589         irq_chip_unmask_parent(data);
1590     pci_msi_unmask_irq(data);
1591 }
1592 
1593 struct compose_comp_ctxt {
1594     struct hv_pci_compl comp_pkt;
1595     struct tran_int_desc int_desc;
1596 };
1597 
1598 static void hv_pci_compose_compl(void *context, struct pci_response *resp,
1599                  int resp_packet_size)
1600 {
1601     struct compose_comp_ctxt *comp_pkt = context;
1602     struct pci_create_int_response *int_resp =
1603         (struct pci_create_int_response *)resp;
1604 
1605     if (resp_packet_size < sizeof(*int_resp)) {
1606         comp_pkt->comp_pkt.completion_status = -1;
1607         goto out;
1608     }
1609     comp_pkt->comp_pkt.completion_status = resp->status;
1610     comp_pkt->int_desc = int_resp->int_desc;
1611 out:
1612     complete(&comp_pkt->comp_pkt.host_event);
1613 }
1614 
1615 static u32 hv_compose_msi_req_v1(
1616     struct pci_create_interrupt *int_pkt, const struct cpumask *affinity,
1617     u32 slot, u8 vector, u8 vector_count)
1618 {
1619     int_pkt->message_type.type = PCI_CREATE_INTERRUPT_MESSAGE;
1620     int_pkt->wslot.slot = slot;
1621     int_pkt->int_desc.vector = vector;
1622     int_pkt->int_desc.vector_count = vector_count;
1623     int_pkt->int_desc.delivery_mode = DELIVERY_MODE;
1624 
1625     /*
1626      * Create MSI w/ dummy vCPU set, overwritten by subsequent retarget in
1627      * hv_irq_unmask().
1628      */
1629     int_pkt->int_desc.cpu_mask = CPU_AFFINITY_ALL;
1630 
1631     return sizeof(*int_pkt);
1632 }
1633 
1634 /*
1635  * Create MSI w/ dummy vCPU set targeting just one vCPU, overwritten
1636  * by subsequent retarget in hv_irq_unmask().
1637  */
1638 static int hv_compose_msi_req_get_cpu(const struct cpumask *affinity)
1639 {
1640     return cpumask_first_and(affinity, cpu_online_mask);
1641 }
1642 
1643 static u32 hv_compose_msi_req_v2(
1644     struct pci_create_interrupt2 *int_pkt, const struct cpumask *affinity,
1645     u32 slot, u8 vector, u8 vector_count)
1646 {
1647     int cpu;
1648 
1649     int_pkt->message_type.type = PCI_CREATE_INTERRUPT_MESSAGE2;
1650     int_pkt->wslot.slot = slot;
1651     int_pkt->int_desc.vector = vector;
1652     int_pkt->int_desc.vector_count = vector_count;
1653     int_pkt->int_desc.delivery_mode = DELIVERY_MODE;
1654     cpu = hv_compose_msi_req_get_cpu(affinity);
1655     int_pkt->int_desc.processor_array[0] =
1656         hv_cpu_number_to_vp_number(cpu);
1657     int_pkt->int_desc.processor_count = 1;
1658 
1659     return sizeof(*int_pkt);
1660 }
1661 
1662 static u32 hv_compose_msi_req_v3(
1663     struct pci_create_interrupt3 *int_pkt, const struct cpumask *affinity,
1664     u32 slot, u32 vector, u8 vector_count)
1665 {
1666     int cpu;
1667 
1668     int_pkt->message_type.type = PCI_CREATE_INTERRUPT_MESSAGE3;
1669     int_pkt->wslot.slot = slot;
1670     int_pkt->int_desc.vector = vector;
1671     int_pkt->int_desc.reserved = 0;
1672     int_pkt->int_desc.vector_count = vector_count;
1673     int_pkt->int_desc.delivery_mode = DELIVERY_MODE;
1674     cpu = hv_compose_msi_req_get_cpu(affinity);
1675     int_pkt->int_desc.processor_array[0] =
1676         hv_cpu_number_to_vp_number(cpu);
1677     int_pkt->int_desc.processor_count = 1;
1678 
1679     return sizeof(*int_pkt);
1680 }
1681 
1682 /**
1683  * hv_compose_msi_msg() - Supplies a valid MSI address/data
1684  * @data:   Everything about this MSI
1685  * @msg:    Buffer that is filled in by this function
1686  *
1687  * This function unpacks the IRQ looking for target CPU set, IDT
1688  * vector and mode and sends a message to the parent partition
1689  * asking for a mapping for that tuple in this partition.  The
1690  * response supplies a data value and address to which that data
1691  * should be written to trigger that interrupt.
1692  */
1693 static void hv_compose_msi_msg(struct irq_data *data, struct msi_msg *msg)
1694 {
1695     struct hv_pcibus_device *hbus;
1696     struct vmbus_channel *channel;
1697     struct hv_pci_dev *hpdev;
1698     struct pci_bus *pbus;
1699     struct pci_dev *pdev;
1700     const struct cpumask *dest;
1701     struct compose_comp_ctxt comp;
1702     struct tran_int_desc *int_desc;
1703     struct msi_desc *msi_desc;
1704     u8 vector, vector_count;
1705     struct {
1706         struct pci_packet pci_pkt;
1707         union {
1708             struct pci_create_interrupt v1;
1709             struct pci_create_interrupt2 v2;
1710             struct pci_create_interrupt3 v3;
1711         } int_pkts;
1712     } __packed ctxt;
1713     u64 trans_id;
1714     u32 size;
1715     int ret;
1716 
1717     /* Reuse the previous allocation */
1718     if (data->chip_data) {
1719         int_desc = data->chip_data;
1720         msg->address_hi = int_desc->address >> 32;
1721         msg->address_lo = int_desc->address & 0xffffffff;
1722         msg->data = int_desc->data;
1723         return;
1724     }
1725 
1726     msi_desc  = irq_data_get_msi_desc(data);
1727     pdev = msi_desc_to_pci_dev(msi_desc);
1728     dest = irq_data_get_effective_affinity_mask(data);
1729     pbus = pdev->bus;
1730     hbus = container_of(pbus->sysdata, struct hv_pcibus_device, sysdata);
1731     channel = hbus->hdev->channel;
1732     hpdev = get_pcichild_wslot(hbus, devfn_to_wslot(pdev->devfn));
1733     if (!hpdev)
1734         goto return_null_message;
1735 
1736     int_desc = kzalloc(sizeof(*int_desc), GFP_ATOMIC);
1737     if (!int_desc)
1738         goto drop_reference;
1739 
1740     if (!msi_desc->pci.msi_attrib.is_msix && msi_desc->nvec_used > 1) {
1741         /*
1742          * If this is not the first MSI of Multi MSI, we already have
1743          * a mapping.  Can exit early.
1744          */
1745         if (msi_desc->irq != data->irq) {
1746             data->chip_data = int_desc;
1747             int_desc->address = msi_desc->msg.address_lo |
1748                         (u64)msi_desc->msg.address_hi << 32;
1749             int_desc->data = msi_desc->msg.data +
1750                      (data->irq - msi_desc->irq);
1751             msg->address_hi = msi_desc->msg.address_hi;
1752             msg->address_lo = msi_desc->msg.address_lo;
1753             msg->data = int_desc->data;
1754             put_pcichild(hpdev);
1755             return;
1756         }
1757         /*
1758          * The vector we select here is a dummy value.  The correct
1759          * value gets sent to the hypervisor in unmask().  This needs
1760          * to be aligned with the count, and also not zero.  Multi-msi
1761          * is powers of 2 up to 32, so 32 will always work here.
1762          */
1763         vector = 32;
1764         vector_count = msi_desc->nvec_used;
1765     } else {
1766         vector = hv_msi_get_int_vector(data);
1767         vector_count = 1;
1768     }
1769 
1770     memset(&ctxt, 0, sizeof(ctxt));
1771     init_completion(&comp.comp_pkt.host_event);
1772     ctxt.pci_pkt.completion_func = hv_pci_compose_compl;
1773     ctxt.pci_pkt.compl_ctxt = &comp;
1774 
1775     switch (hbus->protocol_version) {
1776     case PCI_PROTOCOL_VERSION_1_1:
1777         size = hv_compose_msi_req_v1(&ctxt.int_pkts.v1,
1778                     dest,
1779                     hpdev->desc.win_slot.slot,
1780                     vector,
1781                     vector_count);
1782         break;
1783 
1784     case PCI_PROTOCOL_VERSION_1_2:
1785     case PCI_PROTOCOL_VERSION_1_3:
1786         size = hv_compose_msi_req_v2(&ctxt.int_pkts.v2,
1787                     dest,
1788                     hpdev->desc.win_slot.slot,
1789                     vector,
1790                     vector_count);
1791         break;
1792 
1793     case PCI_PROTOCOL_VERSION_1_4:
1794         size = hv_compose_msi_req_v3(&ctxt.int_pkts.v3,
1795                     dest,
1796                     hpdev->desc.win_slot.slot,
1797                     vector,
1798                     vector_count);
1799         break;
1800 
1801     default:
1802         /* As we only negotiate protocol versions known to this driver,
1803          * this path should never hit. However, this is it not a hot
1804          * path so we print a message to aid future updates.
1805          */
1806         dev_err(&hbus->hdev->device,
1807             "Unexpected vPCI protocol, update driver.");
1808         goto free_int_desc;
1809     }
1810 
1811     ret = vmbus_sendpacket_getid(hpdev->hbus->hdev->channel, &ctxt.int_pkts,
1812                      size, (unsigned long)&ctxt.pci_pkt,
1813                      &trans_id, VM_PKT_DATA_INBAND,
1814                      VMBUS_DATA_PACKET_FLAG_COMPLETION_REQUESTED);
1815     if (ret) {
1816         dev_err(&hbus->hdev->device,
1817             "Sending request for interrupt failed: 0x%x",
1818             comp.comp_pkt.completion_status);
1819         goto free_int_desc;
1820     }
1821 
1822     /*
1823      * Prevents hv_pci_onchannelcallback() from running concurrently
1824      * in the tasklet.
1825      */
1826     tasklet_disable_in_atomic(&channel->callback_event);
1827 
1828     /*
1829      * Since this function is called with IRQ locks held, can't
1830      * do normal wait for completion; instead poll.
1831      */
1832     while (!try_wait_for_completion(&comp.comp_pkt.host_event)) {
1833         unsigned long flags;
1834 
1835         /* 0xFFFF means an invalid PCI VENDOR ID. */
1836         if (hv_pcifront_get_vendor_id(hpdev) == 0xFFFF) {
1837             dev_err_once(&hbus->hdev->device,
1838                      "the device has gone\n");
1839             goto enable_tasklet;
1840         }
1841 
1842         /*
1843          * Make sure that the ring buffer data structure doesn't get
1844          * freed while we dereference the ring buffer pointer.  Test
1845          * for the channel's onchannel_callback being NULL within a
1846          * sched_lock critical section.  See also the inline comments
1847          * in vmbus_reset_channel_cb().
1848          */
1849         spin_lock_irqsave(&channel->sched_lock, flags);
1850         if (unlikely(channel->onchannel_callback == NULL)) {
1851             spin_unlock_irqrestore(&channel->sched_lock, flags);
1852             goto enable_tasklet;
1853         }
1854         hv_pci_onchannelcallback(hbus);
1855         spin_unlock_irqrestore(&channel->sched_lock, flags);
1856 
1857         if (hpdev->state == hv_pcichild_ejecting) {
1858             dev_err_once(&hbus->hdev->device,
1859                      "the device is being ejected\n");
1860             goto enable_tasklet;
1861         }
1862 
1863         udelay(100);
1864     }
1865 
1866     tasklet_enable(&channel->callback_event);
1867 
1868     if (comp.comp_pkt.completion_status < 0) {
1869         dev_err(&hbus->hdev->device,
1870             "Request for interrupt failed: 0x%x",
1871             comp.comp_pkt.completion_status);
1872         goto free_int_desc;
1873     }
1874 
1875     /*
1876      * Record the assignment so that this can be unwound later. Using
1877      * irq_set_chip_data() here would be appropriate, but the lock it takes
1878      * is already held.
1879      */
1880     *int_desc = comp.int_desc;
1881     data->chip_data = int_desc;
1882 
1883     /* Pass up the result. */
1884     msg->address_hi = comp.int_desc.address >> 32;
1885     msg->address_lo = comp.int_desc.address & 0xffffffff;
1886     msg->data = comp.int_desc.data;
1887 
1888     put_pcichild(hpdev);
1889     return;
1890 
1891 enable_tasklet:
1892     tasklet_enable(&channel->callback_event);
1893     /*
1894      * The completion packet on the stack becomes invalid after 'return';
1895      * remove the ID from the VMbus requestor if the identifier is still
1896      * mapped to/associated with the packet.  (The identifier could have
1897      * been 're-used', i.e., already removed and (re-)mapped.)
1898      *
1899      * Cf. hv_pci_onchannelcallback().
1900      */
1901     vmbus_request_addr_match(channel, trans_id, (unsigned long)&ctxt.pci_pkt);
1902 free_int_desc:
1903     kfree(int_desc);
1904 drop_reference:
1905     put_pcichild(hpdev);
1906 return_null_message:
1907     msg->address_hi = 0;
1908     msg->address_lo = 0;
1909     msg->data = 0;
1910 }
1911 
1912 /* HW Interrupt Chip Descriptor */
1913 static struct irq_chip hv_msi_irq_chip = {
1914     .name           = "Hyper-V PCIe MSI",
1915     .irq_compose_msi_msg    = hv_compose_msi_msg,
1916     .irq_set_affinity   = irq_chip_set_affinity_parent,
1917 #ifdef CONFIG_X86
1918     .irq_ack        = irq_chip_ack_parent,
1919 #elif defined(CONFIG_ARM64)
1920     .irq_eoi        = irq_chip_eoi_parent,
1921 #endif
1922     .irq_mask       = hv_irq_mask,
1923     .irq_unmask     = hv_irq_unmask,
1924 };
1925 
1926 static struct msi_domain_ops hv_msi_ops = {
1927     .msi_prepare    = hv_msi_prepare,
1928     .msi_free   = hv_msi_free,
1929 };
1930 
1931 /**
1932  * hv_pcie_init_irq_domain() - Initialize IRQ domain
1933  * @hbus:   The root PCI bus
1934  *
1935  * This function creates an IRQ domain which will be used for
1936  * interrupts from devices that have been passed through.  These
1937  * devices only support MSI and MSI-X, not line-based interrupts
1938  * or simulations of line-based interrupts through PCIe's
1939  * fabric-layer messages.  Because interrupts are remapped, we
1940  * can support multi-message MSI here.
1941  *
1942  * Return: '0' on success and error value on failure
1943  */
1944 static int hv_pcie_init_irq_domain(struct hv_pcibus_device *hbus)
1945 {
1946     hbus->msi_info.chip = &hv_msi_irq_chip;
1947     hbus->msi_info.ops = &hv_msi_ops;
1948     hbus->msi_info.flags = (MSI_FLAG_USE_DEF_DOM_OPS |
1949         MSI_FLAG_USE_DEF_CHIP_OPS | MSI_FLAG_MULTI_PCI_MSI |
1950         MSI_FLAG_PCI_MSIX);
1951     hbus->msi_info.handler = FLOW_HANDLER;
1952     hbus->msi_info.handler_name = FLOW_NAME;
1953     hbus->msi_info.data = hbus;
1954     hbus->irq_domain = pci_msi_create_irq_domain(hbus->fwnode,
1955                              &hbus->msi_info,
1956                              hv_pci_get_root_domain());
1957     if (!hbus->irq_domain) {
1958         dev_err(&hbus->hdev->device,
1959             "Failed to build an MSI IRQ domain\n");
1960         return -ENODEV;
1961     }
1962 
1963     dev_set_msi_domain(&hbus->bridge->dev, hbus->irq_domain);
1964 
1965     return 0;
1966 }
1967 
1968 /**
1969  * get_bar_size() - Get the address space consumed by a BAR
1970  * @bar_val:    Value that a BAR returned after -1 was written
1971  *              to it.
1972  *
1973  * This function returns the size of the BAR, rounded up to 1
1974  * page.  It has to be rounded up because the hypervisor's page
1975  * table entry that maps the BAR into the VM can't specify an
1976  * offset within a page.  The invariant is that the hypervisor
1977  * must place any BARs of smaller than page length at the
1978  * beginning of a page.
1979  *
1980  * Return:  Size in bytes of the consumed MMIO space.
1981  */
1982 static u64 get_bar_size(u64 bar_val)
1983 {
1984     return round_up((1 + ~(bar_val & PCI_BASE_ADDRESS_MEM_MASK)),
1985             PAGE_SIZE);
1986 }
1987 
1988 /**
1989  * survey_child_resources() - Total all MMIO requirements
1990  * @hbus:   Root PCI bus, as understood by this driver
1991  */
1992 static void survey_child_resources(struct hv_pcibus_device *hbus)
1993 {
1994     struct hv_pci_dev *hpdev;
1995     resource_size_t bar_size = 0;
1996     unsigned long flags;
1997     struct completion *event;
1998     u64 bar_val;
1999     int i;
2000 
2001     /* If nobody is waiting on the answer, don't compute it. */
2002     event = xchg(&hbus->survey_event, NULL);
2003     if (!event)
2004         return;
2005 
2006     /* If the answer has already been computed, go with it. */
2007     if (hbus->low_mmio_space || hbus->high_mmio_space) {
2008         complete(event);
2009         return;
2010     }
2011 
2012     spin_lock_irqsave(&hbus->device_list_lock, flags);
2013 
2014     /*
2015      * Due to an interesting quirk of the PCI spec, all memory regions
2016      * for a child device are a power of 2 in size and aligned in memory,
2017      * so it's sufficient to just add them up without tracking alignment.
2018      */
2019     list_for_each_entry(hpdev, &hbus->children, list_entry) {
2020         for (i = 0; i < PCI_STD_NUM_BARS; i++) {
2021             if (hpdev->probed_bar[i] & PCI_BASE_ADDRESS_SPACE_IO)
2022                 dev_err(&hbus->hdev->device,
2023                     "There's an I/O BAR in this list!\n");
2024 
2025             if (hpdev->probed_bar[i] != 0) {
2026                 /*
2027                  * A probed BAR has all the upper bits set that
2028                  * can be changed.
2029                  */
2030 
2031                 bar_val = hpdev->probed_bar[i];
2032                 if (bar_val & PCI_BASE_ADDRESS_MEM_TYPE_64)
2033                     bar_val |=
2034                     ((u64)hpdev->probed_bar[++i] << 32);
2035                 else
2036                     bar_val |= 0xffffffff00000000ULL;
2037 
2038                 bar_size = get_bar_size(bar_val);
2039 
2040                 if (bar_val & PCI_BASE_ADDRESS_MEM_TYPE_64)
2041                     hbus->high_mmio_space += bar_size;
2042                 else
2043                     hbus->low_mmio_space += bar_size;
2044             }
2045         }
2046     }
2047 
2048     spin_unlock_irqrestore(&hbus->device_list_lock, flags);
2049     complete(event);
2050 }
2051 
2052 /**
2053  * prepopulate_bars() - Fill in BARs with defaults
2054  * @hbus:   Root PCI bus, as understood by this driver
2055  *
2056  * The core PCI driver code seems much, much happier if the BARs
2057  * for a device have values upon first scan. So fill them in.
2058  * The algorithm below works down from large sizes to small,
2059  * attempting to pack the assignments optimally. The assumption,
2060  * enforced in other parts of the code, is that the beginning of
2061  * the memory-mapped I/O space will be aligned on the largest
2062  * BAR size.
2063  */
2064 static void prepopulate_bars(struct hv_pcibus_device *hbus)
2065 {
2066     resource_size_t high_size = 0;
2067     resource_size_t low_size = 0;
2068     resource_size_t high_base = 0;
2069     resource_size_t low_base = 0;
2070     resource_size_t bar_size;
2071     struct hv_pci_dev *hpdev;
2072     unsigned long flags;
2073     u64 bar_val;
2074     u32 command;
2075     bool high;
2076     int i;
2077 
2078     if (hbus->low_mmio_space) {
2079         low_size = 1ULL << (63 - __builtin_clzll(hbus->low_mmio_space));
2080         low_base = hbus->low_mmio_res->start;
2081     }
2082 
2083     if (hbus->high_mmio_space) {
2084         high_size = 1ULL <<
2085             (63 - __builtin_clzll(hbus->high_mmio_space));
2086         high_base = hbus->high_mmio_res->start;
2087     }
2088 
2089     spin_lock_irqsave(&hbus->device_list_lock, flags);
2090 
2091     /*
2092      * Clear the memory enable bit, in case it's already set. This occurs
2093      * in the suspend path of hibernation, where the device is suspended,
2094      * resumed and suspended again: see hibernation_snapshot() and
2095      * hibernation_platform_enter().
2096      *
2097      * If the memory enable bit is already set, Hyper-V silently ignores
2098      * the below BAR updates, and the related PCI device driver can not
2099      * work, because reading from the device register(s) always returns
2100      * 0xFFFFFFFF (PCI_ERROR_RESPONSE).
2101      */
2102     list_for_each_entry(hpdev, &hbus->children, list_entry) {
2103         _hv_pcifront_read_config(hpdev, PCI_COMMAND, 2, &command);
2104         command &= ~PCI_COMMAND_MEMORY;
2105         _hv_pcifront_write_config(hpdev, PCI_COMMAND, 2, command);
2106     }
2107 
2108     /* Pick addresses for the BARs. */
2109     do {
2110         list_for_each_entry(hpdev, &hbus->children, list_entry) {
2111             for (i = 0; i < PCI_STD_NUM_BARS; i++) {
2112                 bar_val = hpdev->probed_bar[i];
2113                 if (bar_val == 0)
2114                     continue;
2115                 high = bar_val & PCI_BASE_ADDRESS_MEM_TYPE_64;
2116                 if (high) {
2117                     bar_val |=
2118                         ((u64)hpdev->probed_bar[i + 1]
2119                          << 32);
2120                 } else {
2121                     bar_val |= 0xffffffffULL << 32;
2122                 }
2123                 bar_size = get_bar_size(bar_val);
2124                 if (high) {
2125                     if (high_size != bar_size) {
2126                         i++;
2127                         continue;
2128                     }
2129                     _hv_pcifront_write_config(hpdev,
2130                         PCI_BASE_ADDRESS_0 + (4 * i),
2131                         4,
2132                         (u32)(high_base & 0xffffff00));
2133                     i++;
2134                     _hv_pcifront_write_config(hpdev,
2135                         PCI_BASE_ADDRESS_0 + (4 * i),
2136                         4, (u32)(high_base >> 32));
2137                     high_base += bar_size;
2138                 } else {
2139                     if (low_size != bar_size)
2140                         continue;
2141                     _hv_pcifront_write_config(hpdev,
2142                         PCI_BASE_ADDRESS_0 + (4 * i),
2143                         4,
2144                         (u32)(low_base & 0xffffff00));
2145                     low_base += bar_size;
2146                 }
2147             }
2148             if (high_size <= 1 && low_size <= 1) {
2149                 /*
2150                  * No need to set the PCI_COMMAND_MEMORY bit as
2151                  * the core PCI driver doesn't require the bit
2152                  * to be pre-set. Actually here we intentionally
2153                  * keep the bit off so that the PCI BAR probing
2154                  * in the core PCI driver doesn't cause Hyper-V
2155                  * to unnecessarily unmap/map the virtual BARs
2156                  * from/to the physical BARs multiple times.
2157                  * This reduces the VM boot time significantly
2158                  * if the BAR sizes are huge.
2159                  */
2160                 break;
2161             }
2162         }
2163 
2164         high_size >>= 1;
2165         low_size >>= 1;
2166     }  while (high_size || low_size);
2167 
2168     spin_unlock_irqrestore(&hbus->device_list_lock, flags);
2169 }
2170 
2171 /*
2172  * Assign entries in sysfs pci slot directory.
2173  *
2174  * Note that this function does not need to lock the children list
2175  * because it is called from pci_devices_present_work which
2176  * is serialized with hv_eject_device_work because they are on the
2177  * same ordered workqueue. Therefore hbus->children list will not change
2178  * even when pci_create_slot sleeps.
2179  */
2180 static void hv_pci_assign_slots(struct hv_pcibus_device *hbus)
2181 {
2182     struct hv_pci_dev *hpdev;
2183     char name[SLOT_NAME_SIZE];
2184     int slot_nr;
2185 
2186     list_for_each_entry(hpdev, &hbus->children, list_entry) {
2187         if (hpdev->pci_slot)
2188             continue;
2189 
2190         slot_nr = PCI_SLOT(wslot_to_devfn(hpdev->desc.win_slot.slot));
2191         snprintf(name, SLOT_NAME_SIZE, "%u", hpdev->desc.ser);
2192         hpdev->pci_slot = pci_create_slot(hbus->bridge->bus, slot_nr,
2193                       name, NULL);
2194         if (IS_ERR(hpdev->pci_slot)) {
2195             pr_warn("pci_create slot %s failed\n", name);
2196             hpdev->pci_slot = NULL;
2197         }
2198     }
2199 }
2200 
2201 /*
2202  * Remove entries in sysfs pci slot directory.
2203  */
2204 static void hv_pci_remove_slots(struct hv_pcibus_device *hbus)
2205 {
2206     struct hv_pci_dev *hpdev;
2207 
2208     list_for_each_entry(hpdev, &hbus->children, list_entry) {
2209         if (!hpdev->pci_slot)
2210             continue;
2211         pci_destroy_slot(hpdev->pci_slot);
2212         hpdev->pci_slot = NULL;
2213     }
2214 }
2215 
2216 /*
2217  * Set NUMA node for the devices on the bus
2218  */
2219 static void hv_pci_assign_numa_node(struct hv_pcibus_device *hbus)
2220 {
2221     struct pci_dev *dev;
2222     struct pci_bus *bus = hbus->bridge->bus;
2223     struct hv_pci_dev *hv_dev;
2224 
2225     list_for_each_entry(dev, &bus->devices, bus_list) {
2226         hv_dev = get_pcichild_wslot(hbus, devfn_to_wslot(dev->devfn));
2227         if (!hv_dev)
2228             continue;
2229 
2230         if (hv_dev->desc.flags & HV_PCI_DEVICE_FLAG_NUMA_AFFINITY &&
2231             hv_dev->desc.virtual_numa_node < num_possible_nodes())
2232             /*
2233              * The kernel may boot with some NUMA nodes offline
2234              * (e.g. in a KDUMP kernel) or with NUMA disabled via
2235              * "numa=off". In those cases, adjust the host provided
2236              * NUMA node to a valid NUMA node used by the kernel.
2237              */
2238             set_dev_node(&dev->dev,
2239                      numa_map_to_online_node(
2240                          hv_dev->desc.virtual_numa_node));
2241 
2242         put_pcichild(hv_dev);
2243     }
2244 }
2245 
2246 /**
2247  * create_root_hv_pci_bus() - Expose a new root PCI bus
2248  * @hbus:   Root PCI bus, as understood by this driver
2249  *
2250  * Return: 0 on success, -errno on failure
2251  */
2252 static int create_root_hv_pci_bus(struct hv_pcibus_device *hbus)
2253 {
2254     int error;
2255     struct pci_host_bridge *bridge = hbus->bridge;
2256 
2257     bridge->dev.parent = &hbus->hdev->device;
2258     bridge->sysdata = &hbus->sysdata;
2259     bridge->ops = &hv_pcifront_ops;
2260 
2261     error = pci_scan_root_bus_bridge(bridge);
2262     if (error)
2263         return error;
2264 
2265     pci_lock_rescan_remove();
2266     hv_pci_assign_numa_node(hbus);
2267     pci_bus_assign_resources(bridge->bus);
2268     hv_pci_assign_slots(hbus);
2269     pci_bus_add_devices(bridge->bus);
2270     pci_unlock_rescan_remove();
2271     hbus->state = hv_pcibus_installed;
2272     return 0;
2273 }
2274 
2275 struct q_res_req_compl {
2276     struct completion host_event;
2277     struct hv_pci_dev *hpdev;
2278 };
2279 
2280 /**
2281  * q_resource_requirements() - Query Resource Requirements
2282  * @context:        The completion context.
2283  * @resp:       The response that came from the host.
2284  * @resp_packet_size:   The size in bytes of resp.
2285  *
2286  * This function is invoked on completion of a Query Resource
2287  * Requirements packet.
2288  */
2289 static void q_resource_requirements(void *context, struct pci_response *resp,
2290                     int resp_packet_size)
2291 {
2292     struct q_res_req_compl *completion = context;
2293     struct pci_q_res_req_response *q_res_req =
2294         (struct pci_q_res_req_response *)resp;
2295     s32 status;
2296     int i;
2297 
2298     status = (resp_packet_size < sizeof(*q_res_req)) ? -1 : resp->status;
2299     if (status < 0) {
2300         dev_err(&completion->hpdev->hbus->hdev->device,
2301             "query resource requirements failed: %x\n",
2302             status);
2303     } else {
2304         for (i = 0; i < PCI_STD_NUM_BARS; i++) {
2305             completion->hpdev->probed_bar[i] =
2306                 q_res_req->probed_bar[i];
2307         }
2308     }
2309 
2310     complete(&completion->host_event);
2311 }
2312 
2313 /**
2314  * new_pcichild_device() - Create a new child device
2315  * @hbus:   The internal struct tracking this root PCI bus.
2316  * @desc:   The information supplied so far from the host
2317  *              about the device.
2318  *
2319  * This function creates the tracking structure for a new child
2320  * device and kicks off the process of figuring out what it is.
2321  *
2322  * Return: Pointer to the new tracking struct
2323  */
2324 static struct hv_pci_dev *new_pcichild_device(struct hv_pcibus_device *hbus,
2325         struct hv_pcidev_description *desc)
2326 {
2327     struct hv_pci_dev *hpdev;
2328     struct pci_child_message *res_req;
2329     struct q_res_req_compl comp_pkt;
2330     struct {
2331         struct pci_packet init_packet;
2332         u8 buffer[sizeof(struct pci_child_message)];
2333     } pkt;
2334     unsigned long flags;
2335     int ret;
2336 
2337     hpdev = kzalloc(sizeof(*hpdev), GFP_KERNEL);
2338     if (!hpdev)
2339         return NULL;
2340 
2341     hpdev->hbus = hbus;
2342 
2343     memset(&pkt, 0, sizeof(pkt));
2344     init_completion(&comp_pkt.host_event);
2345     comp_pkt.hpdev = hpdev;
2346     pkt.init_packet.compl_ctxt = &comp_pkt;
2347     pkt.init_packet.completion_func = q_resource_requirements;
2348     res_req = (struct pci_child_message *)&pkt.init_packet.message;
2349     res_req->message_type.type = PCI_QUERY_RESOURCE_REQUIREMENTS;
2350     res_req->wslot.slot = desc->win_slot.slot;
2351 
2352     ret = vmbus_sendpacket(hbus->hdev->channel, res_req,
2353                    sizeof(struct pci_child_message),
2354                    (unsigned long)&pkt.init_packet,
2355                    VM_PKT_DATA_INBAND,
2356                    VMBUS_DATA_PACKET_FLAG_COMPLETION_REQUESTED);
2357     if (ret)
2358         goto error;
2359 
2360     if (wait_for_response(hbus->hdev, &comp_pkt.host_event))
2361         goto error;
2362 
2363     hpdev->desc = *desc;
2364     refcount_set(&hpdev->refs, 1);
2365     get_pcichild(hpdev);
2366     spin_lock_irqsave(&hbus->device_list_lock, flags);
2367 
2368     list_add_tail(&hpdev->list_entry, &hbus->children);
2369     spin_unlock_irqrestore(&hbus->device_list_lock, flags);
2370     return hpdev;
2371 
2372 error:
2373     kfree(hpdev);
2374     return NULL;
2375 }
2376 
2377 /**
2378  * get_pcichild_wslot() - Find device from slot
2379  * @hbus:   Root PCI bus, as understood by this driver
2380  * @wslot:  Location on the bus
2381  *
2382  * This function looks up a PCI device and returns the internal
2383  * representation of it.  It acquires a reference on it, so that
2384  * the device won't be deleted while somebody is using it.  The
2385  * caller is responsible for calling put_pcichild() to release
2386  * this reference.
2387  *
2388  * Return:  Internal representation of a PCI device
2389  */
2390 static struct hv_pci_dev *get_pcichild_wslot(struct hv_pcibus_device *hbus,
2391                          u32 wslot)
2392 {
2393     unsigned long flags;
2394     struct hv_pci_dev *iter, *hpdev = NULL;
2395 
2396     spin_lock_irqsave(&hbus->device_list_lock, flags);
2397     list_for_each_entry(iter, &hbus->children, list_entry) {
2398         if (iter->desc.win_slot.slot == wslot) {
2399             hpdev = iter;
2400             get_pcichild(hpdev);
2401             break;
2402         }
2403     }
2404     spin_unlock_irqrestore(&hbus->device_list_lock, flags);
2405 
2406     return hpdev;
2407 }
2408 
2409 /**
2410  * pci_devices_present_work() - Handle new list of child devices
2411  * @work:   Work struct embedded in struct hv_dr_work
2412  *
2413  * "Bus Relations" is the Windows term for "children of this
2414  * bus."  The terminology is preserved here for people trying to
2415  * debug the interaction between Hyper-V and Linux.  This
2416  * function is called when the parent partition reports a list
2417  * of functions that should be observed under this PCI Express
2418  * port (bus).
2419  *
2420  * This function updates the list, and must tolerate being
2421  * called multiple times with the same information.  The typical
2422  * number of child devices is one, with very atypical cases
2423  * involving three or four, so the algorithms used here can be
2424  * simple and inefficient.
2425  *
2426  * It must also treat the omission of a previously observed device as
2427  * notification that the device no longer exists.
2428  *
2429  * Note that this function is serialized with hv_eject_device_work(),
2430  * because both are pushed to the ordered workqueue hbus->wq.
2431  */
2432 static void pci_devices_present_work(struct work_struct *work)
2433 {
2434     u32 child_no;
2435     bool found;
2436     struct hv_pcidev_description *new_desc;
2437     struct hv_pci_dev *hpdev;
2438     struct hv_pcibus_device *hbus;
2439     struct list_head removed;
2440     struct hv_dr_work *dr_wrk;
2441     struct hv_dr_state *dr = NULL;
2442     unsigned long flags;
2443 
2444     dr_wrk = container_of(work, struct hv_dr_work, wrk);
2445     hbus = dr_wrk->bus;
2446     kfree(dr_wrk);
2447 
2448     INIT_LIST_HEAD(&removed);
2449 
2450     /* Pull this off the queue and process it if it was the last one. */
2451     spin_lock_irqsave(&hbus->device_list_lock, flags);
2452     while (!list_empty(&hbus->dr_list)) {
2453         dr = list_first_entry(&hbus->dr_list, struct hv_dr_state,
2454                       list_entry);
2455         list_del(&dr->list_entry);
2456 
2457         /* Throw this away if the list still has stuff in it. */
2458         if (!list_empty(&hbus->dr_list)) {
2459             kfree(dr);
2460             continue;
2461         }
2462     }
2463     spin_unlock_irqrestore(&hbus->device_list_lock, flags);
2464 
2465     if (!dr)
2466         return;
2467 
2468     /* First, mark all existing children as reported missing. */
2469     spin_lock_irqsave(&hbus->device_list_lock, flags);
2470     list_for_each_entry(hpdev, &hbus->children, list_entry) {
2471         hpdev->reported_missing = true;
2472     }
2473     spin_unlock_irqrestore(&hbus->device_list_lock, flags);
2474 
2475     /* Next, add back any reported devices. */
2476     for (child_no = 0; child_no < dr->device_count; child_no++) {
2477         found = false;
2478         new_desc = &dr->func[child_no];
2479 
2480         spin_lock_irqsave(&hbus->device_list_lock, flags);
2481         list_for_each_entry(hpdev, &hbus->children, list_entry) {
2482             if ((hpdev->desc.win_slot.slot == new_desc->win_slot.slot) &&
2483                 (hpdev->desc.v_id == new_desc->v_id) &&
2484                 (hpdev->desc.d_id == new_desc->d_id) &&
2485                 (hpdev->desc.ser == new_desc->ser)) {
2486                 hpdev->reported_missing = false;
2487                 found = true;
2488             }
2489         }
2490         spin_unlock_irqrestore(&hbus->device_list_lock, flags);
2491 
2492         if (!found) {
2493             hpdev = new_pcichild_device(hbus, new_desc);
2494             if (!hpdev)
2495                 dev_err(&hbus->hdev->device,
2496                     "couldn't record a child device.\n");
2497         }
2498     }
2499 
2500     /* Move missing children to a list on the stack. */
2501     spin_lock_irqsave(&hbus->device_list_lock, flags);
2502     do {
2503         found = false;
2504         list_for_each_entry(hpdev, &hbus->children, list_entry) {
2505             if (hpdev->reported_missing) {
2506                 found = true;
2507                 put_pcichild(hpdev);
2508                 list_move_tail(&hpdev->list_entry, &removed);
2509                 break;
2510             }
2511         }
2512     } while (found);
2513     spin_unlock_irqrestore(&hbus->device_list_lock, flags);
2514 
2515     /* Delete everything that should no longer exist. */
2516     while (!list_empty(&removed)) {
2517         hpdev = list_first_entry(&removed, struct hv_pci_dev,
2518                      list_entry);
2519         list_del(&hpdev->list_entry);
2520 
2521         if (hpdev->pci_slot)
2522             pci_destroy_slot(hpdev->pci_slot);
2523 
2524         put_pcichild(hpdev);
2525     }
2526 
2527     switch (hbus->state) {
2528     case hv_pcibus_installed:
2529         /*
2530          * Tell the core to rescan bus
2531          * because there may have been changes.
2532          */
2533         pci_lock_rescan_remove();
2534         pci_scan_child_bus(hbus->bridge->bus);
2535         hv_pci_assign_numa_node(hbus);
2536         hv_pci_assign_slots(hbus);
2537         pci_unlock_rescan_remove();
2538         break;
2539 
2540     case hv_pcibus_init:
2541     case hv_pcibus_probed:
2542         survey_child_resources(hbus);
2543         break;
2544 
2545     default:
2546         break;
2547     }
2548 
2549     kfree(dr);
2550 }
2551 
2552 /**
2553  * hv_pci_start_relations_work() - Queue work to start device discovery
2554  * @hbus:   Root PCI bus, as understood by this driver
2555  * @dr:     The list of children returned from host
2556  *
2557  * Return:  0 on success, -errno on failure
2558  */
2559 static int hv_pci_start_relations_work(struct hv_pcibus_device *hbus,
2560                        struct hv_dr_state *dr)
2561 {
2562     struct hv_dr_work *dr_wrk;
2563     unsigned long flags;
2564     bool pending_dr;
2565 
2566     if (hbus->state == hv_pcibus_removing) {
2567         dev_info(&hbus->hdev->device,
2568              "PCI VMBus BUS_RELATIONS: ignored\n");
2569         return -ENOENT;
2570     }
2571 
2572     dr_wrk = kzalloc(sizeof(*dr_wrk), GFP_NOWAIT);
2573     if (!dr_wrk)
2574         return -ENOMEM;
2575 
2576     INIT_WORK(&dr_wrk->wrk, pci_devices_present_work);
2577     dr_wrk->bus = hbus;
2578 
2579     spin_lock_irqsave(&hbus->device_list_lock, flags);
2580     /*
2581      * If pending_dr is true, we have already queued a work,
2582      * which will see the new dr. Otherwise, we need to
2583      * queue a new work.
2584      */
2585     pending_dr = !list_empty(&hbus->dr_list);
2586     list_add_tail(&dr->list_entry, &hbus->dr_list);
2587     spin_unlock_irqrestore(&hbus->device_list_lock, flags);
2588 
2589     if (pending_dr)
2590         kfree(dr_wrk);
2591     else
2592         queue_work(hbus->wq, &dr_wrk->wrk);
2593 
2594     return 0;
2595 }
2596 
2597 /**
2598  * hv_pci_devices_present() - Handle list of new children
2599  * @hbus:      Root PCI bus, as understood by this driver
2600  * @relations: Packet from host listing children
2601  *
2602  * Process a new list of devices on the bus. The list of devices is
2603  * discovered by VSP and sent to us via VSP message PCI_BUS_RELATIONS,
2604  * whenever a new list of devices for this bus appears.
2605  */
2606 static void hv_pci_devices_present(struct hv_pcibus_device *hbus,
2607                    struct pci_bus_relations *relations)
2608 {
2609     struct hv_dr_state *dr;
2610     int i;
2611 
2612     dr = kzalloc(struct_size(dr, func, relations->device_count),
2613              GFP_NOWAIT);
2614     if (!dr)
2615         return;
2616 
2617     dr->device_count = relations->device_count;
2618     for (i = 0; i < dr->device_count; i++) {
2619         dr->func[i].v_id = relations->func[i].v_id;
2620         dr->func[i].d_id = relations->func[i].d_id;
2621         dr->func[i].rev = relations->func[i].rev;
2622         dr->func[i].prog_intf = relations->func[i].prog_intf;
2623         dr->func[i].subclass = relations->func[i].subclass;
2624         dr->func[i].base_class = relations->func[i].base_class;
2625         dr->func[i].subsystem_id = relations->func[i].subsystem_id;
2626         dr->func[i].win_slot = relations->func[i].win_slot;
2627         dr->func[i].ser = relations->func[i].ser;
2628     }
2629 
2630     if (hv_pci_start_relations_work(hbus, dr))
2631         kfree(dr);
2632 }
2633 
2634 /**
2635  * hv_pci_devices_present2() - Handle list of new children
2636  * @hbus:   Root PCI bus, as understood by this driver
2637  * @relations:  Packet from host listing children
2638  *
2639  * This function is the v2 version of hv_pci_devices_present()
2640  */
2641 static void hv_pci_devices_present2(struct hv_pcibus_device *hbus,
2642                     struct pci_bus_relations2 *relations)
2643 {
2644     struct hv_dr_state *dr;
2645     int i;
2646 
2647     dr = kzalloc(struct_size(dr, func, relations->device_count),
2648              GFP_NOWAIT);
2649     if (!dr)
2650         return;
2651 
2652     dr->device_count = relations->device_count;
2653     for (i = 0; i < dr->device_count; i++) {
2654         dr->func[i].v_id = relations->func[i].v_id;
2655         dr->func[i].d_id = relations->func[i].d_id;
2656         dr->func[i].rev = relations->func[i].rev;
2657         dr->func[i].prog_intf = relations->func[i].prog_intf;
2658         dr->func[i].subclass = relations->func[i].subclass;
2659         dr->func[i].base_class = relations->func[i].base_class;
2660         dr->func[i].subsystem_id = relations->func[i].subsystem_id;
2661         dr->func[i].win_slot = relations->func[i].win_slot;
2662         dr->func[i].ser = relations->func[i].ser;
2663         dr->func[i].flags = relations->func[i].flags;
2664         dr->func[i].virtual_numa_node =
2665             relations->func[i].virtual_numa_node;
2666     }
2667 
2668     if (hv_pci_start_relations_work(hbus, dr))
2669         kfree(dr);
2670 }
2671 
2672 /**
2673  * hv_eject_device_work() - Asynchronously handles ejection
2674  * @work:   Work struct embedded in internal device struct
2675  *
2676  * This function handles ejecting a device.  Windows will
2677  * attempt to gracefully eject a device, waiting 60 seconds to
2678  * hear back from the guest OS that this completed successfully.
2679  * If this timer expires, the device will be forcibly removed.
2680  */
2681 static void hv_eject_device_work(struct work_struct *work)
2682 {
2683     struct pci_eject_response *ejct_pkt;
2684     struct hv_pcibus_device *hbus;
2685     struct hv_pci_dev *hpdev;
2686     struct pci_dev *pdev;
2687     unsigned long flags;
2688     int wslot;
2689     struct {
2690         struct pci_packet pkt;
2691         u8 buffer[sizeof(struct pci_eject_response)];
2692     } ctxt;
2693 
2694     hpdev = container_of(work, struct hv_pci_dev, wrk);
2695     hbus = hpdev->hbus;
2696 
2697     WARN_ON(hpdev->state != hv_pcichild_ejecting);
2698 
2699     /*
2700      * Ejection can come before or after the PCI bus has been set up, so
2701      * attempt to find it and tear down the bus state, if it exists.  This
2702      * must be done without constructs like pci_domain_nr(hbus->bridge->bus)
2703      * because hbus->bridge->bus may not exist yet.
2704      */
2705     wslot = wslot_to_devfn(hpdev->desc.win_slot.slot);
2706     pdev = pci_get_domain_bus_and_slot(hbus->bridge->domain_nr, 0, wslot);
2707     if (pdev) {
2708         pci_lock_rescan_remove();
2709         pci_stop_and_remove_bus_device(pdev);
2710         pci_dev_put(pdev);
2711         pci_unlock_rescan_remove();
2712     }
2713 
2714     spin_lock_irqsave(&hbus->device_list_lock, flags);
2715     list_del(&hpdev->list_entry);
2716     spin_unlock_irqrestore(&hbus->device_list_lock, flags);
2717 
2718     if (hpdev->pci_slot)
2719         pci_destroy_slot(hpdev->pci_slot);
2720 
2721     memset(&ctxt, 0, sizeof(ctxt));
2722     ejct_pkt = (struct pci_eject_response *)&ctxt.pkt.message;
2723     ejct_pkt->message_type.type = PCI_EJECTION_COMPLETE;
2724     ejct_pkt->wslot.slot = hpdev->desc.win_slot.slot;
2725     vmbus_sendpacket(hbus->hdev->channel, ejct_pkt,
2726              sizeof(*ejct_pkt), 0,
2727              VM_PKT_DATA_INBAND, 0);
2728 
2729     /* For the get_pcichild() in hv_pci_eject_device() */
2730     put_pcichild(hpdev);
2731     /* For the two refs got in new_pcichild_device() */
2732     put_pcichild(hpdev);
2733     put_pcichild(hpdev);
2734     /* hpdev has been freed. Do not use it any more. */
2735 }
2736 
2737 /**
2738  * hv_pci_eject_device() - Handles device ejection
2739  * @hpdev:  Internal device tracking struct
2740  *
2741  * This function is invoked when an ejection packet arrives.  It
2742  * just schedules work so that we don't re-enter the packet
2743  * delivery code handling the ejection.
2744  */
2745 static void hv_pci_eject_device(struct hv_pci_dev *hpdev)
2746 {
2747     struct hv_pcibus_device *hbus = hpdev->hbus;
2748     struct hv_device *hdev = hbus->hdev;
2749 
2750     if (hbus->state == hv_pcibus_removing) {
2751         dev_info(&hdev->device, "PCI VMBus EJECT: ignored\n");
2752         return;
2753     }
2754 
2755     hpdev->state = hv_pcichild_ejecting;
2756     get_pcichild(hpdev);
2757     INIT_WORK(&hpdev->wrk, hv_eject_device_work);
2758     queue_work(hbus->wq, &hpdev->wrk);
2759 }
2760 
2761 /**
2762  * hv_pci_onchannelcallback() - Handles incoming packets
2763  * @context:    Internal bus tracking struct
2764  *
2765  * This function is invoked whenever the host sends a packet to
2766  * this channel (which is private to this root PCI bus).
2767  */
2768 static void hv_pci_onchannelcallback(void *context)
2769 {
2770     const int packet_size = 0x100;
2771     int ret;
2772     struct hv_pcibus_device *hbus = context;
2773     struct vmbus_channel *chan = hbus->hdev->channel;
2774     u32 bytes_recvd;
2775     u64 req_id, req_addr;
2776     struct vmpacket_descriptor *desc;
2777     unsigned char *buffer;
2778     int bufferlen = packet_size;
2779     struct pci_packet *comp_packet;
2780     struct pci_response *response;
2781     struct pci_incoming_message *new_message;
2782     struct pci_bus_relations *bus_rel;
2783     struct pci_bus_relations2 *bus_rel2;
2784     struct pci_dev_inval_block *inval;
2785     struct pci_dev_incoming *dev_message;
2786     struct hv_pci_dev *hpdev;
2787     unsigned long flags;
2788 
2789     buffer = kmalloc(bufferlen, GFP_ATOMIC);
2790     if (!buffer)
2791         return;
2792 
2793     while (1) {
2794         ret = vmbus_recvpacket_raw(chan, buffer, bufferlen,
2795                        &bytes_recvd, &req_id);
2796 
2797         if (ret == -ENOBUFS) {
2798             kfree(buffer);
2799             /* Handle large packet */
2800             bufferlen = bytes_recvd;
2801             buffer = kmalloc(bytes_recvd, GFP_ATOMIC);
2802             if (!buffer)
2803                 return;
2804             continue;
2805         }
2806 
2807         /* Zero length indicates there are no more packets. */
2808         if (ret || !bytes_recvd)
2809             break;
2810 
2811         /*
2812          * All incoming packets must be at least as large as a
2813          * response.
2814          */
2815         if (bytes_recvd <= sizeof(struct pci_response))
2816             continue;
2817         desc = (struct vmpacket_descriptor *)buffer;
2818 
2819         switch (desc->type) {
2820         case VM_PKT_COMP:
2821 
2822             lock_requestor(chan, flags);
2823             req_addr = __vmbus_request_addr_match(chan, req_id,
2824                                   VMBUS_RQST_ADDR_ANY);
2825             if (req_addr == VMBUS_RQST_ERROR) {
2826                 unlock_requestor(chan, flags);
2827                 dev_err(&hbus->hdev->device,
2828                     "Invalid transaction ID %llx\n",
2829                     req_id);
2830                 break;
2831             }
2832             comp_packet = (struct pci_packet *)req_addr;
2833             response = (struct pci_response *)buffer;
2834             /*
2835              * Call ->completion_func() within the critical section to make
2836              * sure that the packet pointer is still valid during the call:
2837              * here 'valid' means that there's a task still waiting for the
2838              * completion, and that the packet data is still on the waiting
2839              * task's stack.  Cf. hv_compose_msi_msg().
2840              */
2841             comp_packet->completion_func(comp_packet->compl_ctxt,
2842                              response,
2843                              bytes_recvd);
2844             unlock_requestor(chan, flags);
2845             break;
2846 
2847         case VM_PKT_DATA_INBAND:
2848 
2849             new_message = (struct pci_incoming_message *)buffer;
2850             switch (new_message->message_type.type) {
2851             case PCI_BUS_RELATIONS:
2852 
2853                 bus_rel = (struct pci_bus_relations *)buffer;
2854                 if (bytes_recvd < sizeof(*bus_rel) ||
2855                     bytes_recvd <
2856                     struct_size(bus_rel, func,
2857                             bus_rel->device_count)) {
2858                     dev_err(&hbus->hdev->device,
2859                         "bus relations too small\n");
2860                     break;
2861                 }
2862 
2863                 hv_pci_devices_present(hbus, bus_rel);
2864                 break;
2865 
2866             case PCI_BUS_RELATIONS2:
2867 
2868                 bus_rel2 = (struct pci_bus_relations2 *)buffer;
2869                 if (bytes_recvd < sizeof(*bus_rel2) ||
2870                     bytes_recvd <
2871                     struct_size(bus_rel2, func,
2872                             bus_rel2->device_count)) {
2873                     dev_err(&hbus->hdev->device,
2874                         "bus relations v2 too small\n");
2875                     break;
2876                 }
2877 
2878                 hv_pci_devices_present2(hbus, bus_rel2);
2879                 break;
2880 
2881             case PCI_EJECT:
2882 
2883                 dev_message = (struct pci_dev_incoming *)buffer;
2884                 if (bytes_recvd < sizeof(*dev_message)) {
2885                     dev_err(&hbus->hdev->device,
2886                         "eject message too small\n");
2887                     break;
2888                 }
2889                 hpdev = get_pcichild_wslot(hbus,
2890                               dev_message->wslot.slot);
2891                 if (hpdev) {
2892                     hv_pci_eject_device(hpdev);
2893                     put_pcichild(hpdev);
2894                 }
2895                 break;
2896 
2897             case PCI_INVALIDATE_BLOCK:
2898 
2899                 inval = (struct pci_dev_inval_block *)buffer;
2900                 if (bytes_recvd < sizeof(*inval)) {
2901                     dev_err(&hbus->hdev->device,
2902                         "invalidate message too small\n");
2903                     break;
2904                 }
2905                 hpdev = get_pcichild_wslot(hbus,
2906                                inval->wslot.slot);
2907                 if (hpdev) {
2908                     if (hpdev->block_invalidate) {
2909                         hpdev->block_invalidate(
2910                             hpdev->invalidate_context,
2911                             inval->block_mask);
2912                     }
2913                     put_pcichild(hpdev);
2914                 }
2915                 break;
2916 
2917             default:
2918                 dev_warn(&hbus->hdev->device,
2919                     "Unimplemented protocol message %x\n",
2920                     new_message->message_type.type);
2921                 break;
2922             }
2923             break;
2924 
2925         default:
2926             dev_err(&hbus->hdev->device,
2927                 "unhandled packet type %d, tid %llx len %d\n",
2928                 desc->type, req_id, bytes_recvd);
2929             break;
2930         }
2931     }
2932 
2933     kfree(buffer);
2934 }
2935 
2936 /**
2937  * hv_pci_protocol_negotiation() - Set up protocol
2938  * @hdev:       VMBus's tracking struct for this root PCI bus.
2939  * @version:        Array of supported channel protocol versions in
2940  *          the order of probing - highest go first.
2941  * @num_version:    Number of elements in the version array.
2942  *
2943  * This driver is intended to support running on Windows 10
2944  * (server) and later versions. It will not run on earlier
2945  * versions, as they assume that many of the operations which
2946  * Linux needs accomplished with a spinlock held were done via
2947  * asynchronous messaging via VMBus.  Windows 10 increases the
2948  * surface area of PCI emulation so that these actions can take
2949  * place by suspending a virtual processor for their duration.
2950  *
2951  * This function negotiates the channel protocol version,
2952  * failing if the host doesn't support the necessary protocol
2953  * level.
2954  */
2955 static int hv_pci_protocol_negotiation(struct hv_device *hdev,
2956                        enum pci_protocol_version_t version[],
2957                        int num_version)
2958 {
2959     struct hv_pcibus_device *hbus = hv_get_drvdata(hdev);
2960     struct pci_version_request *version_req;
2961     struct hv_pci_compl comp_pkt;
2962     struct pci_packet *pkt;
2963     int ret;
2964     int i;
2965 
2966     /*
2967      * Initiate the handshake with the host and negotiate
2968      * a version that the host can support. We start with the
2969      * highest version number and go down if the host cannot
2970      * support it.
2971      */
2972     pkt = kzalloc(sizeof(*pkt) + sizeof(*version_req), GFP_KERNEL);
2973     if (!pkt)
2974         return -ENOMEM;
2975 
2976     init_completion(&comp_pkt.host_event);
2977     pkt->completion_func = hv_pci_generic_compl;
2978     pkt->compl_ctxt = &comp_pkt;
2979     version_req = (struct pci_version_request *)&pkt->message;
2980     version_req->message_type.type = PCI_QUERY_PROTOCOL_VERSION;
2981 
2982     for (i = 0; i < num_version; i++) {
2983         version_req->protocol_version = version[i];
2984         ret = vmbus_sendpacket(hdev->channel, version_req,
2985                 sizeof(struct pci_version_request),
2986                 (unsigned long)pkt, VM_PKT_DATA_INBAND,
2987                 VMBUS_DATA_PACKET_FLAG_COMPLETION_REQUESTED);
2988         if (!ret)
2989             ret = wait_for_response(hdev, &comp_pkt.host_event);
2990 
2991         if (ret) {
2992             dev_err(&hdev->device,
2993                 "PCI Pass-through VSP failed to request version: %d",
2994                 ret);
2995             goto exit;
2996         }
2997 
2998         if (comp_pkt.completion_status >= 0) {
2999             hbus->protocol_version = version[i];
3000             dev_info(&hdev->device,
3001                 "PCI VMBus probing: Using version %#x\n",
3002                 hbus->protocol_version);
3003             goto exit;
3004         }
3005 
3006         if (comp_pkt.completion_status != STATUS_REVISION_MISMATCH) {
3007             dev_err(&hdev->device,
3008                 "PCI Pass-through VSP failed version request: %#x",
3009                 comp_pkt.completion_status);
3010             ret = -EPROTO;
3011             goto exit;
3012         }
3013 
3014         reinit_completion(&comp_pkt.host_event);
3015     }
3016 
3017     dev_err(&hdev->device,
3018         "PCI pass-through VSP failed to find supported version");
3019     ret = -EPROTO;
3020 
3021 exit:
3022     kfree(pkt);
3023     return ret;
3024 }
3025 
3026 /**
3027  * hv_pci_free_bridge_windows() - Release memory regions for the
3028  * bus
3029  * @hbus:   Root PCI bus, as understood by this driver
3030  */
3031 static void hv_pci_free_bridge_windows(struct hv_pcibus_device *hbus)
3032 {
3033     /*
3034      * Set the resources back to the way they looked when they
3035      * were allocated by setting IORESOURCE_BUSY again.
3036      */
3037 
3038     if (hbus->low_mmio_space && hbus->low_mmio_res) {
3039         hbus->low_mmio_res->flags |= IORESOURCE_BUSY;
3040         vmbus_free_mmio(hbus->low_mmio_res->start,
3041                 resource_size(hbus->low_mmio_res));
3042     }
3043 
3044     if (hbus->high_mmio_space && hbus->high_mmio_res) {
3045         hbus->high_mmio_res->flags |= IORESOURCE_BUSY;
3046         vmbus_free_mmio(hbus->high_mmio_res->start,
3047                 resource_size(hbus->high_mmio_res));
3048     }
3049 }
3050 
3051 /**
3052  * hv_pci_allocate_bridge_windows() - Allocate memory regions
3053  * for the bus
3054  * @hbus:   Root PCI bus, as understood by this driver
3055  *
3056  * This function calls vmbus_allocate_mmio(), which is itself a
3057  * bit of a compromise.  Ideally, we might change the pnp layer
3058  * in the kernel such that it comprehends either PCI devices
3059  * which are "grandchildren of ACPI," with some intermediate bus
3060  * node (in this case, VMBus) or change it such that it
3061  * understands VMBus.  The pnp layer, however, has been declared
3062  * deprecated, and not subject to change.
3063  *
3064  * The workaround, implemented here, is to ask VMBus to allocate
3065  * MMIO space for this bus.  VMBus itself knows which ranges are
3066  * appropriate by looking at its own ACPI objects.  Then, after
3067  * these ranges are claimed, they're modified to look like they
3068  * would have looked if the ACPI and pnp code had allocated
3069  * bridge windows.  These descriptors have to exist in this form
3070  * in order to satisfy the code which will get invoked when the
3071  * endpoint PCI function driver calls request_mem_region() or
3072  * request_mem_region_exclusive().
3073  *
3074  * Return: 0 on success, -errno on failure
3075  */
3076 static int hv_pci_allocate_bridge_windows(struct hv_pcibus_device *hbus)
3077 {
3078     resource_size_t align;
3079     int ret;
3080 
3081     if (hbus->low_mmio_space) {
3082         align = 1ULL << (63 - __builtin_clzll(hbus->low_mmio_space));
3083         ret = vmbus_allocate_mmio(&hbus->low_mmio_res, hbus->hdev, 0,
3084                       (u64)(u32)0xffffffff,
3085                       hbus->low_mmio_space,
3086                       align, false);
3087         if (ret) {
3088             dev_err(&hbus->hdev->device,
3089                 "Need %#llx of low MMIO space. Consider reconfiguring the VM.\n",
3090                 hbus->low_mmio_space);
3091             return ret;
3092         }
3093 
3094         /* Modify this resource to become a bridge window. */
3095         hbus->low_mmio_res->flags |= IORESOURCE_WINDOW;
3096         hbus->low_mmio_res->flags &= ~IORESOURCE_BUSY;
3097         pci_add_resource(&hbus->bridge->windows, hbus->low_mmio_res);
3098     }
3099 
3100     if (hbus->high_mmio_space) {
3101         align = 1ULL << (63 - __builtin_clzll(hbus->high_mmio_space));
3102         ret = vmbus_allocate_mmio(&hbus->high_mmio_res, hbus->hdev,
3103                       0x100000000, -1,
3104                       hbus->high_mmio_space, align,
3105                       false);
3106         if (ret) {
3107             dev_err(&hbus->hdev->device,
3108                 "Need %#llx of high MMIO space. Consider reconfiguring the VM.\n",
3109                 hbus->high_mmio_space);
3110             goto release_low_mmio;
3111         }
3112 
3113         /* Modify this resource to become a bridge window. */
3114         hbus->high_mmio_res->flags |= IORESOURCE_WINDOW;
3115         hbus->high_mmio_res->flags &= ~IORESOURCE_BUSY;
3116         pci_add_resource(&hbus->bridge->windows, hbus->high_mmio_res);
3117     }
3118 
3119     return 0;
3120 
3121 release_low_mmio:
3122     if (hbus->low_mmio_res) {
3123         vmbus_free_mmio(hbus->low_mmio_res->start,
3124                 resource_size(hbus->low_mmio_res));
3125     }
3126 
3127     return ret;
3128 }
3129 
3130 /**
3131  * hv_allocate_config_window() - Find MMIO space for PCI Config
3132  * @hbus:   Root PCI bus, as understood by this driver
3133  *
3134  * This function claims memory-mapped I/O space for accessing
3135  * configuration space for the functions on this bus.
3136  *
3137  * Return: 0 on success, -errno on failure
3138  */
3139 static int hv_allocate_config_window(struct hv_pcibus_device *hbus)
3140 {
3141     int ret;
3142 
3143     /*
3144      * Set up a region of MMIO space to use for accessing configuration
3145      * space.
3146      */
3147     ret = vmbus_allocate_mmio(&hbus->mem_config, hbus->hdev, 0, -1,
3148                   PCI_CONFIG_MMIO_LENGTH, 0x1000, false);
3149     if (ret)
3150         return ret;
3151 
3152     /*
3153      * vmbus_allocate_mmio() gets used for allocating both device endpoint
3154      * resource claims (those which cannot be overlapped) and the ranges
3155      * which are valid for the children of this bus, which are intended
3156      * to be overlapped by those children.  Set the flag on this claim
3157      * meaning that this region can't be overlapped.
3158      */
3159 
3160     hbus->mem_config->flags |= IORESOURCE_BUSY;
3161 
3162     return 0;
3163 }
3164 
3165 static void hv_free_config_window(struct hv_pcibus_device *hbus)
3166 {
3167     vmbus_free_mmio(hbus->mem_config->start, PCI_CONFIG_MMIO_LENGTH);
3168 }
3169 
3170 static int hv_pci_bus_exit(struct hv_device *hdev, bool keep_devs);
3171 
3172 /**
3173  * hv_pci_enter_d0() - Bring the "bus" into the D0 power state
3174  * @hdev:   VMBus's tracking struct for this root PCI bus
3175  *
3176  * Return: 0 on success, -errno on failure
3177  */
3178 static int hv_pci_enter_d0(struct hv_device *hdev)
3179 {
3180     struct hv_pcibus_device *hbus = hv_get_drvdata(hdev);
3181     struct pci_bus_d0_entry *d0_entry;
3182     struct hv_pci_compl comp_pkt;
3183     struct pci_packet *pkt;
3184     int ret;
3185 
3186     /*
3187      * Tell the host that the bus is ready to use, and moved into the
3188      * powered-on state.  This includes telling the host which region
3189      * of memory-mapped I/O space has been chosen for configuration space
3190      * access.
3191      */
3192     pkt = kzalloc(sizeof(*pkt) + sizeof(*d0_entry), GFP_KERNEL);
3193     if (!pkt)
3194         return -ENOMEM;
3195 
3196     init_completion(&comp_pkt.host_event);
3197     pkt->completion_func = hv_pci_generic_compl;
3198     pkt->compl_ctxt = &comp_pkt;
3199     d0_entry = (struct pci_bus_d0_entry *)&pkt->message;
3200     d0_entry->message_type.type = PCI_BUS_D0ENTRY;
3201     d0_entry->mmio_base = hbus->mem_config->start;
3202 
3203     ret = vmbus_sendpacket(hdev->channel, d0_entry, sizeof(*d0_entry),
3204                    (unsigned long)pkt, VM_PKT_DATA_INBAND,
3205                    VMBUS_DATA_PACKET_FLAG_COMPLETION_REQUESTED);
3206     if (!ret)
3207         ret = wait_for_response(hdev, &comp_pkt.host_event);
3208 
3209     if (ret)
3210         goto exit;
3211 
3212     if (comp_pkt.completion_status < 0) {
3213         dev_err(&hdev->device,
3214             "PCI Pass-through VSP failed D0 Entry with status %x\n",
3215             comp_pkt.completion_status);
3216         ret = -EPROTO;
3217         goto exit;
3218     }
3219 
3220     ret = 0;
3221 
3222 exit:
3223     kfree(pkt);
3224     return ret;
3225 }
3226 
3227 /**
3228  * hv_pci_query_relations() - Ask host to send list of child
3229  * devices
3230  * @hdev:   VMBus's tracking struct for this root PCI bus
3231  *
3232  * Return: 0 on success, -errno on failure
3233  */
3234 static int hv_pci_query_relations(struct hv_device *hdev)
3235 {
3236     struct hv_pcibus_device *hbus = hv_get_drvdata(hdev);
3237     struct pci_message message;
3238     struct completion comp;
3239     int ret;
3240 
3241     /* Ask the host to send along the list of child devices */
3242     init_completion(&comp);
3243     if (cmpxchg(&hbus->survey_event, NULL, &comp))
3244         return -ENOTEMPTY;
3245 
3246     memset(&message, 0, sizeof(message));
3247     message.type = PCI_QUERY_BUS_RELATIONS;
3248 
3249     ret = vmbus_sendpacket(hdev->channel, &message, sizeof(message),
3250                    0, VM_PKT_DATA_INBAND, 0);
3251     if (!ret)
3252         ret = wait_for_response(hdev, &comp);
3253 
3254     return ret;
3255 }
3256 
3257 /**
3258  * hv_send_resources_allocated() - Report local resource choices
3259  * @hdev:   VMBus's tracking struct for this root PCI bus
3260  *
3261  * The host OS is expecting to be sent a request as a message
3262  * which contains all the resources that the device will use.
3263  * The response contains those same resources, "translated"
3264  * which is to say, the values which should be used by the
3265  * hardware, when it delivers an interrupt.  (MMIO resources are
3266  * used in local terms.)  This is nice for Windows, and lines up
3267  * with the FDO/PDO split, which doesn't exist in Linux.  Linux
3268  * is deeply expecting to scan an emulated PCI configuration
3269  * space.  So this message is sent here only to drive the state
3270  * machine on the host forward.
3271  *
3272  * Return: 0 on success, -errno on failure
3273  */
3274 static int hv_send_resources_allocated(struct hv_device *hdev)
3275 {
3276     struct hv_pcibus_device *hbus = hv_get_drvdata(hdev);
3277     struct pci_resources_assigned *res_assigned;
3278     struct pci_resources_assigned2 *res_assigned2;
3279     struct hv_pci_compl comp_pkt;
3280     struct hv_pci_dev *hpdev;
3281     struct pci_packet *pkt;
3282     size_t size_res;
3283     int wslot;
3284     int ret;
3285 
3286     size_res = (hbus->protocol_version < PCI_PROTOCOL_VERSION_1_2)
3287             ? sizeof(*res_assigned) : sizeof(*res_assigned2);
3288 
3289     pkt = kmalloc(sizeof(*pkt) + size_res, GFP_KERNEL);
3290     if (!pkt)
3291         return -ENOMEM;
3292 
3293     ret = 0;
3294 
3295     for (wslot = 0; wslot < 256; wslot++) {
3296         hpdev = get_pcichild_wslot(hbus, wslot);
3297         if (!hpdev)
3298             continue;
3299 
3300         memset(pkt, 0, sizeof(*pkt) + size_res);
3301         init_completion(&comp_pkt.host_event);
3302         pkt->completion_func = hv_pci_generic_compl;
3303         pkt->compl_ctxt = &comp_pkt;
3304 
3305         if (hbus->protocol_version < PCI_PROTOCOL_VERSION_1_2) {
3306             res_assigned =
3307                 (struct pci_resources_assigned *)&pkt->message;
3308             res_assigned->message_type.type =
3309                 PCI_RESOURCES_ASSIGNED;
3310             res_assigned->wslot.slot = hpdev->desc.win_slot.slot;
3311         } else {
3312             res_assigned2 =
3313                 (struct pci_resources_assigned2 *)&pkt->message;
3314             res_assigned2->message_type.type =
3315                 PCI_RESOURCES_ASSIGNED2;
3316             res_assigned2->wslot.slot = hpdev->desc.win_slot.slot;
3317         }
3318         put_pcichild(hpdev);
3319 
3320         ret = vmbus_sendpacket(hdev->channel, &pkt->message,
3321                 size_res, (unsigned long)pkt,
3322                 VM_PKT_DATA_INBAND,
3323                 VMBUS_DATA_PACKET_FLAG_COMPLETION_REQUESTED);
3324         if (!ret)
3325             ret = wait_for_response(hdev, &comp_pkt.host_event);
3326         if (ret)
3327             break;
3328 
3329         if (comp_pkt.completion_status < 0) {
3330             ret = -EPROTO;
3331             dev_err(&hdev->device,
3332                 "resource allocated returned 0x%x",
3333                 comp_pkt.completion_status);
3334             break;
3335         }
3336 
3337         hbus->wslot_res_allocated = wslot;
3338     }
3339 
3340     kfree(pkt);
3341     return ret;
3342 }
3343 
3344 /**
3345  * hv_send_resources_released() - Report local resources
3346  * released
3347  * @hdev:   VMBus's tracking struct for this root PCI bus
3348  *
3349  * Return: 0 on success, -errno on failure
3350  */
3351 static int hv_send_resources_released(struct hv_device *hdev)
3352 {
3353     struct hv_pcibus_device *hbus = hv_get_drvdata(hdev);
3354     struct pci_child_message pkt;
3355     struct hv_pci_dev *hpdev;
3356     int wslot;
3357     int ret;
3358 
3359     for (wslot = hbus->wslot_res_allocated; wslot >= 0; wslot--) {
3360         hpdev = get_pcichild_wslot(hbus, wslot);
3361         if (!hpdev)
3362             continue;
3363 
3364         memset(&pkt, 0, sizeof(pkt));
3365         pkt.message_type.type = PCI_RESOURCES_RELEASED;
3366         pkt.wslot.slot = hpdev->desc.win_slot.slot;
3367 
3368         put_pcichild(hpdev);
3369 
3370         ret = vmbus_sendpacket(hdev->channel, &pkt, sizeof(pkt), 0,
3371                        VM_PKT_DATA_INBAND, 0);
3372         if (ret)
3373             return ret;
3374 
3375         hbus->wslot_res_allocated = wslot - 1;
3376     }
3377 
3378     hbus->wslot_res_allocated = -1;
3379 
3380     return 0;
3381 }
3382 
3383 #define HVPCI_DOM_MAP_SIZE (64 * 1024)
3384 static DECLARE_BITMAP(hvpci_dom_map, HVPCI_DOM_MAP_SIZE);
3385 
3386 /*
3387  * PCI domain number 0 is used by emulated devices on Gen1 VMs, so define 0
3388  * as invalid for passthrough PCI devices of this driver.
3389  */
3390 #define HVPCI_DOM_INVALID 0
3391 
3392 /**
3393  * hv_get_dom_num() - Get a valid PCI domain number
3394  * Check if the PCI domain number is in use, and return another number if
3395  * it is in use.
3396  *
3397  * @dom: Requested domain number
3398  *
3399  * return: domain number on success, HVPCI_DOM_INVALID on failure
3400  */
3401 static u16 hv_get_dom_num(u16 dom)
3402 {
3403     unsigned int i;
3404 
3405     if (test_and_set_bit(dom, hvpci_dom_map) == 0)
3406         return dom;
3407 
3408     for_each_clear_bit(i, hvpci_dom_map, HVPCI_DOM_MAP_SIZE) {
3409         if (test_and_set_bit(i, hvpci_dom_map) == 0)
3410             return i;
3411     }
3412 
3413     return HVPCI_DOM_INVALID;
3414 }
3415 
3416 /**
3417  * hv_put_dom_num() - Mark the PCI domain number as free
3418  * @dom: Domain number to be freed
3419  */
3420 static void hv_put_dom_num(u16 dom)
3421 {
3422     clear_bit(dom, hvpci_dom_map);
3423 }
3424 
3425 /**
3426  * hv_pci_probe() - New VMBus channel probe, for a root PCI bus
3427  * @hdev:   VMBus's tracking struct for this root PCI bus
3428  * @dev_id: Identifies the device itself
3429  *
3430  * Return: 0 on success, -errno on failure
3431  */
3432 static int hv_pci_probe(struct hv_device *hdev,
3433             const struct hv_vmbus_device_id *dev_id)
3434 {
3435     struct pci_host_bridge *bridge;
3436     struct hv_pcibus_device *hbus;
3437     u16 dom_req, dom;
3438     char *name;
3439     bool enter_d0_retry = true;
3440     int ret;
3441 
3442     /*
3443      * hv_pcibus_device contains the hypercall arguments for retargeting in
3444      * hv_irq_unmask(). Those must not cross a page boundary.
3445      */
3446     BUILD_BUG_ON(sizeof(*hbus) > HV_HYP_PAGE_SIZE);
3447 
3448     bridge = devm_pci_alloc_host_bridge(&hdev->device, 0);
3449     if (!bridge)
3450         return -ENOMEM;
3451 
3452     /*
3453      * With the recent 59bb47985c1d ("mm, sl[aou]b: guarantee natural
3454      * alignment for kmalloc(power-of-two)"), kzalloc() is able to allocate
3455      * a 4KB buffer that is guaranteed to be 4KB-aligned. Here the size and
3456      * alignment of hbus is important because hbus's field
3457      * retarget_msi_interrupt_params must not cross a 4KB page boundary.
3458      *
3459      * Here we prefer kzalloc to get_zeroed_page(), because a buffer
3460      * allocated by the latter is not tracked and scanned by kmemleak, and
3461      * hence kmemleak reports the pointer contained in the hbus buffer
3462      * (i.e. the hpdev struct, which is created in new_pcichild_device() and
3463      * is tracked by hbus->children) as memory leak (false positive).
3464      *
3465      * If the kernel doesn't have 59bb47985c1d, get_zeroed_page() *must* be
3466      * used to allocate the hbus buffer and we can avoid the kmemleak false
3467      * positive by using kmemleak_alloc() and kmemleak_free() to ask
3468      * kmemleak to track and scan the hbus buffer.
3469      */
3470     hbus = kzalloc(HV_HYP_PAGE_SIZE, GFP_KERNEL);
3471     if (!hbus)
3472         return -ENOMEM;
3473 
3474     hbus->bridge = bridge;
3475     hbus->state = hv_pcibus_init;
3476     hbus->wslot_res_allocated = -1;
3477 
3478     /*
3479      * The PCI bus "domain" is what is called "segment" in ACPI and other
3480      * specs. Pull it from the instance ID, to get something usually
3481      * unique. In rare cases of collision, we will find out another number
3482      * not in use.
3483      *
3484      * Note that, since this code only runs in a Hyper-V VM, Hyper-V
3485      * together with this guest driver can guarantee that (1) The only
3486      * domain used by Gen1 VMs for something that looks like a physical
3487      * PCI bus (which is actually emulated by the hypervisor) is domain 0.
3488      * (2) There will be no overlap between domains (after fixing possible
3489      * collisions) in the same VM.
3490      */
3491     dom_req = hdev->dev_instance.b[5] << 8 | hdev->dev_instance.b[4];
3492     dom = hv_get_dom_num(dom_req);
3493 
3494     if (dom == HVPCI_DOM_INVALID) {
3495         dev_err(&hdev->device,
3496             "Unable to use dom# 0x%x or other numbers", dom_req);
3497         ret = -EINVAL;
3498         goto free_bus;
3499     }
3500 
3501     if (dom != dom_req)
3502         dev_info(&hdev->device,
3503              "PCI dom# 0x%x has collision, using 0x%x",
3504              dom_req, dom);
3505 
3506     hbus->bridge->domain_nr = dom;
3507 #ifdef CONFIG_X86
3508     hbus->sysdata.domain = dom;
3509 #elif defined(CONFIG_ARM64)
3510     /*
3511      * Set the PCI bus parent to be the corresponding VMbus
3512      * device. Then the VMbus device will be assigned as the
3513      * ACPI companion in pcibios_root_bridge_prepare() and
3514      * pci_dma_configure() will propagate device coherence
3515      * information to devices created on the bus.
3516      */
3517     hbus->sysdata.parent = hdev->device.parent;
3518 #endif
3519 
3520     hbus->hdev = hdev;
3521     INIT_LIST_HEAD(&hbus->children);
3522     INIT_LIST_HEAD(&hbus->dr_list);
3523     spin_lock_init(&hbus->config_lock);
3524     spin_lock_init(&hbus->device_list_lock);
3525     spin_lock_init(&hbus->retarget_msi_interrupt_lock);
3526     hbus->wq = alloc_ordered_workqueue("hv_pci_%x", 0,
3527                        hbus->bridge->domain_nr);
3528     if (!hbus->wq) {
3529         ret = -ENOMEM;
3530         goto free_dom;
3531     }
3532 
3533     hdev->channel->next_request_id_callback = vmbus_next_request_id;
3534     hdev->channel->request_addr_callback = vmbus_request_addr;
3535     hdev->channel->rqstor_size = HV_PCI_RQSTOR_SIZE;
3536 
3537     ret = vmbus_open(hdev->channel, pci_ring_size, pci_ring_size, NULL, 0,
3538              hv_pci_onchannelcallback, hbus);
3539     if (ret)
3540         goto destroy_wq;
3541 
3542     hv_set_drvdata(hdev, hbus);
3543 
3544     ret = hv_pci_protocol_negotiation(hdev, pci_protocol_versions,
3545                       ARRAY_SIZE(pci_protocol_versions));
3546     if (ret)
3547         goto close;
3548 
3549     ret = hv_allocate_config_window(hbus);
3550     if (ret)
3551         goto close;
3552 
3553     hbus->cfg_addr = ioremap(hbus->mem_config->start,
3554                  PCI_CONFIG_MMIO_LENGTH);
3555     if (!hbus->cfg_addr) {
3556         dev_err(&hdev->device,
3557             "Unable to map a virtual address for config space\n");
3558         ret = -ENOMEM;
3559         goto free_config;
3560     }
3561 
3562     name = kasprintf(GFP_KERNEL, "%pUL", &hdev->dev_instance);
3563     if (!name) {
3564         ret = -ENOMEM;
3565         goto unmap;
3566     }
3567 
3568     hbus->fwnode = irq_domain_alloc_named_fwnode(name);
3569     kfree(name);
3570     if (!hbus->fwnode) {
3571         ret = -ENOMEM;
3572         goto unmap;
3573     }
3574 
3575     ret = hv_pcie_init_irq_domain(hbus);
3576     if (ret)
3577         goto free_fwnode;
3578 
3579 retry:
3580     ret = hv_pci_query_relations(hdev);
3581     if (ret)
3582         goto free_irq_domain;
3583 
3584     ret = hv_pci_enter_d0(hdev);
3585     /*
3586      * In certain case (Kdump) the pci device of interest was
3587      * not cleanly shut down and resource is still held on host
3588      * side, the host could return invalid device status.
3589      * We need to explicitly request host to release the resource
3590      * and try to enter D0 again.
3591      * Since the hv_pci_bus_exit() call releases structures
3592      * of all its child devices, we need to start the retry from
3593      * hv_pci_query_relations() call, requesting host to send
3594      * the synchronous child device relations message before this
3595      * information is needed in hv_send_resources_allocated()
3596      * call later.
3597      */
3598     if (ret == -EPROTO && enter_d0_retry) {
3599         enter_d0_retry = false;
3600 
3601         dev_err(&hdev->device, "Retrying D0 Entry\n");
3602 
3603         /*
3604          * Hv_pci_bus_exit() calls hv_send_resources_released()
3605          * to free up resources of its child devices.
3606          * In the kdump kernel we need to set the
3607          * wslot_res_allocated to 255 so it scans all child
3608          * devices to release resources allocated in the
3609          * normal kernel before panic happened.
3610          */
3611         hbus->wslot_res_allocated = 255;
3612         ret = hv_pci_bus_exit(hdev, true);
3613 
3614         if (ret == 0)
3615             goto retry;
3616 
3617         dev_err(&hdev->device,
3618             "Retrying D0 failed with ret %d\n", ret);
3619     }
3620     if (ret)
3621         goto free_irq_domain;
3622 
3623     ret = hv_pci_allocate_bridge_windows(hbus);
3624     if (ret)
3625         goto exit_d0;
3626 
3627     ret = hv_send_resources_allocated(hdev);
3628     if (ret)
3629         goto free_windows;
3630 
3631     prepopulate_bars(hbus);
3632 
3633     hbus->state = hv_pcibus_probed;
3634 
3635     ret = create_root_hv_pci_bus(hbus);
3636     if (ret)
3637         goto free_windows;
3638 
3639     return 0;
3640 
3641 free_windows:
3642     hv_pci_free_bridge_windows(hbus);
3643 exit_d0:
3644     (void) hv_pci_bus_exit(hdev, true);
3645 free_irq_domain:
3646     irq_domain_remove(hbus->irq_domain);
3647 free_fwnode:
3648     irq_domain_free_fwnode(hbus->fwnode);
3649 unmap:
3650     iounmap(hbus->cfg_addr);
3651 free_config:
3652     hv_free_config_window(hbus);
3653 close:
3654     vmbus_close(hdev->channel);
3655 destroy_wq:
3656     destroy_workqueue(hbus->wq);
3657 free_dom:
3658     hv_put_dom_num(hbus->bridge->domain_nr);
3659 free_bus:
3660     kfree(hbus);
3661     return ret;
3662 }
3663 
3664 static int hv_pci_bus_exit(struct hv_device *hdev, bool keep_devs)
3665 {
3666     struct hv_pcibus_device *hbus = hv_get_drvdata(hdev);
3667     struct vmbus_channel *chan = hdev->channel;
3668     struct {
3669         struct pci_packet teardown_packet;
3670         u8 buffer[sizeof(struct pci_message)];
3671     } pkt;
3672     struct hv_pci_compl comp_pkt;
3673     struct hv_pci_dev *hpdev, *tmp;
3674     unsigned long flags;
3675     u64 trans_id;
3676     int ret;
3677 
3678     /*
3679      * After the host sends the RESCIND_CHANNEL message, it doesn't
3680      * access the per-channel ringbuffer any longer.
3681      */
3682     if (chan->rescind)
3683         return 0;
3684 
3685     if (!keep_devs) {
3686         struct list_head removed;
3687 
3688         /* Move all present children to the list on stack */
3689         INIT_LIST_HEAD(&removed);
3690         spin_lock_irqsave(&hbus->device_list_lock, flags);
3691         list_for_each_entry_safe(hpdev, tmp, &hbus->children, list_entry)
3692             list_move_tail(&hpdev->list_entry, &removed);
3693         spin_unlock_irqrestore(&hbus->device_list_lock, flags);
3694 
3695         /* Remove all children in the list */
3696         list_for_each_entry_safe(hpdev, tmp, &removed, list_entry) {
3697             list_del(&hpdev->list_entry);
3698             if (hpdev->pci_slot)
3699                 pci_destroy_slot(hpdev->pci_slot);
3700             /* For the two refs got in new_pcichild_device() */
3701             put_pcichild(hpdev);
3702             put_pcichild(hpdev);
3703         }
3704     }
3705 
3706     ret = hv_send_resources_released(hdev);
3707     if (ret) {
3708         dev_err(&hdev->device,
3709             "Couldn't send resources released packet(s)\n");
3710         return ret;
3711     }
3712 
3713     memset(&pkt.teardown_packet, 0, sizeof(pkt.teardown_packet));
3714     init_completion(&comp_pkt.host_event);
3715     pkt.teardown_packet.completion_func = hv_pci_generic_compl;
3716     pkt.teardown_packet.compl_ctxt = &comp_pkt;
3717     pkt.teardown_packet.message[0].type = PCI_BUS_D0EXIT;
3718 
3719     ret = vmbus_sendpacket_getid(chan, &pkt.teardown_packet.message,
3720                      sizeof(struct pci_message),
3721                      (unsigned long)&pkt.teardown_packet,
3722                      &trans_id, VM_PKT_DATA_INBAND,
3723                      VMBUS_DATA_PACKET_FLAG_COMPLETION_REQUESTED);
3724     if (ret)
3725         return ret;
3726 
3727     if (wait_for_completion_timeout(&comp_pkt.host_event, 10 * HZ) == 0) {
3728         /*
3729          * The completion packet on the stack becomes invalid after
3730          * 'return'; remove the ID from the VMbus requestor if the
3731          * identifier is still mapped to/associated with the packet.
3732          *
3733          * Cf. hv_pci_onchannelcallback().
3734          */
3735         vmbus_request_addr_match(chan, trans_id,
3736                      (unsigned long)&pkt.teardown_packet);
3737         return -ETIMEDOUT;
3738     }
3739 
3740     return 0;
3741 }
3742 
3743 /**
3744  * hv_pci_remove() - Remove routine for this VMBus channel
3745  * @hdev:   VMBus's tracking struct for this root PCI bus
3746  *
3747  * Return: 0 on success, -errno on failure
3748  */
3749 static int hv_pci_remove(struct hv_device *hdev)
3750 {
3751     struct hv_pcibus_device *hbus;
3752     int ret;
3753 
3754     hbus = hv_get_drvdata(hdev);
3755     if (hbus->state == hv_pcibus_installed) {
3756         tasklet_disable(&hdev->channel->callback_event);
3757         hbus->state = hv_pcibus_removing;
3758         tasklet_enable(&hdev->channel->callback_event);
3759         destroy_workqueue(hbus->wq);
3760         hbus->wq = NULL;
3761         /*
3762          * At this point, no work is running or can be scheduled
3763          * on hbus-wq. We can't race with hv_pci_devices_present()
3764          * or hv_pci_eject_device(), it's safe to proceed.
3765          */
3766 
3767         /* Remove the bus from PCI's point of view. */
3768         pci_lock_rescan_remove();
3769         pci_stop_root_bus(hbus->bridge->bus);
3770         hv_pci_remove_slots(hbus);
3771         pci_remove_root_bus(hbus->bridge->bus);
3772         pci_unlock_rescan_remove();
3773     }
3774 
3775     ret = hv_pci_bus_exit(hdev, false);
3776 
3777     vmbus_close(hdev->channel);
3778 
3779     iounmap(hbus->cfg_addr);
3780     hv_free_config_window(hbus);
3781     hv_pci_free_bridge_windows(hbus);
3782     irq_domain_remove(hbus->irq_domain);
3783     irq_domain_free_fwnode(hbus->fwnode);
3784 
3785     hv_put_dom_num(hbus->bridge->domain_nr);
3786 
3787     kfree(hbus);
3788     return ret;
3789 }
3790 
3791 static int hv_pci_suspend(struct hv_device *hdev)
3792 {
3793     struct hv_pcibus_device *hbus = hv_get_drvdata(hdev);
3794     enum hv_pcibus_state old_state;
3795     int ret;
3796 
3797     /*
3798      * hv_pci_suspend() must make sure there are no pending work items
3799      * before calling vmbus_close(), since it runs in a process context
3800      * as a callback in dpm_suspend().  When it starts to run, the channel
3801      * callback hv_pci_onchannelcallback(), which runs in a tasklet
3802      * context, can be still running concurrently and scheduling new work
3803      * items onto hbus->wq in hv_pci_devices_present() and
3804      * hv_pci_eject_device(), and the work item handlers can access the
3805      * vmbus channel, which can be being closed by hv_pci_suspend(), e.g.
3806      * the work item handler pci_devices_present_work() ->
3807      * new_pcichild_device() writes to the vmbus channel.
3808      *
3809      * To eliminate the race, hv_pci_suspend() disables the channel
3810      * callback tasklet, sets hbus->state to hv_pcibus_removing, and
3811      * re-enables the tasklet. This way, when hv_pci_suspend() proceeds,
3812      * it knows that no new work item can be scheduled, and then it flushes
3813      * hbus->wq and safely closes the vmbus channel.
3814      */
3815     tasklet_disable(&hdev->channel->callback_event);
3816 
3817     /* Change the hbus state to prevent new work items. */
3818     old_state = hbus->state;
3819     if (hbus->state == hv_pcibus_installed)
3820         hbus->state = hv_pcibus_removing;
3821 
3822     tasklet_enable(&hdev->channel->callback_event);
3823 
3824     if (old_state != hv_pcibus_installed)
3825         return -EINVAL;
3826 
3827     flush_workqueue(hbus->wq);
3828 
3829     ret = hv_pci_bus_exit(hdev, true);
3830     if (ret)
3831         return ret;
3832 
3833     vmbus_close(hdev->channel);
3834 
3835     return 0;
3836 }
3837 
3838 static int hv_pci_restore_msi_msg(struct pci_dev *pdev, void *arg)
3839 {
3840     struct irq_data *irq_data;
3841     struct msi_desc *entry;
3842     int ret = 0;
3843 
3844     msi_lock_descs(&pdev->dev);
3845     msi_for_each_desc(entry, &pdev->dev, MSI_DESC_ASSOCIATED) {
3846         irq_data = irq_get_irq_data(entry->irq);
3847         if (WARN_ON_ONCE(!irq_data)) {
3848             ret = -EINVAL;
3849             break;
3850         }
3851 
3852         hv_compose_msi_msg(irq_data, &entry->msg);
3853     }
3854     msi_unlock_descs(&pdev->dev);
3855 
3856     return ret;
3857 }
3858 
3859 /*
3860  * Upon resume, pci_restore_msi_state() -> ... ->  __pci_write_msi_msg()
3861  * directly writes the MSI/MSI-X registers via MMIO, but since Hyper-V
3862  * doesn't trap and emulate the MMIO accesses, here hv_compose_msi_msg()
3863  * must be used to ask Hyper-V to re-create the IOMMU Interrupt Remapping
3864  * Table entries.
3865  */
3866 static void hv_pci_restore_msi_state(struct hv_pcibus_device *hbus)
3867 {
3868     pci_walk_bus(hbus->bridge->bus, hv_pci_restore_msi_msg, NULL);
3869 }
3870 
3871 static int hv_pci_resume(struct hv_device *hdev)
3872 {
3873     struct hv_pcibus_device *hbus = hv_get_drvdata(hdev);
3874     enum pci_protocol_version_t version[1];
3875     int ret;
3876 
3877     hbus->state = hv_pcibus_init;
3878 
3879     hdev->channel->next_request_id_callback = vmbus_next_request_id;
3880     hdev->channel->request_addr_callback = vmbus_request_addr;
3881     hdev->channel->rqstor_size = HV_PCI_RQSTOR_SIZE;
3882 
3883     ret = vmbus_open(hdev->channel, pci_ring_size, pci_ring_size, NULL, 0,
3884              hv_pci_onchannelcallback, hbus);
3885     if (ret)
3886         return ret;
3887 
3888     /* Only use the version that was in use before hibernation. */
3889     version[0] = hbus->protocol_version;
3890     ret = hv_pci_protocol_negotiation(hdev, version, 1);
3891     if (ret)
3892         goto out;
3893 
3894     ret = hv_pci_query_relations(hdev);
3895     if (ret)
3896         goto out;
3897 
3898     ret = hv_pci_enter_d0(hdev);
3899     if (ret)
3900         goto out;
3901 
3902     ret = hv_send_resources_allocated(hdev);
3903     if (ret)
3904         goto out;
3905 
3906     prepopulate_bars(hbus);
3907 
3908     hv_pci_restore_msi_state(hbus);
3909 
3910     hbus->state = hv_pcibus_installed;
3911     return 0;
3912 out:
3913     vmbus_close(hdev->channel);
3914     return ret;
3915 }
3916 
3917 static const struct hv_vmbus_device_id hv_pci_id_table[] = {
3918     /* PCI Pass-through Class ID */
3919     /* 44C4F61D-4444-4400-9D52-802E27EDE19F */
3920     { HV_PCIE_GUID, },
3921     { },
3922 };
3923 
3924 MODULE_DEVICE_TABLE(vmbus, hv_pci_id_table);
3925 
3926 static struct hv_driver hv_pci_drv = {
3927     .name       = "hv_pci",
3928     .id_table   = hv_pci_id_table,
3929     .probe      = hv_pci_probe,
3930     .remove     = hv_pci_remove,
3931     .suspend    = hv_pci_suspend,
3932     .resume     = hv_pci_resume,
3933 };
3934 
3935 static void __exit exit_hv_pci_drv(void)
3936 {
3937     vmbus_driver_unregister(&hv_pci_drv);
3938 
3939     hvpci_block_ops.read_block = NULL;
3940     hvpci_block_ops.write_block = NULL;
3941     hvpci_block_ops.reg_blk_invalidate = NULL;
3942 }
3943 
3944 static int __init init_hv_pci_drv(void)
3945 {
3946     int ret;
3947 
3948     if (!hv_is_hyperv_initialized())
3949         return -ENODEV;
3950 
3951     ret = hv_pci_irqchip_init();
3952     if (ret)
3953         return ret;
3954 
3955     /* Set the invalid domain number's bit, so it will not be used */
3956     set_bit(HVPCI_DOM_INVALID, hvpci_dom_map);
3957 
3958     /* Initialize PCI block r/w interface */
3959     hvpci_block_ops.read_block = hv_read_config_block;
3960     hvpci_block_ops.write_block = hv_write_config_block;
3961     hvpci_block_ops.reg_blk_invalidate = hv_register_block_invalidate;
3962 
3963     return vmbus_driver_register(&hv_pci_drv);
3964 }
3965 
3966 module_init(init_hv_pci_drv);
3967 module_exit(exit_hv_pci_drv);
3968 
3969 MODULE_DESCRIPTION("Hyper-V PCI");
3970 MODULE_LICENSE("GPL v2");