Sfoglia il codice sorgente

DragonOS虚拟化 (#389)

* try some ioctl flow & kvm device

* add sys ioctl

* 删掉一些debug信息

* 修改run-qemu.sh脚本,在QEMU中enable vmx

* 修改cr0,cr4,msr寄存器enable VMX operations

* enable vmx operation

* allocate memory for vmcs with bug

* allocate memory for vmcs

* cpu virt-50%

* single vcpu virt

* add vmcs fields

* CPU virt overall flow with bug

* run vmlaunch success

* run CPU virt with bug

* 成功运行non-root模式的guest

* 成功运行vmexit,进入vmx_return函数

* 成功运行vmlaunch, vmexit, vmresume

* vmexit handler with bug

* 完成vmexit cpuid handler

* fix vmresume guest状态恢复的bug

* 增加vm ioctl

* refactor kvm 50%

* refactor kvm 80%

* FIXME: kvm vmlaunch failed

* vmlaunch success

* FIXME: output error

* update guest_rsp

* cpu virt refactor

* add mmu related struct

* add usermemory region workflow

* add mem-virt workflow

* add mem-virt

* refactor code

* add vcpu ioctl set_regs

* rename hypervisor to vm & solve some deadlock bugs

* workout mem pipeline

* fix vmcs control setting bugs

* refactor segment regs initialization

* resovle conficts

* resovle conficts

* format code
Xiaoye Zheng 1 anno fa
parent
commit
40314b30ab
45 ha cambiato i file con 3652 aggiunte e 12 eliminazioni
  1. 5 1
      kernel/Cargo.toml
  2. 117 0
      kernel/src/arch/x86_64/kvm/mod.rs
  3. 112 0
      kernel/src/arch/x86_64/kvm/vmx/ept.rs
  4. 7 0
      kernel/src/arch/x86_64/kvm/vmx/kvm_emulation.rs
  5. 254 0
      kernel/src/arch/x86_64/kvm/vmx/mmu.rs
  6. 45 0
      kernel/src/arch/x86_64/kvm/vmx/mod.rs
  7. 89 0
      kernel/src/arch/x86_64/kvm/vmx/seg.rs
  8. 653 0
      kernel/src/arch/x86_64/kvm/vmx/vcpu.rs
  9. 539 0
      kernel/src/arch/x86_64/kvm/vmx/vmcs.rs
  10. 269 0
      kernel/src/arch/x86_64/kvm/vmx/vmexit.rs
  11. 96 0
      kernel/src/arch/x86_64/kvm/vmx/vmx_asm_wrapper.rs
  12. 18 6
      kernel/src/arch/x86_64/mm/mod.rs
  13. 3 0
      kernel/src/arch/x86_64/mod.rs
  14. 5 0
      kernel/src/filesystem/devfs/mod.rs
  15. 1 0
      kernel/src/filesystem/vfs/file.rs
  16. 3 0
      kernel/src/filesystem/vfs/mod.rs
  17. 23 1
      kernel/src/filesystem/vfs/syscall.rs
  18. 1 0
      kernel/src/ktest/ktest.h
  19. 23 0
      kernel/src/ktest/test-kvm.c
  20. 4 0
      kernel/src/lib.rs
  21. 5 0
      kernel/src/main.c
  22. 27 0
      kernel/src/mm/allocator/kernel_allocator.rs
  23. 4 0
      kernel/src/mm/mod.rs
  24. 21 0
      kernel/src/syscall/mod.rs
  25. 190 0
      kernel/src/virt/kvm/host_mem.rs
  26. 2 0
      kernel/src/virt/kvm/kvm.h
  27. 188 0
      kernel/src/virt/kvm/kvm_dev.rs
  28. 85 0
      kernel/src/virt/kvm/mod.rs
  29. 9 0
      kernel/src/virt/kvm/vcpu.rs
  30. 212 0
      kernel/src/virt/kvm/vcpu_dev.rs
  31. 175 0
      kernel/src/virt/kvm/vm.rs
  32. 224 0
      kernel/src/virt/kvm/vm_dev.rs
  33. 1 0
      kernel/src/virt/mod.rs
  34. 1 1
      tools/.gdbinit
  35. 2 2
      tools/run-qemu.sh
  36. 1 1
      user/apps/Makefile
  37. 9 0
      user/apps/test_kvm/Makefile
  38. 3 0
      user/apps/test_kvm/bootstrap/Makefile
  39. BIN
      user/apps/test_kvm/bootstrap/boot.bin
  40. 32 0
      user/apps/test_kvm/bootstrap/boot.hex
  41. 54 0
      user/apps/test_kvm/link.lds
  42. 114 0
      user/apps/test_kvm/main.c
  43. 13 0
      user/libs/libc/src/fcntl.c
  44. 12 0
      user/libs/libc/src/include/export/fcntl.h
  45. 1 0
      user/libs/libsystem/syscall.h

+ 5 - 1
kernel/Cargo.toml

@@ -17,6 +17,7 @@ members = [ "src/libs/intertrait" ]
 x86 = "0.52.0"
 x86_64 = "0.14.10"
 bitflags = "1.3.2"
+bitfield-struct = "0.5.3"
 virtio-drivers = { git = "https://git.mirrors.dragonos.org/DragonOS-Community/virtio-drivers.git", rev = "f1d1cbb" }
 # 一个无锁MPSC队列
 thingbuf = { version = "0.1.3", default-features = false, features = ["alloc"] }
@@ -46,7 +47,10 @@ version = "1.4.0"
 # 由于在no_std环境,而lazy_static依赖了spin库,因此需要指定其使用no_std
 features = ["spin_no_std"]
 
-
+# The development profile, used for `cargo build`
+[profile.dev]
+# opt-level = 0  # Controls the --opt-level the compiler builds with
+debug = true   # Controls whether the compiler passes `-g`
 # The release profile, used for `cargo build --release`
 [profile.release]
 debug = false

+ 117 - 0
kernel/src/arch/x86_64/kvm/mod.rs

@@ -0,0 +1,117 @@
+use crate::arch::kvm::vmx::vmcs::VmcsFields;
+use crate::arch::kvm::vmx::vmx_asm_wrapper::{vmx_vmlaunch, vmx_vmread};
+use crate::libs::mutex::Mutex;
+use crate::virt::kvm::vm;
+use crate::{
+    kdebug,
+    kerror,
+    // libs::spinlock::{SpinLock, SpinLockGuard},
+    syscall::SystemError,
+};
+use alloc::sync::Arc;
+use core::arch::asm;
+use raw_cpuid::CpuId;
+// use crate::virt::kvm::guest_code;
+use self::vmx::mmu::{kvm_mmu_setup, kvm_vcpu_mtrr_init};
+use self::vmx::vcpu::VmxVcpu;
+pub mod vmx;
+
+#[derive(Default, Debug, Clone)]
+pub struct X86_64KVMArch {
+    // n_used_mmu_pages: u32,
+    // n_requested_mmu_pages: u32,
+    // n_max_mmu_pages: u32,
+    // mmu_valid_gen: u64,
+    // // mmu_page_hash:[],
+    // active_mmu_pages: LinkedList<KvmMmuPage>, // 所有分配的mmu page都挂到active_mmu_pages上
+    // zapped_obsolete_pages: LinkedList<KvmMmuPage>, // 释放的mmu page都挂到zapped_obsolete_pages上,一个全局的invalid_list
+}
+
+impl X86_64KVMArch {
+    /// @brief 查看CPU是否支持虚拟化
+    pub fn kvm_arch_cpu_supports_vm() -> Result<(), SystemError> {
+        let cpuid = CpuId::new();
+        // Check to see if CPU is Intel (“GenuineIntel”).
+        if let Some(vi) = cpuid.get_vendor_info() {
+            if vi.as_str() != "GenuineIntel" {
+                return Err(SystemError::EOPNOTSUPP_OR_ENOTSUP);
+            }
+        }
+        // Check processor supports for Virtual Machine Extension (VMX) technology
+        // CPUID.1:ECX.VMX[bit 5] = 1 (Intel Manual: 24.6 Discovering Support for VMX)
+        if let Some(fi) = cpuid.get_feature_info() {
+            if !fi.has_vmx() {
+                return Err(SystemError::EOPNOTSUPP_OR_ENOTSUP);
+            }
+        }
+        Ok(())
+    }
+
+    /// @brief 初始化KVM
+    pub fn kvm_arch_init() -> Result<(), SystemError> {
+        Ok(())
+    }
+
+    pub fn kvm_arch_dev_ioctl(cmd: u32, _arg: usize) -> Result<usize, SystemError> {
+        match cmd {
+            _ => {
+                kerror!("unknown kvm ioctl cmd: {}", cmd);
+                return Err(SystemError::EINVAL);
+            }
+        }
+    }
+
+    pub fn kvm_arch_vcpu_create(id: u32) -> Result<Arc<Mutex<VmxVcpu>>, SystemError> {
+        // let guest_rip = current_kvm.lock().memslots[0].memslots[0].userspace_addr;
+        let vcpu = VmxVcpu::new(id, vm(0).unwrap()).unwrap();
+        return Ok(Arc::new(Mutex::new(vcpu)));
+    }
+
+    pub fn kvm_arch_vcpu_setup(vcpu: &Mutex<VmxVcpu>) -> Result<(), SystemError> {
+        kvm_vcpu_mtrr_init(vcpu)?;
+        kvm_mmu_setup(vcpu);
+        Ok(())
+    }
+    pub fn kvm_arch_vcpu_ioctl_run(_vcpu: &Mutex<VmxVcpu>) -> Result<(), SystemError> {
+        match vmx_vmlaunch() {
+            Ok(_) => {}
+            Err(e) => {
+                let vmx_err = vmx_vmread(VmcsFields::VMEXIT_INSTR_ERR as u32).unwrap();
+                kdebug!("vmlaunch failed: {:?}", vmx_err);
+                return Err(e);
+            }
+        }
+        Ok(())
+    }
+
+    // pub fn kvm_arch_create_memslot(_slot: &mut KvmMemorySlot, _npages: u64) {
+
+    // }
+
+    // pub fn kvm_arch_commit_memory_region(
+    //     _mem: &KvmUserspaceMemoryRegion,
+    //     _new_slot: &KvmMemorySlot,
+    //     _old_slot: &KvmMemorySlot,
+    //     _change: KvmMemoryChange) {
+    //         // let kvm = KVM();
+    //         // let mut num_mmu_pages = 0;
+    //         // if kvm.lock().arch.n_requested_mmu_pages == 0{
+    // 	    //     num_mmu_pages = kvm_mmu_calculate_mmu_pages();
+    //         // }
+    //         // if num_mmu_pages != 0 {
+    //         //     // kvm_mmu_change_mmu_pages(num_mmu_pages);
+    //         // }
+    // }
+}
+
+#[no_mangle]
+pub extern "C" fn guest_code() {
+    kdebug!("guest_code");
+    loop {
+        unsafe {
+            asm!("mov rax, 0", "mov rcx, 0", "cpuid");
+        }
+        unsafe { asm!("nop") };
+        kdebug!("guest_code");
+    }
+}

+ 112 - 0
kernel/src/arch/x86_64/kvm/vmx/ept.rs

@@ -0,0 +1,112 @@
+use crate::arch::mm::PageMapper;
+use crate::arch::MMArch;
+use crate::mm::page::PageFlags;
+use crate::mm::{PageTableKind, PhysAddr, VirtAddr};
+use crate::smp::core::smp_get_processor_id;
+use crate::{arch::mm::LockedFrameAllocator, syscall::SystemError};
+use core::sync::atomic::{compiler_fence, AtomicUsize, Ordering};
+use x86::msr;
+
+/// Check if MTRR is supported
+pub fn check_ept_features() -> Result<(), SystemError> {
+    const MTRR_ENABLE_BIT: u64 = 1 << 11;
+    let ia32_mtrr_def_type = unsafe { msr::rdmsr(msr::IA32_MTRR_DEF_TYPE) };
+    if (ia32_mtrr_def_type & MTRR_ENABLE_BIT) == 0 {
+        return Err(SystemError::EOPNOTSUPP_OR_ENOTSUP);
+    }
+    Ok(())
+}
+
+// pub fn ept_build_mtrr_map() -> Result<(), SystemError> {
+// let ia32_mtrr_cap = unsafe { msr::rdmsr(msr::IA32_MTRRCAP) };
+// Ok(())
+// }
+
+/// 标志当前没有处理器持有内核映射器的锁
+/// 之所以需要这个标志,是因为AtomicUsize::new(0)会把0当作一个处理器的id
+const EPT_MAPPER_NO_PROCESSOR: usize = !0;
+/// 当前持有内核映射器锁的处理器
+static EPT_MAPPER_LOCK_OWNER: AtomicUsize = AtomicUsize::new(EPT_MAPPER_NO_PROCESSOR);
+/// 内核映射器的锁计数器
+static EPT_MAPPER_LOCK_COUNT: AtomicUsize = AtomicUsize::new(0);
+
+pub struct EptMapper {
+    /// EPT页表映射器
+    mapper: PageMapper,
+    /// 标记当前映射器是否为只读
+    readonly: bool,
+    // EPT页表根地址
+    // root_hpa: PhysAddr,
+}
+
+impl EptMapper {
+    fn lock_cpu(cpuid: usize, mapper: PageMapper) -> Self {
+        loop {
+            match EPT_MAPPER_LOCK_OWNER.compare_exchange_weak(
+                EPT_MAPPER_NO_PROCESSOR,
+                cpuid,
+                Ordering::Acquire,
+                Ordering::Relaxed,
+            ) {
+                Ok(_) => break,
+                // 当前处理器已经持有了锁
+                Err(id) if id == cpuid => break,
+                // either CAS failed, or some other hardware thread holds the lock
+                Err(_) => core::hint::spin_loop(),
+            }
+        }
+
+        let prev_count = EPT_MAPPER_LOCK_COUNT.fetch_add(1, Ordering::Relaxed);
+        compiler_fence(Ordering::Acquire);
+
+        // 本地核心已经持有过锁,因此标记当前加锁获得的映射器为只读
+        let readonly = prev_count > 0;
+
+        return Self { mapper, readonly };
+    }
+
+    /// @brief 锁定内核映射器, 并返回一个内核映射器对象
+    #[inline(always)]
+    pub fn lock() -> Self {
+        let cpuid = smp_get_processor_id() as usize;
+        let mapper = unsafe { PageMapper::current(PageTableKind::EPT, LockedFrameAllocator) };
+        return Self::lock_cpu(cpuid, mapper);
+    }
+
+    /// 映射guest physical addr(gpa)到指定的host physical addr(hpa)。
+    ///
+    /// ## 参数
+    ///
+    /// - `gpa`: 要映射的guest physical addr
+    /// - `hpa`: 要映射的host physical addr
+    /// - `flags`: 页面标志
+    ///
+    /// ## 返回
+    ///
+    /// - 成功:返回Ok(())
+    /// - 失败: 如果当前映射器为只读,则返回EAGAIN_OR_EWOULDBLOCK
+    pub unsafe fn walk(
+        &mut self,
+        gpa: u64,
+        hpa: u64,
+        flags: PageFlags<MMArch>,
+    ) -> Result<(), SystemError> {
+        if self.readonly {
+            return Err(SystemError::EAGAIN_OR_EWOULDBLOCK);
+        }
+        self.mapper
+            .map_phys(
+                VirtAddr::new(gpa as usize),
+                PhysAddr::new(hpa as usize),
+                flags,
+            )
+            .unwrap()
+            .flush();
+        return Ok(());
+    }
+
+    // fn get_ept_index(addr: u64, level: usize) -> u64 {
+    //     let pt64_level_shift = PAGE_SHIFT + (level - 1) * PT64_LEVEL_BITS;
+    //     (addr >> pt64_level_shift) & ((1 << PT64_LEVEL_BITS) - 1)
+    // }
+}

+ 7 - 0
kernel/src/arch/x86_64/kvm/vmx/kvm_emulation.rs

@@ -0,0 +1,7 @@
+// pub struct X86Exception {
+// 	vector: u8,
+// 	error_code_valid: bool,
+// 	error_code: u16,
+// 	// bool nested_page_fault;
+// 	address: u64, /* cr2 or nested page fault gpa */
+// }

+ 254 - 0
kernel/src/arch/x86_64/kvm/vmx/mmu.rs

@@ -0,0 +1,254 @@
+use crate::{
+    arch::kvm::vmx::ept::EptMapper,
+    kdebug,
+    libs::mutex::Mutex,
+    mm::{page::PageFlags, syscall::ProtFlags},
+    syscall::SystemError,
+    virt::kvm::host_mem::{__gfn_to_pfn, kvm_vcpu_gfn_to_memslot, PAGE_MASK, PAGE_SHIFT},
+};
+use bitfield_struct::bitfield;
+
+use super::{
+    ept::check_ept_features,
+    vcpu::VmxVcpu,
+    vmcs::VmcsFields,
+    vmx_asm_wrapper::{vmx_vmread, vmx_vmwrite},
+};
+use crate::arch::kvm::vmx::mmu::VmcsFields::CTRL_EPTP_PTR;
+
+// pub const PT64_ROOT_LEVEL: u32 = 4;
+// pub const PT32_ROOT_LEVEL: u32 = 2;
+// pub const PT32E_ROOT_LEVEL: u32 = 3;
+
+// pub struct KvmMmuPage{
+//     gfn: u64, // 管理地址范围的起始地址对应的 gfn
+//     role: KvmMmuPageRole, // 基本信息,包括硬件特性和所属层级等
+//     // spt: *mut u64, // spt: shadow page table,指向 struct page 的地址,其包含了所有页表项 (pte)。同时 page->private 会指向该 kvm_mmu_page
+// }
+
+#[bitfield(u32)]
+pub struct KvmMmuPageRole {
+    #[bits(4)]
+    level: usize, // 页所处的层级
+    cr4_pae: bool, // cr4.pae,1 表示使用 64bit gpte
+    #[bits(2)]
+    quadrant: usize, // 如果 cr4.pae=0,则 gpte 为 32bit,但 spte 为 64bit,因此需要用多个 spte 来表示一个 gpte,该字段指示是 gpte 的第几块
+    direct: bool,
+    #[bits(3)]
+    access: usize, // 访问权限
+    invalid: bool,        // 失效,一旦 unpin 就会被销毁
+    nxe: bool,            // efer.nxe,不可执行
+    cr0_wp: bool,         // cr0.wp, 写保护
+    smep_andnot_wp: bool, // smep && !cr0.wp,SMEP启用,用户模式代码将无法执行位于内核地址空间中的指令。
+    smap_andnot_wp: bool, // smap && !cr0.wp
+    #[bits(8)]
+    unused: usize,
+    #[bits(8)]
+    smm: usize, // 1 表示处于 system management mode, 0 表示非 SMM
+}
+
+//  We don't want allocation failures within the mmu code, so we preallocate
+// enough memory for a single page fault in a cache.
+// pub struct KvmMmuMemoryCache {
+//     num_objs: u32,
+//     objs: [*mut u8; KVM_NR_MEM_OBJS as usize],
+// }
+
+#[derive(Default)]
+pub struct KvmMmu {
+    pub root_hpa: u64,
+    pub root_level: u32,
+    pub base_role: KvmMmuPageRole,
+    // ...还有一些变量不知道用来做什么
+    pub get_cr3: Option<fn(&VmxVcpu) -> u64>,
+    pub set_eptp: Option<fn(u64) -> Result<(), SystemError>>,
+    pub page_fault: Option<
+        fn(
+            vcpu: &mut VmxVcpu,
+            gpa: u64,
+            error_code: u32,
+            prefault: bool,
+        ) -> Result<(), SystemError>,
+    >,
+    // get_pdptr: Option<fn(& VmxVcpu, index:u32) -> u64>, // Page Directory Pointer Table Register?暂时不知道和CR3的区别是什么
+    // inject_page_fault: Option<fn(&mut VmxVcpu, fault: &X86Exception)>,
+    // gva_to_gpa: Option<fn(&mut VmxVcpu, gva: u64, access: u32, exception: &X86Exception) -> u64>,
+    // translate_gpa: Option<fn(&mut VmxVcpu, gpa: u64, access: u32, exception: &X86Exception) -> u64>,
+    // sync_page: Option<fn(&mut VmxVcpu, &mut KvmMmuPage)>,
+    // invlpg: Option<fn(&mut VmxVcpu, gva: u64)>, // invalid entry
+    // update_pte: Option<fn(&mut VmxVcpu, sp: &KvmMmuPage, spte: u64, pte: u64)>,
+}
+
+impl core::fmt::Debug for KvmMmu {
+    fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
+        f.debug_struct("KvmMmu")
+            .field("root_hpa", &self.root_hpa)
+            .field("root_level", &self.root_level)
+            .field("base_role", &self.base_role)
+            .finish()
+    }
+}
+
+fn tdp_get_cr3(_vcpu: &VmxVcpu) -> u64 {
+    let guest_cr3 = vmx_vmread(VmcsFields::GUEST_CR3 as u32).expect("Failed to read eptp");
+    return guest_cr3;
+}
+
+fn tdp_set_eptp(root_hpa: u64) -> Result<(), SystemError> {
+    // 设置权限位,目前是写死的,可读可写可执行
+    //  EPT paging-structure memory type: Uncacheable
+    let mut eptp = 0x0 as u64;
+    // This value is 1 less than the EPT page-walk length.  3 means 4-level paging.
+    eptp |= 0x3 << 3;
+    eptp |= root_hpa & (PAGE_MASK as u64);
+    vmx_vmwrite(CTRL_EPTP_PTR as u32, eptp)?;
+    Ok(())
+}
+
+fn tdp_page_fault(
+    vcpu: &mut VmxVcpu,
+    gpa: u64,
+    error_code: u32,
+    prefault: bool,
+) -> Result<(), SystemError> {
+    kdebug!("tdp_page_fault");
+    let gfn = gpa >> PAGE_SHIFT; // 物理地址右移12位得到物理页框号(相对于虚拟机而言)
+                                 // 分配缓存池,为了避免在运行时分配空间失败,这里提前分配/填充足额的空间
+    mmu_topup_memory_caches(vcpu)?;
+    // TODO:获取gfn使用的level,处理hugepage的问题
+    let level = 1; // 4KB page
+                   // TODO: 快速处理由读写操作引起violation,即present同时有写权限的非mmio page fault
+                   // fast_page_fault(vcpu, gpa, level, error_code)
+                   // gfn->pfn
+    let mut map_writable = false;
+    let write = error_code & ((1 as u32) << 1);
+    let pfn = mmu_gfn_to_pfn_fast(vcpu, gpa, prefault, gfn, write == 0, &mut map_writable)?;
+    // direct map就是映射ept页表的过程
+    __direct_map(vcpu, gpa, write, map_writable, level, gfn, pfn, prefault)?;
+    Ok(())
+}
+
+/*
+ * Caculate mmu pages needed for kvm.
+ */
+// pub fn kvm_mmu_calculate_mmu_pages() -> u32 {
+// 	let mut nr_mmu_pages:u32;
+//     let mut nr_pages = 0;
+
+//     let kvm = vm(0).unwrap();
+//     for as_id in 0..KVM_ADDRESS_SPACE_NUM {
+//         let slots = kvm.memslots[as_id];
+//         for i in 0..KVM_MEM_SLOTS_NUM {
+//             let memslot = slots.memslots[i as usize];
+//             nr_pages += memslot.npages;
+//         }
+//     }
+
+// 	nr_mmu_pages = (nr_pages as u32)* KVM_PERMILLE_MMU_PAGES / 1000;
+// 	nr_mmu_pages = nr_mmu_pages.max(KVM_MIN_ALLOC_MMU_PAGES);
+// 	return nr_mmu_pages;
+// }
+
+// pub fn kvm_mmu_change_mmu_pages(mut goal_nr_mmu_pages: u32){
+//     let kvm = KVM();
+//     // 释放多余的mmu page
+//     if kvm.lock().arch.n_used_mmu_pages > goal_nr_mmu_pages {
+//         while kvm.lock().arch.n_used_mmu_pages > goal_nr_mmu_pages {
+//             if !prepare_zap_oldest_mmu_page() {
+//                 break;
+//             }
+//         }
+//         // kvm_mmu_commit_zap_page();
+//         goal_nr_mmu_pages = kvm.lock().arch.n_used_mmu_pages;
+
+//     }
+//     kvm.lock().arch.n_max_mmu_pages = goal_nr_mmu_pages;
+// }
+
+// pub fn prepare_zap_oldest_mmu_page() -> bool {
+//     return false;
+// }
+
+pub fn kvm_mmu_setup(vcpu: &Mutex<VmxVcpu>) {
+    // TODO: init_kvm_softmmu(vcpu), init_kvm_nested_mmu(vcpu)
+    init_kvm_tdp_mmu(vcpu);
+}
+
+pub fn kvm_vcpu_mtrr_init(_vcpu: &Mutex<VmxVcpu>) -> Result<(), SystemError> {
+    check_ept_features()?;
+    Ok(())
+}
+
+pub fn init_kvm_tdp_mmu(vcpu: &Mutex<VmxVcpu>) {
+    let context = &mut vcpu.lock().mmu;
+    context.page_fault = Some(tdp_page_fault);
+    context.get_cr3 = Some(tdp_get_cr3);
+    context.set_eptp = Some(tdp_set_eptp);
+    // context.inject_page_fault = kvm_inject_page_fault; TODO: inject_page_fault
+    // context.invlpg = nonpaging_invlpg;
+    // context.sync_page = nonpaging_sync_page;
+    // context.update_pte = nonpaging_update_pte;
+
+    // TODO: gva to gpa in kvm
+    // if !is_paging(vcpu) { // vcpu不分页
+    //     context.gva_to_gpa = nonpaging_gva_to_gpa;
+    // 	context.root_level = 0;
+    // } else if (is_long_mode(vcpu)) {
+    // 	context.gva_to_gpa = paging64_gva_to_gpa;
+    // 	context.root_level = PT64_ROOT_LEVEL;
+    // TODO:: different paging strategy
+    // } else if (is_pae(vcpu)) {
+    //     context.gva_to_gpa = paging64_gva_to_gpa;
+    //     context.root_level = PT32E_ROOT_LEVEL;
+    // } else {
+    //     context.gva_to_gpa = paging32_gva_to_gpa;
+    //     context.root_level = PT32_ROOT_LEVEL;
+    // }
+}
+
+pub fn __direct_map(
+    vcpu: &mut VmxVcpu,
+    gpa: u64,
+    _write: u32,
+    _map_writable: bool,
+    _level: i32,
+    _gfn: u64,
+    pfn: u64,
+    _prefault: bool,
+) -> Result<u32, SystemError> {
+    kdebug!("gpa={}, pfn={}, root_hpa={:x}", gpa, pfn, vcpu.mmu.root_hpa);
+    // 判断vcpu.mmu.root_hpa是否有效
+    if vcpu.mmu.root_hpa == 0 {
+        return Err(SystemError::KVM_HVA_ERR_BAD);
+    }
+    // 把gpa映射到hpa
+    let mut ept_mapper = EptMapper::lock();
+    let page_flags = PageFlags::from_prot_flags(ProtFlags::from_bits_truncate(0x7 as u64), false);
+    unsafe {
+        assert!(ept_mapper.walk(gpa, pfn << PAGE_SHIFT, page_flags).is_ok());
+    }
+    drop(ept_mapper);
+    return Ok(0);
+}
+
+pub fn mmu_gfn_to_pfn_fast(
+    vcpu: &mut VmxVcpu,
+    _gpa: u64,
+    _prefault: bool,
+    gfn: u64,
+    write: bool,
+    writable: &mut bool,
+) -> Result<u64, SystemError> {
+    let slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn);
+    let pfn = __gfn_to_pfn(slot, gfn, false, write, writable)?;
+    Ok(pfn)
+}
+
+// TODO: 添加cache
+pub fn mmu_topup_memory_caches(_vcpu: &mut VmxVcpu) -> Result<(), SystemError> {
+    // 如果 vcpu->arch.mmu_page_header_cache 不足,从 mmu_page_header_cache 中分配
+    // pte_list_desc_cache 和 mmu_page_header_cache 两块全局 slab cache 在 kvm_mmu_module_init 中被创建
+    // mmu_topup_memory_cache(vcpu.mmu_page_header_cache,
+    //     mmu_page_header_cache, 4);
+    Ok(())
+}

+ 45 - 0
kernel/src/arch/x86_64/kvm/vmx/mod.rs

@@ -0,0 +1,45 @@
+pub mod ept;
+pub mod kvm_emulation;
+pub mod mmu;
+pub mod seg;
+pub mod vcpu;
+pub mod vmcs;
+pub mod vmexit;
+pub mod vmx_asm_wrapper;
+
+#[allow(dead_code)]
+pub enum VcpuRegIndex {
+    Rax = 0,
+    Rbx = 1,
+    Rcx = 2,
+    Rdx = 3,
+    Rsi = 4,
+    Rdi = 5,
+    Rsp = 6,
+    Rbp = 7,
+    R8 = 8,
+    R9 = 9,
+    R10 = 10,
+    R11 = 11,
+    R12 = 12,
+    R13 = 13,
+    R14 = 14,
+    R15 = 15,
+}
+
+bitflags! {
+    #[allow(non_camel_case_types)]
+    pub struct X86_CR0: u32{
+        const CR0_PE = 1 << 0; /* Protection Enable */
+        const CR0_MP = 1 << 1; /* Monitor Coprocessor */
+        const CR0_EM = 1 << 2; /* Emulation */
+        const CR0_TS = 1 << 3; /* Task Switched */
+        const CR0_ET = 1 << 4; /* Extension Type */
+        const CR0_NE = 1 << 5; /* Numeric Error */
+        const CR0_WP = 1 << 16; /* Write Protect */
+        const CR0_AM = 1 << 18; /* Alignment Mask */
+        const CR0_NW = 1 << 29; /* Not Write-through */
+        const CR0_CD = 1 << 30; /* Cache Disable */
+        const CR0_PG = 1 << 31; /* Paging */
+    }
+}

+ 89 - 0
kernel/src/arch/x86_64/kvm/vmx/seg.rs

@@ -0,0 +1,89 @@
+use crate::arch::kvm::VmcsFields::{
+    GUEST_CS_ACCESS_RIGHTS, GUEST_CS_BASE, GUEST_CS_LIMIT, GUEST_CS_SELECTOR,
+};
+use crate::arch::kvm::VmcsFields::{
+    GUEST_DS_ACCESS_RIGHTS, GUEST_DS_BASE, GUEST_DS_LIMIT, GUEST_DS_SELECTOR,
+};
+use crate::arch::kvm::VmcsFields::{
+    GUEST_ES_ACCESS_RIGHTS, GUEST_ES_BASE, GUEST_ES_LIMIT, GUEST_ES_SELECTOR,
+};
+use crate::arch::kvm::VmcsFields::{
+    GUEST_FS_ACCESS_RIGHTS, GUEST_FS_BASE, GUEST_FS_LIMIT, GUEST_FS_SELECTOR,
+};
+use crate::arch::kvm::VmcsFields::{
+    GUEST_GS_ACCESS_RIGHTS, GUEST_GS_BASE, GUEST_GS_LIMIT, GUEST_GS_SELECTOR,
+};
+use crate::arch::kvm::VmcsFields::{
+    GUEST_LDTR_ACCESS_RIGHTS, GUEST_LDTR_BASE, GUEST_LDTR_LIMIT, GUEST_LDTR_SELECTOR,
+};
+use crate::arch::kvm::VmcsFields::{
+    GUEST_SS_ACCESS_RIGHTS, GUEST_SS_BASE, GUEST_SS_LIMIT, GUEST_SS_SELECTOR,
+};
+use crate::arch::kvm::VmcsFields::{
+    GUEST_TR_ACCESS_RIGHTS, GUEST_TR_BASE, GUEST_TR_LIMIT, GUEST_TR_SELECTOR,
+};
+use crate::syscall::SystemError;
+
+use super::vmx_asm_wrapper::vmx_vmwrite;
+
+// pub const TSS_IOPB_BASE_OFFSET: usize = 0x66;
+// pub const TSS_BASE_SIZE: usize = 0x68;
+// pub const TSS_IOPB_SIZE: usize = 65536 / 8;
+// pub const TSS_REDIRECTION_SIZE: usize = 256 / 8;
+// pub const RMODE_TSS_SIZE: usize = TSS_BASE_SIZE + TSS_REDIRECTION_SIZE + TSS_IOPB_SIZE + 1;
+
+#[derive(Debug)]
+pub struct KvmVmxSegmentField {
+    selector: u32,
+    base: u32,
+    limit: u32,
+    access_rights: u32,
+}
+
+macro_rules! VMX_SEGMENT_FIELD {
+    ($struct_name: ident) => {
+        KvmVmxSegmentField {
+            selector: concat_idents!(GUEST_, $struct_name, _SELECTOR) as u32,
+            base: concat_idents!(GUEST_, $struct_name, _BASE) as u32,
+            limit: concat_idents!(GUEST_, $struct_name, _LIMIT) as u32,
+            access_rights: concat_idents!(GUEST_, $struct_name, _ACCESS_RIGHTS) as u32,
+        }
+    };
+}
+#[derive(FromPrimitive)]
+pub enum Sreg {
+    ES = 0,
+    CS = 1,
+    SS = 2,
+    DS = 3,
+    FS = 4,
+    GS = 5,
+    TR = 6,
+    LDTR = 7,
+}
+
+static KVM_VMX_SEGMENT_FIELDS: [KvmVmxSegmentField; 8] = [
+    VMX_SEGMENT_FIELD!(ES),
+    VMX_SEGMENT_FIELD!(CS),
+    VMX_SEGMENT_FIELD!(SS),
+    VMX_SEGMENT_FIELD!(DS),
+    VMX_SEGMENT_FIELD!(FS),
+    VMX_SEGMENT_FIELD!(GS),
+    VMX_SEGMENT_FIELD!(TR),
+    VMX_SEGMENT_FIELD!(LDTR),
+];
+
+pub fn seg_setup(seg: usize) -> Result<(), SystemError> {
+    let seg_field = &KVM_VMX_SEGMENT_FIELDS[seg];
+    let mut access_rigt = 0x0093;
+    if seg == Sreg::CS as usize {
+        access_rigt |= 0x08;
+    }
+    // setup segment fields
+    vmx_vmwrite(seg_field.selector, 0)?;
+    vmx_vmwrite(seg_field.base, 0)?;
+    vmx_vmwrite(seg_field.limit, 0x0000_FFFF)?;
+    vmx_vmwrite(seg_field.access_rights, access_rigt)?;
+
+    Ok(())
+}

+ 653 - 0
kernel/src/arch/x86_64/kvm/vmx/vcpu.rs

@@ -0,0 +1,653 @@
+use super::vmcs::{
+    VMCSRegion, VmcsFields, VmxEntryCtrl, VmxPrimaryExitCtrl, VmxPrimaryProcessBasedExecuteCtrl,
+    VmxSecondaryProcessBasedExecuteCtrl,
+};
+use super::vmx_asm_wrapper::{vmx_vmclear, vmx_vmptrld, vmx_vmread, vmx_vmwrite, vmxoff, vmxon};
+use crate::arch::kvm::vmx::mmu::KvmMmu;
+use crate::arch::kvm::vmx::seg::{seg_setup, Sreg};
+use crate::arch::kvm::vmx::{VcpuRegIndex, X86_CR0};
+use crate::arch::mm::{LockedFrameAllocator, PageMapper};
+use crate::arch::x86_64::mm::X86_64MMArch;
+use crate::arch::MMArch;
+use crate::kdebug;
+use crate::mm::{phys_2_virt, VirtAddr};
+use crate::mm::{MemoryManagementArch, PageTableKind};
+use crate::syscall::SystemError;
+use crate::virt::kvm::vcpu::Vcpu;
+use crate::virt::kvm::vm::Vm;
+use alloc::alloc::Global;
+use alloc::boxed::Box;
+use core::slice;
+use raw_cpuid::CpuId;
+use x86;
+use x86::{controlregs, msr, segmentation};
+// use crate::arch::kvm::vmx::seg::RMODE_TSS_SIZE;
+// use crate::virt::kvm::{KVM};
+
+// KERNEL_ALLOCATOR
+pub const PAGE_SIZE: usize = 0x1000;
+pub const NR_VCPU_REGS: usize = 16;
+
+#[repr(C, align(4096))]
+#[derive(Debug)]
+pub struct VmxonRegion {
+    pub revision_id: u32,
+    pub data: [u8; PAGE_SIZE - 4],
+}
+
+#[repr(C, align(4096))]
+#[derive(Debug)]
+pub struct MSRBitmap {
+    pub data: [u8; PAGE_SIZE],
+}
+
+#[derive(Debug)]
+pub struct VcpuData {
+    /// The virtual and physical address of the Vmxon naturally aligned 4-KByte region of memory
+    pub vmxon_region: Box<VmxonRegion>,
+    pub vmxon_region_physical_address: u64, // vmxon需要该地址
+    /// The virtual and physical address of the Vmcs naturally aligned 4-KByte region of memory
+    /// holds the complete CPU state of both the host and the guest.
+    /// includes the segment registers, GDT, IDT, TR, various MSR’s
+    /// and control field structures for handling exit and entry operations
+    pub vmcs_region: Box<VMCSRegion>,
+    pub vmcs_region_physical_address: u64, // vmptrld, vmclear需要该地址
+    pub msr_bitmap: Box<MSRBitmap>,
+    pub msr_bitmap_physical_address: u64,
+}
+
+#[derive(Default, Debug)]
+#[repr(C)]
+pub struct VcpuContextFrame {
+    pub regs: [usize; NR_VCPU_REGS], // 通用寄存器
+    pub rip: usize,
+    pub rflags: usize,
+}
+
+#[derive(Debug)]
+#[allow(dead_code)]
+pub enum VcpuState {
+    VcpuInv = 0,
+    VcpuPend = 1,
+    VcpuAct = 2,
+}
+
+#[derive(Debug)]
+pub struct VmxVcpu {
+    pub vcpu_id: u32,
+    pub vcpu_ctx: VcpuContextFrame, // 保存vcpu切换时的上下文,如通用寄存器等
+    pub vcpu_state: VcpuState,      // vcpu当前运行状态
+    pub mmu: KvmMmu,                // vcpu的内存管理单元
+    pub data: VcpuData,             // vcpu的数据
+    pub parent_vm: Vm,              // parent KVM
+}
+
+impl VcpuData {
+    pub fn alloc() -> Result<Self, SystemError> {
+        let vmxon_region: Box<VmxonRegion> = unsafe {
+            Box::try_new_zeroed_in(Global)
+                .expect("Try new zeroed fail!")
+                .assume_init()
+        };
+        let vmcs_region: Box<VMCSRegion> = unsafe {
+            Box::try_new_zeroed_in(Global)
+                .expect("Try new zeroed fail!")
+                .assume_init()
+        };
+        let msr_bitmap: Box<MSRBitmap> = unsafe {
+            Box::try_new_zeroed_in(Global)
+                .expect("Try new zeroed fail!")
+                .assume_init()
+        };
+        // FIXME: virt_2_phys的转换正确性存疑
+        let vmxon_region_physical_address = {
+            let vaddr = VirtAddr::new(vmxon_region.as_ref() as *const _ as _);
+            unsafe { MMArch::virt_2_phys(vaddr).unwrap().data() as u64 }
+        };
+        let vmcs_region_physical_address = {
+            let vaddr = VirtAddr::new(vmcs_region.as_ref() as *const _ as _);
+            unsafe { MMArch::virt_2_phys(vaddr).unwrap().data() as u64 }
+        };
+        let msr_bitmap_physical_address = {
+            let vaddr = VirtAddr::new(msr_bitmap.as_ref() as *const _ as _);
+            unsafe { MMArch::virt_2_phys(vaddr).unwrap().data() as u64 }
+        };
+
+        let mut instance = Self {
+            // Allocate a naturally aligned 4-KByte VMXON region of memory to enable VMX operation (Intel Manual: 25.11.5 VMXON Region)
+            vmxon_region,
+            vmxon_region_physical_address,
+            // Allocate a naturally aligned 4-KByte VMCS region of memory
+            vmcs_region,
+            vmcs_region_physical_address,
+            msr_bitmap,
+            msr_bitmap_physical_address,
+        };
+        // printk_color!(GREEN, BLACK, "[+] init_region\n");
+        instance.init_region()?;
+        Ok(instance)
+    }
+
+    pub fn init_region(&mut self) -> Result<(), SystemError> {
+        // Get the Virtual Machine Control Structure revision identifier (VMCS revision ID)
+        // (Intel Manual: 25.11.5 VMXON Region)
+        let revision_id = unsafe { (msr::rdmsr(msr::IA32_VMX_BASIC) as u32) & 0x7FFF_FFFF };
+        kdebug!("[+] VMXON Region Virtual Address: {:p}", self.vmxon_region);
+        kdebug!(
+            "[+] VMXON Region Physical Addresss: 0x{:x}",
+            self.vmxon_region_physical_address
+        );
+        kdebug!("[+] VMCS Region Virtual Address: {:p}", self.vmcs_region);
+        kdebug!(
+            "[+] VMCS Region Physical Address1: 0x{:x}",
+            self.vmcs_region_physical_address
+        );
+        self.vmxon_region.revision_id = revision_id;
+        self.vmcs_region.revision_id = revision_id;
+        return Ok(());
+    }
+}
+
+impl VmxVcpu {
+    pub fn new(vcpu_id: u32, parent_vm: Vm) -> Result<Self, SystemError> {
+        kdebug!("Creating processor {}", vcpu_id);
+        let instance = Self {
+            vcpu_id,
+            vcpu_ctx: VcpuContextFrame {
+                regs: [0; NR_VCPU_REGS],
+                rip: 0,
+                rflags: 0,
+            },
+            vcpu_state: VcpuState::VcpuInv,
+            mmu: KvmMmu::default(),
+            data: VcpuData::alloc()?,
+            parent_vm,
+        };
+        Ok(instance)
+    }
+
+    pub fn vmx_set_cr0(cr0: X86_CR0) -> Result<(), SystemError> {
+        let mut hw_cr0 = cr0 & !(X86_CR0::CR0_NW | X86_CR0::CR0_CD);
+        hw_cr0 |= X86_CR0::CR0_WP | X86_CR0::CR0_NE;
+
+        vmx_vmwrite(VmcsFields::GUEST_CR0 as u32, cr0.bits() as u64)?;
+        Ok(())
+    }
+
+    pub fn vmcs_init_guest(&self) -> Result<(), SystemError> {
+        // https://www.sandpile.org/x86/initial.htm
+        // segment field initialization
+        seg_setup(Sreg::CS as usize)?;
+        vmx_vmwrite(VmcsFields::GUEST_CS_SELECTOR as u32, 0xf000)?;
+        vmx_vmwrite(VmcsFields::GUEST_CS_BASE as u32, 0xffff0000)?;
+
+        seg_setup(Sreg::DS as usize)?;
+        seg_setup(Sreg::ES as usize)?;
+        seg_setup(Sreg::FS as usize)?;
+        seg_setup(Sreg::GS as usize)?;
+        seg_setup(Sreg::SS as usize)?;
+
+        vmx_vmwrite(VmcsFields::GUEST_TR_SELECTOR as u32, 0)?;
+        vmx_vmwrite(VmcsFields::GUEST_TR_BASE as u32, 0)?;
+        vmx_vmwrite(VmcsFields::GUEST_TR_LIMIT as u32, 0xffff)?;
+        vmx_vmwrite(VmcsFields::GUEST_TR_ACCESS_RIGHTS as u32, 0x008b)?;
+
+        vmx_vmwrite(VmcsFields::GUEST_LDTR_SELECTOR as u32, 0)?;
+        vmx_vmwrite(VmcsFields::GUEST_LDTR_BASE as u32, 0)?;
+        vmx_vmwrite(VmcsFields::GUEST_LDTR_LIMIT as u32, 0xffff)?;
+        vmx_vmwrite(VmcsFields::GUEST_LDTR_ACCESS_RIGHTS as u32, 0x00082)?;
+
+        vmx_vmwrite(VmcsFields::GUEST_RFLAGS as u32, 2)?;
+
+        vmx_vmwrite(VmcsFields::GUEST_GDTR_BASE as u32, 0)?;
+        vmx_vmwrite(VmcsFields::GUEST_GDTR_LIMIT as u32, 0x0000_FFFF as u64)?;
+
+        vmx_vmwrite(VmcsFields::GUEST_IDTR_BASE as u32, 0)?;
+        vmx_vmwrite(VmcsFields::GUEST_IDTR_LIMIT as u32, 0x0000_FFFF as u64)?;
+
+        vmx_vmwrite(VmcsFields::GUEST_ACTIVITY_STATE as u32, 0)?; // State = Active
+        vmx_vmwrite(VmcsFields::GUEST_INTERRUPTIBILITY_STATE as u32, 0)?;
+        vmx_vmwrite(VmcsFields::GUEST_PENDING_DBG_EXCEPTIONS as u32, 0)?;
+
+        vmx_vmwrite(VmcsFields::CTRL_VM_ENTRY_INTR_INFO_FIELD as u32, 0)?;
+
+        let cr0 = X86_CR0::CR0_NW | X86_CR0::CR0_CD | X86_CR0::CR0_ET;
+        Self::vmx_set_cr0(cr0)?;
+
+        vmx_vmwrite(VmcsFields::GUEST_CR0 as u32, cr0.bits() as u64)?;
+
+        vmx_vmwrite(
+            VmcsFields::GUEST_SYSENTER_CS as u32,
+            vmx_vmread(VmcsFields::HOST_SYSENTER_CS as u32).unwrap(),
+        )?;
+        vmx_vmwrite(VmcsFields::GUEST_VMX_PREEMPT_TIMER_VALUE as u32, 0)?;
+
+        vmx_vmwrite(VmcsFields::GUEST_INTR_STATUS as u32, 0)?;
+        vmx_vmwrite(VmcsFields::GUEST_PML_INDEX as u32, 0)?;
+
+        vmx_vmwrite(VmcsFields::GUEST_VMCS_LINK_PTR as u32, u64::MAX)?;
+        vmx_vmwrite(VmcsFields::GUEST_DEBUGCTL as u32, unsafe {
+            msr::rdmsr(msr::IA32_DEBUGCTL)
+        })?;
+
+        vmx_vmwrite(
+            VmcsFields::GUEST_SYSENTER_ESP as u32,
+            vmx_vmread(VmcsFields::HOST_SYSENTER_ESP as u32).unwrap(),
+        )?;
+        vmx_vmwrite(
+            VmcsFields::GUEST_SYSENTER_EIP as u32,
+            vmx_vmread(VmcsFields::HOST_SYSENTER_EIP as u32).unwrap(),
+        )?;
+
+        // Self::vmx_set_cr0();
+        vmx_vmwrite(VmcsFields::GUEST_CR3 as u32, 0)?;
+        vmx_vmwrite(
+            VmcsFields::GUEST_CR4 as u32,
+            1, // enable vme
+        )?;
+        vmx_vmwrite(VmcsFields::GUEST_DR7 as u32, 0x0000_0000_0000_0400)?;
+        vmx_vmwrite(
+            VmcsFields::GUEST_RSP as u32,
+            self.vcpu_ctx.regs[VcpuRegIndex::Rsp as usize] as u64,
+        )?;
+        vmx_vmwrite(VmcsFields::GUEST_RIP as u32, self.vcpu_ctx.rip as u64)?;
+        kdebug!("vmcs init guest rip: {:#x}", self.vcpu_ctx.rip as u64);
+        kdebug!(
+            "vmcs init guest rsp: {:#x}",
+            self.vcpu_ctx.regs[VcpuRegIndex::Rsp as usize] as u64
+        );
+
+        // vmx_vmwrite(VmcsFields::GUEST_RFLAGS as u32, x86::bits64::rflags::read().bits())?;
+        Ok(())
+    }
+
+    #[allow(deprecated)]
+    pub fn vmcs_init_host(&self) -> Result<(), SystemError> {
+        vmx_vmwrite(VmcsFields::HOST_CR0 as u32, unsafe {
+            controlregs::cr0().bits().try_into().unwrap()
+        })?;
+        vmx_vmwrite(VmcsFields::HOST_CR3 as u32, unsafe { controlregs::cr3() })?;
+        vmx_vmwrite(VmcsFields::HOST_CR4 as u32, unsafe {
+            controlregs::cr4().bits().try_into().unwrap()
+        })?;
+        vmx_vmwrite(
+            VmcsFields::HOST_ES_SELECTOR as u32,
+            (segmentation::es().bits() & (!0x07)).into(),
+        )?;
+        vmx_vmwrite(
+            VmcsFields::HOST_CS_SELECTOR as u32,
+            (segmentation::cs().bits() & (!0x07)).into(),
+        )?;
+        vmx_vmwrite(
+            VmcsFields::HOST_SS_SELECTOR as u32,
+            (segmentation::ss().bits() & (!0x07)).into(),
+        )?;
+        vmx_vmwrite(
+            VmcsFields::HOST_DS_SELECTOR as u32,
+            (segmentation::ds().bits() & (!0x07)).into(),
+        )?;
+        vmx_vmwrite(
+            VmcsFields::HOST_FS_SELECTOR as u32,
+            (segmentation::fs().bits() & (!0x07)).into(),
+        )?;
+        vmx_vmwrite(
+            VmcsFields::HOST_GS_SELECTOR as u32,
+            (segmentation::gs().bits() & (!0x07)).into(),
+        )?;
+        vmx_vmwrite(VmcsFields::HOST_TR_SELECTOR as u32, unsafe {
+            (x86::task::tr().bits() & (!0x07)).into()
+        })?;
+        vmx_vmwrite(VmcsFields::HOST_FS_BASE as u32, unsafe {
+            msr::rdmsr(msr::IA32_FS_BASE)
+        })?;
+        vmx_vmwrite(VmcsFields::HOST_GS_BASE as u32, unsafe {
+            msr::rdmsr(msr::IA32_GS_BASE)
+        })?;
+
+        let mut pseudo_descriptpr: x86::dtables::DescriptorTablePointer<u64> = Default::default();
+        unsafe {
+            x86::dtables::sgdt(&mut pseudo_descriptpr);
+        };
+
+        vmx_vmwrite(
+            VmcsFields::HOST_TR_BASE as u32,
+            get_segment_base(pseudo_descriptpr.base, pseudo_descriptpr.limit, unsafe {
+                x86::task::tr().bits().into()
+            }),
+        )?;
+        vmx_vmwrite(
+            VmcsFields::HOST_GDTR_BASE as u32,
+            pseudo_descriptpr.base.to_bits() as u64,
+        )?;
+        vmx_vmwrite(VmcsFields::HOST_IDTR_BASE as u32, unsafe {
+            let mut pseudo_descriptpr: x86::dtables::DescriptorTablePointer<u64> =
+                Default::default();
+            x86::dtables::sidt(&mut pseudo_descriptpr);
+            pseudo_descriptpr.base.to_bits() as u64
+        })?;
+
+        // fast entry into the kernel
+        vmx_vmwrite(VmcsFields::HOST_SYSENTER_ESP as u32, unsafe {
+            msr::rdmsr(msr::IA32_SYSENTER_ESP)
+        })?;
+        vmx_vmwrite(VmcsFields::HOST_SYSENTER_EIP as u32, unsafe {
+            msr::rdmsr(msr::IA32_SYSENTER_EIP)
+        })?;
+        vmx_vmwrite(VmcsFields::HOST_SYSENTER_CS as u32, unsafe {
+            msr::rdmsr(msr::IA32_SYSENTER_CS)
+        })?;
+
+        // vmx_vmwrite(VmcsFields::HOST_RIP as u32, vmx_return as *const () as u64)?;
+        // kdebug!("vmcs init host rip: {:#x}", vmx_return as *const () as u64);
+
+        Ok(())
+    }
+
+    // Intel SDM Volume 3C Chapter 25.3 “Organization of VMCS Data”
+    pub fn vmcs_init(&self) -> Result<(), SystemError> {
+        vmx_vmwrite(VmcsFields::CTRL_PAGE_FAULT_ERR_CODE_MASK as u32, 0)?;
+        vmx_vmwrite(VmcsFields::CTRL_PAGE_FAULT_ERR_CODE_MATCH as u32, 0)?;
+        vmx_vmwrite(VmcsFields::CTRL_CR3_TARGET_COUNT as u32, 0)?;
+
+        vmx_vmwrite(
+            VmcsFields::CTRL_PIN_BASED_VM_EXEC_CTRLS as u32,
+            adjust_vmx_pinbased_controls() as u64,
+        )?;
+
+        vmx_vmwrite(
+            VmcsFields::CTRL_MSR_BITMAP_ADDR as u32,
+            self.data.msr_bitmap_physical_address,
+        )?;
+
+        vmx_vmwrite(VmcsFields::CTRL_CR0_READ_SHADOW as u32, unsafe {
+            controlregs::cr0().bits().try_into().unwrap()
+        })?;
+        vmx_vmwrite(VmcsFields::CTRL_CR4_READ_SHADOW as u32, unsafe {
+            controlregs::cr4().bits().try_into().unwrap()
+        })?;
+        vmx_vmwrite(
+            VmcsFields::CTRL_VM_ENTRY_CTRLS as u32,
+            adjust_vmx_entry_controls() as u64,
+        )?;
+        vmx_vmwrite(
+            VmcsFields::CTRL_PRIMARY_VM_EXIT_CTRLS as u32,
+            adjust_vmx_exit_controls() as u64,
+        )?;
+        vmx_vmwrite(
+            VmcsFields::CTRL_PRIMARY_PROCESSOR_VM_EXEC_CTRLS as u32,
+            adjust_vmx_primary_process_exec_controls() as u64,
+        )?;
+        vmx_vmwrite(
+            VmcsFields::CTRL_SECONDARY_PROCESSOR_VM_EXEC_CTRLS as u32,
+            adjust_vmx_secondary_process_exec_controls() as u64,
+        )?;
+
+        self.vmcs_init_host()?;
+        self.vmcs_init_guest()?;
+        Ok(())
+    }
+
+    fn kvm_mmu_load(&mut self) -> Result<(), SystemError> {
+        kdebug!("kvm_mmu_load!");
+        // 申请并创建新的页表
+        let mapper: crate::mm::page::PageMapper<X86_64MMArch, LockedFrameAllocator> = unsafe {
+            PageMapper::create(PageTableKind::EPT, LockedFrameAllocator)
+                .ok_or(SystemError::ENOMEM)?
+        };
+
+        let ept_root_hpa = mapper.table().phys();
+        let set_eptp_fn = self.mmu.set_eptp.unwrap();
+        set_eptp_fn(ept_root_hpa.data() as u64)?;
+        self.mmu.root_hpa = ept_root_hpa.data() as u64;
+        kdebug!("ept_root_hpa:{:x}!", ept_root_hpa.data() as u64);
+
+        return Ok(());
+    }
+
+    pub fn set_regs(&mut self, regs: VcpuContextFrame) -> Result<(), SystemError> {
+        self.vcpu_ctx = regs;
+        Ok(())
+    }
+}
+
+impl Vcpu for VmxVcpu {
+    /// Virtualize the CPU
+    fn virtualize_cpu(&mut self) -> Result<(), SystemError> {
+        match has_intel_vmx_support() {
+            Ok(_) => {
+                kdebug!("[+] CPU supports Intel VMX");
+            }
+            Err(e) => {
+                kdebug!("[-] CPU does not support Intel VMX: {:?}", e);
+                return Err(SystemError::EOPNOTSUPP_OR_ENOTSUP);
+            }
+        };
+
+        match enable_vmx_operation() {
+            Ok(_) => {
+                kdebug!("[+] Enabling Virtual Machine Extensions (VMX)");
+            }
+            Err(_) => {
+                kdebug!("[-] VMX operation is not supported on this processor.");
+                return Err(SystemError::EOPNOTSUPP_OR_ENOTSUP);
+            }
+        }
+
+        vmxon(self.data.vmxon_region_physical_address)?;
+        kdebug!("[+] VMXON successful!");
+        vmx_vmclear(self.data.vmcs_region_physical_address)?;
+        vmx_vmptrld(self.data.vmcs_region_physical_address)?;
+        kdebug!("[+] VMPTRLD successful!");
+        self.vmcs_init().expect("vncs_init fail");
+        kdebug!("[+] VMCS init!");
+        // kdebug!("vmcs init host rip: {:#x}", vmx_return as *const () as u64);
+        // kdebug!("vmcs init host rsp: {:#x}", x86::bits64::registers::rsp());
+        // vmx_vmwrite(VmcsFields::HOST_RSP as u32, x86::bits64::registers::rsp())?;
+        // vmx_vmwrite(VmcsFields::HOST_RIP as u32, vmx_return as *const () as u64)?;
+        // vmx_vmwrite(VmcsFields::HOST_RSP as u32,  x86::bits64::registers::rsp())?;
+        self.kvm_mmu_load()?;
+        Ok(())
+    }
+
+    fn devirtualize_cpu(&self) -> Result<(), SystemError> {
+        vmxoff()?;
+        Ok(())
+    }
+
+    /// Gets the index of the current logical/virtual processor
+    fn id(&self) -> u32 {
+        self.vcpu_id
+    }
+}
+
+pub fn get_segment_base(gdt_base: *const u64, gdt_size: u16, segment_selector: u16) -> u64 {
+    let table = segment_selector & 0x0004; // get table indicator in selector
+    let index = (segment_selector >> 3) as usize; // get index in selector
+    if table == 0 && index == 0 {
+        return 0;
+    }
+    let descriptor_table = unsafe { slice::from_raw_parts(gdt_base, gdt_size.into()) };
+    let descriptor = descriptor_table[index];
+
+    let base_high = (descriptor & 0xFF00_0000_0000_0000) >> 32;
+    let base_mid = (descriptor & 0x0000_00FF_0000_0000) >> 16;
+    let base_low = (descriptor & 0x0000_0000_FFFF_0000) >> 16;
+    let segment_base = (base_high | base_mid | base_low) & 0xFFFFFFFF;
+    let virtaddr = phys_2_virt(segment_base.try_into().unwrap())
+        .try_into()
+        .unwrap();
+    kdebug!(
+        "segment_base={:x}",
+        phys_2_virt(segment_base.try_into().unwrap())
+    );
+    return virtaddr;
+}
+
+// FIXME: may have bug
+// pub fn read_segment_access_rights(segement_selector: u16) -> u32{
+//     let table = segement_selector & 0x0004; // get table indicator in selector
+//     let index = segement_selector & 0xFFF8; // get index in selector
+//     let mut flag: u16;
+//     if table==0 && index==0 {
+//         return 0;
+//     }
+//     unsafe{
+//         asm!(
+//             "lar {0:r}, rcx",
+//             "mov {1:r}, {0:r}",
+//             in(reg) segement_selector,
+//             out(reg) flag,
+//         );
+//     }
+//     return (flag >> 8) as u32;
+// }
+pub fn adjust_vmx_controls(ctl_min: u32, ctl_opt: u32, msr: u32, result: &mut u32) {
+    let vmx_msr_low: u32 = unsafe { (msr::rdmsr(msr) & 0x0000_0000_FFFF_FFFF) as u32 };
+    let vmx_msr_high: u32 = unsafe { (msr::rdmsr(msr) << 32) as u32 };
+    let mut ctl: u32 = ctl_min | ctl_opt;
+    ctl &= vmx_msr_high; /* bit == 0 in high word ==> must be zero */
+    ctl |= vmx_msr_low; /* bit == 1 in low word  ==> must be one  */
+    *result = ctl;
+}
+
+pub fn adjust_vmx_entry_controls() -> u32 {
+    let mut entry_controls: u32 = 0;
+    adjust_vmx_controls(
+        VmxEntryCtrl::LOAD_DBG_CTRLS.bits(),
+        VmxEntryCtrl::IA32E_MODE_GUEST.bits(),
+        msr::IA32_VMX_ENTRY_CTLS, //Capability Reporting Register of VM-entry Controls (R/O)
+        &mut entry_controls,
+    );
+    return entry_controls;
+    // msr::IA32_VMX_TRUE_ENTRY_CTLS//Capability Reporting Register of VM-entry Flex Controls (R/O) See Table 35-2
+}
+
+pub fn adjust_vmx_exit_controls() -> u32 {
+    let mut exit_controls: u32 = 0;
+    adjust_vmx_controls(
+        VmxPrimaryExitCtrl::SAVE_DBG_CTRLS.bits(),
+        VmxPrimaryExitCtrl::HOST_ADDR_SPACE_SIZE.bits(),
+        msr::IA32_VMX_EXIT_CTLS,
+        &mut exit_controls,
+    );
+    return exit_controls;
+}
+
+pub fn adjust_vmx_pinbased_controls() -> u32 {
+    let mut controls: u32 = 0000_0016;
+    adjust_vmx_controls(0, 0, msr::IA32_VMX_TRUE_PINBASED_CTLS, &mut controls);
+    // kdebug!("adjust_vmx_pinbased_controls: {:x}", controls);
+    return controls;
+}
+
+pub fn adjust_vmx_primary_process_exec_controls() -> u32 {
+    let mut controls: u32 = 0;
+    adjust_vmx_controls(
+        0,
+        VmxPrimaryProcessBasedExecuteCtrl::USE_MSR_BITMAPS.bits()
+            | VmxPrimaryProcessBasedExecuteCtrl::ACTIVATE_SECONDARY_CONTROLS.bits(),
+        msr::IA32_VMX_PROCBASED_CTLS,
+        &mut controls,
+    );
+    return controls;
+}
+
+pub fn adjust_vmx_secondary_process_exec_controls() -> u32 {
+    let mut controls: u32 = 0;
+    adjust_vmx_controls(
+        0,
+        VmxSecondaryProcessBasedExecuteCtrl::ENABLE_RDTSCP.bits()
+            | VmxSecondaryProcessBasedExecuteCtrl::ENABLE_XSAVES_XRSTORS.bits()
+            | VmxSecondaryProcessBasedExecuteCtrl::ENABLE_INVPCID.bits()
+            | VmxSecondaryProcessBasedExecuteCtrl::ENABLE_EPT.bits()
+            | VmxSecondaryProcessBasedExecuteCtrl::UNRESTRICTED_GUEST.bits(),
+        msr::IA32_VMX_PROCBASED_CTLS2,
+        &mut controls,
+    );
+    return controls;
+}
+
+/// Check to see if CPU is Intel (“GenuineIntel”).
+/// Check processor supports for Virtual Machine Extension (VMX) technology
+//  CPUID.1:ECX.VMX[bit 5] = 1 (Intel Manual: 24.6 Discovering Support for VMX)
+pub fn has_intel_vmx_support() -> Result<(), SystemError> {
+    let cpuid = CpuId::new();
+    if let Some(vi) = cpuid.get_vendor_info() {
+        if vi.as_str() != "GenuineIntel" {
+            return Err(SystemError::EOPNOTSUPP_OR_ENOTSUP);
+        }
+    }
+    if let Some(fi) = cpuid.get_feature_info() {
+        if !fi.has_vmx() {
+            return Err(SystemError::EOPNOTSUPP_OR_ENOTSUP);
+        }
+    }
+    Ok(())
+}
+
+/// Enables Virtual Machine Extensions
+// - CR4.VMXE[bit 13] = 1 (Intel Manual: 24.7 Enabling and Entering VMX Operation)
+pub fn enable_vmx_operation() -> Result<(), SystemError> {
+    let mut cr4 = unsafe { controlregs::cr4() };
+    cr4.set(controlregs::Cr4::CR4_ENABLE_VMX, true);
+    unsafe { controlregs::cr4_write(cr4) };
+
+    set_lock_bit()?;
+    kdebug!("[+] Lock bit set via IA32_FEATURE_CONTROL");
+    set_cr0_bits();
+    kdebug!("[+] Mandatory bits in CR0 set/cleared");
+    set_cr4_bits();
+    kdebug!("[+] Mandatory bits in CR4 set/cleared");
+
+    Ok(())
+}
+
+/// Check if we need to set bits in IA32_FEATURE_CONTROL
+// (Intel Manual: 24.7 Enabling and Entering VMX Operation)
+fn set_lock_bit() -> Result<(), SystemError> {
+    const VMX_LOCK_BIT: u64 = 1 << 0;
+    const VMXON_OUTSIDE_SMX: u64 = 1 << 2;
+
+    let ia32_feature_control = unsafe { msr::rdmsr(msr::IA32_FEATURE_CONTROL) };
+
+    if (ia32_feature_control & VMX_LOCK_BIT) == 0 {
+        unsafe {
+            msr::wrmsr(
+                msr::IA32_FEATURE_CONTROL,
+                VMXON_OUTSIDE_SMX | VMX_LOCK_BIT | ia32_feature_control,
+            )
+        };
+    } else if (ia32_feature_control & VMXON_OUTSIDE_SMX) == 0 {
+        return Err(SystemError::EPERM);
+    }
+
+    Ok(())
+}
+
+/// Set the mandatory bits in CR0 and clear bits that are mandatory zero
+/// (Intel Manual: 24.8 Restrictions on VMX Operation)
+fn set_cr0_bits() {
+    let ia32_vmx_cr0_fixed0 = unsafe { msr::rdmsr(msr::IA32_VMX_CR0_FIXED0) };
+    let ia32_vmx_cr0_fixed1 = unsafe { msr::rdmsr(msr::IA32_VMX_CR0_FIXED1) };
+
+    let mut cr0 = unsafe { controlregs::cr0() };
+
+    cr0 |= controlregs::Cr0::from_bits_truncate(ia32_vmx_cr0_fixed0 as usize);
+    cr0 &= controlregs::Cr0::from_bits_truncate(ia32_vmx_cr0_fixed1 as usize);
+
+    unsafe { controlregs::cr0_write(cr0) };
+}
+
+/// Set the mandatory bits in CR4 and clear bits that are mandatory zero
+/// (Intel Manual: 24.8 Restrictions on VMX Operation)
+fn set_cr4_bits() {
+    let ia32_vmx_cr4_fixed0 = unsafe { msr::rdmsr(msr::IA32_VMX_CR4_FIXED0) };
+    let ia32_vmx_cr4_fixed1 = unsafe { msr::rdmsr(msr::IA32_VMX_CR4_FIXED1) };
+
+    let mut cr4 = unsafe { controlregs::cr4() };
+
+    cr4 |= controlregs::Cr4::from_bits_truncate(ia32_vmx_cr4_fixed0 as usize);
+    cr4 &= controlregs::Cr4::from_bits_truncate(ia32_vmx_cr4_fixed1 as usize);
+
+    unsafe { controlregs::cr4_write(cr4) };
+}

+ 539 - 0
kernel/src/arch/x86_64/kvm/vmx/vmcs.rs

@@ -0,0 +1,539 @@
+use bitflags::bitflags;
+use num_derive::FromPrimitive;
+
+pub const PAGE_SIZE: usize = 0x1000;
+
+#[repr(C, align(4096))]
+#[derive(Clone, Debug)]
+pub struct VMCSRegion {
+    pub revision_id: u32,
+    pub abort_indicator: u32,
+    data: [u8; PAGE_SIZE - 8],
+}
+
+// (Intel Manual: 25.11.2 VMREAD, VMWRITE, and Encodings of VMCS Fields)
+#[derive(FromPrimitive)]
+enum VmcsAccessType {
+    FULL = 0,
+    HIGH = 1,
+}
+
+#[derive(FromPrimitive)]
+enum VmcsType {
+    CONTROL = 0,
+    VMEXIT = 1,
+    GUEST = 2,
+    HOST = 3,
+}
+
+#[derive(FromPrimitive)]
+enum VmcsWidth {
+    BIT16 = 0,
+    BIT64 = 1,
+    BIT32 = 2,
+    NATURAL = 3,
+}
+
+#[derive(FromPrimitive)]
+#[allow(non_camel_case_types)]
+// (Intel Manual: APPENDIX B FIELD ENCODING IN VMCS)
+pub enum VmcsFields {
+    // [CONTROL] fields
+    // 16-bit control fields
+    CTRL_VIRT_PROC_ID = encode_vmcs_field_full(VmcsType::CONTROL, VmcsWidth::BIT16, 0) as isize,
+    CTRL_POSTED_INTR_N_VECTOR =
+        encode_vmcs_field_full(VmcsType::CONTROL, VmcsWidth::BIT16, 1) as isize,
+    CTRL_EPTP_INDEX = encode_vmcs_field_full(VmcsType::CONTROL, VmcsWidth::BIT16, 2) as isize,
+    // 64-bit control fields
+    CTRL_IO_BITMAP_A_ADDR = encode_vmcs_field_full(VmcsType::CONTROL, VmcsWidth::BIT64, 0) as isize,
+    CTRL_IO_BITMAP_B_ADDR = encode_vmcs_field_full(VmcsType::CONTROL, VmcsWidth::BIT64, 1) as isize,
+    CTRL_MSR_BITMAP_ADDR = encode_vmcs_field_full(VmcsType::CONTROL, VmcsWidth::BIT64, 2) as isize, // control whether RDMSR or WRMSR cause VM exit
+    CTRL_VMEXIT_MSR_STORE_ADDR =
+        encode_vmcs_field_full(VmcsType::CONTROL, VmcsWidth::BIT64, 3) as isize,
+    CTRL_VMEXIT_MSR_LOAD_ADDR =
+        encode_vmcs_field_full(VmcsType::CONTROL, VmcsWidth::BIT64, 4) as isize,
+    CTRL_VMENTRY_MSR_LOAD_ADDR =
+        encode_vmcs_field_full(VmcsType::CONTROL, VmcsWidth::BIT64, 5) as isize,
+    CTRL_EXECUTIVE_VMCS_PTR =
+        encode_vmcs_field_full(VmcsType::CONTROL, VmcsWidth::BIT64, 6) as isize,
+    CTRL_PML_ADDR = encode_vmcs_field_full(VmcsType::CONTROL, VmcsWidth::BIT64, 7) as isize,
+    CTRL_TSC_ADDR = encode_vmcs_field_full(VmcsType::CONTROL, VmcsWidth::BIT64, 8) as isize,
+    CTRL_VIRT_APIC_ADDR = encode_vmcs_field_full(VmcsType::CONTROL, VmcsWidth::BIT64, 9) as isize,
+    CTRL_APIC_ACCESS_ADDR =
+        encode_vmcs_field_full(VmcsType::CONTROL, VmcsWidth::BIT64, 10) as isize,
+    CTRL_POSTED_INTR_DESC_ADDR =
+        encode_vmcs_field_full(VmcsType::CONTROL, VmcsWidth::BIT64, 11) as isize,
+    CTRL_VMFUNC_CTRL = encode_vmcs_field_full(VmcsType::CONTROL, VmcsWidth::BIT64, 12) as isize,
+    CTRL_EPTP_PTR = encode_vmcs_field_full(VmcsType::CONTROL, VmcsWidth::BIT64, 13) as isize,
+    CTRL_EOI_EXIT_BITMAP_0 =
+        encode_vmcs_field_full(VmcsType::CONTROL, VmcsWidth::BIT64, 14) as isize,
+    CTRL_EOI_EXIT_BITMAP_1 =
+        encode_vmcs_field_full(VmcsType::CONTROL, VmcsWidth::BIT64, 15) as isize,
+    CTRL_EOI_EXIT_BITMAP_2 =
+        encode_vmcs_field_full(VmcsType::CONTROL, VmcsWidth::BIT64, 16) as isize,
+    CTRL_EOI_EXIT_BITMAP_3 =
+        encode_vmcs_field_full(VmcsType::CONTROL, VmcsWidth::BIT64, 17) as isize,
+    CTRL_EPT_LIST_ADDR = encode_vmcs_field_full(VmcsType::CONTROL, VmcsWidth::BIT64, 18) as isize,
+    CTRL_VMREAD_BITMAP_ADDR =
+        encode_vmcs_field_full(VmcsType::CONTROL, VmcsWidth::BIT64, 19) as isize,
+    CTRL_VMWRITE_BITMAP_ADDR =
+        encode_vmcs_field_full(VmcsType::CONTROL, VmcsWidth::BIT64, 20) as isize,
+    CTRL_VIRT_EXECPT_INFO_ADDR =
+        encode_vmcs_field_full(VmcsType::CONTROL, VmcsWidth::BIT64, 21) as isize,
+    CTRL_XSS_EXITING_BITMAP =
+        encode_vmcs_field_full(VmcsType::CONTROL, VmcsWidth::BIT64, 22) as isize,
+    CTRL_ENCLS_EXITING_BITMAP =
+        encode_vmcs_field_full(VmcsType::CONTROL, VmcsWidth::BIT64, 23) as isize,
+    CTRL_TSC_MULTIPLIER = encode_vmcs_field_full(VmcsType::CONTROL, VmcsWidth::BIT64, 25) as isize,
+    // 32-bit control fields
+    CTRL_PIN_BASED_VM_EXEC_CTRLS =
+        encode_vmcs_field_full(VmcsType::CONTROL, VmcsWidth::BIT32, 0) as isize, // control async event handling (i.e. interrupts)
+    CTRL_PRIMARY_PROCESSOR_VM_EXEC_CTRLS =
+        encode_vmcs_field_full(VmcsType::CONTROL, VmcsWidth::BIT32, 1) as isize, // control sync event handling (i.e. instruction exits)
+    CTRL_EXPECTION_BITMAP = encode_vmcs_field_full(VmcsType::CONTROL, VmcsWidth::BIT32, 2) as isize, // bitmap to control exceptions that cause a VM exit
+    CTRL_PAGE_FAULT_ERR_CODE_MASK =
+        encode_vmcs_field_full(VmcsType::CONTROL, VmcsWidth::BIT32, 3) as isize,
+    CTRL_PAGE_FAULT_ERR_CODE_MATCH =
+        encode_vmcs_field_full(VmcsType::CONTROL, VmcsWidth::BIT32, 4) as isize,
+    CTRL_CR3_TARGET_COUNT = encode_vmcs_field_full(VmcsType::CONTROL, VmcsWidth::BIT32, 5) as isize,
+    CTRL_PRIMARY_VM_EXIT_CTRLS =
+        encode_vmcs_field_full(VmcsType::CONTROL, VmcsWidth::BIT32, 6) as isize,
+    CTRL_VM_EXIT_MSR_STORE_COUNT =
+        encode_vmcs_field_full(VmcsType::CONTROL, VmcsWidth::BIT32, 7) as isize,
+    CTRL_VM_EXIT_MSR_LOAD_COUNT =
+        encode_vmcs_field_full(VmcsType::CONTROL, VmcsWidth::BIT32, 8) as isize,
+    CTRL_VM_ENTRY_CTRLS = encode_vmcs_field_full(VmcsType::CONTROL, VmcsWidth::BIT32, 9) as isize,
+    CTRL_VM_ENTRY_MSR_LOAD_COUNT =
+        encode_vmcs_field_full(VmcsType::CONTROL, VmcsWidth::BIT32, 10) as isize,
+    CTRL_VM_ENTRY_INTR_INFO_FIELD =
+        encode_vmcs_field_full(VmcsType::CONTROL, VmcsWidth::BIT32, 11) as isize,
+    CTRL_VM_ENTRY_EXCEPTION_ERR_CODE =
+        encode_vmcs_field_full(VmcsType::CONTROL, VmcsWidth::BIT32, 12) as isize,
+    CTRL_VM_ENTRY_INSTR_LEN =
+        encode_vmcs_field_full(VmcsType::CONTROL, VmcsWidth::BIT32, 13) as isize,
+    CTRL_TPR_THRESHOLD = encode_vmcs_field_full(VmcsType::CONTROL, VmcsWidth::BIT32, 14) as isize,
+    CTRL_SECONDARY_PROCESSOR_VM_EXEC_CTRLS =
+        encode_vmcs_field_full(VmcsType::CONTROL, VmcsWidth::BIT32, 15) as isize,
+    CTRL_PLE_GAP = encode_vmcs_field_full(VmcsType::CONTROL, VmcsWidth::BIT32, 16) as isize,
+    CTRL_PLE_WINDOW = encode_vmcs_field_full(VmcsType::CONTROL, VmcsWidth::BIT32, 17) as isize,
+    // natural control fields
+    CTRL_CR0_GUEST_HOST_MASK =
+        encode_vmcs_field_full(VmcsType::CONTROL, VmcsWidth::NATURAL, 0) as isize, // control executions of insts that access cr0
+    CTRL_CR4_GUEST_HOST_MASK =
+        encode_vmcs_field_full(VmcsType::CONTROL, VmcsWidth::NATURAL, 1) as isize,
+    CTRL_CR0_READ_SHADOW =
+        encode_vmcs_field_full(VmcsType::CONTROL, VmcsWidth::NATURAL, 2) as isize, // control executions of insts that access cr0
+    CTRL_CR4_READ_SHADOW =
+        encode_vmcs_field_full(VmcsType::CONTROL, VmcsWidth::NATURAL, 3) as isize,
+    CTRL_CR3_TARGET_VALUE_0 =
+        encode_vmcs_field_full(VmcsType::CONTROL, VmcsWidth::NATURAL, 4) as isize,
+    CTRL_CR3_TARGET_VALUE_1 =
+        encode_vmcs_field_full(VmcsType::CONTROL, VmcsWidth::NATURAL, 5) as isize,
+    CTRL_CR3_TARGET_VALUE_2 =
+        encode_vmcs_field_full(VmcsType::CONTROL, VmcsWidth::NATURAL, 6) as isize,
+    CTRL_CR3_TARGET_VALUE_3 =
+        encode_vmcs_field_full(VmcsType::CONTROL, VmcsWidth::NATURAL, 7) as isize,
+
+    // [VMEXIT] fields read-only
+    // No 16-bit vmexit fields
+    // 64-bit vmexit fields
+    VMEXIT_GUEST_PHY_ADDR = encode_vmcs_field_full(VmcsType::VMEXIT, VmcsWidth::BIT64, 0) as isize,
+    // 32-bit vmexit fields
+    VMEXIT_INSTR_ERR = encode_vmcs_field_full(VmcsType::VMEXIT, VmcsWidth::BIT32, 0) as isize,
+    VMEXIT_EXIT_REASON = encode_vmcs_field_full(VmcsType::VMEXIT, VmcsWidth::BIT32, 1) as isize,
+    VMEXIT_INT_INFO = encode_vmcs_field_full(VmcsType::VMEXIT, VmcsWidth::BIT32, 2) as isize,
+    VMEXIT_INT_ERR_CODE = encode_vmcs_field_full(VmcsType::VMEXIT, VmcsWidth::BIT32, 3) as isize,
+    VMEXIT_IDT_VECTOR_INFO = encode_vmcs_field_full(VmcsType::VMEXIT, VmcsWidth::BIT32, 4) as isize,
+    VMEXIT_IDT_VECTOR_ERR_CODE =
+        encode_vmcs_field_full(VmcsType::VMEXIT, VmcsWidth::BIT32, 5) as isize,
+    VMEXIT_INSTR_LEN = encode_vmcs_field_full(VmcsType::VMEXIT, VmcsWidth::BIT32, 6) as isize,
+    VMEXIT_INSTR_INFO = encode_vmcs_field_full(VmcsType::VMEXIT, VmcsWidth::BIT32, 7) as isize,
+    // natural vmexit fields
+    VMEXIT_QUALIFICATION = encode_vmcs_field_full(VmcsType::VMEXIT, VmcsWidth::NATURAL, 0) as isize,
+    VMEXIT_IO_RCX = encode_vmcs_field_full(VmcsType::VMEXIT, VmcsWidth::NATURAL, 1) as isize,
+    VMEXIT_IO_RSX = encode_vmcs_field_full(VmcsType::VMEXIT, VmcsWidth::NATURAL, 2) as isize,
+    VMEXIT_IO_RDI = encode_vmcs_field_full(VmcsType::VMEXIT, VmcsWidth::NATURAL, 3) as isize,
+    VMEXIT_IO_RIP = encode_vmcs_field_full(VmcsType::VMEXIT, VmcsWidth::NATURAL, 4) as isize,
+    VMEXIT_GUEST_LINEAR_ADDR =
+        encode_vmcs_field_full(VmcsType::VMEXIT, VmcsWidth::NATURAL, 5) as isize,
+
+    // [GUEST] fields
+    // 16-bit guest fields
+    GUEST_ES_SELECTOR = encode_vmcs_field_full(VmcsType::GUEST, VmcsWidth::BIT16, 0) as isize,
+    GUEST_CS_SELECTOR = encode_vmcs_field_full(VmcsType::GUEST, VmcsWidth::BIT16, 1) as isize,
+    GUEST_SS_SELECTOR = encode_vmcs_field_full(VmcsType::GUEST, VmcsWidth::BIT16, 2) as isize,
+    GUEST_DS_SELECTOR = encode_vmcs_field_full(VmcsType::GUEST, VmcsWidth::BIT16, 3) as isize,
+    GUEST_FS_SELECTOR = encode_vmcs_field_full(VmcsType::GUEST, VmcsWidth::BIT16, 4) as isize,
+    GUEST_GS_SELECTOR = encode_vmcs_field_full(VmcsType::GUEST, VmcsWidth::BIT16, 5) as isize,
+    GUEST_LDTR_SELECTOR = encode_vmcs_field_full(VmcsType::GUEST, VmcsWidth::BIT16, 6) as isize,
+    GUEST_TR_SELECTOR = encode_vmcs_field_full(VmcsType::GUEST, VmcsWidth::BIT16, 7) as isize,
+    GUEST_INTR_STATUS = encode_vmcs_field_full(VmcsType::GUEST, VmcsWidth::BIT16, 8) as isize,
+    GUEST_PML_INDEX = encode_vmcs_field_full(VmcsType::GUEST, VmcsWidth::BIT16, 9) as isize,
+    // 64-bit guest fields
+    GUEST_VMCS_LINK_PTR = encode_vmcs_field_full(VmcsType::GUEST, VmcsWidth::BIT64, 0) as isize,
+    GUEST_DEBUGCTL = encode_vmcs_field_full(VmcsType::GUEST, VmcsWidth::BIT64, 1) as isize,
+    GUEST_PAT = encode_vmcs_field_full(VmcsType::GUEST, VmcsWidth::BIT64, 2) as isize,
+    GUEST_EFER = encode_vmcs_field_full(VmcsType::GUEST, VmcsWidth::BIT64, 3) as isize,
+    GUEST_PERF_GLOBAL_CTRL = encode_vmcs_field_full(VmcsType::GUEST, VmcsWidth::BIT64, 4) as isize,
+    GUEST_PDPTE0 = encode_vmcs_field_full(VmcsType::GUEST, VmcsWidth::BIT64, 5) as isize,
+    GUEST_PDPTE1 = encode_vmcs_field_full(VmcsType::GUEST, VmcsWidth::BIT64, 6) as isize,
+    GUEST_PDPTE2 = encode_vmcs_field_full(VmcsType::GUEST, VmcsWidth::BIT64, 7) as isize,
+    GUEST_PDPTE3 = encode_vmcs_field_full(VmcsType::GUEST, VmcsWidth::BIT64, 8) as isize,
+    // 32-bit guest fields
+    GUEST_ES_LIMIT = encode_vmcs_field_full(VmcsType::GUEST, VmcsWidth::BIT32, 0) as isize,
+    GUEST_CS_LIMIT = encode_vmcs_field_full(VmcsType::GUEST, VmcsWidth::BIT32, 1) as isize,
+    GUEST_SS_LIMIT = encode_vmcs_field_full(VmcsType::GUEST, VmcsWidth::BIT32, 2) as isize,
+    GUEST_DS_LIMIT = encode_vmcs_field_full(VmcsType::GUEST, VmcsWidth::BIT32, 3) as isize,
+    GUEST_FS_LIMIT = encode_vmcs_field_full(VmcsType::GUEST, VmcsWidth::BIT32, 4) as isize,
+    GUEST_GS_LIMIT = encode_vmcs_field_full(VmcsType::GUEST, VmcsWidth::BIT32, 5) as isize,
+    GUEST_LDTR_LIMIT = encode_vmcs_field_full(VmcsType::GUEST, VmcsWidth::BIT32, 6) as isize,
+    GUEST_TR_LIMIT = encode_vmcs_field_full(VmcsType::GUEST, VmcsWidth::BIT32, 7) as isize,
+    GUEST_GDTR_LIMIT = encode_vmcs_field_full(VmcsType::GUEST, VmcsWidth::BIT32, 8) as isize,
+    GUEST_IDTR_LIMIT = encode_vmcs_field_full(VmcsType::GUEST, VmcsWidth::BIT32, 9) as isize,
+    GUEST_ES_ACCESS_RIGHTS = encode_vmcs_field_full(VmcsType::GUEST, VmcsWidth::BIT32, 10) as isize,
+    GUEST_CS_ACCESS_RIGHTS = encode_vmcs_field_full(VmcsType::GUEST, VmcsWidth::BIT32, 11) as isize,
+    GUEST_SS_ACCESS_RIGHTS = encode_vmcs_field_full(VmcsType::GUEST, VmcsWidth::BIT32, 12) as isize,
+    GUEST_DS_ACCESS_RIGHTS = encode_vmcs_field_full(VmcsType::GUEST, VmcsWidth::BIT32, 13) as isize,
+    GUEST_FS_ACCESS_RIGHTS = encode_vmcs_field_full(VmcsType::GUEST, VmcsWidth::BIT32, 14) as isize,
+    GUEST_GS_ACCESS_RIGHTS = encode_vmcs_field_full(VmcsType::GUEST, VmcsWidth::BIT32, 15) as isize,
+    GUEST_LDTR_ACCESS_RIGHTS =
+        encode_vmcs_field_full(VmcsType::GUEST, VmcsWidth::BIT32, 16) as isize,
+    GUEST_TR_ACCESS_RIGHTS = encode_vmcs_field_full(VmcsType::GUEST, VmcsWidth::BIT32, 17) as isize,
+    GUEST_INTERRUPTIBILITY_STATE =
+        encode_vmcs_field_full(VmcsType::GUEST, VmcsWidth::BIT32, 18) as isize,
+    GUEST_ACTIVITY_STATE = encode_vmcs_field_full(VmcsType::GUEST, VmcsWidth::BIT32, 19) as isize,
+    GUEST_SMBASE = encode_vmcs_field_full(VmcsType::GUEST, VmcsWidth::BIT32, 20) as isize,
+    GUEST_SYSENTER_CS = encode_vmcs_field_full(VmcsType::GUEST, VmcsWidth::BIT32, 21) as isize,
+    GUEST_VMX_PREEMPT_TIMER_VALUE = 0x482E as isize,
+    // natural guest fields
+    GUEST_CR0 = encode_vmcs_field_full(VmcsType::GUEST, VmcsWidth::NATURAL, 0) as isize,
+    GUEST_CR3 = encode_vmcs_field_full(VmcsType::GUEST, VmcsWidth::NATURAL, 1) as isize,
+    GUEST_CR4 = encode_vmcs_field_full(VmcsType::GUEST, VmcsWidth::NATURAL, 2) as isize,
+    GUEST_ES_BASE = encode_vmcs_field_full(VmcsType::GUEST, VmcsWidth::NATURAL, 3) as isize,
+    GUEST_CS_BASE = encode_vmcs_field_full(VmcsType::GUEST, VmcsWidth::NATURAL, 4) as isize,
+    GUEST_SS_BASE = encode_vmcs_field_full(VmcsType::GUEST, VmcsWidth::NATURAL, 5) as isize,
+    GUEST_DS_BASE = encode_vmcs_field_full(VmcsType::GUEST, VmcsWidth::NATURAL, 6) as isize,
+    GUEST_FS_BASE = encode_vmcs_field_full(VmcsType::GUEST, VmcsWidth::NATURAL, 7) as isize,
+    GUEST_GS_BASE = encode_vmcs_field_full(VmcsType::GUEST, VmcsWidth::NATURAL, 8) as isize,
+    GUEST_LDTR_BASE = encode_vmcs_field_full(VmcsType::GUEST, VmcsWidth::NATURAL, 9) as isize,
+    GUEST_TR_BASE = encode_vmcs_field_full(VmcsType::GUEST, VmcsWidth::NATURAL, 10) as isize,
+    GUEST_GDTR_BASE = encode_vmcs_field_full(VmcsType::GUEST, VmcsWidth::NATURAL, 11) as isize,
+    GUEST_IDTR_BASE = encode_vmcs_field_full(VmcsType::GUEST, VmcsWidth::NATURAL, 12) as isize,
+    GUEST_DR7 = encode_vmcs_field_full(VmcsType::GUEST, VmcsWidth::NATURAL, 13) as isize,
+    GUEST_RSP = encode_vmcs_field_full(VmcsType::GUEST, VmcsWidth::NATURAL, 14) as isize,
+    GUEST_RIP = encode_vmcs_field_full(VmcsType::GUEST, VmcsWidth::NATURAL, 15) as isize,
+    GUEST_RFLAGS = encode_vmcs_field_full(VmcsType::GUEST, VmcsWidth::NATURAL, 16) as isize,
+    GUEST_PENDING_DBG_EXCEPTIONS =
+        encode_vmcs_field_full(VmcsType::GUEST, VmcsWidth::NATURAL, 17) as isize,
+    GUEST_SYSENTER_ESP = encode_vmcs_field_full(VmcsType::GUEST, VmcsWidth::NATURAL, 18) as isize,
+    GUEST_SYSENTER_EIP = encode_vmcs_field_full(VmcsType::GUEST, VmcsWidth::NATURAL, 19) as isize,
+
+    // [HOST] fields
+    // host 16 bit fields
+    HOST_ES_SELECTOR = encode_vmcs_field_full(VmcsType::HOST, VmcsWidth::BIT16, 0) as isize,
+    HOST_CS_SELECTOR = encode_vmcs_field_full(VmcsType::HOST, VmcsWidth::BIT16, 1) as isize,
+    HOST_SS_SELECTOR = encode_vmcs_field_full(VmcsType::HOST, VmcsWidth::BIT16, 2) as isize,
+    HOST_DS_SELECTOR = encode_vmcs_field_full(VmcsType::HOST, VmcsWidth::BIT16, 3) as isize,
+    HOST_FS_SELECTOR = encode_vmcs_field_full(VmcsType::HOST, VmcsWidth::BIT16, 4) as isize,
+    HOST_GS_SELECTOR = encode_vmcs_field_full(VmcsType::HOST, VmcsWidth::BIT16, 5) as isize,
+    HOST_TR_SELECTOR = encode_vmcs_field_full(VmcsType::HOST, VmcsWidth::BIT16, 6) as isize,
+    // host 64 bit fields
+    HOST_PAT = encode_vmcs_field_full(VmcsType::HOST, VmcsWidth::BIT64, 0) as isize,
+    HOST_EFER = encode_vmcs_field_full(VmcsType::HOST, VmcsWidth::BIT64, 1) as isize,
+    HOST_PERF_GLOBAL_CTRL = encode_vmcs_field_full(VmcsType::HOST, VmcsWidth::BIT64, 2) as isize,
+    // host 32 bit fields
+    HOST_SYSENTER_CS = encode_vmcs_field_full(VmcsType::HOST, VmcsWidth::BIT32, 0) as isize,
+    // host natural fields
+    HOST_CR0 = encode_vmcs_field_full(VmcsType::HOST, VmcsWidth::NATURAL, 0) as isize,
+    HOST_CR3 = encode_vmcs_field_full(VmcsType::HOST, VmcsWidth::NATURAL, 1) as isize,
+    HOST_CR4 = encode_vmcs_field_full(VmcsType::HOST, VmcsWidth::NATURAL, 2) as isize,
+    HOST_FS_BASE = encode_vmcs_field_full(VmcsType::HOST, VmcsWidth::NATURAL, 3) as isize,
+    HOST_GS_BASE = encode_vmcs_field_full(VmcsType::HOST, VmcsWidth::NATURAL, 4) as isize,
+    HOST_TR_BASE = encode_vmcs_field_full(VmcsType::HOST, VmcsWidth::NATURAL, 5) as isize,
+    HOST_GDTR_BASE = encode_vmcs_field_full(VmcsType::HOST, VmcsWidth::NATURAL, 6) as isize,
+    HOST_IDTR_BASE = encode_vmcs_field_full(VmcsType::HOST, VmcsWidth::NATURAL, 7) as isize,
+    HOST_SYSENTER_ESP = encode_vmcs_field_full(VmcsType::HOST, VmcsWidth::NATURAL, 8) as isize,
+    HOST_SYSENTER_EIP = encode_vmcs_field_full(VmcsType::HOST, VmcsWidth::NATURAL, 9) as isize,
+    HOST_RSP = encode_vmcs_field_full(VmcsType::HOST, VmcsWidth::NATURAL, 10) as isize,
+    HOST_RIP = encode_vmcs_field_full(VmcsType::HOST, VmcsWidth::NATURAL, 11) as isize,
+}
+
+// (Intel Manual: 25.6 VM-EXECUTION CONTROL FIELDS)
+bitflags! {
+    // (Intel Manual: 25.6.1 Pin-Based VM-Execution Controls)
+    #[allow(non_camel_case_types)]
+    pub struct VmxPinBasedExecuteCtrl: u32 {
+        const EXTERNAL_INTERRUPT_EXITING = 1 << 0; // external interrupts cause VM exits
+        const NMI_EXITING = 1 << 3; // non-maskable interrupts (NMIs) cause VM exits.
+        const VIRTUAL_NMIS = 1 << 5; // NMIs are never blocked and the “blocking by NMI” bit (bit 3) in the interruptibility-state field indicates “virtual-NMI blocking”
+        const VMX_PREEMPTION_TIMER = 1 << 6; // the VMX-preemption timer counts down in VMX non-root operation
+        const PROCESS_POSTED_INTERRUPTS = 1 << 7; // he processor treats interrupts with the posted-interrupt notification vector
+    }
+
+    // (Intel Manual: 25.6.2 Processor-Based VM-Execution Controls)
+    #[allow(non_camel_case_types)]
+    pub struct VmxPrimaryProcessBasedExecuteCtrl: u32{
+        const INTERRUPT_WINDOW_EXITING = 1 << 2; // VM exits on interrupt window RFLAGS.IF = 1
+        const USE_TSC_OFFSETTING = 1 << 3; // TSC offsetting is enabled
+        const HLT_EXITING = 1 << 7;
+        const INVLPG_EXITING = 1 << 9;
+        const MWAIT_EXITING = 1 << 10;
+        const RDPMC_EXITING = 1 << 11;
+        const RDTSC_EXITING = 1 << 12;
+        const CR3_LOAD_EXITING = 1 << 15;
+        const CR3_STR_EXITING = 1 << 16;
+        const CR8_LOAD_EXITING = 1 << 19;
+        const CR8_STR_EXITING = 1 << 20;
+        const USE_TPR_SHADOW = 1 << 21;
+        const NMI_WINDOW_EXITING = 1 << 22;
+        const MOV_DR_EXITING = 1 << 23;
+        const UNCOND_IO_EXITING = 1 << 24;
+        const USE_IO_BITMAPS = 1 << 25;
+        const MONITOR_TRAP_FLAG = 1 << 27;
+        const USE_MSR_BITMAPS = 1 << 28;
+        const MONITOR_EXITING = 1 << 29;
+        const PAUSE_EXITING = 1 << 30;
+        const ACTIVATE_SECONDARY_CONTROLS = 1 << 31;
+    }
+
+    // (Intel Manual: 25.6.2 Processor-Based VM-Execution Controls)
+    pub struct VmxSecondaryProcessBasedExecuteCtrl: u32{
+        const VIRT_APIC_ACCESS = 1 << 0;
+        const ENABLE_EPT = 1 << 1;
+        const DESCRIPTOR_TABLE_EXITING = 1 << 2;
+        const ENABLE_RDTSCP = 1 << 3;
+        const VIRT_X2APIC_MODE = 1 << 4;
+        const ENABLE_VPID = 1 << 5;
+        const WBINVD_EXITING = 1 << 6;
+        const UNRESTRICTED_GUEST = 1 << 7;
+        const APCI_REGISTER_VIRT = 1 << 8;
+        const VIRT_INTR_DELIVERY = 1 << 9;
+        const PAUSE_LOOP_EXITING = 1 << 10;
+        const RDRAND_EXITING = 1 << 11;
+        const ENABLE_INVPCID = 1 << 12;
+        const ENABLE_VM_FUNCTIONS = 1 << 13;
+        const VMCS_SHADOWING = 1 << 14;
+        const ENABLE_ENCLS_EXITING = 1 << 15;
+        const RDSEED_EXITING = 1 << 16;
+        const ENABLE_PML = 1 << 17;
+        const EPT_VIOLATION_VE = 1 << 18;
+        const CONCEAL_VMX_FROM_PT = 1 << 19;
+        const ENABLE_XSAVES_XRSTORS = 1 << 20;
+        const PASID_TRANSLATION = 1 << 21;
+        const MODE_BASED_EPT_EXEC = 1 << 22;
+        const SUB_PAGE_WRITE_PERM = 1 << 23;
+        const PT_USE_GUEST_PYH_ADDR = 1 << 24;
+        const USE_TSC_SCALING = 1 << 25;
+        const ENABLE_USER_WAIT_PAUSE = 1 << 26;
+        const ENABLE_PCONFIG = 1 << 27;
+        const ENABLE_ENCLV_EXITING = 1 << 28;
+        const VMM_BUS_LOCK_DETECTION = 1 << 30;
+        const INST_TIMEOUT = 1 << 31;
+    }
+
+    // (Intel Manual: 25.7.1 VM-Exit Controls)
+    #[allow(non_camel_case_types)]
+    pub struct VmxPrimaryExitCtrl: u32 {
+        const SAVE_DBG_CTRLS = 1 << 2;
+        const HOST_ADDR_SPACE_SIZE = 1 << 9; // determines if a virtual processor will be in 64-bit mode after a VM exit
+        const LOAD_IA32_PERF_GLOBAL_CTRL = 1 << 12;
+        const ACK_INTERRUPT_ON_EXIT = 1 << 15;
+        const SAVE_IA32_PAT = 1 << 18;
+        const LOAD_IA32_PAT = 1 << 19;
+        const SAVE_IA32_EFER = 1 << 20;
+        const LOAD_IA32_EFER = 1 << 21;
+        const SAVE_VMX_PREEMPT_TIMER_VALUE = 1 << 22;
+        const CLEAR_IA32_BNDCFGS = 1 << 23;
+        const CONCEAL_VMX_FROM_PT = 1 << 24;
+        const CLEAR_IA32_RTIT_CTL = 1 << 25;
+        const CLEAR_IA32_LBR_CTL = 1 << 26;
+        const CLEAR_UINV = 1 << 27;
+        const LOAD_CET_STATE = 1 << 28;
+        const LOAD_PKRS = 1 << 29;
+        const SAVE_IA32_PERF_GLOBAL_CTL = 1 << 30;
+        const ACTIVATE_SECONDARY_CONTROLS = 1 << 31;
+    }
+
+    // (Intel Manual: 25.8.1 VM-Entry Controls)
+    #[allow(non_camel_case_types)]
+    pub struct VmxEntryCtrl: u32 {
+        const LOAD_DBG_CTRLS = 1 << 2;
+        const IA32E_MODE_GUEST = 1 << 9;
+        const ENTRY_TO_SMM = 1 << 10;
+        const DEACTIVATE_DUAL_MONITOR = 1 << 11;
+        const LOAD_IA32_PERF_GLOBAL_CTRL = 1 << 13;
+        const LOAD_IA32_PAT = 1 << 14;
+        const LOAD_IA32_EFER = 1 << 15;
+        const LOAD_IA32_BNDCFGS = 1 << 16;
+        const CONCEAL_VMX_FROM_PT = 1 << 17;
+        const LOAD_IA32_RTIT_CTL = 1 << 18;
+        const LOAD_UINV = 1 << 19;
+        const LOAD_CET_STATE = 1 << 20;
+        const LOAD_PKRS = 1 << 21;
+        const LOAD_IA32_PERF_GLOBAL_CTL = 1 << 22;
+    }
+
+}
+
+#[derive(FromPrimitive)]
+#[allow(non_camel_case_types)]
+pub enum VmxExitReason {
+    EXCEPTION_OR_NMI = 0,
+    EXTERNAL_INTERRUPT = 1,
+    TRIPLE_FAULT = 2,
+    INIT_SIGNAL = 3,
+    SIPI = 4,
+    IO_SMI = 5,
+    OTHER_SMI = 6,
+    INTERRUPT_WINDOW = 7,
+    NMI_WINDOW = 8,
+    TASK_SWITCH = 9,
+    CPUID = 10,
+    GETSEC = 11,
+    HLT = 12,
+    INVD = 13,
+    INVLPG = 14,
+    RDPMC = 15,
+    RDTSC = 16,
+    RSM = 17,
+    VMCALL = 18,
+    VMCLEAR = 19,
+    VMLAUNCH = 20,
+    VMPTRLD = 21,
+    VMPTRST = 22,
+    VMREAD = 23,
+    VMRESUME = 24,
+    VMWRITE = 25,
+    VMXOFF = 26,
+    VMXON = 27,
+    CR_ACCESS = 28,
+    DR_ACCESS = 29,
+    IO_INSTRUCTION = 30,
+    RDMSR = 31,
+    WRMSR = 32,
+    VM_ENTRY_FAILURE_INVALID_GUEST_STATE = 33,
+    VM_ENTRY_FAILURE_MSR_LOADING = 34,
+    MWAIT = 36,
+    MONITOR_TRAP_FLAG = 37,
+    MONITOR = 39,
+    PAUSE = 40,
+    VM_ENTRY_FAILURE_MACHINE_CHECK_EVENT = 41,
+    TPR_BELOW_THRESHOLD = 43,
+    APIC_ACCESS = 44,
+    VIRTUALIZED_EOI = 45,
+    ACCESS_GDTR_OR_IDTR = 46,
+    ACCESS_LDTR_OR_TR = 47,
+    EPT_VIOLATION = 48,
+    EPT_MISCONFIG = 49,
+    INVEPT = 50,
+    RDTSCP = 51,
+    VMX_PREEMPTION_TIMER_EXPIRED = 52,
+    INVVPID = 53,
+    WBINVD = 54,
+    XSETBV = 55,
+    APIC_WRITE = 56,
+    RDRAND = 57,
+    INVPCID = 58,
+    VMFUNC = 59,
+    ENCLS = 60,
+    RDSEED = 61,
+    PML_FULL = 62,
+    XSAVES = 63,
+    XRSTORS = 64,
+}
+
+impl From<i32> for VmxExitReason {
+    fn from(num: i32) -> Self {
+        match num {
+            0 => VmxExitReason::EXCEPTION_OR_NMI,
+            1 => VmxExitReason::EXTERNAL_INTERRUPT,
+            2 => VmxExitReason::TRIPLE_FAULT,
+            3 => VmxExitReason::INIT_SIGNAL,
+            4 => VmxExitReason::SIPI,
+            5 => VmxExitReason::IO_SMI,
+            6 => VmxExitReason::OTHER_SMI,
+            7 => VmxExitReason::INTERRUPT_WINDOW,
+            8 => VmxExitReason::NMI_WINDOW,
+            9 => VmxExitReason::TASK_SWITCH,
+            10 => VmxExitReason::CPUID,
+            11 => VmxExitReason::GETSEC,
+            12 => VmxExitReason::HLT,
+            13 => VmxExitReason::INVD,
+            14 => VmxExitReason::INVLPG,
+            15 => VmxExitReason::RDPMC,
+            16 => VmxExitReason::RDTSC,
+            17 => VmxExitReason::RSM,
+            18 => VmxExitReason::VMCALL,
+            19 => VmxExitReason::VMCLEAR,
+            20 => VmxExitReason::VMLAUNCH,
+            21 => VmxExitReason::VMPTRLD,
+            22 => VmxExitReason::VMPTRST,
+            23 => VmxExitReason::VMREAD,
+            24 => VmxExitReason::VMRESUME,
+            25 => VmxExitReason::VMWRITE,
+            26 => VmxExitReason::VMXOFF,
+            27 => VmxExitReason::VMXON,
+            28 => VmxExitReason::CR_ACCESS,
+            29 => VmxExitReason::DR_ACCESS,
+            30 => VmxExitReason::IO_INSTRUCTION,
+            31 => VmxExitReason::RDMSR,
+            32 => VmxExitReason::WRMSR,
+            33 => VmxExitReason::VM_ENTRY_FAILURE_INVALID_GUEST_STATE,
+            34 => VmxExitReason::VM_ENTRY_FAILURE_MSR_LOADING,
+            36 => VmxExitReason::MWAIT,
+            37 => VmxExitReason::MONITOR_TRAP_FLAG,
+            39 => VmxExitReason::MONITOR,
+            40 => VmxExitReason::PAUSE,
+            41 => VmxExitReason::VM_ENTRY_FAILURE_MACHINE_CHECK_EVENT,
+            43 => VmxExitReason::TPR_BELOW_THRESHOLD,
+            44 => VmxExitReason::APIC_ACCESS,
+            45 => VmxExitReason::VIRTUALIZED_EOI,
+            46 => VmxExitReason::ACCESS_GDTR_OR_IDTR,
+            47 => VmxExitReason::ACCESS_LDTR_OR_TR,
+            48 => VmxExitReason::EPT_VIOLATION,
+            49 => VmxExitReason::EPT_MISCONFIG,
+            50 => VmxExitReason::INVEPT,
+            51 => VmxExitReason::RDTSCP,
+            52 => VmxExitReason::VMX_PREEMPTION_TIMER_EXPIRED,
+            53 => VmxExitReason::INVVPID,
+            54 => VmxExitReason::WBINVD,
+            55 => VmxExitReason::XSETBV,
+            56 => VmxExitReason::APIC_WRITE,
+            57 => VmxExitReason::RDRAND,
+            58 => VmxExitReason::INVPCID,
+            59 => VmxExitReason::VMFUNC,
+            60 => VmxExitReason::ENCLS,
+            61 => VmxExitReason::RDSEED,
+            62 => VmxExitReason::PML_FULL,
+            63 => VmxExitReason::XSAVES,
+            64 => VmxExitReason::XRSTORS,
+            _ => panic!("Invalid VmxExitReason number: {}", num),
+        }
+    }
+}
+
+const fn encode_vmcs_field(
+    access_type: VmcsAccessType,
+    vmcs_type: VmcsType,
+    vmcs_width: VmcsWidth,
+    index: u32,
+) -> u32 {
+    let mut encoding: u32 = 0;
+    encoding |= (access_type as u32)
+        | (index as u32) << 1
+        | (vmcs_type as u32) << 10
+        | (vmcs_width as u32) << 13;
+    return encoding;
+}
+
+const fn encode_vmcs_field_full(vmcs_type: VmcsType, vmcs_width: VmcsWidth, index: u32) -> u32 {
+    encode_vmcs_field(VmcsAccessType::FULL, vmcs_type, vmcs_width, index)
+}
+
+// fn decode_vmcs_field(field: u32) -> (VmcsAccessType, VmcsType, VmcsWidth, u16){
+//     (FromPrimitive::from_u32(field & 1).unwrap() ,
+//         FromPrimitive::from_u32((field>>10) & 0x3).unwrap(),
+//         FromPrimitive::from_u32((field>>13) & 0x3).unwrap(),
+//         ((field>>1) & 0x1ff) as u16
+//     )
+// }

+ 269 - 0
kernel/src/arch/x86_64/kvm/vmx/vmexit.rs

@@ -0,0 +1,269 @@
+use super::vmcs::{VmcsFields, VmxExitReason};
+use super::vmx_asm_wrapper::{vmx_vmread, vmx_vmwrite};
+use crate::kdebug;
+use crate::{syscall::SystemError, virt::kvm::vm};
+use core::arch::asm;
+use x86::vmx::vmcs::ro::GUEST_PHYSICAL_ADDR_FULL;
+
+#[derive(FromPrimitive)]
+#[allow(non_camel_case_types)]
+pub enum APICExceptionVectors {
+    EXCEPTION_DIVIDE_ERROR,
+    EXCEPTION_DEBUG_BREAKPOINT,
+    EXCEPTION_NMI,
+    EXCEPTION_BREAKPOINT,
+    EXCEPTION_OVERFLOW,
+    EXCEPTION_BOUND_RANGE_EXCEEDED,
+    EXCEPTION_UNDEFINED_OPCODE,
+    EXCEPTION_NO_MATH_COPROCESSOR,
+    EXCEPTION_DOUBLE_FAULT,
+    EXCEPTION_RESERVED0,
+    EXCEPTION_INVALID_TASK_SEGMENT_SELECTOR,
+    EXCEPTION_SEGMENT_NOT_PRESENT,
+    EXCEPTION_STACK_SEGMENT_FAULT,
+    EXCEPTION_GENERAL_PROTECTION_FAULT,
+    EXCEPTION_PAGE_FAULT,
+    EXCEPTION_RESERVED1,
+    EXCEPTION_MATH_FAULT,
+    EXCEPTION_ALIGNMENT_CHECK,
+    EXCEPTION_MACHINE_CHECK,
+    EXCEPTION_SIMD_FLOATING_POINT_NUMERIC_ERROR,
+    EXCEPTION_VIRTUAL_EXCEPTION,
+    EXCEPTION_RESERVED2,
+    EXCEPTION_RESERVED3,
+    EXCEPTION_RESERVED4,
+    EXCEPTION_RESERVED5,
+    EXCEPTION_RESERVED6,
+    EXCEPTION_RESERVED7,
+    EXCEPTION_RESERVED8,
+    EXCEPTION_RESERVED9,
+    EXCEPTION_RESERVED10,
+    EXCEPTION_RESERVED11,
+    EXCEPTION_RESERVED12,
+}
+
+#[derive(FromPrimitive)]
+#[allow(non_camel_case_types)]
+pub enum InterruptType {
+    INTERRUPT_TYPE_EXTERNAL_INTERRUPT = 0,
+    INTERRUPT_TYPE_RESERVED = 1,
+    INTERRUPT_TYPE_NMI = 2,
+    INTERRUPT_TYPE_HARDWARE_EXCEPTION = 3,
+    INTERRUPT_TYPE_SOFTWARE_INTERRUPT = 4,
+    INTERRUPT_TYPE_PRIVILEGED_SOFTWARE_INTERRUPT = 5,
+    INTERRUPT_TYPE_SOFTWARE_EXCEPTION = 6,
+    INTERRUPT_TYPE_OTHER_EVENT = 7,
+}
+
+pub fn vmexit_vmx_instruction_executed() -> Result<(), SystemError> {
+    let valid: u32 = 1;
+    let vector: u32 = APICExceptionVectors::EXCEPTION_UNDEFINED_OPCODE as u32;
+    let interrupt_type = InterruptType::INTERRUPT_TYPE_HARDWARE_EXCEPTION as u32;
+    let deliver_code: u32 = 0;
+    let interrupt_info = valid << 31 | interrupt_type << 8 | deliver_code << 11 | vector;
+    vmx_vmwrite(
+        VmcsFields::CTRL_VM_ENTRY_INTR_INFO_FIELD as u32,
+        interrupt_info as u64,
+    )?;
+    vmx_vmwrite(VmcsFields::CTRL_VM_ENTRY_INSTR_LEN as u32, 0)?;
+    let rflags: u64 = vmx_vmread(VmcsFields::GUEST_RFLAGS as u32).unwrap() | 0x0001_0000; // set RF flags
+    vmx_vmwrite(VmcsFields::GUEST_RFLAGS as u32, rflags)?;
+    Ok(())
+}
+
+// pub fn vmexit_cpuid_handler(guest_cpu_context: &mut GuestCpuContext) -> Result<(), SystemError>{
+//     let rax = guest_cpu_context.rax;
+//     let rcx = guest_cpu_context.rcx;
+//     // let rdx = guest_cpu_context.rdx;
+//     // let rbx = guest_cpu_context.rbx;
+//     cpuid!(rax, rcx);
+//     unsafe{asm!("mov {}, rax", out(reg) guest_cpu_context.rax)};
+//     unsafe{asm!("mov {}, rcx", out(reg) guest_cpu_context.rcx)};
+//     unsafe{asm!("mov {}, rdx", out(reg) guest_cpu_context.rdx)};
+//     unsafe{asm!("mov {}, rbx", out(reg) guest_cpu_context.rbx)};
+//     Ok(())
+// }
+
+unsafe fn save_rpg() {
+    asm!(
+        "push    rax",
+        "push    rcx",
+        "push    rdx",
+        "push    rbx",
+        "push    rbp",
+        "push    rsi",
+        "push    rdi",
+        "push    r8",
+        "push    r9",
+        "push    r10",
+        "push    r11",
+        "push    r12",
+        "push    r13",
+        "push    r14",
+        "push    r15",
+    );
+}
+
+unsafe fn restore_rpg() {
+    asm!(
+        "pop    r15",
+        "pop    r14",
+        "pop    r13",
+        "pop    r12",
+        "pop    r11",
+        "pop    r10",
+        "pop    r9",
+        "pop    r8",
+        "pop    rdi",
+        "pop    rsi",
+        "pop    rbp",
+        "pop    rbx",
+        "pop    rdx",
+        "pop    rcx",
+        "pop    rax",
+    );
+}
+
+#[repr(C)]
+#[allow(dead_code)]
+pub struct GuestCpuContext {
+    pub r15: u64,
+    pub r14: u64,
+    pub r13: u64,
+    pub r12: u64,
+    pub r11: u64,
+    pub r10: u64,
+    pub r9: u64,
+    pub r8: u64,
+    pub rdi: u64,
+    pub rsi: u64,
+    pub rbp: u64,
+    pub rbx: u64,
+    pub rdx: u64,
+    pub rcx: u64,
+    pub rax: u64,
+}
+
+#[no_mangle]
+pub extern "C" fn vmx_return() {
+    kdebug!("vmx_return!");
+    unsafe { save_rpg() };
+    vmexit_handler();
+    // XMM registers are vector registers. They're renamed onto the FP/SIMD register file
+    // unsafe {asm!(
+    //     "sub     rsp, 60h",
+    //     "movaps  xmmword ptr [rsp +  0h], xmm0",
+    //     "movaps  xmmword ptr [rsp + 10h], xmm1",
+    //     "movaps  xmmword ptr [rsp + 20h], xmm2",
+    //     "movaps  xmmword ptr [rsp + 30h], xmm3",
+    //     "movaps  xmmword ptr [rsp + 40h], xmm4",
+    //     "movaps  xmmword ptr [rsp + 50h], xmm5",
+
+    //     "mov     rdi, rsp",
+    //     "sub     rsp, 20h",
+    //     "call vmexit_handler",
+    //     "add     rsp, 20h",
+
+    //     "movaps  xmm0, xmmword ptr [rsp +  0h]",
+    //     "movaps  xmm1, xmmword ptr [rsp + 10h]",
+    //     "movaps  xmm2, xmmword ptr [rsp + 20h]",
+    //     "movaps  xmm3, xmmword ptr [rsp + 30h]",
+    //     "movaps  xmm4, xmmword ptr [rsp + 40h]",
+    //     "movaps  xmm5, xmmword ptr [rsp + 50h]",
+    //     "add     rsp, 60h",
+    // clobber_abi("C"),
+    // )};
+    unsafe { restore_rpg() };
+    unsafe { asm!("vmresume",) };
+}
+
+#[no_mangle]
+extern "C" fn vmexit_handler() {
+    // let guest_cpu_context = unsafe { guest_cpu_context_ptr.as_mut().unwrap() };
+    // kdebug!("guest_cpu_context_ptr={:p}",guest_cpu_context_ptr);
+    kdebug!("vmexit handler!");
+
+    let exit_reason = vmx_vmread(VmcsFields::VMEXIT_EXIT_REASON as u32).unwrap() as u32;
+    let exit_basic_reason = exit_reason & 0x0000_ffff;
+    let guest_rip = vmx_vmread(VmcsFields::GUEST_RIP as u32).unwrap();
+    // let guest_rsp = vmx_vmread(VmcsFields::GUEST_RSP as u32).unwrap();
+    kdebug!("guest_rip={:x}", guest_rip);
+    let _guest_rflags = vmx_vmread(VmcsFields::GUEST_RFLAGS as u32).unwrap();
+
+    match VmxExitReason::from(exit_basic_reason as i32) {
+        VmxExitReason::VMCALL
+        | VmxExitReason::VMCLEAR
+        | VmxExitReason::VMLAUNCH
+        | VmxExitReason::VMPTRLD
+        | VmxExitReason::VMPTRST
+        | VmxExitReason::VMREAD
+        | VmxExitReason::VMRESUME
+        | VmxExitReason::VMWRITE
+        | VmxExitReason::VMXOFF
+        | VmxExitReason::VMXON
+        | VmxExitReason::VMFUNC
+        | VmxExitReason::INVEPT
+        | VmxExitReason::INVVPID => {
+            kdebug!("vmexit handler: vmx instruction!");
+            vmexit_vmx_instruction_executed().expect("previledge instruction handle error");
+        }
+        VmxExitReason::CPUID => {
+            kdebug!("vmexit handler: cpuid instruction!");
+            // vmexit_cpuid_handler(guest_cpu_context);
+            adjust_rip(guest_rip).unwrap();
+        }
+        VmxExitReason::RDMSR => {
+            kdebug!("vmexit handler: rdmsr instruction!");
+            adjust_rip(guest_rip).unwrap();
+        }
+        VmxExitReason::WRMSR => {
+            kdebug!("vmexit handler: wrmsr instruction!");
+            adjust_rip(guest_rip).unwrap();
+        }
+        VmxExitReason::TRIPLE_FAULT => {
+            kdebug!("vmexit handler: triple fault!");
+            adjust_rip(guest_rip).unwrap();
+        }
+        VmxExitReason::EPT_VIOLATION => {
+            kdebug!("vmexit handler: ept violation!");
+            let gpa = vmx_vmread(GUEST_PHYSICAL_ADDR_FULL as u32).unwrap();
+            let exit_qualification = vmx_vmread(VmcsFields::VMEXIT_QUALIFICATION as u32).unwrap();
+            /* It is a write fault? */
+            let mut error_code = exit_qualification & (1 << 1);
+            /* It is a fetch fault? */
+            error_code |= (exit_qualification << 2) & (1 << 4);
+            /* ept page table is present? */
+            error_code |= (exit_qualification >> 3) & (1 << 0);
+
+            let kvm = vm(0).unwrap();
+            let vcpu = kvm.vcpu[0].clone();
+            // Use the data
+            let kvm_ept_page_fault = vcpu.lock().mmu.page_fault.unwrap();
+            kvm_ept_page_fault(&mut (*vcpu.lock()), gpa, error_code as u32, false)
+                .expect("ept page fault error");
+        }
+        _ => {
+            kdebug!(
+                "vmexit handler: unhandled vmexit reason: {}!",
+                exit_basic_reason
+            );
+
+            let info = vmx_vmread(VmcsFields::VMEXIT_INSTR_LEN as u32).unwrap() as u32;
+            kdebug!("vmexit handler: VMEXIT_INSTR_LEN: {}!", info);
+            let info = vmx_vmread(VmcsFields::VMEXIT_INSTR_INFO as u32).unwrap() as u32;
+            kdebug!("vmexit handler: VMEXIT_INSTR_INFO: {}!", info);
+            let info = vmx_vmread(VmcsFields::CTRL_EXPECTION_BITMAP as u32).unwrap() as u32;
+            kdebug!("vmexit handler: CTRL_EXPECTION_BITMAP: {}!", info);
+
+            adjust_rip(guest_rip).unwrap();
+            // panic!();
+        }
+    }
+}
+
+#[no_mangle]
+fn adjust_rip(rip: u64) -> Result<(), SystemError> {
+    let instruction_length = vmx_vmread(VmcsFields::VMEXIT_INSTR_LEN as u32)?;
+    vmx_vmwrite(VmcsFields::GUEST_RIP as u32, rip + instruction_length)?;
+    Ok(())
+}

+ 96 - 0
kernel/src/arch/x86_64/kvm/vmx/vmx_asm_wrapper.rs

@@ -0,0 +1,96 @@
+use super::vmcs::VmcsFields;
+use crate::kdebug;
+use crate::syscall::SystemError;
+use core::arch::asm;
+use x86;
+/// Enable VMX operation.
+pub fn vmxon(vmxon_pa: u64) -> Result<(), SystemError> {
+    match unsafe { x86::bits64::vmx::vmxon(vmxon_pa) } {
+        Ok(_) => Ok(()),
+        Err(e) => {
+            kdebug!("vmxon fail: {:?}", e);
+            Err(SystemError::EVMXONFailed)
+        }
+    }
+}
+
+/// Disable VMX operation.
+pub fn vmxoff() -> Result<(), SystemError> {
+    match unsafe { x86::bits64::vmx::vmxoff() } {
+        Ok(_) => Ok(()),
+        Err(_) => Err(SystemError::EVMXOFFFailed),
+    }
+}
+
+/// vmrite the current VMCS.
+pub fn vmx_vmwrite(vmcs_field: u32, value: u64) -> Result<(), SystemError> {
+    match unsafe { x86::bits64::vmx::vmwrite(vmcs_field, value) } {
+        Ok(_) => Ok(()),
+        Err(e) => {
+            kdebug!("vmx_write fail: {:?}", e);
+            kdebug!("vmcs_field: {:x}", vmcs_field);
+            Err(SystemError::EVMWRITEFailed)
+        }
+    }
+}
+
+/// vmread the current VMCS.
+pub fn vmx_vmread(vmcs_field: u32) -> Result<u64, SystemError> {
+    match unsafe { x86::bits64::vmx::vmread(vmcs_field) } {
+        Ok(value) => Ok(value),
+        Err(e) => {
+            kdebug!("vmx_read fail: {:?}", e);
+            Err(SystemError::EVMREADFailed)
+        }
+    }
+}
+
+pub fn vmx_vmptrld(vmcs_pa: u64) -> Result<(), SystemError> {
+    match unsafe { x86::bits64::vmx::vmptrld(vmcs_pa) } {
+        Ok(_) => Ok(()),
+        Err(_) => Err(SystemError::EVMPRTLDFailed),
+    }
+}
+
+pub fn vmx_vmlaunch() -> Result<(), SystemError> {
+    let host_rsp = VmcsFields::HOST_RSP as u32;
+    let host_rip = VmcsFields::HOST_RIP as u32;
+    unsafe {
+        asm!(
+            "push    rbp",
+            "push    rcx",
+            "push    rdx",
+            "push    rsi",
+            "push    rdi",
+            "vmwrite {0:r}, rsp",
+            "lea rax, 1f[rip]",
+            "vmwrite {1:r}, rax",
+            "vmlaunch",
+            "1:",
+            "pop    rdi",
+            "pop    rsi",
+            "pop    rdx",
+            "pop    rcx",
+            "pop    rbp",
+            "call vmx_return",
+            in(reg) host_rsp,
+            in(reg) host_rip,
+            clobber_abi("C"),
+        )
+    }
+    Ok(())
+    // match unsafe { x86::bits64::vmx::vmlaunch() } {
+    //     Ok(_) => Ok(()),
+    //     Err(e) => {
+    //         kdebug!("vmx_launch fail: {:?}", e);
+    //         Err(SystemError::EVMLAUNCHFailed)
+    //     },
+    // }
+}
+
+pub fn vmx_vmclear(vmcs_pa: u64) -> Result<(), SystemError> {
+    match unsafe { x86::bits64::vmx::vmclear(vmcs_pa) } {
+        Ok(_) => Ok(()),
+        Err(_) => Err(SystemError::EVMPRTLDFailed),
+    }
+}

+ 18 - 6
kernel/src/arch/x86_64/mm/mod.rs

@@ -34,6 +34,9 @@ use core::mem::{self};
 
 use core::sync::atomic::{compiler_fence, AtomicBool, Ordering};
 
+use super::kvm::vmx::vmcs::VmcsFields;
+use super::kvm::vmx::vmx_asm_wrapper::vmx_vmread;
+
 pub type PageMapper =
     crate::mm::page::PageMapper<crate::arch::x86_64::mm::X86_64MMArch, LockedFrameAllocator>;
 
@@ -169,12 +172,21 @@ impl MemoryManagementArch for X86_64MMArch {
     }
 
     /// @brief 获取顶级页表的物理地址
-    unsafe fn table(_table_kind: PageTableKind) -> PhysAddr {
-        let paddr: usize;
-        compiler_fence(Ordering::SeqCst);
-        asm!("mov {}, cr3", out(reg) paddr, options(nomem, nostack, preserves_flags));
-        compiler_fence(Ordering::SeqCst);
-        return PhysAddr::new(paddr);
+    unsafe fn table(table_kind: PageTableKind) -> PhysAddr {
+        match table_kind {
+            PageTableKind::Kernel | PageTableKind::User => {
+                let paddr: usize;
+                compiler_fence(Ordering::SeqCst);
+                asm!("mov {}, cr3", out(reg) paddr, options(nomem, nostack, preserves_flags));
+                compiler_fence(Ordering::SeqCst);
+                return PhysAddr::new(paddr);
+            }
+            PageTableKind::EPT => {
+                let eptp =
+                    vmx_vmread(VmcsFields::CTRL_EPTP_PTR as u32).expect("Failed to read eptp");
+                return PhysAddr::new(eptp as usize);
+            }
+        }
     }
 
     /// @brief 设置顶级页表的物理地址到处理器中

+ 3 - 0
kernel/src/arch/x86_64/mod.rs

@@ -6,6 +6,7 @@ pub mod cpu;
 pub mod fpu;
 pub mod interrupt;
 pub mod ipc;
+pub mod kvm;
 pub mod libs;
 pub mod mm;
 pub mod msi;
@@ -25,4 +26,6 @@ pub use self::mm::X86_64MMArch as MMArch;
 pub use interrupt::X86_64InterruptArch as CurrentIrqArch;
 
 pub use crate::arch::asm::pio::X86_64PortIOArch as CurrentPortIOArch;
+pub use kvm::X86_64KVMArch as KVMArch;
+
 pub use crate::arch::ipc::signal::X86_64SignalArch as CurrentSignalArch;

+ 5 - 0
kernel/src/filesystem/devfs/mod.rs

@@ -149,6 +149,11 @@ impl DevFS {
                 dev_block_inode.add_dev(name, device.clone())?;
                 device.set_fs(dev_block_inode.0.lock().fs.clone());
             }
+            FileType::KvmDevice => {
+                dev_root_inode
+                    .add_dev(name, device.clone())
+                    .expect("DevFS: Failed to register /dev/kvm");
+            }
             _ => {
                 return Err(SystemError::EOPNOTSUPP_OR_ENOTSUP);
             }

+ 1 - 0
kernel/src/filesystem/vfs/file.rs

@@ -136,6 +136,7 @@ impl File {
         };
         // kdebug!("inode:{:?}",f.inode);
         f.inode.open(&mut f.private_data, &mode)?;
+
         return Ok(f);
     }
 

+ 3 - 0
kernel/src/filesystem/vfs/mod.rs

@@ -39,6 +39,8 @@ pub enum FileType {
     BlockDevice,
     /// 字符设备
     CharDevice,
+    /// kvm设备
+    KvmDevice,
     /// 管道文件
     Pipe,
     /// 符号链接
@@ -88,6 +90,7 @@ impl FileType {
             FileType::Dir => DT_DIR,
             FileType::BlockDevice => DT_BLK,
             FileType::CharDevice => DT_CHR,
+            FileType::KvmDevice => DT_CHR,
             FileType::Pipe => DT_FIFO,
             FileType::SymLink => DT_LNK,
             FileType::Socket => DT_SOCK,

+ 23 - 1
kernel/src/filesystem/vfs/syscall.rs

@@ -25,6 +25,7 @@ use super::{
     utils::rsplit_path,
     Dirent, FileType, IndexNode, MAX_PATHLEN, ROOT_INODE, VFS_MAX_FOLLOW_SYMLINK_TIMES,
 };
+// use crate::kdebug;
 
 pub const SEEK_SET: u32 = 0;
 pub const SEEK_CUR: u32 = 1;
@@ -207,7 +208,6 @@ impl Syscall {
         if mode.contains(FileMode::O_APPEND) {
             file.lseek(SeekFrom::SeekEnd(0))?;
         }
-
         // 把文件对象存入pcb
         let r = ProcessManager::current_pcb()
             .fd_table()
@@ -232,6 +232,27 @@ impl Syscall {
         return res;
     }
 
+    /// @brief 发送命令到文件描述符对应的设备,
+    ///
+    /// @param fd 文件描述符编号
+    /// @param cmd 设备相关的请求类型
+    ///
+    /// @return Ok(usize) 成功返回0
+    /// @return Err(SystemError) 读取失败,返回posix错误码
+    pub fn ioctl(fd: usize, cmd: u32, data: usize) -> Result<usize, SystemError> {
+        let binding = ProcessManager::current_pcb().fd_table();
+        let fd_table_guard = binding.read();
+
+        let file = fd_table_guard
+            .get_file_by_fd(fd as i32)
+            .ok_or(SystemError::EBADF)?;
+
+        // drop guard 以避免无法调度的问题
+        drop(fd_table_guard);
+        let r = file.lock_no_preempt().inode().ioctl(cmd, data);
+        return r;
+    }
+
     /// @brief 根据文件描述符,读取文件数据。尝试读取的数据长度与buf的长度相同。
     ///
     /// @param fd 文件描述符编号
@@ -700,6 +721,7 @@ impl Syscall {
             FileType::SymLink => kstat.mode.insert(ModeType::S_IFLNK),
             FileType::Socket => kstat.mode.insert(ModeType::S_IFSOCK),
             FileType::Pipe => kstat.mode.insert(ModeType::S_IFIFO),
+            FileType::KvmDevice => kstat.mode.insert(ModeType::S_IFCHR),
         }
 
         return Ok(kstat);

+ 1 - 0
kernel/src/ktest/ktest.h

@@ -5,6 +5,7 @@ int ktest_test_bitree(void* arg);
 int ktest_test_kfifo(void* arg);
 int ktest_test_mutex(void* arg);
 int ktest_test_idr(void* arg);
+int ktest_test_kvm(void* arg);
 
 /**
  * @brief 开启一个新的内核线程以进行测试

+ 23 - 0
kernel/src/ktest/test-kvm.c

@@ -0,0 +1,23 @@
+#include "ktest.h"
+#include "ktest_utils.h"
+
+static long ktest_kvm_case0_1(uint64_t arg0, uint64_t arg1){
+    kTEST("Testing /dev/kvm device...");
+    
+}
+
+static ktest_case_table kt_kvm_func_table[] = {
+    ktest_kvm_case0_1,
+};
+
+int ktest_test_kvm(void* arg)
+{
+    kTEST("Testing kvm...");
+    for (int i = 0; i < sizeof(kt_kvm_func_table) / sizeof(ktest_case_table); ++i)
+    {
+        kTEST("Testing case %d", i);
+        kt_kvm_func_table[i](i, 0);
+    }
+    kTEST("kvm Test done.");
+    return 0;
+}

+ 4 - 0
kernel/src/lib.rs

@@ -16,6 +16,9 @@
 #![feature(trait_upcasting)]
 #![feature(slice_ptr_get)]
 #![feature(vec_into_raw_parts)]
+#![feature(new_uninit)]
+#![feature(ptr_to_from_bits)]
+#![feature(concat_idents)]
 #![cfg_attr(target_os = "none", no_std)]
 
 #[cfg(test)]
@@ -46,6 +49,7 @@ mod sched;
 mod smp;
 mod syscall;
 mod time;
+mod virt;
 
 #[macro_use]
 extern crate alloc;

+ 5 - 0
kernel/src/main.c

@@ -30,6 +30,7 @@
 #include <time/timer.h>
 
 #include <driver/interrupt/apic/apic_timer.h>
+#include <virt/kvm/kvm.h>
 
 extern int rs_driver_init();
 extern void rs_softirq_init();
@@ -158,6 +159,10 @@ void system_initialize()
     cli();
     HPET_enable();
 
+    io_mfence();
+    
+    kvm_init();
+
     io_mfence();
     // 系统初始化到此结束,剩下的初始化功能应当放在初始内核线程中执行
 

+ 27 - 0
kernel/src/mm/allocator/kernel_allocator.rs

@@ -94,6 +94,33 @@ unsafe impl GlobalAlloc for KernelAllocator {
     }
 }
 
+/// 为内核slab分配器实现Allocator特性
+// unsafe impl Allocator for KernelAllocator {
+//     fn allocate(&self, layout: Layout) -> Result<NonNull<[u8]>, AllocError> {
+//         let memory = unsafe {self.local_alloc(layout)};
+//         if memory.is_null() {
+//             Err(AllocError)
+//         } else {
+//             let slice = unsafe { core::slice::from_raw_parts_mut(memory, layout.size()) };
+//             Ok(unsafe { NonNull::new_unchecked(slice) })
+//         }
+//     }
+
+//     fn allocate_zeroed(&self, layout: Layout) -> Result<NonNull<[u8]>, AllocError> {
+//         let memory = unsafe {self.local_alloc_zeroed(layout)};
+//         if memory.is_null() {
+//             Err(AllocError)
+//         } else {
+//             let slice = unsafe { core::slice::from_raw_parts_mut(memory, layout.size()) };
+//             Ok(unsafe { NonNull::new_unchecked(slice) })
+//         }
+//     }
+
+//     unsafe fn deallocate(&self, ptr: NonNull<u8>, layout: Layout) {
+//         self.local_dealloc(ptr.cast().as_ptr(), layout);
+//     }
+// }
+
 /// 内存分配错误处理函数
 #[cfg(target_os = "none")]
 #[alloc_error_handler]

+ 4 - 0
kernel/src/mm/mod.rs

@@ -73,6 +73,8 @@ pub enum PageTableKind {
     User,
     /// 内核页表
     Kernel,
+    /// 内存虚拟化中使用的EPT
+    EPT,
 }
 
 /// 物理内存地址
@@ -365,6 +367,8 @@ pub trait MemoryManagementArch: Clone + Copy + Debug {
     const PAGE_SIZE: usize = 1 << Self::PAGE_SHIFT;
     /// 通过这个mask,获取地址的页内偏移量
     const PAGE_OFFSET_MASK: usize = Self::PAGE_SIZE - 1;
+    /// 通过这个mask,获取页的首地址
+    const PAGE_MASK: usize = !(Self::PAGE_OFFSET_MASK);
     /// 页表项的地址、数据部分的shift。
     /// 打个比方,如果这个值为52,那么意味着页表项的[0, 52)位,用于表示地址以及其他的标志位
     const PAGE_ADDRESS_SHIFT: usize = Self::PAGE_LEVELS * Self::PAGE_ENTRY_SHIFT + Self::PAGE_SHIFT;

+ 21 - 0
kernel/src/syscall/mod.rs

@@ -3,6 +3,8 @@ use core::{
     sync::atomic::{AtomicBool, Ordering},
 };
 
+use crate::kdebug;
+
 use num_traits::{FromPrimitive, ToPrimitive};
 
 use crate::{
@@ -297,6 +299,16 @@ pub enum SystemError {
     EOWNERDEAD = 129,
     /// 状态不可恢复 State not recoverable.
     ENOTRECOVERABLE = 130,
+    // VMX on 虚拟化开启指令出错
+    EVMXONFailed = 131,
+    // VMX off 虚拟化关闭指令出错
+    EVMXOFFFailed = 132,
+    // VMX VMWRITE 写入虚拟化VMCS内存出错
+    EVMWRITEFailed = 133,
+    EVMREADFailed = 134,
+    EVMPRTLDFailed = 135,
+    EVMLAUNCHFailed = 136,
+    KVM_HVA_ERR_BAD = 137,
 }
 
 impl SystemError {
@@ -377,6 +389,8 @@ pub const SYS_FCNTL: usize = 51;
 pub const SYS_FTRUNCATE: usize = 52;
 pub const SYS_MKNOD: usize = 53;
 
+pub const SYS_IOCTL: usize = 54;
+
 #[derive(Debug)]
 pub struct Syscall;
 
@@ -477,6 +491,13 @@ impl Syscall {
 
                 Self::lseek(fd, w)
             }
+            SYS_IOCTL => {
+                kdebug!("SYS_IOCTL");
+                let fd = args[0];
+                let cmd = args[1];
+                let data = args[2];
+                Self::ioctl(fd, cmd as u32, data)
+            }
 
             SYS_FORK => Self::fork(frame),
             SYS_VFORK => Self::vfork(frame),

+ 190 - 0
kernel/src/virt/kvm/host_mem.rs

@@ -0,0 +1,190 @@
+use super::{vcpu::Vcpu, vm};
+use crate::{
+    kdebug,
+    mm::{kernel_mapper::KernelMapper, page::PageFlags, VirtAddr},
+    syscall::SystemError,
+};
+
+/*
+ * Address types:
+ *
+ *  gva - guest virtual address
+ *  gpa - guest physical address
+ *  gfn - guest frame number
+ *  hva - host virtual address
+ *  hpa - host physical address
+ *  hfn - host frame number
+ */
+pub const KVM_USER_MEM_SLOTS: u32 = 16;
+pub const KVM_PRIVATE_MEM_SLOTS: u32 = 3;
+pub const KVM_MEM_SLOTS_NUM: u32 = KVM_USER_MEM_SLOTS + KVM_PRIVATE_MEM_SLOTS;
+pub const KVM_ADDRESS_SPACE_NUM: usize = 2;
+
+pub const KVM_MEM_LOG_DIRTY_PAGES: u32 = 1 << 0;
+pub const KVM_MEM_READONLY: u32 = 1 << 1;
+pub const KVM_MEM_MAX_NR_PAGES: u32 = (1 << 31) - 1;
+
+/*
+ * The bit 16 ~ bit 31 of kvm_memory_region::flags are internally used
+ * in kvm, other bits are visible for userspace which are defined in
+ * include/linux/kvm_h.
+ */
+pub const KVM_MEMSLOT_INVALID: u32 = 1 << 16;
+// pub const  KVM_MEMSLOT_INCOHERENT:u32 = 1 << 17;
+
+// pub const KVM_PERMILLE_MMU_PAGES: u32 = 20; //  the proportion of MMU pages required per thousand (out of 1000) memory pages.
+// pub const KVM_MIN_ALLOC_MMU_PAGES: u32 = 64;
+
+pub const PAGE_SHIFT: u32 = 12;
+pub const PAGE_SIZE: u32 = 1 << PAGE_SHIFT;
+pub const PAGE_MASK: u32 = !(PAGE_SIZE - 1);
+
+#[repr(C)]
+/// 通过这个结构可以将虚拟机的物理地址对应到用户进程的虚拟地址
+/// 用来表示虚拟机的一段物理内存
+pub struct KvmUserspaceMemoryRegion {
+    pub slot: u32, // 要在哪个slot上注册内存区间
+    // flags有两个取值,KVM_MEM_LOG_DIRTY_PAGES和KVM_MEM_READONLY,用来指示kvm针对这段内存应该做的事情。
+    // KVM_MEM_LOG_DIRTY_PAGES用来开启内存脏页,KVM_MEM_READONLY用来开启内存只读。
+    pub flags: u32,
+    pub guest_phys_addr: u64, // 虚机内存区间起始物理地址
+    pub memory_size: u64,     // 虚机内存区间大小
+    pub userspace_addr: u64,  // 虚机内存区间对应的主机虚拟地址
+}
+
+#[derive(Default, Clone, Copy, Debug)]
+pub struct KvmMemorySlot {
+    pub base_gfn: u64,       // 虚机内存区间起始物理页框号
+    pub npages: u64,         // 虚机内存区间页数,即内存区间的大小
+    pub userspace_addr: u64, // 虚机内存区间对应的主机虚拟地址
+    pub flags: u32,          // 虚机内存区间属性
+    pub id: u16,             // 虚机内存区间id
+                             // 用来记录虚机内存区间的脏页信息,每个bit对应一个页,如果bit为1,表示对应的页是脏页,如果bit为0,表示对应的页是干净页。
+                             // pub dirty_bitmap: *mut u8,
+                             // unsigned long *rmap[KVM_NR_PAGE_SIZES]; 反向映射相关的结构, 创建EPT页表项时就记录GPA对应的页表项地址(GPA-->页表项地址),暂时不需要
+}
+
+#[derive(Default, Clone, Copy, Debug)]
+pub struct KvmMemorySlots {
+    pub memslots: [KvmMemorySlot; KVM_MEM_SLOTS_NUM as usize], // 虚机内存区间数组
+    pub used_slots: u32,                                       // 已经使用的slot数量
+}
+
+#[derive(PartialEq, Eq, Debug)]
+pub enum KvmMemoryChange {
+    Create,
+    Delete,
+    Move,
+    FlagsOnly,
+}
+
+impl Default for KvmUserspaceMemoryRegion {
+    fn default() -> KvmUserspaceMemoryRegion {
+        KvmUserspaceMemoryRegion {
+            slot: 0,
+            flags: 0,
+            guest_phys_addr: 0,
+            memory_size: 0,
+            userspace_addr: 0,
+        }
+    }
+}
+
+pub fn kvm_vcpu_memslots(_vcpu: &mut dyn Vcpu) -> KvmMemorySlots {
+    let kvm = vm(0).unwrap();
+    let as_id = 0;
+    return kvm.memslots[as_id];
+}
+
+fn __gfn_to_memslot(slots: KvmMemorySlots, gfn: u64) -> Option<KvmMemorySlot> {
+    kdebug!("__gfn_to_memslot");
+    // TODO: 使用二分查找的方式优化
+    for i in 0..slots.used_slots {
+        let memslot = slots.memslots[i as usize];
+        if gfn >= memslot.base_gfn && gfn < memslot.base_gfn + memslot.npages {
+            return Some(memslot);
+        }
+    }
+    return None;
+}
+
+fn __gfn_to_hva(slot: KvmMemorySlot, gfn: u64) -> u64 {
+    return slot.userspace_addr + (gfn - slot.base_gfn) * (PAGE_SIZE as u64);
+}
+fn __gfn_to_hva_many(
+    slot: Option<KvmMemorySlot>,
+    gfn: u64,
+    nr_pages: Option<&mut u64>,
+    write: bool,
+) -> Result<u64, SystemError> {
+    kdebug!("__gfn_to_hva_many");
+    if slot.is_none() {
+        return Err(SystemError::KVM_HVA_ERR_BAD);
+    }
+    let slot = slot.unwrap();
+    if slot.flags & KVM_MEMSLOT_INVALID != 0 || (slot.flags & KVM_MEM_READONLY != 0) && write {
+        return Err(SystemError::KVM_HVA_ERR_BAD);
+    }
+
+    if nr_pages.is_some() {
+        let nr_pages = nr_pages.unwrap();
+        *nr_pages = slot.npages - (gfn - slot.base_gfn);
+    }
+    return Ok(__gfn_to_hva(slot, gfn));
+}
+
+/* From Linux kernel
+ * Pin guest page in memory and return its pfn.
+ * @addr: host virtual address which maps memory to the guest
+ * @atomic: whether this function can sleep
+ * @async: whether this function need to wait IO complete if the
+ *         host page is not in the memory
+ * @write_fault: whether we should get a writable host page
+ * @writable: whether it allows to map a writable host page for !@write_fault
+ *
+ * The function will map a writable host page for these two cases:
+ * 1): @write_fault = true
+ * 2): @write_fault = false && @writable, @writable will tell the caller
+ *     whether the mapping is writable.
+ */
+// 计算 HVA 对应的 pfn,同时确保该物理页在内存中
+// host端虚拟地址到物理地址的转换,有两种方式,hva_to_pfn_fast、hva_to_pfn_slow
+// 正确性待验证
+fn hva_to_pfn(addr: u64, _atomic: bool, _writable: &mut bool) -> Result<u64, SystemError> {
+    kdebug!("hva_to_pfn");
+    unsafe {
+        let raw = addr as *const i32;
+        kdebug!("raw={:x}", *raw);
+    }
+    // let hpa = MMArch::virt_2_phys(VirtAddr::new(addr)).unwrap().data() as u64;
+    let hva = VirtAddr::new(addr as usize);
+    let mut mapper = KernelMapper::lock();
+    let mapper = mapper.as_mut().unwrap();
+    if let Some((hpa, _)) = mapper.translate(hva) {
+        return Ok(hpa.data() as u64 >> PAGE_SHIFT);
+    }
+    unsafe {
+        mapper.map(hva, PageFlags::mmio_flags());
+    }
+    let (hpa, _) = mapper.translate(hva).unwrap();
+    return Ok(hpa.data() as u64 >> PAGE_SHIFT);
+}
+
+pub fn __gfn_to_pfn(
+    slot: Option<KvmMemorySlot>,
+    gfn: u64,
+    atomic: bool,
+    write: bool,
+    writable: &mut bool,
+) -> Result<u64, SystemError> {
+    kdebug!("__gfn_to_pfn");
+    let mut nr_pages = 0;
+    let addr = __gfn_to_hva_many(slot, gfn, Some(&mut nr_pages), write)?;
+    let pfn = hva_to_pfn(addr, atomic, writable)?;
+    kdebug!("hva={}, pfn={}", addr, pfn);
+    return Ok(pfn);
+}
+
+pub fn kvm_vcpu_gfn_to_memslot(vcpu: &mut dyn Vcpu, gfn: u64) -> Option<KvmMemorySlot> {
+    return __gfn_to_memslot(kvm_vcpu_memslots(vcpu), gfn);
+}

+ 2 - 0
kernel/src/virt/kvm/kvm.h

@@ -0,0 +1,2 @@
+// ================= Rust 实现 =============
+extern void kvm_init();

+ 188 - 0
kernel/src/virt/kvm/kvm_dev.rs

@@ -0,0 +1,188 @@
+use crate::filesystem::devfs::{DevFS, DeviceINode};
+use crate::filesystem::vfs::{
+    core::generate_inode_id,
+    file::{File, FileMode},
+    make_rawdev, FilePrivateData, FileSystem, FileType, IndexNode, Metadata, PollStatus,
+};
+use crate::process::ProcessManager;
+use crate::{arch::KVMArch, libs::spinlock::SpinLock, syscall::SystemError, time::TimeSpec};
+use crate::{filesystem, kdebug};
+// use crate::virt::kvm::{host_stack};
+use super::push_vm;
+use crate::virt::kvm::vm_dev::LockedVmInode;
+use alloc::{
+    string::String,
+    sync::{Arc, Weak},
+    vec::Vec,
+};
+
+pub const KVM_API_VERSION: u32 = 12;
+
+// use crate::virt::kvm::kvm_dev_ioctl_create_vm;
+/*
+ * ioctls for /dev/kvm fds:
+ */
+pub const KVM_GET_API_VERSION: u32 = 0x00;
+pub const KVM_CREATE_VM: u32 = 0x01;
+pub const KVM_CHECK_EXTENSION: u32 = 0x03;
+pub const KVM_GET_VCPU_MMAP_SIZE: u32 = 0x04; // Get size for mmap(vcpu_fd) in bytes
+pub const KVM_TRACE_ENABLE: u32 = 0x05;
+pub const KVM_TRACE_PAUSE: u32 = 0x06;
+pub const KVM_TRACE_DISABLE: u32 = 0x07;
+
+#[derive(Debug)]
+pub struct KvmInode {
+    /// uuid 暂时不知道有什么用(x
+    // uuid: Uuid,
+    /// 指向自身的弱引用
+    self_ref: Weak<LockedKvmInode>,
+    /// 指向inode所在的文件系统对象的指针
+    fs: Weak<DevFS>,
+    /// INode 元数据
+    metadata: Metadata,
+}
+
+#[derive(Debug)]
+pub struct LockedKvmInode(SpinLock<KvmInode>);
+
+impl LockedKvmInode {
+    pub fn new() -> Arc<Self> {
+        let inode = KvmInode {
+            self_ref: Weak::default(),
+            fs: Weak::default(),
+            metadata: Metadata {
+                dev_id: 1,
+                inode_id: generate_inode_id(),
+                size: 0,
+                blk_size: 0,
+                blocks: 0,
+                atime: TimeSpec::default(),
+                mtime: TimeSpec::default(),
+                ctime: TimeSpec::default(),
+                file_type: FileType::KvmDevice, // 文件夹,block设备,char设备
+                mode: filesystem::vfs::syscall::ModeType::S_IALLUGO,
+                nlinks: 1,
+                uid: 0,
+                gid: 0,
+                raw_dev: make_rawdev(1, 4), // 这里用来作为device number
+            },
+        };
+
+        let result = Arc::new(LockedKvmInode(SpinLock::new(inode)));
+        result.0.lock().self_ref = Arc::downgrade(&result);
+
+        return result;
+    }
+}
+
+impl DeviceINode for LockedKvmInode {
+    fn set_fs(&self, fs: Weak<DevFS>) {
+        self.0.lock().fs = fs;
+    }
+}
+
+impl IndexNode for LockedKvmInode {
+    fn as_any_ref(&self) -> &dyn core::any::Any {
+        self
+    }
+
+    fn open(&self, _data: &mut FilePrivateData, _mode: &FileMode) -> Result<(), SystemError> {
+        kdebug!("file private data:{:?}", _data);
+        return Ok(());
+    }
+
+    fn close(&self, _data: &mut FilePrivateData) -> Result<(), SystemError> {
+        return Ok(());
+    }
+
+    fn metadata(&self) -> Result<Metadata, SystemError> {
+        return Ok(self.0.lock().metadata.clone());
+    }
+
+    fn fs(&self) -> Arc<dyn FileSystem> {
+        return self.0.lock().fs.upgrade().unwrap();
+    }
+
+    fn list(&self) -> Result<Vec<String>, SystemError> {
+        Err(SystemError::EOPNOTSUPP_OR_ENOTSUP)
+    }
+
+    fn set_metadata(&self, metadata: &Metadata) -> Result<(), SystemError> {
+        let mut inode = self.0.lock();
+        inode.metadata.atime = metadata.atime;
+        inode.metadata.mtime = metadata.mtime;
+        inode.metadata.ctime = metadata.ctime;
+        inode.metadata.mode = metadata.mode;
+        inode.metadata.uid = metadata.uid;
+        inode.metadata.gid = metadata.gid;
+
+        return Ok(());
+    }
+
+    fn poll(&self) -> Result<PollStatus, SystemError> {
+        return Ok(PollStatus::READ | PollStatus::WRITE);
+    }
+
+    /// @brief io control接口
+    ///
+    /// @param cmd 命令
+    /// @param data 数据
+    ///
+    /// @return 成功:Ok()
+    ///         失败:Err(错误码)
+    fn ioctl(&self, cmd: u32, data: usize) -> Result<usize, SystemError> {
+        match cmd {
+            0xdeadbeef => {
+                kdebug!("kvm ioctl");
+                Ok(0)
+            }
+            KVM_GET_API_VERSION => Ok(KVM_API_VERSION as usize),
+            KVM_CREATE_VM => {
+                kdebug!("kvm KVM_CREATE_VM");
+                kvm_dev_ioctl_create_vm(data)
+            }
+            KVM_CHECK_EXTENSION
+            | KVM_GET_VCPU_MMAP_SIZE
+            | KVM_TRACE_ENABLE
+            | KVM_TRACE_PAUSE
+            | KVM_TRACE_DISABLE => Err(SystemError::EOPNOTSUPP_OR_ENOTSUP),
+            _ => KVMArch::kvm_arch_dev_ioctl(cmd, data),
+        }
+    }
+    /// 读设备 - 应该调用设备的函数读写,而不是通过文件系统读写
+    fn read_at(
+        &self,
+        _offset: usize,
+        _len: usize,
+        _buf: &mut [u8],
+        _data: &mut FilePrivateData,
+    ) -> Result<usize, SystemError> {
+        Err(SystemError::EOPNOTSUPP_OR_ENOTSUP)
+    }
+
+    /// 写设备 - 应该调用设备的函数读写,而不是通过文件系统读写
+    fn write_at(
+        &self,
+        _offset: usize,
+        _len: usize,
+        _buf: &[u8],
+        _data: &mut FilePrivateData,
+    ) -> Result<usize, SystemError> {
+        Err(SystemError::EOPNOTSUPP_OR_ENOTSUP)
+    }
+}
+
+#[no_mangle]
+pub fn kvm_dev_ioctl_create_vm(_vmtype: usize) -> Result<usize, SystemError> {
+    push_vm(0).expect("need a valid vm!");
+
+    // 创建vm文件,返回文件描述符
+    let vm_inode = LockedVmInode::new();
+    let file: File = File::new(vm_inode, FileMode::O_RDWR)?;
+    let r = ProcessManager::current_pcb()
+        .fd_table()
+        .write()
+        .alloc_fd(file, None)
+        .map(|fd| fd as usize);
+    return r;
+}

+ 85 - 0
kernel/src/virt/kvm/mod.rs

@@ -0,0 +1,85 @@
+use self::kvm_dev::LockedKvmInode;
+use crate::arch::KVMArch;
+use crate::filesystem::devfs::devfs_register;
+use crate::kdebug;
+use crate::libs::mutex::Mutex;
+use alloc::vec::Vec;
+use vm::Vm;
+
+pub mod host_mem;
+mod kvm_dev;
+pub mod vcpu;
+mod vcpu_dev;
+pub mod vm;
+mod vm_dev;
+
+// pub const KVM_MAX_VCPUS:u32 = 255;
+// pub const GUEST_STACK_SIZE:usize = 1024;
+// pub const HOST_STACK_SIZE:usize = 0x1000 * 6;
+
+/// @brief 获取全局的VM list
+pub static VM_LIST: Mutex<Vec<Vm>> = Mutex::new(Vec::new());
+
+pub fn push_vm(id: usize) -> Result<(), ()> {
+    let mut vm_list = VM_LIST.lock();
+    if vm_list.iter().any(|x| x.id == id) {
+        kdebug!("push_vm: vm {} already exists", id);
+        Err(())
+    } else {
+        vm_list.push(Vm::new(id).unwrap());
+        Ok(())
+    }
+}
+
+pub fn remove_vm(id: usize) -> Vm {
+    let mut vm_list = VM_LIST.lock();
+    match vm_list.iter().position(|x| x.id == id) {
+        None => {
+            panic!("VM[{}] not exist in VM LIST", id);
+        }
+        Some(idx) => vm_list.remove(idx),
+    }
+}
+
+pub fn update_vm(id: usize, new_vm: Vm) {
+    remove_vm(id);
+    let mut vm_list = VM_LIST.lock();
+    vm_list.push(new_vm);
+}
+
+pub fn vm(id: usize) -> Option<Vm> {
+    let vm_list = VM_LIST.lock();
+    vm_list.iter().find(|&x| x.id == id).cloned()
+}
+
+#[no_mangle]
+pub extern "C" fn kvm_init() {
+    kdebug!("kvm init");
+
+    match KVMArch::kvm_arch_cpu_supports_vm() {
+        Ok(_) => {
+            kdebug!("[+] CPU supports Intel VMX");
+        }
+        Err(e) => {
+            kdebug!("[-] CPU does not support Intel VMX: {:?}", e);
+        }
+    };
+
+    KVMArch::kvm_arch_init().expect("kvm arch init");
+
+    devfs_register("kvm", LockedKvmInode::new()).expect("Failed to register /dev/kvm");
+    // let r = devfs_register("kvm", LockedKvmInode::new());
+    // if r.is_err() {
+    //     panic!("Failed to register /dev/kvm");
+    // }
+    // let guest_stack = vec![0xCC; GUEST_STACK_SIZE];
+    // let host_stack = vec![0xCC; HOST_STACK_SIZE];
+    // let guest_rsp = guest_stack.as_ptr() as u64 + GUEST_STACK_SIZE as u64;
+    // let host_rsp = (host_stack.as_ptr() as u64) + HOST_STACK_SIZE  as u64;
+    // kdebug!("guest rsp: {:x}", guest_rsp);
+    // kdebug!("guest rip: {:x}", guest_code as *const () as u64);
+    // kdebug!("host rsp: {:x}", host_rsp);
+    // let hypervisor = Hypervisor::new(1, host_rsp, 0).expect("Cannot create hypervisor");
+    // let vcpu = VmxVcpu::new(1, Arc::new(Mutex::new(hypervisor)), host_rsp, guest_rsp,  guest_code as *const () as u64).expect("Cannot create VcpuData");
+    // vcpu.virtualize_cpu().expect("Cannot virtualize cpu");
+}

+ 9 - 0
kernel/src/virt/kvm/vcpu.rs

@@ -0,0 +1,9 @@
+use crate::syscall::SystemError;
+
+pub trait Vcpu: Send + Sync {
+    /// Virtualize the CPU
+    fn virtualize_cpu(&mut self) -> Result<(), SystemError>;
+    fn devirtualize_cpu(&self) -> Result<(), SystemError>;
+    /// Gets the index of the current logical/virtual processor
+    fn id(&self) -> u32;
+}

+ 212 - 0
kernel/src/virt/kvm/vcpu_dev.rs

@@ -0,0 +1,212 @@
+use crate::arch::kvm::vmx::vcpu::VcpuContextFrame;
+use crate::arch::KVMArch;
+use crate::filesystem::devfs::DevFS;
+use crate::filesystem::vfs::{
+    core::generate_inode_id, file::FileMode, make_rawdev, FilePrivateData, FileSystem, FileType,
+    IndexNode, Metadata, PollStatus,
+};
+use crate::mm::VirtAddr;
+use crate::syscall::user_access::copy_from_user;
+use crate::virt::kvm::vcpu::Vcpu;
+use crate::virt::kvm::vm;
+use crate::{filesystem, kdebug};
+use crate::{libs::spinlock::SpinLock, syscall::SystemError, time::TimeSpec};
+use alloc::{
+    string::String,
+    sync::{Arc, Weak},
+    vec::Vec,
+};
+
+// pub const KVM_API_VERSION:u32 = 12;
+pub const KVM_RUN: u32 = 0x00;
+// pub const KVM_GET_REGS: u32 = 0x01;
+pub const KVM_SET_REGS: u32 = 0x02;
+
+// pub const GUEST_STACK_SIZE:usize = 1024;
+// pub const HOST_STACK_SIZE:usize = 0x1000 * 6;
+
+/*
+ * ioctls for /dev/vm fds:
+ */
+// pub const KVM_CREATE_VCPU: u32 = 0x00;
+// pub const KVM_SET_USER_MEMORY_REGION: u32 = 0x01;
+// pub const KVM_GET_DIRTY_LOG: u32 = 0x02;
+// pub const KVM_IRQFD: u32 = 0x03;
+// pub const KVM_IOEVENTFD: u32 = 0x04;
+// pub const KVM_IRQ_LINE_STATUS: u32 = 0x05;
+
+//  #[derive(Debug)]
+//  pub struct InodeInfo {
+//     kvm: Arc<Hypervisor>,
+//  }
+
+#[derive(Debug)]
+pub struct VcpuInode {
+    /// uuid 暂时不知道有什么用(x
+    // uuid: Uuid,
+    /// 指向自身的弱引用
+    self_ref: Weak<LockedVcpuInode>,
+    /// 指向inode所在的文件系统对象的指针
+    fs: Weak<DevFS>,
+    /// INode 元数据
+    metadata: Metadata,
+    // fdata: InodeInfo,
+}
+
+#[derive(Debug)]
+pub struct LockedVcpuInode(SpinLock<VcpuInode>);
+
+impl LockedVcpuInode {
+    pub fn new() -> Arc<Self> {
+        let inode = VcpuInode {
+            self_ref: Weak::default(),
+            fs: Weak::default(),
+            metadata: Metadata {
+                dev_id: 1,
+                inode_id: generate_inode_id(),
+                size: 0,
+                blk_size: 0,
+                blocks: 0,
+                atime: TimeSpec::default(),
+                mtime: TimeSpec::default(),
+                ctime: TimeSpec::default(),
+                file_type: FileType::KvmDevice, // 文件夹,block设备,char设备
+                mode: filesystem::vfs::syscall::ModeType::S_IALLUGO,
+                nlinks: 1,
+                uid: 0,
+                gid: 0,
+                raw_dev: make_rawdev(1, 4), // 这里用来作为device number
+            },
+            // fdata: InodeInfo {
+            //     kvm: kvm,
+            // },
+        };
+
+        let result = Arc::new(LockedVcpuInode(SpinLock::new(inode)));
+        result.0.lock().self_ref = Arc::downgrade(&result);
+
+        return result;
+    }
+}
+
+impl IndexNode for LockedVcpuInode {
+    fn as_any_ref(&self) -> &dyn core::any::Any {
+        self
+    }
+
+    fn open(&self, _data: &mut FilePrivateData, _mode: &FileMode) -> Result<(), SystemError> {
+        kdebug!("file private data:{:?}", _data);
+        return Ok(());
+    }
+
+    fn close(&self, _data: &mut FilePrivateData) -> Result<(), SystemError> {
+        return Ok(());
+    }
+
+    fn metadata(&self) -> Result<Metadata, SystemError> {
+        return Ok(self.0.lock().metadata.clone());
+    }
+
+    fn fs(&self) -> Arc<dyn FileSystem> {
+        return self.0.lock().fs.upgrade().unwrap();
+    }
+
+    fn list(&self) -> Result<Vec<String>, SystemError> {
+        Err(SystemError::EOPNOTSUPP_OR_ENOTSUP)
+    }
+
+    fn set_metadata(&self, metadata: &Metadata) -> Result<(), SystemError> {
+        let mut inode = self.0.lock();
+        inode.metadata.atime = metadata.atime;
+        inode.metadata.mtime = metadata.mtime;
+        inode.metadata.ctime = metadata.ctime;
+        inode.metadata.mode = metadata.mode;
+        inode.metadata.uid = metadata.uid;
+        inode.metadata.gid = metadata.gid;
+
+        return Ok(());
+    }
+
+    fn poll(&self) -> Result<PollStatus, SystemError> {
+        return Ok(PollStatus::READ | PollStatus::WRITE);
+    }
+
+    /// @brief io control接口
+    ///
+    /// @param cmd 命令
+    /// @param data 数据
+    ///
+    /// @return 成功:Ok()
+    ///         失败:Err(错误码)
+    fn ioctl(&self, cmd: u32, data: usize) -> Result<usize, SystemError> {
+        match cmd {
+            0xdeadbeef => {
+                kdebug!("kvm_cpu ioctl");
+                Ok(0)
+            }
+            KVM_RUN => {
+                kdebug!("kvm_cpu ioctl");
+                // let guest_stack = vec![0xCC; GUEST_STACK_SIZE];
+                // let host_stack = vec![0xCC; HOST_STACK_SIZE];
+                // let guest_rsp = guest_stack.as_ptr() as u64 + GUEST_STACK_SIZE as u64;
+                // let host_rsp = (host_stack.as_ptr() as u64) + HOST_STACK_SIZE  as u64;
+                // let hypervisor = Hypervisor::new(1, host_rsp, 0).expect("Cannot create hypervisor");
+                // let vcpu = VmxVcpu::new(1, Arc::new(Mutex::new(hypervisor)), host_rsp, guest_rsp,  guest_code as *const () as u64).expect("Cannot create VcpuData");
+                // vcpu.virtualize_cpu().expect("Cannot virtualize cpu");
+                let vcpu = vm(0).unwrap().vcpu[0].clone();
+                vcpu.lock().virtualize_cpu()?;
+                KVMArch::kvm_arch_vcpu_ioctl_run(vcpu.as_ref())?;
+                Ok(0)
+            }
+            KVM_SET_REGS => {
+                let mut kvm_regs = VcpuContextFrame::default();
+                unsafe {
+                    copy_from_user(
+                        core::slice::from_raw_parts_mut(
+                            (&mut kvm_regs as *mut _) as *mut u8,
+                            core::mem::size_of::<VcpuContextFrame>(),
+                        ),
+                        VirtAddr::new(data),
+                    )?;
+                }
+                kdebug!(
+                    "rip={:x}, rflags={:x}, rsp={:x}, rax={:x}",
+                    kvm_regs.rip,
+                    kvm_regs.rflags,
+                    kvm_regs.regs[6],
+                    kvm_regs.regs[0],
+                );
+
+                let vcpu = vm(0).unwrap().vcpu[0].clone();
+                vcpu.lock().set_regs(kvm_regs)?;
+
+                Ok(0)
+            }
+            _ => {
+                kdebug!("kvm_cpu ioctl");
+                Ok(usize::MAX)
+            }
+        }
+    }
+    /// 读设备 - 应该调用设备的函数读写,而不是通过文件系统读写
+    fn read_at(
+        &self,
+        _offset: usize,
+        _len: usize,
+        _buf: &mut [u8],
+        _data: &mut FilePrivateData,
+    ) -> Result<usize, SystemError> {
+        Err(SystemError::EOPNOTSUPP_OR_ENOTSUP)
+    }
+
+    /// 写设备 - 应该调用设备的函数读写,而不是通过文件系统读写
+    fn write_at(
+        &self,
+        _offset: usize,
+        _len: usize,
+        _buf: &[u8],
+        _data: &mut FilePrivateData,
+    ) -> Result<usize, SystemError> {
+        Err(SystemError::EOPNOTSUPP_OR_ENOTSUP)
+    }
+}

+ 175 - 0
kernel/src/virt/kvm/vm.rs

@@ -0,0 +1,175 @@
+use crate::arch::kvm::vmx::vcpu::VmxVcpu;
+use crate::libs::mutex::Mutex;
+use crate::syscall::SystemError;
+use crate::{arch::KVMArch, kdebug};
+use alloc::sync::Arc;
+use alloc::vec::Vec;
+
+// use super::HOST_STACK_SIZE;
+use super::host_mem::{
+    KvmMemoryChange, KvmMemorySlot, KvmMemorySlots, KvmUserspaceMemoryRegion,
+    KVM_ADDRESS_SPACE_NUM, KVM_MEM_LOG_DIRTY_PAGES, KVM_MEM_MAX_NR_PAGES, KVM_MEM_READONLY,
+    KVM_MEM_SLOTS_NUM, KVM_USER_MEM_SLOTS, PAGE_SHIFT,
+};
+use crate::arch::kvm::vmx::vmcs::PAGE_SIZE;
+// use crate::kdebug;
+
+#[derive(Debug, Clone)]
+pub struct Vm {
+    pub id: usize,
+    // vcpu config
+    pub nr_vcpus: u32, /* Number of cpus to run */
+    pub vcpu: Vec<Arc<Mutex<VmxVcpu>>>,
+    // memory config
+    pub nr_mem_slots: u32, /* Number of memory slots in each address space */
+    pub memslots: [KvmMemorySlots; KVM_ADDRESS_SPACE_NUM],
+    // arch related config
+    pub arch: KVMArch,
+}
+
+impl Vm {
+    pub fn new(id: usize) -> Result<Self, SystemError> {
+        let vcpu = Vec::new();
+        // Allocate stack for vm-exit handlers and fill it with garbage data
+        let instance = Self {
+            id,
+            nr_vcpus: 0,
+            vcpu,
+            nr_mem_slots: KVM_MEM_SLOTS_NUM,
+            memslots: [KvmMemorySlots::default(); KVM_ADDRESS_SPACE_NUM],
+            arch: Default::default(),
+        };
+        Ok(instance)
+    }
+
+    /// Allocate some memory and give it an address in the guest physical address space.
+    pub fn set_user_memory_region(
+        &mut self,
+        mem: &KvmUserspaceMemoryRegion,
+    ) -> Result<(), SystemError> {
+        kdebug!("set_user_memory_region");
+        let id: u16 = mem.slot as u16; // slot id
+        let as_id = mem.slot >> 16; // address space id
+        kdebug!("id={}, as_id={}", id, as_id);
+
+        // 检查slot是否合法
+        if mem.slot as usize >= self.nr_mem_slots as usize {
+            return Err(SystemError::EINVAL);
+        }
+        // 检查flags是否合法
+        self.check_memory_region_flag(mem)?;
+        // 内存大小和地址必须是页对齐的
+        if (mem.memory_size & (PAGE_SIZE - 1) as u64) != 0
+            || (mem.guest_phys_addr & (PAGE_SIZE - 1) as u64) != 0
+        {
+            return Err(SystemError::EINVAL);
+        }
+        // 检查地址空间是否合法
+        if as_id >= (KVM_ADDRESS_SPACE_NUM as u32) || id >= KVM_MEM_SLOTS_NUM as u16 {
+            return Err(SystemError::EINVAL);
+        }
+        // if mem.memory_size < 0 {
+        //     return Err(SystemError::EINVAL);
+        // }
+        let slot = &self.memslots[as_id as usize].memslots[id as usize];
+        let base_gfn = mem.guest_phys_addr >> PAGE_SHIFT;
+        let npages = mem.memory_size >> PAGE_SHIFT;
+        if npages > KVM_MEM_MAX_NR_PAGES as u64 {
+            return Err(SystemError::EINVAL);
+        }
+        let change: KvmMemoryChange;
+
+        let old_slot = slot;
+        let mut new_slot = KvmMemorySlot {
+            base_gfn, // 虚机内存区间起始物理页框号
+            npages,   // 虚机内存区间页数,即内存区间的大小
+            // dirty_bitmap: old_slot.dirty_bitmap,
+            userspace_addr: mem.userspace_addr, // 虚机内存区间对应的主机虚拟地址
+            flags: mem.flags,                   // 虚机内存区间属性
+            id,                                 // 虚机内存区间id
+        };
+
+        // 判断新memoryslot的类型
+        if npages != 0 {
+            //映射内存有大小,不是删除内存条
+            if old_slot.npages == 0 {
+                //内存槽号没有虚拟内存条,意味内存新创建
+                change = KvmMemoryChange::Create;
+            } else {
+                //修改已存在的内存,表示修改标志或者平移映射地址
+                // 检查内存条是否可以修改
+                if mem.userspace_addr != old_slot.userspace_addr
+                    || npages != old_slot.npages
+                    || (new_slot.flags ^ old_slot.flags & KVM_MEM_READONLY) != 0
+                {
+                    return Err(SystemError::EINVAL);
+                }
+                if new_slot.base_gfn != old_slot.base_gfn {
+                    //guest地址不同,内存条平移
+                    change = KvmMemoryChange::Move;
+                } else if new_slot.flags != old_slot.flags {
+                    //内存条标志不同,修改标志
+                    change = KvmMemoryChange::FlagsOnly;
+                } else {
+                    return Ok(());
+                }
+            }
+        } else {
+            if old_slot.npages == 0 {
+                //内存槽号没有虚拟内存条,不可以删除
+                return Err(SystemError::EINVAL);
+            }
+            //申请插入的内存为0,而内存槽上有内存,意味删除
+            change = KvmMemoryChange::Delete;
+            new_slot.base_gfn = 0;
+            new_slot.flags = 0;
+        }
+
+        if change == KvmMemoryChange::Create || change == KvmMemoryChange::Move {
+            // 检查内存区域是否重叠
+            for i in 0..KVM_MEM_SLOTS_NUM {
+                let memslot = &self.memslots[as_id as usize].memslots[i as usize];
+                if memslot.id == id || memslot.id as u32 >= KVM_USER_MEM_SLOTS {
+                    continue;
+                }
+                // 当前已有的slot与new在guest物理地址上有交集
+                if !(base_gfn + npages <= memslot.base_gfn
+                    || memslot.base_gfn + memslot.npages <= base_gfn)
+                {
+                    return Err(SystemError::EEXIST);
+                }
+            }
+        }
+
+        if !(new_slot.flags & KVM_MEM_LOG_DIRTY_PAGES != 0) {
+            // new_slot.dirty_bitmap = 0;
+        }
+
+        // 根据flags的值,决定是否创建内存脏页
+        // if (new_slot.flags & KVM_MEM_LOG_DIRTY_PAGES)!=0 && new_slot.dirty_bitmap == 0 {
+        //     let type_size = core::mem::size_of::<u64>() as u64;
+        //     let dirty_bytes = 2 * ((new_slot.npages+type_size-1) / type_size) / 8;
+        // new_slot.dirty_bitmap = Box::new(vec![0; dirty_bytes as u8]);
+        // }
+        if change == KvmMemoryChange::Create {
+            new_slot.userspace_addr = mem.userspace_addr;
+            let mut memslots = self.memslots[as_id as usize].memslots.clone();
+            memslots[id as usize] = new_slot;
+            self.memslots[as_id as usize].memslots = memslots;
+            self.memslots[as_id as usize].used_slots += 1;
+            // KVMArch::kvm_arch_create_memslot(&mut new_slot, npages);
+            // KVMArch::kvm_arch_commit_memory_region(mem, &new_slot, old_slot, change);
+        }
+        // TODO--KvmMemoryChange::Delete & Move
+        Ok(())
+    }
+
+    fn check_memory_region_flag(&self, mem: &KvmUserspaceMemoryRegion) -> Result<(), SystemError> {
+        let valid_flags = KVM_MEM_LOG_DIRTY_PAGES;
+        // 除了valid_flags之外的flags被置1了,就返回错误
+        if mem.flags & !valid_flags != 0 {
+            return Err(SystemError::EINVAL);
+        }
+        Ok(())
+    }
+}

+ 224 - 0
kernel/src/virt/kvm/vm_dev.rs

@@ -0,0 +1,224 @@
+use crate::filesystem::devfs::DevFS;
+use crate::filesystem::vfs::{
+    core::generate_inode_id,
+    file::{File, FileMode},
+    make_rawdev, FilePrivateData, FileSystem, FileType, IndexNode, Metadata, PollStatus,
+};
+use crate::mm::VirtAddr;
+use crate::process::ProcessManager;
+use crate::syscall::user_access::copy_from_user;
+use crate::virt::kvm::host_mem::KvmUserspaceMemoryRegion;
+use crate::virt::kvm::update_vm;
+use crate::virt::kvm::vcpu_dev::LockedVcpuInode;
+use crate::virt::kvm::vm;
+use crate::{arch::KVMArch, libs::spinlock::SpinLock, syscall::SystemError, time::TimeSpec};
+use crate::{filesystem, kdebug};
+use alloc::{
+    string::String,
+    sync::{Arc, Weak},
+    vec::Vec,
+};
+
+// pub const KVM_API_VERSION:u32 = 12;
+// pub const GUEST_STACK_SIZE:usize = 1024;
+// pub const HOST_STACK_SIZE:usize = 0x1000 * 6;
+
+/*
+ * ioctls for /dev/vm fds:
+ */
+pub const KVM_CREATE_VCPU: u32 = 0x00;
+pub const KVM_SET_USER_MEMORY_REGION: u32 = 0x01;
+pub const KVM_GET_DIRTY_LOG: u32 = 0x02;
+pub const KVM_IRQFD: u32 = 0x03;
+pub const KVM_IOEVENTFD: u32 = 0x04;
+pub const KVM_IRQ_LINE_STATUS: u32 = 0x05;
+
+//  #[derive(Debug)]
+//  pub struct InodeInfo {
+//     kvm: Arc<Hypervisor>,
+//  }
+
+#[derive(Debug)]
+pub struct VmInode {
+    /// uuid 暂时不知道有什么用(x
+    // uuid: Uuid,
+    /// 指向自身的弱引用
+    self_ref: Weak<LockedVmInode>,
+    /// 指向inode所在的文件系统对象的指针
+    fs: Weak<DevFS>,
+    /// INode 元数据
+    metadata: Metadata,
+    // fdata: InodeInfo,
+}
+
+#[derive(Debug)]
+pub struct LockedVmInode(SpinLock<VmInode>);
+
+impl LockedVmInode {
+    pub fn new() -> Arc<Self> {
+        let inode = VmInode {
+            self_ref: Weak::default(),
+            fs: Weak::default(),
+            metadata: Metadata {
+                dev_id: 1,
+                inode_id: generate_inode_id(),
+                size: 0,
+                blk_size: 0,
+                blocks: 0,
+                atime: TimeSpec::default(),
+                mtime: TimeSpec::default(),
+                ctime: TimeSpec::default(),
+                file_type: FileType::KvmDevice, // 文件夹,block设备,char设备
+                mode: filesystem::vfs::syscall::ModeType::S_IALLUGO,
+                nlinks: 1,
+                uid: 0,
+                gid: 0,
+                raw_dev: make_rawdev(1, 4), // 这里用来作为device number
+            },
+            // fdata: InodeInfo {
+            //     kvm: kvm,
+            // },
+        };
+
+        let result = Arc::new(LockedVmInode(SpinLock::new(inode)));
+        result.0.lock().self_ref = Arc::downgrade(&result);
+
+        return result;
+    }
+}
+
+impl IndexNode for LockedVmInode {
+    fn as_any_ref(&self) -> &dyn core::any::Any {
+        self
+    }
+
+    fn open(&self, _data: &mut FilePrivateData, _mode: &FileMode) -> Result<(), SystemError> {
+        kdebug!("file private data:{:?}", _data);
+        return Ok(());
+    }
+
+    fn close(&self, _data: &mut FilePrivateData) -> Result<(), SystemError> {
+        return Ok(());
+    }
+
+    fn metadata(&self) -> Result<Metadata, SystemError> {
+        return Ok(self.0.lock().metadata.clone());
+    }
+
+    fn fs(&self) -> Arc<dyn FileSystem> {
+        return self.0.lock().fs.upgrade().unwrap();
+    }
+
+    fn list(&self) -> Result<Vec<String>, SystemError> {
+        Err(SystemError::EOPNOTSUPP_OR_ENOTSUP)
+    }
+
+    fn set_metadata(&self, metadata: &Metadata) -> Result<(), SystemError> {
+        let mut inode = self.0.lock();
+        inode.metadata.atime = metadata.atime;
+        inode.metadata.mtime = metadata.mtime;
+        inode.metadata.ctime = metadata.ctime;
+        inode.metadata.mode = metadata.mode;
+        inode.metadata.uid = metadata.uid;
+        inode.metadata.gid = metadata.gid;
+
+        return Ok(());
+    }
+
+    fn poll(&self) -> Result<PollStatus, SystemError> {
+        return Ok(PollStatus::READ | PollStatus::WRITE);
+    }
+
+    /// @brief io control接口
+    ///
+    /// @param cmd 命令
+    /// @param data 数据
+    ///
+    /// @return 成功:Ok()
+    ///         失败:Err(错误码)
+    fn ioctl(&self, cmd: u32, data: usize) -> Result<usize, SystemError> {
+        match cmd {
+            0xdeadbeef => {
+                kdebug!("kvm_vm ioctl");
+                Ok(0)
+            }
+            KVM_CREATE_VCPU => {
+                kdebug!("kvm_vcpu ioctl KVM_CREATE_VCPU");
+                kvm_vm_ioctl_create_vcpu(data as u32)
+            }
+            KVM_SET_USER_MEMORY_REGION => {
+                kdebug!("kvm_vcpu ioctl KVM_SET_USER_MEMORY_REGION data={:x}", data);
+                let mut kvm_userspace_mem = KvmUserspaceMemoryRegion::default(); // = unsafe { (data as *const KvmUserspaceMemoryRegion).as_ref().unwrap() };
+                unsafe {
+                    copy_from_user(
+                        core::slice::from_raw_parts_mut(
+                            (&mut kvm_userspace_mem as *mut _) as *mut u8,
+                            core::mem::size_of::<KvmUserspaceMemoryRegion>(),
+                        ),
+                        VirtAddr::new(data),
+                    )?;
+                }
+                kdebug!(
+                    "slot={}, flag={}, memory_size={:x}, guest_phys_addr={}, userspace_addr={}",
+                    kvm_userspace_mem.slot,
+                    kvm_userspace_mem.flags,
+                    kvm_userspace_mem.memory_size,
+                    kvm_userspace_mem.guest_phys_addr, // starting at physical address guest_phys_addr (from the guest’s perspective)
+                    kvm_userspace_mem.userspace_addr // using memory at linear address userspace_addr (from the host’s perspective)
+                );
+
+                let mut current_vm = vm(0).unwrap();
+                current_vm.set_user_memory_region(&kvm_userspace_mem)?;
+                update_vm(0, current_vm);
+                Ok(0)
+            }
+            KVM_GET_DIRTY_LOG | KVM_IRQFD | KVM_IOEVENTFD | KVM_IRQ_LINE_STATUS => {
+                Err(SystemError::EOPNOTSUPP_OR_ENOTSUP)
+            }
+            _ => {
+                kdebug!("kvm_vm ioctl");
+                Ok(usize::MAX)
+            }
+        }
+    }
+    /// 读设备 - 应该调用设备的函数读写,而不是通过文件系统读写
+    fn read_at(
+        &self,
+        _offset: usize,
+        _len: usize,
+        _buf: &mut [u8],
+        _data: &mut FilePrivateData,
+    ) -> Result<usize, SystemError> {
+        Err(SystemError::EOPNOTSUPP_OR_ENOTSUP)
+    }
+
+    /// 写设备 - 应该调用设备的函数读写,而不是通过文件系统读写
+    fn write_at(
+        &self,
+        _offset: usize,
+        _len: usize,
+        _buf: &[u8],
+        _data: &mut FilePrivateData,
+    ) -> Result<usize, SystemError> {
+        Err(SystemError::EOPNOTSUPP_OR_ENOTSUP)
+    }
+}
+
+fn kvm_vm_ioctl_create_vcpu(id: u32) -> Result<usize, SystemError> {
+    let vcpu = KVMArch::kvm_arch_vcpu_create(id).unwrap();
+    KVMArch::kvm_arch_vcpu_setup(vcpu.as_ref())?;
+
+    let mut current_vm = vm(0).unwrap();
+    current_vm.vcpu.push(vcpu);
+    current_vm.nr_vcpus += 1;
+    update_vm(0, current_vm);
+
+    let vcpu_inode = LockedVcpuInode::new();
+    let file: File = File::new(vcpu_inode, FileMode::O_RDWR)?;
+    let r = ProcessManager::current_pcb()
+        .fd_table()
+        .write()
+        .alloc_fd(file, None)
+        .map(|fd| fd as usize);
+    return r;
+}

+ 1 - 0
kernel/src/virt/mod.rs

@@ -0,0 +1 @@
+pub mod kvm;

+ 1 - 1
tools/.gdbinit

@@ -1,3 +1,3 @@
 target remote localhost:1234
 file bin/kernel/kernel.elf
-set follow-fork-mode child
+set follow-fork-mode child

+ 2 - 2
tools/run-qemu.sh

@@ -54,7 +54,7 @@ QEMU_MEMORY="512M"
 QEMU_SMP="2,cores=2,threads=1,sockets=1"
 QEMU_MONITOR="stdio"
 QEMU_TRACE="${qemu_trace_std}"
-QEMU_CPU_FEATURES="IvyBridge,apic,x2apic,+fpu,check,${allflags}"
+QEMU_CPU_FEATURES="IvyBridge,apic,x2apic,+fpu,check,+vmx,${allflags}"
 QEMU_RTC_CLOCK="clock=host,base=localtime"
 QEMU_SERIAL="file:../serial_opt.txt"
 QEMU_DRIVE="id=disk,file=${QEMU_DISK_IMAGE},if=none"
@@ -66,7 +66,7 @@ QEMU_DRIVE="id=disk,file=${QEMU_DISK_IMAGE},if=none"
 QEMU_DEVICES="-device ahci,id=ahci -device ide-hd,drive=disk,bus=ahci.0 -netdev user,id=hostnet0,hostfwd=tcp::12580-:12580 -device virtio-net-pci,vectors=5,netdev=hostnet0,id=net0 -usb -device qemu-xhci,id=xhci,p2=8,p3=4 -machine accel=${qemu_accel} -machine q35 "
 QEMU_ARGUMENT="-d ${QEMU_DISK_IMAGE} -m ${QEMU_MEMORY} -smp ${QEMU_SMP} -boot order=d -monitor ${QEMU_MONITOR} -d ${qemu_trace_std} "
 
-QEMU_ARGUMENT+="-s -S -cpu ${QEMU_CPU_FEATURES} -rtc ${QEMU_RTC_CLOCK} -serial ${QEMU_SERIAL} -drive ${QEMU_DRIVE} ${QEMU_DEVICES}"
+QEMU_ARGUMENT+="-s -S -enable-kvm -cpu ${QEMU_CPU_FEATURES} -rtc ${QEMU_RTC_CLOCK} -serial ${QEMU_SERIAL} -drive ${QEMU_DRIVE} ${QEMU_DEVICES}"
 
 if [ $flag_can_run -eq 1 ]; then
   while true;do

+ 1 - 1
user/apps/Makefile

@@ -1,5 +1,5 @@
 
-user_apps_sub_dirs=shell about
+user_apps_sub_dirs=shell about test_kvm
 
 ECHO:
 	@echo "$@"

+ 9 - 0
user/apps/test_kvm/Makefile

@@ -0,0 +1,9 @@
+OLD_LIBC_INSTALL_PATH=$(ROOT_PATH)/bin/sysroot/usr/old_libc
+
+all: main.o
+
+	$(LD) -b elf64-x86-64 -z muldefs -o $(tmp_output_dir)/test_kvm  $(shell find . -name "*.o") $(OLD_LIBC_INSTALL_PATH)/lib/libc.a -T link.lds
+
+	$(OBJCOPY) -I elf64-x86-64 -R ".eh_frame" -R ".comment" -O elf64-x86-64 $(tmp_output_dir)/test_kvm $(output_dir)/test_kvm.elf
+main.o: main.c
+	$(CC) $(CFLAGS) -c main.c  -o main.o

+ 3 - 0
user/apps/test_kvm/bootstrap/Makefile

@@ -0,0 +1,3 @@
+boot.bin: boot.s
+	nasm boot.s -o boot.bin
+	xxd boot.bin > boot.hex

BIN
user/apps/test_kvm/bootstrap/boot.bin


+ 32 - 0
user/apps/test_kvm/bootstrap/boot.hex

@@ -0,0 +1,32 @@
+00000000: 8cc8 8ed8 8ec0 e802 00eb feb8 1e00 89c5  ................
+00000010: b910 00b8 0113 bb0c 00b2 00cd 10c3 4865  ..............He
+00000020: 6c6c 6f2c 204f 5320 776f 726c 6421 0000  llo, OS world!..
+00000030: 0000 0000 0000 0000 0000 0000 0000 0000  ................
+00000040: 0000 0000 0000 0000 0000 0000 0000 0000  ................
+00000050: 0000 0000 0000 0000 0000 0000 0000 0000  ................
+00000060: 0000 0000 0000 0000 0000 0000 0000 0000  ................
+00000070: 0000 0000 0000 0000 0000 0000 0000 0000  ................
+00000080: 0000 0000 0000 0000 0000 0000 0000 0000  ................
+00000090: 0000 0000 0000 0000 0000 0000 0000 0000  ................
+000000a0: 0000 0000 0000 0000 0000 0000 0000 0000  ................
+000000b0: 0000 0000 0000 0000 0000 0000 0000 0000  ................
+000000c0: 0000 0000 0000 0000 0000 0000 0000 0000  ................
+000000d0: 0000 0000 0000 0000 0000 0000 0000 0000  ................
+000000e0: 0000 0000 0000 0000 0000 0000 0000 0000  ................
+000000f0: 0000 0000 0000 0000 0000 0000 0000 0000  ................
+00000100: 0000 0000 0000 0000 0000 0000 0000 0000  ................
+00000110: 0000 0000 0000 0000 0000 0000 0000 0000  ................
+00000120: 0000 0000 0000 0000 0000 0000 0000 0000  ................
+00000130: 0000 0000 0000 0000 0000 0000 0000 0000  ................
+00000140: 0000 0000 0000 0000 0000 0000 0000 0000  ................
+00000150: 0000 0000 0000 0000 0000 0000 0000 0000  ................
+00000160: 0000 0000 0000 0000 0000 0000 0000 0000  ................
+00000170: 0000 0000 0000 0000 0000 0000 0000 0000  ................
+00000180: 0000 0000 0000 0000 0000 0000 0000 0000  ................
+00000190: 0000 0000 0000 0000 0000 0000 0000 0000  ................
+000001a0: 0000 0000 0000 0000 0000 0000 0000 0000  ................
+000001b0: 0000 0000 0000 0000 0000 0000 0000 0000  ................
+000001c0: 0000 0000 0000 0000 0000 0000 0000 0000  ................
+000001d0: 0000 0000 0000 0000 0000 0000 0000 0000  ................
+000001e0: 0000 0000 0000 0000 0000 0000 0000 0000  ................
+000001f0: 0000 0000 0000 0000 0000 0000 0000 55aa  ..............U.

+ 54 - 0
user/apps/test_kvm/link.lds

@@ -0,0 +1,54 @@
+
+OUTPUT_FORMAT("elf64-x86-64","elf64-x86-64","elf64-x86-64")
+OUTPUT_ARCH(i386:x86-64)
+ENTRY(_start)
+
+SECTIONS
+{
+
+	. = 0x800000;
+	
+	
+	.text :
+	{
+		_text = .;
+		
+		*(.text)
+		*(.text.*)
+		
+		_etext = .;
+	}
+	. = ALIGN(8);
+	
+	.data :
+	{
+		_data = .;
+		*(.data)
+		*(.data.*)
+		
+		_edata = .;
+	}
+
+
+	rodata_start_pa = .;
+	.rodata :
+	{
+		_rodata = .;	
+		*(.rodata)
+		*(.rodata.*)
+		_erodata = .;
+	}
+
+	
+	.bss :
+	{
+		_bss = .;
+		*(.bss)
+		*(.bss.*)
+		_ebss = .;
+	}
+
+	_end = .;
+
+
+}

+ 114 - 0
user/apps/test_kvm/main.c

@@ -0,0 +1,114 @@
+/**
+ * @file main.c
+ * @author xiaoyez ([email protected])
+ * @brief 测试kvm的程序
+ * @version 0.1
+ * @date 2023-07-13
+ *
+ * @copyright Copyright (c) 2023
+ *
+ */
+
+/**
+ * 测试kvm命令的方法:
+ * 1.在DragonOS的控制台输入 exec bin/test_kvm.elf
+ *
+ */
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <fcntl.h>
+
+#define KVM_CREATE_VCPU 0x00
+#define KVM_SET_USER_MEMORY_REGION 0x01
+
+#define KVM_RUN 0x00
+#define KVM_GET_REGS 0x01
+#define KVM_SET_REGS 0x02
+
+struct kvm_userspace_memory_region {
+    uint32_t slot; // 要在哪个slot上注册内存区间
+    // flags有两个取值,KVM_MEM_LOG_DIRTY_PAGES和KVM_MEM_READONLY,用来指示kvm针对这段内存应该做的事情。
+    // KVM_MEM_LOG_DIRTY_PAGES用来开启内存脏页,KVM_MEM_READONLY用来开启内存只读。
+    uint32_t flags;
+    uint64_t guest_phys_addr; // 虚机内存区间起始物理地址
+    uint64_t memory_size;     // 虚机内存区间大小
+    uint64_t userspace_addr;  // 虚机内存区间对应的主机虚拟地址
+};
+
+struct kvm_regs {
+	/* out (KVM_GET_REGS) / in (KVM_SET_REGS) */
+	uint64_t rax, rbx, rcx, rdx;
+	uint64_t rsi, rdi, rsp, rbp;
+	uint64_t r8,  r9,  r10, r11;
+	uint64_t r12, r13, r14, r15;
+	uint64_t rip, rflags;
+};
+
+int guest_code(){
+    while (1)
+    {
+        // printf("guest code\n");
+        __asm__ __volatile__ (
+            "mov %rax, 0\n\t"
+            "mov %rcx, 0\n\t"
+            "cpuid\n\t"
+        );
+    }
+    return 0;
+}
+
+int main()
+{
+    printf("Test kvm running...\n");
+    printf("Open /dev/kvm\n");
+    int kvm_fd = open("/dev/kvm", O_RDWR|O_CLOEXEC);
+    int vmfd = ioctl(kvm_fd, 0x01, 0);
+    printf("vmfd=%d\n", vmfd);
+
+    /*
+         __asm__ __volatile__ (
+            "mov %rax, 0\n\t"
+            "mov %rcx, 0\n\t"
+            "cpuid\n\t"
+        ); 
+    */
+    const uint8_t code[] = {
+        0xba, 0xf8, 0x03, /* mov $0x3f8, %dx */
+        0x00, 0xd8,       /* add %bl, %al */
+        0x04, '0',        /* add $'0', %al */
+        0xee,             /* out %al, (%dx) */
+        0xb0, '\n',       /* mov $'\n', %al */
+        0xee,             /* out %al, (%dx) */
+        0xf4,             /* hlt */
+    };
+
+    size_t mem_size = 0x4000; // size of user memory you want to assign
+    printf("code=%p\n", code);
+    // void *mem = mmap(0, mem_size, 0x7, -1, 0);
+    // memcpy(mem, code, sizeof(code));
+    struct kvm_userspace_memory_region region = {
+        .slot = 0,
+        .flags = 0,
+        .guest_phys_addr = 0,
+        .memory_size = mem_size,
+        .userspace_addr = (size_t)code
+    };
+    ioctl(vmfd, KVM_SET_USER_MEMORY_REGION, &region);
+
+    int vcpufd = ioctl(vmfd, KVM_CREATE_VCPU, 0);
+    printf("vcpufd=%d\n", vcpufd);
+    int user_entry = 0x0;
+
+    struct kvm_regs regs = {0};
+    regs.rip = user_entry;
+    regs.rsp = 0x3000; // stack address
+    regs.rflags = 0x2; // in x86 the 0x2 bit should always be set
+    ioctl(vcpufd, KVM_SET_REGS, &regs); // set registers
+
+    ioctl(vcpufd, KVM_RUN, 0);
+
+    return 0;
+}
+
+

+ 13 - 0
user/libs/libc/src/fcntl.c

@@ -12,4 +12,17 @@
 int open(const char *path, int options, ...)
 {
     return syscall_invoke(SYS_OPEN, (uint64_t)path, options, 0, 0, 0, 0, 0, 0);
+}
+
+/**
+ * @brief ioctl的接口
+ *
+ * @param fd 文件句柄
+ * @param cmd 设备相关的请求类型
+ * @param ...
+ * @return int 成功返回0
+ */
+int ioctl(int fd, int cmd, uint64_t data, ...)
+{
+    return syscall_invoke(SYS_IOCTL, fd, cmd, data, 0, 0, 0, 0, 0);
 }

+ 12 - 0
user/libs/libc/src/include/export/fcntl.h

@@ -14,6 +14,8 @@
 extern  "C"  { 
 
 #endif
+#include <stdint.h>
+
 #define O_RDONLY 00000000  // Open Read-only
 #define O_WRONLY 00000001  // Open Write-only
 #define O_RDWR 00000002    // Open read/write
@@ -72,6 +74,16 @@ extern  "C"  {
  */
 int open(const char * path, int options, ...);
 
+/**
+ * @brief ioctl的接口
+ *
+ * @param fd 文件句柄
+ * @param cmd 设备相关的请求类型
+ * @param ...
+ * @return int 成功返回0
+ */
+int ioctl(int fd, int cmd, uint64_t data, ...);
+
 #if defined(__cplusplus) 
 }  /* extern "C" */ 
 #endif

+ 1 - 0
user/libs/libsystem/syscall.h

@@ -49,6 +49,7 @@
 #define SYS_ACCEPT 40     // 接受一个socket连接
 #define SYS_GETSOCKNAME 41 // 获取socket的名字
 #define SYS_GETPEERNAME 42 // 获取socket的对端名字
+#define SYS_IOCTL 54
 
 #define SYS_GETCWD 48