Переглянути джерело

feat(virtualization): 内核虚拟化支持 (#1073)

* 几个结构体

* 通过vmx_init以及create_vm,create_vcpu部分TODO

* kvm_run完成一半

* 能够成功vmlaunch,但是在vmexit时候还有些问题未排查出来

* 解决了vmlaunch导致的cpu_reset的问题

* 整理代码

* 暂时性push到hyc仓库

* 修改内存虚拟化部分参数传入,解决死锁问题

* 初步完成ept映射.但不停EPT_VIOLATION

* 初步完成了EPT映射,但是读写内存还是有点问题

* fixme

* 更新了一些truncate到from_bits_unchecked的实现

* 完成内存虚拟化EPT_VIOLATION的映射

* fmt

* Remove /fixme from .gitignore

* Remove /fixme file

* Update kernel/src/init/init.rs

Co-authored-by: Samuel Dai <[email protected]>

* Update kernel/src/init/init.rs

Co-authored-by: Samuel Dai <[email protected]>

* 修改了注释格式,删除了附带的一些文件操作

* feat(syscall): 实现syscall restart (#1075)

能够在系统调用返回ERESTARTSYS时,信号处理结束后,自动重启系统调用.

TODO: 实现wait等需要restart_block的系统调用的重启

Signed-off-by: longjin <[email protected]>

* chore: update docker image version in script && update doc (#1076)

* chore: update docker image version in script

* chore: replace lots of spaces with newline in doc

* fix: 修复wait4系统调用部分语义与Linux不一致的问题 (#1080)

* fix: 修复wait4系统调用部分语义与Linux不一致的问题

解决wait不住/wait之后卡死的bug

---------

Signed-off-by: longjin <[email protected]>

* feat(fs/syscall): 实现fchdir系统调用 (#1081)

Signed-off-by: longjin <[email protected]>

* fix(mm): 修复fat文件系统的PageCache同步问题 (#1005)


---------

Co-authored-by: longjin <[email protected]>

* fix: 修正nographic启动时,控制台日志未能输出到文件的问题 (#1082)

Signed-off-by: longjin <[email protected]>

* fix(process): 修复copy_process的一些bug & 支持默认init进程传参 (#1083)

- 修复`copy_process`函数对标志位处理不正确的bug
- init进程搜索列表中,支持为默认init程序传入参数

Signed-off-by: longjin <[email protected]>

* feat: 完善sys_reboot (#1084)

* fix(process): 修复copy_process的一些bug & 支持默认init进程传参

- 修复`copy_process`函数对标志位处理不正确的bug
- init进程搜索列表中,支持为默认init程序传入参数

Signed-off-by: longjin <[email protected]>

* feat: 完善sys_reboot

- 校验magic number
- 支持多个cmd (具体内容未实现)

Signed-off-by: longjin <[email protected]>

---------

Signed-off-by: longjin <[email protected]>

* fix: 修复do_wait函数在wait所有子进程时,忘了释放锁就sleep的bug (#1089)

Signed-off-by: longjin <[email protected]>

* pull主线并且fmt

---------

Signed-off-by: longjin <[email protected]>
Co-authored-by: GnoCiYeH <[email protected]>
Co-authored-by: Samuel Dai <[email protected]>
Co-authored-by: LoGin <[email protected]>
Co-authored-by: LIU Yuwei <[email protected]>
Co-authored-by: MemoryShore <[email protected]>
Z Fan 1 тиждень тому
батько
коміт
597315b04d
50 змінених файлів з 13674 додано та 125 видалено
  1. 1 2
      .gitignore
  2. 2 1
      .vscode/settings.json
  3. 1 0
      build-scripts/kernel_build/src/cfiles/arch/x86_64.rs
  4. 5 1
      kernel/crates/bitmap/src/alloc_bitmap.rs
  5. 15 15
      kernel/crates/bitmap/src/bitmap_core.rs
  6. 1 0
      kernel/crates/bitmap/src/lib.rs
  7. 1 1
      kernel/src/arch/x86_64/kvm/vmx/mmu.rs
  8. 1 1
      kernel/src/arch/x86_64/kvm/vmx/vcpu.rs
  9. 1 1
      kernel/src/arch/x86_64/kvm/vmx/vmexit.rs
  10. 9 0
      kernel/src/arch/x86_64/mm/mod.rs
  11. 10 0
      kernel/src/arch/x86_64/mod.rs
  12. 592 0
      kernel/src/arch/x86_64/vm/asm.rs
  13. 59 0
      kernel/src/arch/x86_64/vm/cpuid.rs
  14. 1 0
      kernel/src/arch/x86_64/vm/exit.rs
  15. 62 0
      kernel/src/arch/x86_64/vm/kvm_host/lapic.rs
  16. 463 0
      kernel/src/arch/x86_64/vm/kvm_host/mod.rs
  17. 1 0
      kernel/src/arch/x86_64/vm/kvm_host/page.rs
  18. 1697 0
      kernel/src/arch/x86_64/vm/kvm_host/vcpu.rs
  19. 24 0
      kernel/src/arch/x86_64/vm/mem.rs
  20. 648 0
      kernel/src/arch/x86_64/vm/mmu/kvm_mmu.rs
  21. 396 0
      kernel/src/arch/x86_64/vm/mmu/mmu_internal.rs
  22. 3 0
      kernel/src/arch/x86_64/vm/mmu/mod.rs
  23. 219 0
      kernel/src/arch/x86_64/vm/mmu/tdp_iter.rs
  24. 640 0
      kernel/src/arch/x86_64/vm/mod.rs
  25. 37 0
      kernel/src/arch/x86_64/vm/mtrr.rs
  26. 102 0
      kernel/src/arch/x86_64/vm/uapi.rs
  27. 19 0
      kernel/src/arch/x86_64/vm/vmx/asm.rs
  28. 591 0
      kernel/src/arch/x86_64/vm/vmx/capabilities.rs
  29. 466 0
      kernel/src/arch/x86_64/vm/vmx/ept/mod.rs
  30. 426 0
      kernel/src/arch/x86_64/vm/vmx/exit.rs
  31. 3775 0
      kernel/src/arch/x86_64/vm/vmx/mod.rs
  32. 160 0
      kernel/src/arch/x86_64/vm/vmx/vmcs/feat.rs
  33. 451 0
      kernel/src/arch/x86_64/vm/vmx/vmcs/mod.rs
  34. 179 0
      kernel/src/arch/x86_64/vm/vmx/vmenter.S
  35. 4 1
      kernel/src/init/init.rs
  36. 1 0
      kernel/src/lib.rs
  37. 9 0
      kernel/src/libs/rbtree.rs
  38. 2 2
      kernel/src/mm/mod.rs
  39. 1 0
      kernel/src/mm/page.rs
  40. 1 1
      kernel/src/namespaces/syscall.rs
  41. 1 0
      kernel/src/virt/mod.rs
  42. 491 0
      kernel/src/virt/vm/kvm_dev.rs
  43. 714 0
      kernel/src/virt/vm/kvm_host/mem.rs
  44. 268 0
      kernel/src/virt/vm/kvm_host/mod.rs
  45. 117 0
      kernel/src/virt/vm/kvm_host/vcpu.rs
  46. 3 0
      kernel/src/virt/vm/mod.rs
  47. 466 0
      kernel/src/virt/vm/user_api.rs
  48. 17 0
      package-lock.json
  49. 0 3
      tools/.gdbinit
  50. 521 96
      user/apps/test_kvm/main.c

+ 1 - 2
.gitignore

@@ -18,5 +18,4 @@ cppcheck.xml
 /target/
 Cargo.lock
 .cache
-compile_commands.json
-/logs/
+compile_commands.json

+ 2 - 1
.vscode/settings.json

@@ -144,7 +144,7 @@
     "rust-analyzer.checkOnSave.allTargets": false,
     "rust-analyzer.linkedProjects": [
         "./kernel/Cargo.toml",
-        "./tools/Cargo.toml",
+        //"./tools/Cargo.toml",
     
     ],
     // "rust-analyzer.cargo.target": "riscv64gc-unknown-none-elf",
@@ -154,4 +154,5 @@
         "check",
         
     ],
+    "makefile.configureOnOpen": false,
 }

+ 1 - 0
build-scripts/kernel_build/src/cfiles/arch/x86_64.rs

@@ -31,6 +31,7 @@ impl CFilesArch for X86_64CFilesArch {
         files.insert(PathBuf::from("src/arch/x86_64/asm/head.S"));
         files.insert(PathBuf::from("src/arch/x86_64/asm/entry.S"));
         files.insert(PathBuf::from("src/arch/x86_64/asm/apu_boot.S"));
+        files.insert(PathBuf::from("src/arch/x86_64/vm/vmx/vmenter.S"));
     }
 
     fn setup_global_flags(&self, c: &mut Build) {

+ 5 - 1
kernel/crates/bitmap/src/alloc_bitmap.rs

@@ -4,7 +4,7 @@ use alloc::vec::Vec;
 
 use crate::{bitmap_core::BitMapCore, traits::BitMapOps};
 
-#[derive(Clone)]
+#[derive(Debug, Clone)]
 pub struct AllocBitmap {
     elements: usize,
     data: Vec<usize>,
@@ -26,6 +26,10 @@ impl AllocBitmap {
             self.data[i] &= rhs.data[i];
         }
     }
+
+    pub fn data(&self) -> &[usize] {
+        &self.data
+    }
 }
 
 impl BitMapOps<usize> for AllocBitmap {

+ 15 - 15
kernel/crates/bitmap/src/bitmap_core.rs

@@ -3,7 +3,7 @@ use core::{intrinsics::unlikely, marker::PhantomData};
 use crate::traits::BitOps;
 
 #[derive(Debug, Clone)]
-pub(crate) struct BitMapCore<T: BitOps> {
+pub struct BitMapCore<T: BitOps> {
     phantom: PhantomData<T>,
 }
 
@@ -15,7 +15,7 @@ impl<T: BitOps> BitMapCore<T> {
     }
 
     /// 获取位图中的某一位
-    pub(crate) fn get(&self, n: usize, data: &[T], index: usize) -> Option<bool> {
+    pub fn get(&self, n: usize, data: &[T], index: usize) -> Option<bool> {
         if unlikely(index >= n) {
             return None;
         }
@@ -30,7 +30,7 @@ impl<T: BitOps> BitMapCore<T> {
     }
 
     /// 设置位图中的某一位
-    pub(crate) fn set(&self, n: usize, data: &mut [T], index: usize, value: bool) -> Option<bool> {
+    pub fn set(&self, n: usize, data: &mut [T], index: usize, value: bool) -> Option<bool> {
         if unlikely(index >= n) {
             return None;
         }
@@ -43,7 +43,7 @@ impl<T: BitOps> BitMapCore<T> {
         Some(bit)
     }
 
-    pub(crate) fn set_all(&self, n: usize, data: &mut [T], value: bool) {
+    pub fn set_all(&self, n: usize, data: &mut [T], value: bool) {
         let val = if value { T::max() } else { T::zero() };
         for element in data.iter_mut() {
             *element = val;
@@ -58,7 +58,7 @@ impl<T: BitOps> BitMapCore<T> {
     }
 
     /// 获取位图中第一个为1的位
-    pub(crate) fn first_index(&self, data: &[T]) -> Option<usize> {
+    pub fn first_index(&self, data: &[T]) -> Option<usize> {
         for (i, element) in data.iter().enumerate() {
             let bit = <T as BitOps>::first_index(element);
             if let Some(b) = bit {
@@ -70,7 +70,7 @@ impl<T: BitOps> BitMapCore<T> {
     }
 
     /// 获取位图中第一个为0的位
-    pub(crate) fn first_false_index(&self, n: usize, data: &[T]) -> Option<usize> {
+    pub fn first_false_index(&self, n: usize, data: &[T]) -> Option<usize> {
         for (i, element) in data.iter().enumerate() {
             if let Some(bit) = <T as BitOps>::first_false_index(element) {
                 return self.make_index(n, i * T::bit_size() + bit);
@@ -81,7 +81,7 @@ impl<T: BitOps> BitMapCore<T> {
     }
 
     /// 获取位图中最后一个为1的位
-    pub(crate) fn last_index(&self, n: usize, data: &[T]) -> Option<usize> {
+    pub fn last_index(&self, n: usize, data: &[T]) -> Option<usize> {
         for (i, element) in data.iter().enumerate().rev() {
             if let Some(bit) = <T as BitOps>::last_index(element) {
                 return self.make_index(n, i * T::bit_size() + bit);
@@ -97,7 +97,7 @@ impl<T: BitOps> BitMapCore<T> {
     ///
     /// - `data`:位图数据
     /// - `n`:位图有效位数
-    pub(crate) fn last_false_index(&self, n: usize, data: &[T]) -> Option<usize> {
+    pub fn last_false_index(&self, n: usize, data: &[T]) -> Option<usize> {
         let mut iter = data.iter().rev();
         let mut last_element = *iter.next()?;
 
@@ -123,7 +123,7 @@ impl<T: BitOps> BitMapCore<T> {
     }
 
     /// 获取位图中下一个为1的位
-    pub(crate) fn next_index(&self, n: usize, data: &[T], index: usize) -> Option<usize> {
+    pub fn next_index(&self, n: usize, data: &[T], index: usize) -> Option<usize> {
         if unlikely(index >= n) {
             return None;
         }
@@ -146,7 +146,7 @@ impl<T: BitOps> BitMapCore<T> {
     }
 
     /// 获取位图中下一个为0的位
-    pub(crate) fn next_false_index(&self, n: usize, data: &[T], index: usize) -> Option<usize> {
+    pub fn next_false_index(&self, n: usize, data: &[T], index: usize) -> Option<usize> {
         if unlikely(index >= n) {
             return None;
         }
@@ -169,7 +169,7 @@ impl<T: BitOps> BitMapCore<T> {
     }
 
     /// 获取位图中上一个为1的位
-    pub(crate) fn prev_index(&self, n: usize, data: &[T], index: usize) -> Option<usize> {
+    pub fn prev_index(&self, n: usize, data: &[T], index: usize) -> Option<usize> {
         if unlikely(index >= n) {
             return None;
         }
@@ -190,7 +190,7 @@ impl<T: BitOps> BitMapCore<T> {
         None
     }
 
-    pub(crate) fn prev_false_index(&self, n: usize, data: &[T], index: usize) -> Option<usize> {
+    pub fn prev_false_index(&self, n: usize, data: &[T], index: usize) -> Option<usize> {
         let element_index = index / T::bit_size();
         let bit_index = index % T::bit_size();
 
@@ -208,7 +208,7 @@ impl<T: BitOps> BitMapCore<T> {
         None
     }
 
-    pub(crate) fn invert(&self, n: usize, data: &mut [T]) {
+    pub fn invert(&self, n: usize, data: &mut [T]) {
         for element in data.iter_mut() {
             <T as BitOps>::invert(element);
         }
@@ -222,7 +222,7 @@ impl<T: BitOps> BitMapCore<T> {
         }
     }
 
-    pub(crate) fn is_full(&self, n: usize, data: &[T]) -> bool {
+    pub fn is_full(&self, n: usize, data: &[T]) -> bool {
         let mut iter = data.iter().peekable();
         while let Some(element) = iter.next() {
             if iter.peek().is_none() {
@@ -245,7 +245,7 @@ impl<T: BitOps> BitMapCore<T> {
         return false;
     }
 
-    pub(crate) fn is_empty(&self, data: &[T]) -> bool {
+    pub fn is_empty(&self, data: &[T]) -> bool {
         for element in data.iter() {
             if element != &T::zero() {
                 return false;

+ 1 - 0
kernel/crates/bitmap/src/lib.rs

@@ -13,4 +13,5 @@ mod bitmap_core;
 mod static_bitmap;
 pub mod traits;
 pub use alloc_bitmap::AllocBitmap;
+pub use bitmap_core::BitMapCore;
 pub use static_bitmap::StaticBitmap;

+ 1 - 1
kernel/src/arch/x86_64/kvm/vmx/mmu.rs

@@ -88,7 +88,7 @@ fn tdp_get_cr3(_vcpu: &VmxVcpu) -> u64 {
     return guest_cr3;
 }
 
-fn tdp_set_eptp(root_hpa: u64) -> Result<(), SystemError> {
+pub fn tdp_set_eptp(root_hpa: u64) -> Result<(), SystemError> {
     // 设置权限位,目前是写死的,可读可写可执行
     //  EPT paging-structure memory type: Uncacheable
     let mut eptp = 0x0_u64;

+ 1 - 1
kernel/src/arch/x86_64/kvm/vmx/vcpu.rs

@@ -501,7 +501,7 @@ pub fn get_segment_base(gdt_base: *const u64, gdt_size: u16, segment_selector: u
 // }
 pub fn adjust_vmx_controls(ctl_min: u32, ctl_opt: u32, msr: u32, result: &mut u32) {
     let vmx_msr_low: u32 = unsafe { (msr::rdmsr(msr) & 0x0000_0000_FFFF_FFFF) as u32 };
-    let vmx_msr_high: u32 = unsafe { (msr::rdmsr(msr) << 32) as u32 };
+    let vmx_msr_high: u32 = unsafe { (msr::rdmsr(msr) >> 32) as u32 };
     let mut ctl: u32 = ctl_min | ctl_opt;
     ctl &= vmx_msr_high; /* bit == 0 in high word ==> must be zero */
     ctl |= vmx_msr_low; /* bit == 1 in low word  ==> must be one  */

+ 1 - 1
kernel/src/arch/x86_64/kvm/vmx/vmexit.rs

@@ -264,7 +264,7 @@ extern "C" fn vmexit_handler() {
 }
 
 #[no_mangle]
-fn adjust_rip(rip: u64) -> Result<(), SystemError> {
+pub fn adjust_rip(rip: u64) -> Result<(), SystemError> {
     let instruction_length = vmx_vmread(VmcsFields::VMEXIT_INSTR_LEN as u32)?;
     vmx_vmwrite(VmcsFields::GUEST_RIP as u32, rip + instruction_length)?;
     Ok(())

+ 9 - 0
kernel/src/arch/x86_64/mm/mod.rs

@@ -439,6 +439,15 @@ impl X86_64MMArch {
         // 不支持的原因是,目前好像没有能正确的设置page-level的xd位,会触发page fault
         return true;
     }
+
+    pub unsafe fn read_array<T>(addr: VirtAddr, count: usize) -> Vec<T> {
+        // 实现读取数组逻辑
+        let mut vec = Vec::with_capacity(count);
+        for i in 0..count {
+            vec.push(Self::read(addr + i * core::mem::size_of::<T>()));
+        }
+        vec
+    }
 }
 
 impl VirtAddr {

+ 10 - 0
kernel/src/arch/x86_64/mod.rs

@@ -20,6 +20,7 @@ pub mod sched;
 pub mod smp;
 pub mod syscall;
 pub mod time;
+pub mod vm;
 
 pub use self::pci::pci::X86_64PciArch as PciArch;
 
@@ -40,3 +41,12 @@ pub use crate::arch::elf::X86_64ElfArch as CurrentElfArch;
 pub use crate::arch::smp::X86_64SMPArch as CurrentSMPArch;
 
 pub use crate::arch::sched::X86_64SchedArch as CurrentSchedArch;
+
+pub use crate::arch::vm::KvmArchManager as CurrentKvmManager;
+
+pub use crate::arch::vm::kvm_host::X86KvmArch as KvmArch;
+
+pub use crate::arch::vm::x86_kvm_ops as kvm_arch_ops;
+
+pub use crate::arch::vm::kvm_host::vcpu::X86VcpuArch as VirtCpuArch;
+pub use crate::arch::vm::kvm_host::KvmVcpuStat as VirtCpuStat;

+ 592 - 0
kernel/src/arch/x86_64/vm/asm.rs

@@ -0,0 +1,592 @@
+use core::arch::asm;
+
+use alloc::slice;
+use log::{debug, error};
+use raw_cpuid::CpuId;
+use system_error::SystemError;
+use x86::{
+    bits64::vmx::vmxon,
+    controlregs::{cr0, cr0_write, cr4, cr4_write, Cr0, Cr4},
+    msr::{
+        rdmsr, wrmsr, IA32_FEATURE_CONTROL, IA32_VMX_CR0_FIXED0, IA32_VMX_CR0_FIXED1,
+        IA32_VMX_CR4_FIXED0, IA32_VMX_CR4_FIXED1,
+    },
+    vmx::vmcs::ro,
+};
+
+use crate::{
+    arch::{mm::barrier, MMArch},
+    mm::{MemoryManagementArch, PhysAddr},
+};
+
+use super::vmx::vmx_info;
+
+pub struct KvmX86Asm;
+
+impl KvmX86Asm {
+    pub fn read_pkru() -> u32 {
+        let cpuid = CpuId::new();
+        if let Some(feat) = cpuid.get_extended_feature_info() {
+            if feat.has_ospke() {
+                return Self::rdpkru();
+            }
+        }
+        return 0;
+    }
+
+    pub fn write_pkru(_val: u32) {
+        let cpuid = CpuId::new();
+        if let Some(feat) = cpuid.get_extended_feature_info() {
+            if feat.has_ospke() {
+                todo!();
+            }
+        }
+    }
+
+    fn rdpkru() -> u32 {
+        let ecx: u32 = 0;
+        let pkru: u32;
+        let _edx: u32;
+
+        unsafe {
+            asm!(
+                "rdpkru",
+                out("eax") pkru,
+                out("edx") _edx,
+                in("ecx") ecx,
+            );
+        }
+
+        pkru
+    }
+
+    pub fn get_segment_base(gdt_base: *const u64, gdt_size: u16, segment_selector: u16) -> u64 {
+        let table = segment_selector & 0x0004; // get table indicator in selector
+        let index = (segment_selector >> 3) as usize; // get index in selector
+        if table == 0 && index == 0 {
+            return 0;
+        }
+        let descriptor_table = unsafe { slice::from_raw_parts(gdt_base, gdt_size.into()) };
+        let descriptor = descriptor_table[index];
+
+        let base_high = (descriptor & 0xFF00_0000_0000_0000) >> 32;
+        let base_mid = (descriptor & 0x0000_00FF_0000_0000) >> 16;
+        let base_low = (descriptor & 0x0000_0000_FFFF_0000) >> 16;
+        let segment_base = (base_high | base_mid | base_low) & 0xFFFFFFFF;
+        let virtaddr = unsafe {
+            MMArch::phys_2_virt(PhysAddr::new(segment_base as usize))
+                .unwrap()
+                .data() as u64
+        };
+        return virtaddr;
+    }
+}
+
+pub struct VmxAsm;
+
+impl VmxAsm {
+    pub fn vmclear(phys_addr: PhysAddr) {
+        debug!("vmclear addr {phys_addr:?}");
+        match unsafe { x86::bits64::vmx::vmclear(phys_addr.data() as u64) } {
+            Ok(_) => {}
+            Err(e) => {
+                panic!("[VMX] vmclear failed! reason: {e:?}");
+            }
+        }
+    }
+
+    pub fn vmcs_load(phys_addr: PhysAddr) {
+        match unsafe { x86::bits64::vmx::vmptrld(phys_addr.data() as u64) } {
+            Ok(_) => {}
+            Err(e) => {
+                panic!("[VMX] vmptrld failed! reason: {e:?}");
+            }
+        }
+    }
+
+    /// vmrite the current VMCS.
+    pub fn vmx_vmwrite(vmcs_field: u32, value: u64) {
+        unsafe {
+            x86::bits64::vmx::vmwrite(vmcs_field, value)
+                .unwrap_or_else(|_| panic!("vmcs_field: {:x} vmx_write fail", vmcs_field))
+        }
+    }
+
+    /// vmread the current VMCS.
+    pub fn vmx_vmread(vmcs_field: u32) -> u64 {
+        unsafe { x86::bits64::vmx::vmread(vmcs_field).expect("vmx_read fail: ") }
+    }
+
+    pub fn kvm_cpu_vmxon(phys_addr: PhysAddr) -> Result<(), SystemError> {
+        unsafe {
+            let mut cr4 = cr4();
+            cr4.insert(Cr4::CR4_ENABLE_VMX);
+            cr4_write(cr4);
+
+            Self::vmx_set_lock_bit()?;
+            Self::vmx_set_cr0_bits();
+            Self::vmx_set_cr4_bits();
+            debug!("vmxon addr {phys_addr:?}");
+
+            vmxon(phys_addr.data() as u64).expect("[VMX] vmxon failed! reason");
+
+            barrier::mfence();
+
+            Ok(())
+        }
+    }
+
+    #[allow(dead_code)]
+    const VMX_VPID_EXTENT_INDIVIDUAL_ADDR: u64 = 0;
+    const VMX_VPID_EXTENT_SINGLE_CONTEXT: u64 = 1;
+    #[allow(dead_code)]
+    const VMX_VPID_EXTENT_ALL_CONTEXT: u64 = 2;
+    #[allow(dead_code)]
+    const VMX_VPID_EXTENT_SINGLE_NON_GLOBAL: u64 = 3;
+    #[allow(dead_code)]
+    const VMX_EPT_EXTENT_CONTEXT: u64 = 1;
+    const VMX_EPT_EXTENT_GLOBAL: u64 = 2;
+    #[allow(dead_code)]
+    const VMX_EPT_EXTENT_SHIFT: u64 = 24;
+
+    pub fn ept_sync_global() {
+        Self::invept(Self::VMX_EPT_EXTENT_GLOBAL, 0, 0);
+    }
+    #[allow(dead_code)]
+    pub fn ept_sync_context(eptp: u64) {
+        if vmx_info().has_vmx_invept_context() {
+            Self::invept(Self::VMX_EPT_EXTENT_CONTEXT, eptp, 0);
+        } else {
+            Self::ept_sync_global();
+        }
+    }
+
+    pub fn sync_vcpu_single(vpid: u16) {
+        if vpid == 0 {
+            return;
+        }
+
+        Self::invvpid(Self::VMX_VPID_EXTENT_SINGLE_CONTEXT, vpid, 0)
+    }
+
+    pub fn sync_vcpu_global() {
+        Self::invvpid(Self::VMX_VPID_EXTENT_ALL_CONTEXT, 0, 0);
+    }
+
+    #[inline(always)]
+    fn invept(ext: u64, eptp: u64, gpa: u64) {
+        #[repr(C)]
+        struct InveptDescriptor {
+            eptp: u64,
+            gpa: u64,
+        }
+
+        let descriptor = InveptDescriptor { eptp, gpa };
+
+        unsafe {
+            asm!(
+                "invept {0}, [{1}]",
+                in(reg) ext,
+                in(reg) &descriptor,
+                options(nostack)
+            );
+        }
+    }
+
+    #[inline(always)]
+    fn invvpid(ext: u64, vpid: u16, gva: u64) {
+        #[repr(C)]
+        struct InvvpidDescriptor {
+            vpid: u16,
+            rsvd: u64,
+            gva: u64,
+        }
+
+        let descriptor = InvvpidDescriptor { vpid, rsvd: 0, gva };
+
+        unsafe {
+            asm!(
+                "invvpid {0}, [{1}]",
+                in(reg) ext,
+                in(reg) &descriptor,
+                options(nostack)
+            );
+        }
+    }
+
+    /// Set the mandatory bits in CR4 and clear bits that are mandatory zero
+    /// (Intel Manual: 24.8 Restrictions on VMX Operation)
+    fn vmx_set_cr4_bits() {
+        let ia32_vmx_cr4_fixed0 = unsafe { rdmsr(IA32_VMX_CR4_FIXED0) };
+        let ia32_vmx_cr4_fixed1 = unsafe { rdmsr(IA32_VMX_CR4_FIXED1) };
+
+        let mut cr4 = unsafe { cr4() };
+
+        cr4 |= Cr4::from_bits_truncate(ia32_vmx_cr4_fixed0 as usize);
+        cr4 &= Cr4::from_bits_truncate(ia32_vmx_cr4_fixed1 as usize);
+
+        unsafe { cr4_write(cr4) };
+    }
+
+    /// Check if we need to set bits in IA32_FEATURE_CONTROL
+    // (Intel Manual: 24.7 Enabling and Entering VMX Operation)
+    fn vmx_set_lock_bit() -> Result<(), SystemError> {
+        const VMX_LOCK_BIT: u64 = 1 << 0;
+        const VMXON_OUTSIDE_SMX: u64 = 1 << 2;
+
+        let ia32_feature_control = unsafe { rdmsr(IA32_FEATURE_CONTROL) };
+
+        if (ia32_feature_control & VMX_LOCK_BIT) == 0 {
+            unsafe {
+                wrmsr(
+                    IA32_FEATURE_CONTROL,
+                    VMXON_OUTSIDE_SMX | VMX_LOCK_BIT | ia32_feature_control,
+                )
+            };
+        } else if (ia32_feature_control & VMXON_OUTSIDE_SMX) == 0 {
+            return Err(SystemError::EPERM);
+        }
+
+        Ok(())
+    }
+
+    /// Set the mandatory bits in CR0 and clear bits that are mandatory zero
+    /// (Intel Manual: 24.8 Restrictions on VMX Operation)
+    fn vmx_set_cr0_bits() {
+        let ia32_vmx_cr0_fixed0 = unsafe { rdmsr(IA32_VMX_CR0_FIXED0) };
+        let ia32_vmx_cr0_fixed1 = unsafe { rdmsr(IA32_VMX_CR0_FIXED1) };
+
+        let mut cr0 = unsafe { cr0() };
+
+        cr0 |= Cr0::from_bits_truncate(ia32_vmx_cr0_fixed0 as usize);
+        cr0 &= Cr0::from_bits_truncate(ia32_vmx_cr0_fixed1 as usize);
+
+        unsafe { cr0_write(cr0) };
+    }
+}
+
+#[no_mangle]
+unsafe extern "C" fn vmx_vmlaunch() {
+    if let Err(e) = x86::bits64::vmx::vmlaunch() {
+        error!(
+            "vmx_launch fail: {:?}, err code {}",
+            e,
+            VmxAsm::vmx_vmread(ro::VM_INSTRUCTION_ERROR)
+        );
+    }
+}
+
+bitflags! {
+    pub struct IntrInfo: u32 {
+        const INTR_INFO_VECTOR_MASK = 0xff;
+        const INTR_INFO_INTR_TYPE_MASK = 0x700;
+        const INTR_INFO_DELIVER_CODE_MASK = 0x800;
+        const INTR_INFO_UNBLOCK_NMI = 0x1000;
+        const INTR_INFO_VALID_MASK = 0x80000000;
+        const INTR_INFO_RESVD_BITS_MASK = 0x7ffff000;
+    }
+
+    pub struct IntrType: u32 {
+        /// external interrupt
+        const INTR_TYPE_EXT_INTR = (0 << 8);
+        /// reserved
+        const INTR_TYPE_RESERVED = (1 << 8);
+        /// NMI
+        const INTR_TYPE_NMI_INTR = (2 << 8);
+        /// processor exception
+        const INTR_TYPE_HARD_EXCEPTION = (3 << 8);
+        /// software interrupt
+        const INTR_TYPE_SOFT_INTR = (4 << 8);
+        /// ICE breakpoint - undocumented
+        const INTR_TYPE_PRIV_SW_EXCEPTION = (5 << 8);
+        /// software exception
+        const INTR_TYPE_SOFT_EXCEPTION = (6 << 8);
+        /// other even
+        const INTR_TYPE_OTHER_EVENT = (7 << 8);
+    }
+
+    pub struct MiscEnable: u64 {
+        const MSR_IA32_MISC_ENABLE_FAST_STRING = 1 << 0;
+        const MSR_IA32_MISC_ENABLE_TCC = 1 << 1;
+        const MSR_IA32_MISC_ENABLE_EMON = 1 << 7;
+        const MSR_IA32_MISC_ENABLE_BTS_UNAVAIL = 1 << 11;
+        const MSR_IA32_MISC_ENABLE_PEBS_UNAVAIL = 1 << 12;
+        const MSR_IA32_MISC_ENABLE_ENHANCED_SPEEDSTEP = 1 << 16;
+        const MSR_IA32_MISC_ENABLE_MWAIT = 1 << 18;
+        const MSR_IA32_MISC_ENABLE_LIMIT_CPUID= 1 << 22;
+        const MSR_IA32_MISC_ENABLE_XTPR_DISABLE = 1 << 23;
+        const MSR_IA32_MISC_ENABLE_XD_DISABLE = 1 << 34;
+    }
+
+    pub struct ArchCapabilities: u64 {
+        /// Not susceptible to Meltdown
+        const ARCH_CAP_RDCL_NO = 1 << 0;
+        /// Enhanced IBRS support
+        const ARCH_CAP_IBRS_ALL = 1 << 1;
+        /// RET may use alternative branch predictors
+        const ARCH_CAP_RSBA	= 1 << 2;
+        /// Skip L1D flush on vmentry
+        const ARCH_CAP_SKIP_VMENTRY_L1DFLUSH = 1 << 3;
+        ///
+        /// Not susceptible to Speculative Store Bypass
+        /// attack, so no Speculative Store Bypass
+        /// control required.
+        ///
+        const ARCH_CAP_SSB_NO = 1 << 4;
+        /// Not susceptible to
+        /// Microarchitectural Data
+        /// Sampling (MDS) vulnerabilities.
+        const ARCH_CAP_MDS_NO = 1 << 5;
+        /// The processor is not susceptible to a
+        /// machine check error due to modifying the
+        /// code page size along with either the
+        /// physical address or cache type
+        /// without TLB invalidation.
+        const ARCH_CAP_PSCHANGE_MC_NO = 1 << 6;
+        /// MSR for TSX control is available.
+        const ARCH_CAP_TSX_CTRL_MSR = 1 << 7;
+        /// Not susceptible to
+        /// TSX Async Abort (TAA) vulnerabilities.
+        const ARCH_CAP_TAA_NO = 1 << 8;
+        /// Not susceptible to SBDR and SSDP
+        /// variants of Processor MMIO stale data
+        /// vulnerabilities.
+        const ARCH_CAP_SBDR_SSDP_NO = 1 << 13;
+        /// Not susceptible to FBSDP variant of
+        /// Processor MMIO stale data
+        /// vulnerabilities.
+        const ARCH_CAP_FBSDP_NO = 1 << 14;
+        /// Not susceptible to PSDP variant of
+        /// Processor MMIO stale data
+        /// vulnerabilities.
+        const ARCH_CAP_PSDP_NO = 1 << 15;
+        /// VERW clears CPU fill buffer
+        /// even on MDS_NO CPUs.
+        const ARCH_CAP_FB_CLEAR = 1 << 17;
+        /// MSR_IA32_MCU_OPT_CTRL[FB_CLEAR_DIS]
+        /// bit available to control VERW
+        /// behavior.
+        const ARCH_CAP_FB_CLEAR_CTRL = 1 << 18;
+        /// Indicates RET may use predictors
+        /// other than the RSB. With eIBRS
+        /// enabled predictions in kernel mode
+        /// are restricted to targets in
+        /// kernel.
+        const ARCH_CAP_RRSBA = 1 << 19;
+        /// Not susceptible to Post-Barrier
+        /// Return Stack Buffer Predictions.
+        const ARCH_CAP_PBRSB_NO = 1 << 24;
+        /// CPU is vulnerable to Gather
+        /// Data Sampling (GDS) and
+        /// has controls for mitigation.
+        const ARCH_CAP_GDS_CTRL = 1 << 25;
+        /// CPU is not vulnerable to Gather
+        /// Data Sampling (GDS).
+        const ARCH_CAP_GDS_NO = 1 << 26;
+        /// IA32_XAPIC_DISABLE_STATUS MSR
+        /// supported
+        const ARCH_CAP_XAPIC_DISABLE = 1 << 21;
+
+        const KVM_SUPPORTED_ARCH_CAP = ArchCapabilities::ARCH_CAP_RDCL_NO.bits
+        | ArchCapabilities::ARCH_CAP_IBRS_ALL.bits
+        | ArchCapabilities::ARCH_CAP_RSBA.bits
+        | ArchCapabilities::ARCH_CAP_SKIP_VMENTRY_L1DFLUSH.bits
+        | ArchCapabilities::ARCH_CAP_SSB_NO.bits
+        | ArchCapabilities::ARCH_CAP_MDS_NO.bits
+        | ArchCapabilities::ARCH_CAP_PSCHANGE_MC_NO.bits
+        | ArchCapabilities::ARCH_CAP_TSX_CTRL_MSR.bits
+        | ArchCapabilities::ARCH_CAP_TAA_NO.bits
+        | ArchCapabilities::ARCH_CAP_SBDR_SSDP_NO.bits
+        | ArchCapabilities::ARCH_CAP_FBSDP_NO.bits
+        | ArchCapabilities::ARCH_CAP_PSDP_NO.bits
+        | ArchCapabilities::ARCH_CAP_FB_CLEAR.bits
+        | ArchCapabilities::ARCH_CAP_RRSBA.bits
+        | ArchCapabilities::ARCH_CAP_PBRSB_NO.bits
+        | ArchCapabilities::ARCH_CAP_GDS_NO.bits;
+    }
+}
+
+#[derive(Debug, Default, Clone)]
+pub struct MsrData {
+    pub host_initiated: bool,
+    pub index: u32,
+    pub data: u64,
+}
+
+#[repr(C, align(16))]
+#[derive(Debug, Default, Copy, Clone)]
+pub struct VmxMsrEntry {
+    pub index: u32,
+    pub reserved: u32,
+    pub data: u64,
+}
+
+#[allow(dead_code)]
+pub mod hyperv {
+    /* Hyper-V specific model specific registers (MSRs) */
+
+    /* MSR used to identify the guest OS. */
+    pub const HV_X64_MSR_GUEST_OS_ID: u32 = 0x40000000;
+
+    /* MSR used to setup pages used to communicate with the hypervisor. */
+    pub const HV_X64_MSR_HYPERCALL: u32 = 0x40000001;
+
+    /* MSR used to provide vcpu index */
+    pub const HV_REGISTER_VP_INDEX: u32 = 0x40000002;
+
+    /* MSR used to reset the guest OS. */
+    pub const HV_X64_MSR_RESET: u32 = 0x40000003;
+
+    /* MSR used to provide vcpu runtime in 100ns units */
+    pub const HV_X64_MSR_VP_RUNTIME: u32 = 0x40000010;
+
+    /* MSR used to read the per-partition time reference counter */
+    pub const HV_REGISTER_TIME_REF_COUNT: u32 = 0x40000020;
+
+    /* A partition's reference time stamp counter (TSC) page */
+    pub const HV_REGISTER_REFERENCE_TSC: u32 = 0x40000021;
+
+    /* MSR used to retrieve the TSC frequency */
+    pub const HV_X64_MSR_TSC_FREQUENCY: u32 = 0x40000022;
+
+    /* MSR used to retrieve the local APIC timer frequency */
+    pub const HV_X64_MSR_APIC_FREQUENCY: u32 = 0x40000023;
+
+    /* Define the virtual APIC registers */
+    pub const HV_X64_MSR_EOI: u32 = 0x40000070;
+    pub const HV_X64_MSR_ICR: u32 = 0x40000071;
+    pub const HV_X64_MSR_TPR: u32 = 0x40000072;
+    pub const HV_X64_MSR_VP_ASSIST_PAGE: u32 = 0x40000073;
+
+    /* Define synthetic interrupt controller model specific registers. */
+    pub const HV_REGISTER_SCONTROL: u32 = 0x40000080;
+    pub const HV_REGISTER_SVERSION: u32 = 0x40000081;
+    pub const HV_REGISTER_SIEFP: u32 = 0x40000082;
+    pub const HV_REGISTER_SIMP: u32 = 0x40000083;
+    pub const HV_REGISTER_EOM: u32 = 0x40000084;
+    pub const HV_REGISTER_SINT0: u32 = 0x40000090;
+    pub const HV_REGISTER_SINT1: u32 = 0x40000091;
+    pub const HV_REGISTER_SINT2: u32 = 0x40000092;
+    pub const HV_REGISTER_SINT3: u32 = 0x40000093;
+    pub const HV_REGISTER_SINT4: u32 = 0x40000094;
+    pub const HV_REGISTER_SINT5: u32 = 0x40000095;
+    pub const HV_REGISTER_SINT6: u32 = 0x40000096;
+    pub const HV_REGISTER_SINT7: u32 = 0x40000097;
+    pub const HV_REGISTER_SINT8: u32 = 0x40000098;
+    pub const HV_REGISTER_SINT9: u32 = 0x40000099;
+    pub const HV_REGISTER_SINT10: u32 = 0x4000009A;
+    pub const HV_REGISTER_SINT11: u32 = 0x4000009B;
+    pub const HV_REGISTER_SINT12: u32 = 0x4000009C;
+    pub const HV_REGISTER_SINT13: u32 = 0x4000009D;
+    pub const HV_REGISTER_SINT14: u32 = 0x4000009E;
+    pub const HV_REGISTER_SINT15: u32 = 0x4000009F;
+
+    /*
+     * Define synthetic interrupt controller model specific registers for
+     * nested hypervisor.
+     */
+    pub const HV_REGISTER_NESTED_SCONTROL: u32 = 0x40001080;
+    pub const HV_REGISTER_NESTED_SVERSION: u32 = 0x40001081;
+    pub const HV_REGISTER_NESTED_SIEFP: u32 = 0x40001082;
+    pub const HV_REGISTER_NESTED_SIMP: u32 = 0x40001083;
+    pub const HV_REGISTER_NESTED_EOM: u32 = 0x40001084;
+    pub const HV_REGISTER_NESTED_SINT0: u32 = 0x40001090;
+
+    /*
+     * Synthetic Timer MSRs. Four timers per vcpu.
+     */
+    pub const HV_REGISTER_STIMER0_CONFIG: u32 = 0x400000B0;
+    pub const HV_REGISTER_STIMER0_COUNT: u32 = 0x400000B1;
+    pub const HV_REGISTER_STIMER1_CONFIG: u32 = 0x400000B2;
+    pub const HV_REGISTER_STIMER1_COUNT: u32 = 0x400000B3;
+    pub const HV_REGISTER_STIMER2_CONFIG: u32 = 0x400000B4;
+    pub const HV_REGISTER_STIMER2_COUNT: u32 = 0x400000B5;
+    pub const HV_REGISTER_STIMER3_CONFIG: u32 = 0x400000B6;
+    pub const HV_REGISTER_STIMER3_COUNT: u32 = 0x400000B7;
+
+    /* Hyper-V guest idle MSR */
+    pub const HV_X64_MSR_GUEST_IDLE: u32 = 0x400000F0;
+
+    /* Hyper-V guest crash notification MSR's */
+    pub const HV_REGISTER_CRASH_P0: u32 = 0x40000100;
+    pub const HV_REGISTER_CRASH_P1: u32 = 0x40000101;
+    pub const HV_REGISTER_CRASH_P2: u32 = 0x40000102;
+    pub const HV_REGISTER_CRASH_P3: u32 = 0x40000103;
+    pub const HV_REGISTER_CRASH_P4: u32 = 0x40000104;
+    pub const HV_REGISTER_CRASH_CTL: u32 = 0x40000105;
+
+    /* TSC emulation after migration */
+    pub const HV_X64_MSR_REENLIGHTENMENT_CONTROL: u32 = 0x40000106;
+    pub const HV_X64_MSR_TSC_EMULATION_CONTROL: u32 = 0x40000107;
+    pub const HV_X64_MSR_TSC_EMULATION_STATUS: u32 = 0x40000108;
+
+    /* TSC invariant control */
+    pub const HV_X64_MSR_TSC_INVARIANT_CONTROL: u32 = 0x40000118;
+
+    /*
+     * The defines related to the synthetic debugger are required by KDNet, but
+     * they are not documented in the Hyper-V TLFS because the synthetic debugger
+     * functionality has been deprecated and is subject to removal in future
+     * versions of Windows.
+     */
+    pub const HYPERV_CPUID_SYNDBG_VENDOR_AND_MAX_FUNCTIONS: u32 = 0x40000080;
+    pub const HYPERV_CPUID_SYNDBG_INTERFACE: u32 = 0x40000081;
+    pub const HYPERV_CPUID_SYNDBG_PLATFORM_CAPABILITIES: u32 = 0x40000082;
+
+    /*
+     * Hyper-V synthetic debugger platform capabilities
+     * These are HYPERV_CPUID_SYNDBG_PLATFORM_CAPABILITIES.EAX bits.
+     */
+    pub const HV_X64_SYNDBG_CAP_ALLOW_KERNEL_DEBUGGING: u32 = 1 << 1;
+
+    /* Hyper-V Synthetic debug options MSR */
+    pub const HV_X64_MSR_SYNDBG_CONTROL: u32 = 0x400000F1;
+    pub const HV_X64_MSR_SYNDBG_STATUS: u32 = 0x400000F2;
+    pub const HV_X64_MSR_SYNDBG_SEND_BUFFER: u32 = 0x400000F3;
+    pub const HV_X64_MSR_SYNDBG_RECV_BUFFER: u32 = 0x400000F4;
+    pub const HV_X64_MSR_SYNDBG_PENDING_BUFFER: u32 = 0x400000F5;
+    pub const HV_X64_MSR_SYNDBG_OPTIONS: u32 = 0x400000FF;
+}
+
+#[allow(dead_code)]
+pub mod kvm_msr {
+    pub const MSR_KVM_WALL_CLOCK: u32 = 0x11;
+    pub const MSR_KVM_SYSTEM_TIME: u32 = 0x12;
+
+    /* Custom MSRs falls in the range 0x4b564d00-0x4b564dff */
+    pub const MSR_KVM_WALL_CLOCK_NEW: u32 = 0x4b564d00;
+    pub const MSR_KVM_SYSTEM_TIME_NEW: u32 = 0x4b564d01;
+    pub const MSR_KVM_ASYNC_PF_EN: u32 = 0x4b564d02;
+    pub const MSR_KVM_STEAL_TIME: u32 = 0x4b564d03;
+    pub const MSR_KVM_PV_EOI_EN: u32 = 0x4b564d04;
+    pub const MSR_KVM_POLL_CONTROL: u32 = 0x4b564d05;
+    pub const MSR_KVM_ASYNC_PF_INT: u32 = 0x4b564d06;
+    pub const MSR_KVM_ASYNC_PF_ACK: u32 = 0x4b564d07;
+    pub const MSR_KVM_MIGRATION_CONTROL: u32 = 0x4b564d08;
+
+    pub const PIN_BASED_ALWAYSON_WITHOUT_TRUE_MSR: u64 = 0x00000016;
+    pub const CPU_BASED_ALWAYSON_WITHOUT_TRUE_MSR: u64 = 0x0401e172;
+    pub const VM_EXIT_ALWAYSON_WITHOUT_TRUE_MSR: u64 = 0x00036dff;
+    pub const VM_ENTRY_ALWAYSON_WITHOUT_TRUE_MSR: u64 = 0x000011ff;
+}
+
+#[derive(Debug, PartialEq, Clone, Copy)]
+pub enum VcpuSegment {
+    ES,
+    CS,
+    SS,
+    DS,
+    FS,
+    GS,
+    TR,
+    LDTR,
+}
+
+#[derive(Debug, PartialEq, Clone, Copy)]
+pub enum SegmentCacheField {
+    SEL = 0,
+    BASE = 1,
+    LIMIT = 2,
+    AR = 3,
+    NR = 4,
+}

+ 59 - 0
kernel/src/arch/x86_64/vm/cpuid.rs

@@ -0,0 +1,59 @@
+use alloc::vec::Vec;
+
+#[derive(Debug, Default, Clone, Copy)]
+#[allow(dead_code)]
+pub struct KvmCpuidEntry2 {
+    pub function: u32,
+    pub index: u32,
+    pub flags: KvmCpuidFlag,
+    pub eax: u32,
+    pub ebx: u32,
+    pub ecx: u32,
+    pub edx: u32,
+    padding: [u32; 3],
+}
+
+impl KvmCpuidEntry2 {
+    pub fn find(
+        entries: &Vec<KvmCpuidEntry2>,
+        function: u32,
+        index: Option<u32>,
+    ) -> Option<KvmCpuidEntry2> {
+        for e in entries {
+            if e.function != function {
+                continue;
+            }
+
+            if !e
+                .flags
+                .contains(KvmCpuidFlag::KVM_CPUID_FLAG_SIGNIFCANT_INDEX)
+                || Some(e.index) == index
+            {
+                return Some(*e);
+            }
+
+            if index.is_none() {
+                return Some(*e);
+            }
+        }
+
+        None
+    }
+}
+
+bitflags! {
+    pub struct KvmCpuidFlag: u32 {
+        /// 表示CPUID函数的输入索引值是重要的,它会影响CPUID函数的行为或返回值
+        const KVM_CPUID_FLAG_SIGNIFCANT_INDEX = 1 << 0;
+        /// 表示CPUID函数是有状态的,即它的行为可能受到先前CPUID函数调用的影响
+        const KVM_CPUID_FLAG_STATEFUL_FUNC = 1 << 1;
+        /// 表示CPUID函数的状态应该在下一次CPUID函数调用中读取
+        const KVM_CPUID_FLAG_STATE_READ_NEXT = 1 << 2;
+    }
+}
+
+impl Default for KvmCpuidFlag {
+    fn default() -> Self {
+        Self::empty()
+    }
+}

+ 1 - 0
kernel/src/arch/x86_64/vm/exit.rs

@@ -0,0 +1 @@
+

+ 62 - 0
kernel/src/arch/x86_64/vm/kvm_host/lapic.rs

@@ -0,0 +1,62 @@
+use alloc::boxed::Box;
+
+use crate::{
+    arch::kvm_arch_ops,
+    virt::vm::kvm_host::{vcpu::VirtCpu, Vm},
+};
+
+const APIC_DEFAULT_PHYS_BASE: u64 = 0xfee00000;
+#[allow(dead_code)]
+const MSR_IA32_APICBASE: u64 = 0x0000001b;
+const MSR_IA32_APICBASE_BSP: u64 = 1 << 8;
+const MSR_IA32_APICBASE_ENABLE: u64 = 1 << 11;
+#[allow(dead_code)]
+const MSR_IA32_APICBASE_BASE: u64 = 0xfffff << 12;
+
+#[derive(Debug)]
+pub struct KvmLapic {
+    pub apicv_active: bool,
+    pub regs: Box<[u8]>,
+}
+
+impl VirtCpu {
+    pub fn lapic_reset(&mut self, vm: &Vm, init_event: bool) {
+        kvm_arch_ops().apicv_pre_state_restore(self);
+
+        if !init_event {
+            let mut msr_val = APIC_DEFAULT_PHYS_BASE | MSR_IA32_APICBASE_ENABLE;
+            if vm.arch.bsp_vcpu_id == self.vcpu_id {
+                msr_val |= MSR_IA32_APICBASE_BSP;
+            }
+
+            self.lapic_set_base(msr_val);
+        }
+
+        if self.arch.apic.is_none() {
+            return;
+        }
+
+        todo!()
+    }
+
+    fn lapic_set_base(&mut self, value: u64) {
+        let old_val = self.arch.apic_base;
+        let apic = self.arch.apic.as_ref();
+
+        self.arch.apic_base = value;
+
+        if (old_val ^ value) & MSR_IA32_APICBASE_ENABLE != 0 {
+            // TODO: kvm_update_cpuid_runtime(vcpu);
+        }
+
+        if apic.is_none() {
+            return;
+        }
+
+        if (old_val ^ value) & MSR_IA32_APICBASE_ENABLE != 0 {
+            // if value & MSR_IA32_APICBASE_ENABLE != 0 {}
+        }
+
+        todo!()
+    }
+}

+ 463 - 0
kernel/src/arch/x86_64/vm/kvm_host/mod.rs

@@ -0,0 +1,463 @@
+use core::{fmt::Debug, sync::atomic::AtomicU32};
+
+use alloc::{boxed::Box, vec::Vec};
+use bit_field::BitField;
+use bitmap::{traits::BitMapOps, AllocBitmap};
+use system_error::SystemError;
+use x86::{
+    bits64::rflags::RFlags,
+    controlregs::{Cr0, Cr4},
+    dtables::DescriptorTablePointer,
+};
+use x86_64::registers::control::EferFlags;
+
+use crate::{
+    smp::cpu::ProcessorId,
+    virt::vm::{
+        kvm_host::{
+            vcpu::VirtCpu, Vm, KVM_IRQFD_RESAMPLE_IRQ_SOURCE_ID, KVM_USERSAPCE_IRQ_SOURCE_ID,
+        },
+        user_api::UapiKvmSegment,
+    },
+};
+
+use crate::arch::VirtCpuArch;
+
+use super::{
+    asm::{MsrData, VcpuSegment, VmxMsrEntry},
+    vmx::{exit::ExitFastpathCompletion, vmx_info},
+    x86_kvm_manager, x86_kvm_ops,
+};
+
+pub mod lapic;
+pub mod page;
+pub mod vcpu;
+#[allow(dead_code)]
+pub const TSS_IOPB_BASE_OFFSET: usize = 0x66;
+pub const TSS_BASE_SIZE: usize = 0x68;
+pub const TSS_IOPB_SIZE: usize = 65536 / 8;
+pub const TSS_REDIRECTION_SIZE: usize = 256 / 8;
+pub const RMODE_TSS_SIZE: usize = TSS_BASE_SIZE + TSS_REDIRECTION_SIZE + TSS_IOPB_SIZE + 1;
+
+pub const KVM_PFN_NOSLOT: u64 = 0x1 << 63;
+
+#[allow(dead_code)]
+#[derive(Debug, Default)]
+pub struct X86KvmArch {
+    /// 中断芯片模式
+    pub irqchip_mode: KvmIrqChipMode,
+    /// 负责引导(bootstrap)kvm的vcpu_id
+    bsp_vcpu_id: usize,
+    pub pause_in_guest: bool,
+    pub cstate_in_guest: bool,
+    pub mwait_in_guest: bool,
+    pub hlt_in_guest: bool,
+    pub bus_lock_detection_enabled: bool,
+    irq_sources_bitmap: u64,
+    default_tsc_khz: u64,
+    guest_can_read_msr_platform_info: bool,
+    apicv_inhibit_reasons: usize,
+
+    pub max_vcpu_ids: usize,
+
+    pub notify_vmexit_flags: NotifyVmExitFlags,
+    pub notify_window: u32,
+
+    msr_fliter: Option<Box<KvmX86MsrFilter>>,
+
+    pub noncoherent_dma_count: AtomicU32,
+
+    pub active_mmu_pages: Vec<u64>,
+
+    pub n_max_mmu_pages: usize,
+    pub n_used_mmu_pages: usize,
+}
+
+impl X86KvmArch {
+    pub fn init(kvm_type: usize) -> Result<Self, SystemError> {
+        if kvm_type != 0 {
+            return Err(SystemError::EINVAL);
+        }
+        let mut arch = x86_kvm_ops().vm_init();
+
+        // 设置中断源位图
+        arch.irq_sources_bitmap
+            .set_bit(KVM_USERSAPCE_IRQ_SOURCE_ID, true)
+            .set_bit(KVM_IRQFD_RESAMPLE_IRQ_SOURCE_ID, true);
+
+        arch.default_tsc_khz = x86_kvm_manager().max_tsc_khz;
+        arch.guest_can_read_msr_platform_info = true;
+
+        arch.apicv_init();
+        Ok(arch)
+    }
+
+    fn apicv_init(&mut self) {
+        self.apicv_inhibit_reasons
+            .set_bit(KvmApicvInhibit::ABSENT, true);
+
+        if !vmx_info().enable_apicv {
+            self.apicv_inhibit_reasons
+                .set_bit(KvmApicvInhibit::DISABLE, true);
+        }
+    }
+
+    pub fn msr_allowed(&self, msr: u32, ftype: MsrFilterType) -> bool {
+        // x2APIC MSRs
+        if (0x800..=0x8ff).contains(&msr) {
+            return true;
+        }
+
+        if let Some(msr_filter) = &self.msr_fliter {
+            let mut allowed = msr_filter.default_allow;
+
+            for i in 0..msr_filter.count as usize {
+                let range = &msr_filter.ranges[i];
+                let start = range.base;
+                let end = start + range.nmsrs;
+                let flags = range.flags;
+                let bitmap = &range.bitmap;
+                if msr >= start && msr < end && flags.contains(ftype) {
+                    allowed = bitmap.get((msr - start) as usize).unwrap_or(false);
+                    break;
+                }
+            }
+
+            return allowed;
+        } else {
+            return true;
+        }
+    }
+}
+
+#[derive(Debug, Clone, Copy, PartialEq)]
+#[allow(dead_code)]
+pub enum KvmIrqChipMode {
+    None,
+    Kernel,
+    Split,
+}
+
+impl Default for KvmIrqChipMode {
+    fn default() -> Self {
+        Self::None
+    }
+}
+#[allow(dead_code)]
+pub trait KvmInitFunc {
+    fn hardware_setup(&self) -> Result<(), SystemError>;
+    fn handle_intel_pt_intr(&self) -> u32;
+    fn runtime_funcs(&self) -> &'static dyn KvmFunc;
+}
+
+pub trait KvmFunc: Send + Sync + Debug {
+    /// 返回该硬件支持的名字,例如“Vmx”
+    fn name(&self) -> &'static str;
+
+    /// 启用硬件支持
+    fn hardware_enable(&self) -> Result<(), SystemError>;
+
+    fn vm_init(&self) -> X86KvmArch;
+
+    fn vcpu_precreate(&self, vm: &mut Vm) -> Result<(), SystemError>;
+
+    fn vcpu_create(&self, vcpu: &mut VirtCpu, vm: &Vm);
+
+    fn vcpu_load(&self, vcpu: &mut VirtCpu, cpu: ProcessorId);
+
+    fn load_mmu_pgd(&self, vcpu: &mut VirtCpu, vm: &Vm, root_hpa: u64, root_level: u32);
+
+    fn cache_reg(&self, vcpu: &mut VirtCpuArch, reg: KvmReg);
+
+    fn apicv_pre_state_restore(&self, vcpu: &mut VirtCpu);
+
+    fn set_msr(&self, vcpu: &mut VirtCpu, msr: MsrData) -> Result<(), SystemError>;
+
+    fn set_rflags(&self, vcpu: &mut VirtCpu, rflags: RFlags);
+
+    fn get_rflags(&self, vcpu: &mut VirtCpu) -> RFlags;
+
+    fn set_cr0(&self, vm: &Vm, vcpu: &mut VirtCpu, cr0: Cr0);
+
+    fn is_vaild_cr0(&self, vcpu: &VirtCpu, cr0: Cr0) -> bool;
+
+    fn set_cr4(&self, vcpu: &mut VirtCpu, cr4: Cr4);
+
+    fn post_set_cr3(&self, vcpu: &VirtCpu, cr3: u64);
+
+    fn is_vaild_cr4(&self, vcpu: &VirtCpu, cr4: Cr4) -> bool;
+
+    fn set_efer(&self, vcpu: &mut VirtCpu, efer: EferFlags);
+
+    fn set_segment(&self, vcpu: &mut VirtCpu, var: &mut UapiKvmSegment, seg: VcpuSegment);
+
+    fn get_segment(
+        &self,
+        vcpu: &mut VirtCpu,
+        var: UapiKvmSegment,
+        seg: VcpuSegment,
+    ) -> UapiKvmSegment;
+
+    /// 这个函数不会用到VCPU,这里拿到只是为了确保上一层拿到锁
+    fn get_idt(&self, _vcpu: &mut VirtCpu, dt: &mut DescriptorTablePointer<u8>);
+
+    fn set_idt(&self, _vcpu: &mut VirtCpu, dt: &DescriptorTablePointer<u8>);
+
+    fn get_gdt(&self, _vcpu: &mut VirtCpu, dt: &mut DescriptorTablePointer<u8>);
+
+    fn set_gdt(&self, _vcpu: &mut VirtCpu, dt: &DescriptorTablePointer<u8>);
+
+    fn update_exception_bitmap(&self, vcpu: &mut VirtCpu);
+
+    fn vcpu_reset(&self, vcpu: &mut VirtCpu, vm: &Vm, init_event: bool);
+
+    fn has_emulated_msr(&self, msr: u32) -> bool;
+
+    fn get_msr_feature(&self, msr: &mut VmxMsrEntry) -> bool;
+
+    fn prepare_switch_to_guest(&self, vcpu: &mut VirtCpu);
+
+    fn flush_tlb_all(&self, vcpu: &mut VirtCpu);
+
+    fn vcpu_run(&self, vcpu: &mut VirtCpu) -> ExitFastpathCompletion;
+
+    fn handle_exit_irqoff(&self, vcpu: &mut VirtCpu);
+
+    fn handle_exit(
+        &self,
+        vcpu: &mut VirtCpu,
+        vm: &Vm,
+        fastpath: ExitFastpathCompletion,
+    ) -> Result<i32, SystemError>;
+}
+
+/// ## 中断抑制的原因位
+#[derive(Debug)]
+pub struct KvmApicvInhibit;
+
+#[allow(dead_code)]
+impl KvmApicvInhibit {
+    // Intel与AMD共用
+
+    /// APIC 加速功能被模块参数禁用,或者硬件不支持
+    pub const DISABLE: usize = 0;
+
+    /// Hyper-V 客户机正在使用 AutoEOI 功能,导致 APIC 加速被禁用。
+    pub const HYPERV: usize = 1;
+
+    /// 因为用户空间尚未启用内核或分裂的中断控制器,导致 APIC 加速被禁用。
+    pub const ABSENT: usize = 2;
+
+    /// KVM_GUESTDBG_BLOCKIRQ(一种调试措施,用于阻止该 vCPU 上的所有中断)被启用,以避免 AVIC/APICv 绕过此功能。
+    pub const BLOCKIRQ: usize = 3;
+
+    /// 当所有 vCPU 的 APIC ID 和 vCPU 的 1:1 映射被更改且 KVM 未应用其 x2APIC 热插拔修补程序时,APIC 加速被禁用。
+    pub const PHYSICAL_ID_ALIASED: usize = 4;
+
+    /// 当 vCPU 的 APIC ID 或 APIC 基址从其复位值更改时,首次禁用 APIC 加速。
+    pub const APIC_ID_MODIFIED: usize = 5;
+    /// 当 vCPU 的 APIC ID 或 APIC 基址从其复位值更改时,首次禁用 APIC 加速。
+    pub const APIC_BASE_MODIFIED: usize = 6;
+
+    // 仅仅对AMD适用
+
+    /// 当 vCPU 运行嵌套客户机时,AVIC 被禁用。因为与 APICv 不同,当 vCPU 运行嵌套时,该 vCPU 的同级无法使用门铃机制通过 AVIC 信号中断。
+    pub const NESTED: usize = 7;
+
+    ///  在 SVM 上,等待 IRQ 窗口的实现使用挂起的虚拟中断,而在 KVM 等待 IRQ 窗口时无法注入这些虚拟中断,因此在等待 IRQ 窗口时 AVIC 被禁用。
+    pub const IRQWIN: usize = 8;
+
+    /// PIT(i8254)的“重新注入”模式依赖于 EOI 拦截,而 AVIC 不支持边沿触发中断的 EOI 拦截。
+    pub const PIT_REINJ: usize = 9;
+
+    /// SEV 不支持 AVIC,因此 AVIC 被禁用。
+    pub const SEV: usize = 10;
+
+    /// 当所有带有有效 LDR 的 vCPU 之间的逻辑 ID 和 vCPU 的 1:1 映射被更改时,AVIC 被禁用。
+    pub const LOGICAL_ID_ALIASED: usize = 11;
+}
+
+#[derive(Debug)]
+pub struct KvmX86MsrFilter {
+    count: u8,
+    default_allow: bool,
+    ranges: Vec<KernelMsrRange>,
+}
+
+#[derive(Debug)]
+pub struct KernelMsrRange {
+    pub flags: MsrFilterType,
+    pub nmsrs: u32,
+    pub base: u32,
+    pub bitmap: AllocBitmap,
+}
+
+#[repr(C)]
+#[allow(dead_code)]
+pub struct PosixMsrFilterRange {
+    pub flags: u32,
+    pub nmsrs: u32,
+    pub base: u32,
+    pub bitmap: *const u8,
+}
+
+bitflags! {
+    pub struct MsrFilterType: u8 {
+        const KVM_MSR_FILTER_READ  = 1 << 0;
+        const KVM_MSR_FILTER_WRITE = 1 << 1;
+    }
+
+    pub struct NotifyVmExitFlags: u8 {
+        const KVM_X86_NOTIFY_VMEXIT_ENABLED = 1 << 0;
+        const KVM_X86_NOTIFY_VMEXIT_USER = 1 << 1;
+    }
+}
+
+impl Default for NotifyVmExitFlags {
+    fn default() -> Self {
+        NotifyVmExitFlags::empty()
+    }
+}
+
+#[derive(Debug, Clone, Copy)]
+pub enum KvmReg {
+    VcpuRegsRax = 0,
+    VcpuRegsRcx = 1,
+    VcpuRegsRdx = 2,
+    VcpuRegsRbx = 3,
+    VcpuRegsRsp = 4,
+    VcpuRegsRbp = 5,
+    VcpuRegsRsi = 6,
+    VcpuRegsRdi = 7,
+
+    VcpuRegsR8 = 8,
+    VcpuRegsR9 = 9,
+    VcpuRegsR10 = 10,
+    VcpuRegsR11 = 11,
+    VcpuRegsR12 = 12,
+    VcpuRegsR13 = 13,
+    VcpuRegsR14 = 14,
+    VcpuRegsR15 = 15,
+
+    VcpuRegsRip = 16,
+    NrVcpuRegs = 17,
+
+    //VcpuExregPdptr = NrVcpuRegs,
+    VcpuExregCr0,
+    VcpuExregCr3,
+    VcpuExregCr4,
+    VcpuExregRflags,
+    VcpuExregSegments,
+    VcpuExregExitInfo1, //EXITINFO1 provides the linear address of the memory operand.
+    VcpuExregExitInfo2, //EXITINFO2 provides the contents of the register operand.
+}
+
+bitflags! {
+    pub struct HFlags: u8 {
+        const HF_GUEST_MASK = 1 << 0; /* VCPU is in guest-mode */
+        const HF_SMM_MASK = 1 << 1;
+        const HF_SMM_INSIDE_NMI_MASK = 1 << 2;
+    }
+}
+
+/// ### 虚拟机的通用寄存器
+#[derive(Debug, Default, Clone, Copy)]
+#[repr(C)]
+pub struct KvmCommonRegs {
+    rax: u64,
+    rbx: u64,
+    rcx: u64,
+    rdx: u64,
+    rsi: u64,
+    rdi: u64,
+    rsp: u64,
+    rbp: u64,
+    r8: u64,
+    r9: u64,
+    r10: u64,
+    r11: u64,
+    r12: u64,
+    r13: u64,
+    r14: u64,
+    r15: u64,
+    rip: u64,
+    rflags: u64,
+}
+
+impl Vm {
+    pub fn vcpu_precreate(&mut self, id: usize) -> Result<(), SystemError> {
+        if self.arch.max_vcpu_ids == 0 {
+            self.arch.max_vcpu_ids = 1024 * 4;
+        }
+
+        if id >= self.arch.max_vcpu_ids {
+            return Err(SystemError::EINVAL);
+        }
+
+        return x86_kvm_ops().vcpu_precreate(self);
+    }
+}
+bitflags! {
+    pub struct EmulType: u32 {
+        const NO_DECODE = 1 << 0;
+        const TRAP_UD = 1 << 1;
+        const SKIP = 1 << 2;
+        const ALLOW_RETRY_PF = 1 << 3;
+        const TRAP_UD_FORCED = 1 << 4;
+        const VMWARE_GP = 1 << 5;
+        const PF = 1 << 6;
+        const COMPLETE_USER_EXIT = 1 << 7;
+        const WRITE_PF_TO_SP = 1 << 8;
+    }
+}
+#[allow(dead_code)]
+#[derive(Default, Debug)]
+///用于跟踪和记录VCPU的各种统计信息。
+pub struct KvmVcpuStat {
+    //pub generic: KvmVcpuStatGeneric,
+    pub pf_taken: u64,
+    pub pf_fixed: u64,
+    pub pf_emulate: u64,
+    pub pf_spurious: u64,
+    pub pf_fast: u64,
+    pub pf_mmio_spte_created: u64,
+    pub pf_guest: u64,
+    pub tlb_flush: u64,
+    pub invlpg: u64,
+    pub exits: u64,
+    pub io_exits: u64,
+    pub mmio_exits: u64,
+    pub signal_exits: u64,
+    pub irq_window_exits: u64,
+    pub nmi_window_exits: u64,
+    pub l1d_flush: u64,
+    pub halt_exits: u64,
+    pub request_irq_exits: u64,
+    pub irq_exits: u64,
+    pub host_state_reload: u64,
+    pub fpu_reload: u64,
+    pub insn_emulation: u64,
+    pub insn_emulation_fail: u64,
+    pub hypercalls: u64,
+    pub irq_injections: u64,
+    pub nmi_injections: u64,
+    pub req_event: u64,
+    pub nested_run: u64,
+    pub directed_yield_attempted: u64,
+    pub directed_yield_successful: u64,
+    pub preemption_reported: u64,
+    pub preemption_other: u64,
+    pub guest_mode: u64,
+    pub notify_window_exits: u64,
+}
+#[inline]
+/// 将 GFN 转换为 GPA
+pub fn gfn_to_gpa(gfn: u64) -> u64 {
+    gfn << 12
+}
+#[allow(dead_code)]
+#[inline]
+/// 将 GPA 转换为 GFN
+pub fn gpa_to_gfn(gfn: u64) -> u64 {
+    gfn >> 12
+}

+ 1 - 0
kernel/src/arch/x86_64/vm/kvm_host/page.rs

@@ -0,0 +1 @@
+pub const KVM_MIN_FREE_MMU_PAGES: usize = 5;

+ 1697 - 0
kernel/src/arch/x86_64/vm/kvm_host/vcpu.rs

@@ -0,0 +1,1697 @@
+use core::intrinsics::likely;
+use core::{arch::x86_64::_xsetbv, intrinsics::unlikely};
+
+use alloc::{boxed::Box, sync::Arc, vec::Vec};
+use bitmap::{traits::BitMapOps, AllocBitmap, BitMapCore};
+use log::warn;
+use raw_cpuid::CpuId;
+use system_error::SystemError;
+use x86::vmx::vmcs::guest;
+use x86::{
+    bits64::rflags::RFlags,
+    controlregs::{Cr0, Cr4, Xcr0},
+    dtables::DescriptorTablePointer,
+    msr::{self, wrmsr},
+    vmx::vmcs::control::SecondaryControls,
+};
+use x86_64::registers::control::EferFlags;
+
+use crate::arch::vm::asm::VmxAsm;
+use crate::arch::vm::vmx::exit::ExitFastpathCompletion;
+use crate::virt::vm::kvm_host::mem::KvmMmuMemoryCache;
+use crate::virt::vm::kvm_host::vcpu::VcpuMode;
+use crate::{
+    arch::{
+        kvm_arch_ops,
+        mm::barrier,
+        vm::{
+            asm::{hyperv, kvm_msr, KvmX86Asm, MiscEnable, MsrData, VcpuSegment},
+            cpuid::KvmCpuidEntry2,
+            kvm_host::KvmReg,
+            mmu::kvm_mmu::LockedKvmMmu,
+            uapi::{UapiKvmSegmentRegs, KVM_SYNC_X86_VALID_FIELDS},
+            vmx::{vmcs::ControlsType, vmx_info},
+            x86_kvm_manager, x86_kvm_manager_mut, x86_kvm_ops,
+        },
+    },
+    mm::VirtAddr,
+    smp::{core::smp_get_processor_id, cpu::ProcessorId},
+    virt::vm::{
+        kvm_host::{
+            mem::GfnToHvaCache,
+            vcpu::{GuestDebug, VirtCpu},
+            MutilProcessorState, Vm,
+        },
+        user_api::{UapiKvmRun, UapiKvmSegment},
+    },
+};
+
+use super::{lapic::KvmLapic, HFlags, KvmCommonRegs, KvmIrqChipMode};
+const MSR_IA32_CR_PAT_DEFAULT: u64 = 0x0007_0406_0007_0406;
+#[allow(dead_code)]
+#[derive(Debug)]
+pub struct X86VcpuArch {
+    /// 最近一次尝试进入虚拟机的主机cpu
+    pub last_vmentry_cpu: ProcessorId,
+    /// 可用寄存器位图
+    pub regs_avail: AllocBitmap,
+    /// 脏寄存器位图
+    pub regs_dirty: AllocBitmap,
+    /// 多处理器状态
+    mp_state: MutilProcessorState,
+    pub apic_base: u64,
+    /// apic
+    pub apic: Option<KvmLapic>,
+    /// 主机pkru寄存器
+    host_pkru: u32,
+    pkru: u32,
+    /// hflag
+    hflags: HFlags,
+
+    pub microcode_version: u64,
+
+    arch_capabilities: u64,
+
+    perf_capabilities: u64,
+
+    ia32_xss: u64,
+
+    pub guest_state_protected: bool,
+
+    pub cpuid_entries: Vec<KvmCpuidEntry2>,
+
+    pub exception: KvmQueuedException,
+    pub exception_vmexit: KvmQueuedException,
+    pub apf: KvmAsyncPageFault,
+
+    pub emulate_regs_need_sync_from_vcpu: bool,
+    pub emulate_regs_need_sync_to_vcpu: bool,
+
+    pub smbase: u64,
+
+    pub interrupt: KvmQueuedInterrupt,
+
+    pub tsc_offset_adjustment: u64,
+
+    pub mmu: Option<Arc<LockedKvmMmu>>,
+    pub root_mmu: Option<Arc<LockedKvmMmu>>,
+    pub guset_mmu: Option<Arc<LockedKvmMmu>>,
+    pub walk_mmu: Option<Arc<LockedKvmMmu>>,
+    pub nested_mmu: Option<Arc<LockedKvmMmu>>,
+
+    pub mmu_pte_list_desc_cache: KvmMmuMemoryCache,
+    pub mmu_shadow_page_cache: KvmMmuMemoryCache,
+    pub mmu_shadowed_info_cache: KvmMmuMemoryCache,
+    pub mmu_page_header_cache: KvmMmuMemoryCache,
+
+    pub max_phyaddr: usize,
+
+    pub pat: u64,
+
+    pub regs: [u64; KvmReg::NrVcpuRegs as usize],
+
+    pub cr0: Cr0,
+    pub cr0_guest_owned_bits: Cr0,
+    pub cr2: u64,
+    pub cr3: u64,
+    pub cr4: Cr4,
+    pub cr4_guest_owned_bits: Cr4,
+    pub cr4_guest_rsvd_bits: Cr4,
+    pub cr8: u64,
+    pub efer: EferFlags,
+
+    pub xcr0: Xcr0,
+
+    pub dr6: usize,
+    pub dr7: usize,
+
+    pub single_step_rip: usize,
+
+    pub msr_misc_features_enables: u64,
+    pub ia32_misc_enable_msr: MiscEnable,
+
+    pub smi_pending: bool,
+    pub smi_count: u64,
+    pub nmi_queued: usize,
+    /// 待注入的 NMI 数量,不包括硬件 vNMI。
+    pub nmi_pending: u32,
+    pub nmi_injected: bool,
+
+    pub handling_intr_from_guest: KvmIntrType,
+
+    pub xfd_no_write_intercept: bool,
+
+    pub l1tf_flush_l1d: bool,
+
+    pub at_instruction_boundary: bool,
+
+    pub db: [usize; Self::KVM_NR_DB_REGS],
+
+    /* set at EPT violation at this point */
+    pub exit_qual: u64,
+}
+
+impl X86VcpuArch {
+    const KVM_NR_DB_REGS: usize = 4;
+
+    #[inline(never)]
+    pub fn new() -> Self {
+        let mut ret: Box<X86VcpuArch> = unsafe { Box::new_zeroed().assume_init() };
+        ret.last_vmentry_cpu = ProcessorId::INVALID;
+        ret.regs_avail = AllocBitmap::new(32);
+        ret.regs_dirty = AllocBitmap::new(32);
+        ret.mp_state = MutilProcessorState::Runnable;
+
+        ret.apic = None;
+        //max_phyaddr=?? fztodo
+        *ret
+    }
+
+    pub fn clear_dirty(&mut self) {
+        self.regs_dirty.set_all(false);
+    }
+
+    pub fn vcpu_apicv_active(&self) -> bool {
+        self.lapic_in_kernel() && self.lapic().apicv_active
+    }
+
+    pub fn lapic_in_kernel(&self) -> bool {
+        if x86_kvm_manager().has_noapic_vcpu {
+            return self.apic.is_some();
+        }
+        true
+    }
+
+    pub fn is_bsp(&self) -> bool {
+        return self.apic_base & msr::IA32_APIC_BASE as u64 != 0;
+    }
+
+    #[inline]
+    pub fn lapic(&self) -> &KvmLapic {
+        self.apic.as_ref().unwrap()
+    }
+
+    pub fn queue_interrupt(&mut self, vec: u8, soft: bool) {
+        self.interrupt.injected = true;
+        self.interrupt.soft = soft;
+        self.interrupt.nr = vec;
+    }
+
+    pub fn read_cr0_bits(&mut self, mask: Cr0) -> Cr0 {
+        let tmask = mask & (Cr0::CR0_TASK_SWITCHED | Cr0::CR0_WRITE_PROTECT);
+        if tmask.contains(self.cr0_guest_owned_bits)
+            && !self
+                .regs_avail
+                .get(KvmReg::VcpuExregCr0 as usize)
+                .unwrap_or_default()
+        {
+            x86_kvm_ops().cache_reg(self, KvmReg::VcpuExregCr0);
+        }
+
+        return self.cr0 & mask;
+    }
+
+    pub fn read_cr4_bits(&mut self, mask: Cr4) -> Cr4 {
+        let tmask = mask
+            & (Cr4::CR4_VIRTUAL_INTERRUPTS
+                | Cr4::CR4_DEBUGGING_EXTENSIONS
+                | Cr4::CR4_ENABLE_PPMC
+                | Cr4::CR4_ENABLE_SSE
+                | Cr4::CR4_UNMASKED_SSE
+                | Cr4::CR4_ENABLE_GLOBAL_PAGES
+                | Cr4::CR4_TIME_STAMP_DISABLE
+                | Cr4::CR4_ENABLE_FSGSBASE);
+
+        if tmask.contains(self.cr4_guest_owned_bits)
+            && !self
+                .regs_avail
+                .get(KvmReg::VcpuExregCr4 as usize)
+                .unwrap_or_default()
+        {
+            x86_kvm_ops().cache_reg(self, KvmReg::VcpuExregCr4)
+        }
+
+        return self.cr4 & mask;
+    }
+
+    pub fn get_cr8(&self) -> u64 {
+        if self.lapic_in_kernel() {
+            todo!()
+        } else {
+            return self.cr8;
+        }
+    }
+
+    #[inline]
+    pub fn is_smm(&self) -> bool {
+        self.hflags.contains(HFlags::HF_SMM_MASK)
+    }
+
+    #[inline]
+    pub fn is_guest_mode(&self) -> bool {
+        self.hflags.contains(HFlags::HF_GUEST_MASK)
+    }
+
+    #[inline]
+    pub fn is_long_mode(&self) -> bool {
+        self.efer.contains(EferFlags::LONG_MODE_ACTIVE)
+    }
+
+    #[inline]
+    #[allow(dead_code)]
+    pub fn is_pae_paging(&mut self) -> bool {
+        let flag1 = self.is_long_mode();
+        let flag2 = self.is_pae();
+        let flag3 = self.is_paging();
+
+        !flag1 && flag2 && flag3
+    }
+
+    #[inline]
+    pub fn is_pae(&mut self) -> bool {
+        !self.read_cr4_bits(Cr4::CR4_ENABLE_PAE).is_empty()
+    }
+    #[inline]
+    pub fn is_paging(&mut self) -> bool {
+        //return likely(kvm_is_cr0_bit_set(vcpu, X86_CR0_PG));
+        !self.read_cr0_bits(Cr0::CR0_ENABLE_PAGING).is_empty()
+    }
+
+    #[inline]
+    pub fn is_portected_mode(&mut self) -> bool {
+        !self.read_cr0_bits(Cr0::CR0_PROTECTED_MODE).is_empty()
+    }
+
+    #[inline]
+    fn clear_interrupt_queue(&mut self) {
+        self.interrupt.injected = false;
+    }
+
+    #[inline]
+    fn clear_exception_queue(&mut self) {
+        self.exception.pending = false;
+        self.exception.injected = false;
+        self.exception_vmexit.pending = false;
+    }
+
+    #[allow(dead_code)]
+    pub fn update_cpuid_runtime(&mut self, entries: &Vec<KvmCpuidEntry2>) {
+        let cpuid = CpuId::new();
+        let feat = cpuid.get_feature_info().unwrap();
+        let base = KvmCpuidEntry2::find(entries, 1, None);
+        if let Some(_base) = base {
+            if feat.has_xsave() {}
+        }
+
+        todo!()
+    }
+
+    #[inline]
+    pub fn test_and_mark_available(&mut self, reg: KvmReg) -> bool {
+        let old = self.regs_avail.get(reg as usize).unwrap_or_default();
+        self.regs_avail.set(reg as usize, true);
+        return old;
+    }
+
+    #[inline]
+    pub fn mark_register_dirty(&mut self, reg: KvmReg) {
+        self.regs_avail.set(reg as usize, true);
+        self.regs_dirty.set(reg as usize, true);
+    }
+
+    #[inline]
+    pub fn mark_register_available(&mut self, reg: KvmReg) {
+        self.regs_avail.set(reg as usize, true);
+    }
+
+    #[inline]
+    pub fn is_register_dirty(&self, reg: KvmReg) -> bool {
+        self.regs_dirty.get(reg as usize).unwrap()
+    }
+
+    #[inline]
+    pub fn is_register_available(&self, reg: KvmReg) -> bool {
+        self.regs_avail.get(reg as usize).unwrap()
+    }
+
+    #[inline]
+    pub fn write_reg(&mut self, reg: KvmReg, data: u64) {
+        self.regs[reg as usize] = data;
+    }
+
+    #[inline]
+    pub fn write_reg_raw(&mut self, reg: KvmReg, data: u64) {
+        self.regs[reg as usize] = data;
+        self.mark_register_dirty(reg);
+    }
+
+    #[inline]
+    pub fn read_reg(&self, reg: KvmReg) -> u64 {
+        return self.regs[reg as usize];
+    }
+
+    #[inline]
+    pub fn read_reg_raw(&mut self, reg: KvmReg) -> u64 {
+        if self.regs_avail.get(reg as usize) == Some(true) {
+            kvm_arch_ops().cache_reg(self, reg);
+        }
+
+        return self.regs[reg as usize];
+    }
+
+    #[inline]
+    fn get_linear_rip(&mut self) -> u64 {
+        if self.guest_state_protected {
+            return 0;
+        }
+        return self.read_reg_raw(KvmReg::VcpuRegsRip);
+    }
+
+    pub fn set_msr_common(&mut self, msr_info: &MsrData) {
+        let msr = msr_info.index;
+        let data = msr_info.data;
+
+        match msr {
+            // MSR_AMD64_NB_CFG
+            0xc001001f => {
+                return;
+            }
+            // MSR_VM_HSAVE_PA
+            0xc0010117 => {
+                return;
+            }
+            // MSR_AMD64_PATCH_LOADER
+            0xc0010020 => {
+                return;
+            }
+            // MSR_AMD64_BU_CFG2
+            0xc001102a => {
+                return;
+            }
+            // MSR_AMD64_DC_CFG
+            0xc0011022 => {
+                return;
+            }
+            // MSR_AMD64_TW_CFG
+            0xc0011023 => {
+                return;
+            }
+            // MSR_F15H_EX_CFG
+            0xc001102c => {
+                return;
+            }
+            msr::IA32_BIOS_UPDT_TRIG => {
+                return;
+            }
+            msr::IA32_BIOS_SIGN_ID => {
+                // MSR_IA32_UCODE_REV
+                if msr_info.host_initiated {
+                    self.microcode_version = data;
+                }
+                return;
+            }
+            // MSR_IA32_ARCH_CAPABILITIES
+            0x0000010a => {
+                if !msr_info.host_initiated {
+                    return;
+                }
+
+                self.arch_capabilities = data;
+            }
+            msr::MSR_PERF_CAPABILITIES => {
+                if !msr_info.host_initiated {
+                    return;
+                }
+
+                if data & (!x86_kvm_manager().kvm_caps.supported_perf_cap) != 0 {
+                    return;
+                }
+
+                if self.perf_capabilities == data {
+                    return;
+                }
+
+                self.perf_capabilities = data;
+                // todo: kvm_pmu_refresh
+                return;
+            }
+            // MSR_IA32_FLUSH_CMD
+            0x0000010b => {
+                todo!()
+            }
+            msr::IA32_EFER => {
+                todo!()
+            }
+            // MSR_K7_HWCR
+            0xc0010015 => {
+                todo!()
+            }
+            // MSR_FAM10H_MMIO_CONF_BASE
+            0xc0010058 => {
+                todo!()
+            }
+            msr::IA32_PAT => {
+                todo!()
+            }
+            // MTRRphysBase_MSR(0) ... MSR_MTRRfix4K_F8000 | MSR_MTRRdefType
+            0x200..=0x26f | 0x2ff => {
+                todo!()
+            }
+            msr::APIC_BASE => {
+                todo!()
+            }
+            // APIC_BASE_MSR ... APIC_BASE_MSR + 0xff
+            0x800..=0x8ff => {
+                todo!()
+            }
+            msr::IA32_TSC_DEADLINE => {
+                todo!()
+            }
+            msr::IA32_TSC_ADJUST => {
+                todo!()
+            }
+            msr::IA32_MISC_ENABLE => {
+                todo!()
+            }
+            msr::IA32_SMBASE => {
+                todo!()
+            }
+            msr::TSC => {
+                todo!()
+            }
+            // MSR_IA32_XSS
+            msr::MSR_C5_PMON_BOX_CTRL => {
+                if !msr_info.host_initiated {
+                    return;
+                }
+                if data & (!x86_kvm_manager().kvm_caps.supported_xss) != 0 {
+                    return;
+                }
+
+                self.ia32_xss = data;
+                // TODO:kvm_update_cpuid_runtime
+                return;
+            }
+            msr::MSR_SMI_COUNT => {
+                todo!()
+            }
+            kvm_msr::MSR_KVM_WALL_CLOCK_NEW => {
+                todo!()
+            }
+            kvm_msr::MSR_KVM_WALL_CLOCK => {
+                todo!()
+            }
+            kvm_msr::MSR_KVM_SYSTEM_TIME => {
+                todo!()
+            }
+            kvm_msr::MSR_KVM_ASYNC_PF_EN => {
+                todo!()
+            }
+            kvm_msr::MSR_KVM_ASYNC_PF_INT => {
+                todo!()
+            }
+            kvm_msr::MSR_KVM_ASYNC_PF_ACK => {
+                todo!()
+            }
+            kvm_msr::MSR_KVM_STEAL_TIME => {
+                todo!()
+            }
+            kvm_msr::MSR_KVM_PV_EOI_EN => {
+                todo!()
+            }
+            kvm_msr::MSR_KVM_POLL_CONTROL => {
+                todo!()
+            }
+            msr::MCG_CTL
+            | msr::MCG_STATUS
+            | msr::MC0_CTL..=msr::MSR_MC26_MISC
+            | msr::IA32_MC0_CTL2..=msr::IA32_MC21_CTL2 => {
+                todo!()
+            }
+            // MSR_K7_PERFCTR0 ... MSR_K7_PERFCTR3
+            // MSR_K7_PERFCTR0 ... MSR_K7_PERFCTR3
+            // MSR_K7_EVNTSEL0 ... MSR_K7_EVNTSEL3
+            // MSR_P6_EVNTSEL0 ... MSR_P6_EVNTSEL1
+            0xc0010004..=0xc0010007
+            | 0xc1..=0xc2
+            | 0xc0010000..=0xc0010003
+            | 0x00000186..=0x00000187 => {
+                todo!()
+            }
+
+            // MSR_K7_CLK_CTL
+            0xc001001b => {
+                return;
+            }
+
+            hyperv::HV_X64_MSR_GUEST_OS_ID..=hyperv::HV_REGISTER_SINT15
+            | hyperv::HV_X64_MSR_SYNDBG_CONTROL..=hyperv::HV_X64_MSR_SYNDBG_PENDING_BUFFER
+            | hyperv::HV_X64_MSR_SYNDBG_OPTIONS
+            | hyperv::HV_REGISTER_CRASH_P0..=hyperv::HV_REGISTER_CRASH_P4
+            | hyperv::HV_REGISTER_CRASH_CTL
+            | hyperv::HV_REGISTER_STIMER0_CONFIG..=hyperv::HV_REGISTER_STIMER3_COUNT
+            | hyperv::HV_X64_MSR_REENLIGHTENMENT_CONTROL
+            | hyperv::HV_X64_MSR_TSC_EMULATION_CONTROL
+            | hyperv::HV_X64_MSR_TSC_EMULATION_STATUS
+            | hyperv::HV_X64_MSR_TSC_INVARIANT_CONTROL => {
+                todo!()
+            }
+
+            msr::MSR_BBL_CR_CTL3 => {
+                todo!()
+            }
+
+            // MSR_AMD64_OSVW_ID_LENGTH
+            0xc0010140 => {
+                todo!()
+            }
+            // MSR_AMD64_OSVW_STATUS
+            0xc0010141 => {
+                todo!()
+            }
+
+            msr::MSR_PLATFORM_INFO => {
+                todo!()
+            }
+            // MSR_MISC_FEATURES_ENABLES
+            0x00000140 => {
+                todo!()
+            }
+            // MSR_IA32_XFD
+            0x000001c4 => {
+                todo!()
+            }
+            // MSR_IA32_XFD_ERR
+            0x000001c5 => {
+                todo!()
+            }
+            _ => {
+                todo!()
+            }
+        }
+    }
+
+    pub fn kvm_before_interrupt(&mut self, intr: KvmIntrType) {
+        barrier::mfence();
+        self.handling_intr_from_guest = intr;
+        barrier::mfence();
+    }
+
+    pub fn kvm_after_interrupt(&mut self) {
+        barrier::mfence();
+        self.handling_intr_from_guest = KvmIntrType::None;
+        barrier::mfence();
+    }
+}
+
+impl VirtCpu {
+    pub fn init_arch(&mut self, vm: &mut Vm, id: usize) -> Result<(), SystemError> {
+        //kvm_arch_vcpu_create
+        vm.vcpu_precreate(id)?;
+
+        self.arch.last_vmentry_cpu = ProcessorId::INVALID;
+        self.arch.regs_avail.set_all(true);
+        self.arch.regs_dirty.set_all(true);
+
+        if vm.arch.irqchip_mode != KvmIrqChipMode::None || vm.arch.bsp_vcpu_id == self.vcpu_id {
+            self.arch.mp_state = MutilProcessorState::Runnable;
+        } else {
+            self.arch.mp_state = MutilProcessorState::Uninitialized;
+        }
+
+        self.arch.vcpu_arch_mmu_create();
+
+        if vm.arch.irqchip_mode != KvmIrqChipMode::None {
+            todo!()
+        } else {
+            x86_kvm_manager_mut().has_noapic_vcpu = true;
+        }
+
+        x86_kvm_ops().vcpu_create(self, vm);
+
+        //lots of todo!!!
+
+        self.arch.pat = MSR_IA32_CR_PAT_DEFAULT;
+
+        self.load();
+        self.vcpu_reset(vm, false)?;
+        self.arch.kvm_init_mmu();
+
+        Ok(())
+    }
+
+    #[inline]
+    pub fn kvm_run(&self) -> &UapiKvmRun {
+        self.run.as_ref().unwrap()
+    }
+
+    #[inline]
+    pub fn kvm_run_mut(&mut self) -> &mut Box<UapiKvmRun> {
+        self.run.as_mut().unwrap()
+    }
+
+    pub fn run(&mut self) -> Result<usize, SystemError> {
+        self.load();
+
+        if unlikely(self.arch.mp_state == MutilProcessorState::Uninitialized) {
+            todo!()
+        }
+
+        if self.kvm_run().kvm_valid_regs & !KVM_SYNC_X86_VALID_FIELDS != 0
+            || self.kvm_run().kvm_dirty_regs & !KVM_SYNC_X86_VALID_FIELDS != 0
+        {
+            return Err(SystemError::EINVAL);
+        }
+
+        if self.kvm_run().kvm_dirty_regs != 0 {
+            todo!()
+        }
+
+        if !self.arch.lapic_in_kernel() {
+            self.kvm_set_cr8(self.kvm_run().cr8);
+        }
+
+        // TODO: https://code.dragonos.org.cn/xref/linux-6.6.21/arch/x86/kvm/x86.c#11174 - 11196
+
+        if self.kvm_run().immediate_exit != 0 {
+            return Err(SystemError::EINTR);
+        }
+
+        // vmx_vcpu_pre_run
+
+        self.vcpu_run(&self.kvm().lock())?;
+
+        Ok(0)
+    }
+
+    fn vcpu_run(&mut self, vm: &Vm) -> Result<(), SystemError> {
+        self.arch.l1tf_flush_l1d = true;
+
+        loop {
+            self.arch.at_instruction_boundary = false;
+            if self.can_running() {
+                self.enter_guest(vm)?;
+            } else {
+                todo!()
+            };
+        }
+    }
+
+    fn enter_guest(&mut self, vm: &Vm) -> Result<(), SystemError> {
+        let req_immediate_exit = false;
+
+        warn!("request {:?}", self.request);
+        if !self.request.is_empty() {
+            if self.check_request(VirtCpuRequest::KVM_REQ_VM_DEAD) {
+                return Err(SystemError::EIO);
+            }
+
+            // TODO: kvm_dirty_ring_check_request
+
+            if self.check_request(VirtCpuRequest::KVM_REQ_MMU_FREE_OBSOLETE_ROOTS) {
+                todo!()
+            }
+
+            if self.check_request(VirtCpuRequest::KVM_REQ_MIGRATE_TIMER) {
+                todo!()
+            }
+
+            if self.check_request(VirtCpuRequest::KVM_REQ_MASTERCLOCK_UPDATE) {
+                todo!()
+            }
+
+            if self.check_request(VirtCpuRequest::KVM_REQ_GLOBAL_CLOCK_UPDATE) {
+                todo!()
+            }
+
+            if self.check_request(VirtCpuRequest::KVM_REQ_CLOCK_UPDATE) {
+                todo!()
+            }
+
+            if self.check_request(VirtCpuRequest::KVM_REQ_MMU_SYNC) {
+                todo!()
+            }
+
+            if self.check_request(VirtCpuRequest::KVM_REQ_LOAD_MMU_PGD) {
+                todo!()
+            }
+
+            if self.check_request(VirtCpuRequest::KVM_REQ_TLB_FLUSH) {
+                self.flush_tlb_all();
+            }
+
+            self.service_local_tlb_flush_requests();
+
+            // TODO: KVM_REQ_HV_TLB_FLUSH) && kvm_hv_vcpu_flush_tlb(vcpu)
+
+            if self.check_request(VirtCpuRequest::KVM_REQ_REPORT_TPR_ACCESS) {
+                todo!()
+            }
+
+            if self.check_request(VirtCpuRequest::KVM_REQ_TRIPLE_FAULT) {
+                todo!()
+            }
+
+            if self.check_request(VirtCpuRequest::KVM_REQ_STEAL_UPDATE) {
+                // todo!()
+                warn!("VirtCpuRequest::KVM_REQ_STEAL_UPDATE TODO!");
+            }
+
+            if self.check_request(VirtCpuRequest::KVM_REQ_SMI) {
+                todo!()
+            }
+
+            if self.check_request(VirtCpuRequest::KVM_REQ_NMI) {
+                todo!()
+            }
+
+            if self.check_request(VirtCpuRequest::KVM_REQ_PMU) {
+                todo!()
+            }
+
+            if self.check_request(VirtCpuRequest::KVM_REQ_PMI) {
+                todo!()
+            }
+
+            if self.check_request(VirtCpuRequest::KVM_REQ_IOAPIC_EOI_EXIT) {
+                todo!()
+            }
+
+            if self.check_request(VirtCpuRequest::KVM_REQ_SCAN_IOAPIC) {
+                todo!()
+            }
+
+            if self.check_request(VirtCpuRequest::KVM_REQ_LOAD_EOI_EXITMAP) {
+                todo!()
+            }
+
+            if self.check_request(VirtCpuRequest::KVM_REQ_APIC_PAGE_RELOAD) {
+                // todo!()
+                warn!("VirtCpuRequest::KVM_REQ_APIC_PAGE_RELOAD TODO!");
+            }
+
+            if self.check_request(VirtCpuRequest::KVM_REQ_HV_CRASH) {
+                todo!()
+            }
+
+            if self.check_request(VirtCpuRequest::KVM_REQ_HV_RESET) {
+                todo!()
+            }
+
+            if self.check_request(VirtCpuRequest::KVM_REQ_HV_EXIT) {
+                todo!()
+            }
+
+            if self.check_request(VirtCpuRequest::KVM_REQ_HV_STIMER) {
+                todo!()
+            }
+
+            if self.check_request(VirtCpuRequest::KVM_REQ_APICV_UPDATE) {
+                todo!()
+            }
+
+            if self.check_request(VirtCpuRequest::KVM_REQ_APF_READY) {
+                todo!()
+            }
+
+            if self.check_request(VirtCpuRequest::KVM_REQ_MSR_FILTER_CHANGED) {
+                todo!()
+            }
+
+            if self.check_request(VirtCpuRequest::KVM_REQ_UPDATE_CPU_DIRTY_LOGGING) {
+                todo!()
+            }
+        }
+
+        // TODO: https://code.dragonos.org.cn/xref/linux-6.6.21/arch/x86/kvm/x86.c#10661
+        if self.check_request(VirtCpuRequest::KVM_REQ_EVENT) {
+            // TODO
+        }
+
+        self.kvm_mmu_reload(vm)?;
+
+        x86_kvm_ops().prepare_switch_to_guest(self);
+        // warn!(
+        //     "mode {:?} req {:?} mode_cond {} !is_empty {} cond {}",
+        //     self.mode,
+        //     self.request,
+        //     self.mode == VcpuMode::ExitingGuestMode,
+        //     !self.request.is_empty(),
+        //     (self.mode == VcpuMode::ExitingGuestMode) || (!self.request.is_empty())
+        // );
+        warn!(
+            "req bit {} empty bit {}",
+            self.request.bits,
+            VirtCpuRequest::empty().bits
+        );
+        // TODO: https://code.dragonos.org.cn/xref/linux-6.6.21/arch/x86/kvm/x86.c#10730
+        if self.mode == VcpuMode::ExitingGuestMode || !self.request.is_empty() {
+            self.mode = VcpuMode::OutsideGuestMode;
+            return Err(SystemError::EINVAL);
+        }
+
+        if req_immediate_exit {
+            self.request(VirtCpuRequest::KVM_REQ_EVENT);
+            todo!();
+        }
+
+        // TODO: https://code.dragonos.org.cn/xref/linux-6.6.21/arch/x86/kvm/x86.c#10749 - 10766
+
+        let exit_fastpath;
+        loop {
+            exit_fastpath = x86_kvm_ops().vcpu_run(self);
+            if likely(exit_fastpath != ExitFastpathCompletion::ExitHandled) {
+                break;
+            }
+
+            todo!();
+        }
+
+        // TODO: https://code.dragonos.org.cn/xref/linux-6.6.21/arch/x86/kvm/x86.c#10799 - 10814
+
+        self.arch.last_vmentry_cpu = self.cpu;
+
+        // TODO: last_guest_tsc
+
+        self.mode = VcpuMode::OutsideGuestMode;
+
+        barrier::mfence();
+
+        // TODO: xfd
+
+        x86_kvm_ops().handle_exit_irqoff(self);
+
+        // todo: xfd
+
+        // TODO: 一些中断或者tsc操作
+
+        match x86_kvm_ops().handle_exit(self, vm, exit_fastpath) {
+            Err(err) => return Err(err),
+            Ok(_) => Ok(()),
+        }
+    }
+
+    fn flush_tlb_all(&mut self) {
+        x86_kvm_ops().flush_tlb_all(self);
+        self.clear_request(VirtCpuRequest::KVM_REQ_TLB_FLUSH_CURRENT);
+    }
+
+    fn service_local_tlb_flush_requests(&mut self) {
+        if self.check_request(VirtCpuRequest::KVM_REQ_TLB_FLUSH_CURRENT) {
+            todo!()
+        }
+
+        if self.check_request(VirtCpuRequest::KVM_REQ_TLB_FLUSH_GUEST) {
+            todo!()
+        }
+    }
+
+    pub fn request(&mut self, req: VirtCpuRequest) {
+        // self.request.set(
+        //     (req.bits() & VirtCpuRequest::KVM_REQUEST_MASK.bits()) as usize,
+        //     true,
+        // );
+        self.request.insert(req);
+    }
+
+    fn check_request(&mut self, req: VirtCpuRequest) -> bool {
+        if self.test_request(req) {
+            self.clear_request(req);
+
+            barrier::mfence();
+            return true;
+        }
+
+        return false;
+    }
+
+    fn test_request(&self, req: VirtCpuRequest) -> bool {
+        // self.request
+        //     .get((req.bits & VirtCpuRequest::KVM_REQUEST_MASK.bits) as usize)
+        //     .unwrap_or_default()
+        self.request.contains(req)
+    }
+
+    fn clear_request(&mut self, req: VirtCpuRequest) {
+        // self.request.set(
+        //     (req.bits & VirtCpuRequest::KVM_REQUEST_MASK.bits) as usize,
+        //     false,
+        // );
+        self.request.remove(req);
+    }
+
+    pub fn can_running(&self) -> bool {
+        return self.arch.mp_state == MutilProcessorState::Runnable && !self.arch.apf.halted;
+    }
+
+    #[inline]
+    fn load(&mut self) {
+        self.arch_vcpu_load(smp_get_processor_id())
+    }
+
+    fn arch_vcpu_load(&mut self, cpu: ProcessorId) {
+        x86_kvm_ops().vcpu_load(self, cpu);
+
+        self.arch.host_pkru = KvmX86Asm::read_pkru();
+
+        // 下列两个TODO为处理时钟信息
+        if unlikely(self.arch.tsc_offset_adjustment != 0) {
+            todo!()
+        }
+
+        if unlikely(self.cpu != cpu) {
+            // TODO: 设置tsc
+            self.cpu = cpu;
+        }
+
+        self.request(VirtCpuRequest::KVM_REQ_STEAL_UPDATE)
+    }
+
+    pub fn set_msr(
+        &mut self,
+        index: u32,
+        data: u64,
+        host_initiated: bool,
+    ) -> Result<(), SystemError> {
+        match index {
+            msr::IA32_FS_BASE
+            | msr::IA32_GS_BASE
+            | msr::IA32_KERNEL_GSBASE
+            | msr::IA32_CSTAR
+            | msr::IA32_LSTAR => {
+                if VirtAddr::new(data as usize).is_canonical() {
+                    return Ok(());
+                }
+            }
+
+            msr::IA32_SYSENTER_EIP | msr::IA32_SYSENTER_ESP => {
+                // 需要将Data转为合法地址,但是现在先这样写
+                assert!(VirtAddr::new(data as usize).is_canonical());
+            }
+            msr::IA32_TSC_AUX => {
+                if x86_kvm_manager()
+                    .find_user_return_msr_idx(msr::IA32_TSC_AUX)
+                    .is_none()
+                {
+                    return Ok(());
+                }
+
+                todo!()
+            }
+            _ => {}
+        }
+
+        let msr_data = MsrData {
+            host_initiated,
+            index,
+            data,
+        };
+
+        return kvm_arch_ops().set_msr(self, msr_data);
+    }
+
+    pub fn vcpu_reset(&mut self, vm: &Vm, init_event: bool) -> Result<(), SystemError> {
+        let old_cr0 = self.arch.read_cr0_bits(Cr0::all());
+
+        if self.arch.is_guest_mode() {
+            todo!()
+        }
+
+        self.lapic_reset(vm, init_event);
+
+        self.arch.hflags = HFlags::empty();
+
+        self.arch.smi_pending = false;
+        self.arch.smi_count = 0;
+        self.arch.nmi_queued = 0;
+        self.arch.nmi_pending = 0;
+        self.arch.nmi_injected = false;
+
+        self.arch.clear_exception_queue();
+        self.arch.clear_interrupt_queue();
+
+        for i in &mut self.arch.db {
+            *i = 0;
+        }
+
+        // TODO: kvm_update_dr0123(vcpu);
+
+        // DR6_ACTIVE_LOW
+        self.arch.dr6 = 0xffff0ff0;
+        // DR7_FIXED_1
+        self.arch.dr7 = 0x00000400;
+
+        // TODO: kvm_update_dr7(vcpu);
+
+        self.arch.cr2 = 0;
+
+        self.request(VirtCpuRequest::KVM_REQ_EVENT);
+
+        self.arch.apf.msr_en_val = 0;
+        self.arch.apf.msr_int_val = 0;
+        // TODO:st
+
+        // TODO: kvmclock_reset(vcpu);
+
+        // TODO: kvm_clear_async_pf_completion_queue(vcpu);
+
+        for i in &mut self.arch.apf.gfns {
+            *i = u64::MAX;
+        }
+
+        self.arch.apf.halted = false;
+
+        // TODO: fpu
+
+        if !init_event {
+            // TODO:pmu
+            self.arch.smbase = 0x30000;
+
+            self.arch.msr_misc_features_enables = 0;
+            self.arch.ia32_misc_enable_msr = MiscEnable::MSR_IA32_MISC_ENABLE_PEBS_UNAVAIL
+                | MiscEnable::MSR_IA32_MISC_ENABLE_BTS_UNAVAIL;
+
+            // TODO: __kvm_set_xcr(vcpu, 0, XFEATURE_MASK_FP);
+            // 0xda0: MSR_IA32_XSS
+            self.set_msr(0xda0, 0, true)?;
+        }
+
+        for reg in &mut self.arch.regs {
+            *reg = 0;
+        }
+
+        self.arch.mark_register_dirty(KvmReg::VcpuRegsRsp);
+
+        let cpuid_0x1 = KvmCpuidEntry2::find(&self.arch.cpuid_entries, 1, None);
+        let val = if let Some(cpuid) = cpuid_0x1 {
+            cpuid.eax
+        } else {
+            0x600
+        };
+        self.arch.write_reg(KvmReg::VcpuRegsRdx, val as u64);
+
+        kvm_arch_ops().vcpu_reset(self, vm, init_event);
+
+        self.set_rflags(RFlags::FLAGS_A1);
+        self.arch.write_reg_raw(KvmReg::VcpuRegsRip, 0xfff0);
+
+        self.arch.cr3 = 0;
+        self.arch.mark_register_dirty(KvmReg::VcpuExregCr3);
+
+        let mut new_cr0 = Cr0::CR0_EXTENSION_TYPE;
+        if init_event {
+            new_cr0.insert(old_cr0 & (Cr0::CR0_NOT_WRITE_THROUGH | Cr0::CR0_CACHE_DISABLE));
+        } else {
+            new_cr0.insert(Cr0::CR0_NOT_WRITE_THROUGH | Cr0::CR0_CACHE_DISABLE);
+        }
+
+        kvm_arch_ops().set_cr0(vm, self, new_cr0);
+        kvm_arch_ops().set_cr4(self, Cr4::empty());
+        kvm_arch_ops().set_efer(self, EferFlags::empty());
+        kvm_arch_ops().update_exception_bitmap(self);
+
+        if old_cr0.contains(Cr0::CR0_ENABLE_PAGING) {
+            self.request(VirtCpuRequest::MAKE_KVM_REQ_TLB_FLUSH_GUEST);
+            self.arch.reset_mmu_context();
+        }
+
+        if init_event {
+            self.request(VirtCpuRequest::MAKE_KVM_REQ_TLB_FLUSH_GUEST);
+        }
+
+        Ok(())
+    }
+
+    fn set_rflags(&mut self, rflags: RFlags) {
+        self._set_rflags(rflags);
+        self.request(VirtCpuRequest::KVM_REQ_EVENT);
+    }
+
+    fn _set_rflags(&mut self, mut rflags: RFlags) {
+        if self.guest_debug.contains(GuestDebug::SINGLESTEP)
+            && self.is_linear_rip(self.arch.single_step_rip)
+        {
+            rflags.insert(RFlags::FLAGS_TF);
+        }
+
+        kvm_arch_ops().set_rflags(self, rflags);
+    }
+
+    fn get_rflags(&mut self) -> RFlags {
+        let mut rflags = kvm_arch_ops().get_rflags(self);
+        if self.guest_debug.contains(GuestDebug::SINGLESTEP) {
+            rflags.insert(RFlags::FLAGS_TF);
+        }
+        return rflags;
+    }
+
+    fn is_linear_rip(&mut self, linear_rip: usize) -> bool {
+        return self.arch.get_linear_rip() == linear_rip as u64;
+    }
+
+    pub fn get_regs(&mut self) -> KvmCommonRegs {
+        self.load();
+        return self._get_regs();
+    }
+
+    fn _get_regs(&mut self) -> KvmCommonRegs {
+        KvmCommonRegs {
+            rax: self.arch.read_reg(KvmReg::VcpuRegsRax),
+            rbx: self.arch.read_reg(KvmReg::VcpuRegsRbx),
+            rcx: self.arch.read_reg(KvmReg::VcpuRegsRcx),
+            rdx: self.arch.read_reg(KvmReg::VcpuRegsRdx),
+            rsi: self.arch.read_reg(KvmReg::VcpuRegsRsi),
+            rdi: self.arch.read_reg(KvmReg::VcpuRegsRdi),
+            rsp: self.arch.read_reg(KvmReg::VcpuRegsRsp),
+            rbp: self.arch.read_reg(KvmReg::VcpuRegsRbp),
+            r8: self.arch.read_reg(KvmReg::VcpuRegsR8),
+            r9: self.arch.read_reg(KvmReg::VcpuRegsR9),
+            r10: self.arch.read_reg(KvmReg::VcpuRegsR10),
+            r11: self.arch.read_reg(KvmReg::VcpuRegsR11),
+            r12: self.arch.read_reg(KvmReg::VcpuRegsR12),
+            r13: self.arch.read_reg(KvmReg::VcpuRegsR13),
+            r14: self.arch.read_reg(KvmReg::VcpuRegsR14),
+            r15: self.arch.read_reg(KvmReg::VcpuRegsR15),
+            rip: self.arch.read_reg_raw(KvmReg::VcpuRegsRip),
+            rflags: self.get_rflags().bits(),
+        }
+    }
+
+    pub fn get_segment_regs(&mut self) -> UapiKvmSegmentRegs {
+        self.load();
+        return self._get_segment_regs();
+    }
+
+    fn _get_segment_regs(&mut self) -> UapiKvmSegmentRegs {
+        let mut sregs = self._get_segment_regs_common();
+
+        if self.arch.guest_state_protected {
+            return sregs;
+        }
+
+        if self.arch.interrupt.injected && !self.arch.interrupt.soft {
+            BitMapCore::new().set(
+                sregs.interrupt_bitmap.len() * core::mem::size_of::<u64>(),
+                &mut sregs.interrupt_bitmap,
+                self.arch.interrupt.nr as usize,
+                true,
+            );
+        }
+
+        return sregs;
+    }
+
+    fn read_cr3(&mut self) -> u64 {
+        if !self.arch.is_register_available(KvmReg::VcpuExregCr3) {
+            x86_kvm_ops().cache_reg(&mut self.arch, KvmReg::VcpuExregCr3);
+        }
+        return self.arch.cr3;
+    }
+
+    fn kvm_get_segment(&mut self, segment: &mut UapiKvmSegment, seg: VcpuSegment) {
+        *segment = x86_kvm_ops().get_segment(self, *segment, seg);
+    }
+
+    fn _get_segment_regs_common(&mut self) -> UapiKvmSegmentRegs {
+        let mut sregs = UapiKvmSegmentRegs::default();
+
+        if !self.arch.guest_state_protected {
+            let mut dt = DescriptorTablePointer::default();
+
+            self.kvm_get_segment(&mut sregs.cs, VcpuSegment::CS);
+            self.kvm_get_segment(&mut sregs.ds, VcpuSegment::DS);
+            self.kvm_get_segment(&mut sregs.es, VcpuSegment::ES);
+            self.kvm_get_segment(&mut sregs.fs, VcpuSegment::FS);
+            self.kvm_get_segment(&mut sregs.gs, VcpuSegment::GS);
+            self.kvm_get_segment(&mut sregs.ss, VcpuSegment::SS);
+
+            self.kvm_get_segment(&mut sregs.tr, VcpuSegment::TR);
+            self.kvm_get_segment(&mut sregs.ldt, VcpuSegment::LDTR);
+
+            x86_kvm_ops().get_idt(self, &mut dt);
+            sregs.idt.limit = dt.limit;
+            sregs.idt.base = dt.base as usize as u64;
+
+            x86_kvm_ops().get_gdt(self, &mut dt);
+            sregs.gdt.limit = dt.limit;
+            sregs.gdt.base = dt.base as usize as u64;
+
+            sregs.cr2 = self.arch.cr2;
+            sregs.cr3 = self.read_cr3();
+        }
+
+        sregs.cr0 = self.arch.read_cr0_bits(Cr0::all()).bits() as u64;
+        sregs.cr4 = self.arch.read_cr4_bits(Cr4::all()).bits() as u64;
+        sregs.cr8 = self.arch.get_cr8();
+        sregs.efer = self.arch.efer.bits();
+        sregs.apic_base = self.arch.apic_base;
+
+        return sregs;
+    }
+
+    pub fn set_segment_regs(&mut self, sregs: &mut UapiKvmSegmentRegs) -> Result<(), SystemError> {
+        self.load();
+        self._set_segmenet_regs(&self.kvm().lock(), sregs)?;
+        Ok(())
+    }
+
+    fn _set_segmenet_regs(
+        &mut self,
+        vm: &Vm,
+        sregs: &mut UapiKvmSegmentRegs,
+    ) -> Result<(), SystemError> {
+        let mut mmu_reset_needed = false;
+        self._set_segmenet_regs_common(vm, sregs, &mut mmu_reset_needed, true)?;
+
+        if mmu_reset_needed {
+            todo!()
+        }
+
+        // KVM_NR_INTERRUPTS
+        let max_bits = 256;
+
+        let pending_vec = BitMapCore::new().first_index(&sregs.interrupt_bitmap);
+        if let Some(pending) = pending_vec {
+            if pending < max_bits {
+                self.arch.queue_interrupt(pending as u8, false);
+
+                self.request(VirtCpuRequest::KVM_REQ_EVENT);
+            }
+        }
+
+        Ok(())
+    }
+
+    /// 设置段寄存器
+    fn _set_segmenet_regs_common(
+        &mut self,
+        vm: &Vm,
+        sregs: &mut UapiKvmSegmentRegs,
+        mmu_reset_needed: &mut bool,
+        update_pdptrs: bool,
+    ) -> Result<(), SystemError> {
+        let mut apic_base_msr = MsrData::default();
+
+        if !self.is_valid_segment_regs(sregs) {
+            return Err(SystemError::EINVAL);
+        }
+
+        apic_base_msr.data = sregs.apic_base;
+        apic_base_msr.host_initiated = true;
+
+        // TODO: kvm_set_apic_base
+
+        if self.arch.guest_state_protected {
+            return Ok(());
+        }
+
+        let mut dt: DescriptorTablePointer<u8> = DescriptorTablePointer {
+            limit: sregs.idt.limit,
+            base: sregs.idt.base as usize as *const u8,
+        };
+
+        x86_kvm_ops().set_idt(self, &dt);
+
+        dt.limit = sregs.gdt.limit;
+        dt.base = sregs.gdt.base as usize as *const u8;
+        x86_kvm_ops().set_gdt(self, &dt);
+
+        self.arch.cr2 = sregs.cr2;
+        *mmu_reset_needed |= self.read_cr3() != sregs.cr3;
+
+        self.arch.cr3 = sregs.cr3;
+
+        self.arch.mark_register_dirty(KvmReg::VcpuExregCr3);
+
+        x86_kvm_ops().post_set_cr3(self, sregs.cr3);
+
+        //debug!("_set_segmenet_regs_common 2:: cr3: {:#x}", self.arch.cr3);
+
+        self.kvm_set_cr8(sregs.cr8);
+
+        let efer = EferFlags::from_bits_truncate(sregs.efer);
+        *mmu_reset_needed |= self.arch.efer != efer;
+        x86_kvm_ops().set_efer(self, efer);
+
+        let cr0 = Cr0::from_bits_truncate(sregs.cr0 as usize);
+        *mmu_reset_needed |= self.arch.cr0 != cr0;
+        x86_kvm_ops().set_cr0(vm, self, cr0);
+        self.arch.cr0 = cr0;
+
+        let cr4 = Cr4::from_bits_truncate(sregs.cr4 as usize);
+        *mmu_reset_needed |= self.arch.read_cr4_bits(Cr4::all()) != cr4;
+        x86_kvm_ops().set_cr4(self, cr4);
+
+        if update_pdptrs {
+            //todo!()
+        }
+
+        x86_kvm_ops().set_segment(self, &mut sregs.cs, VcpuSegment::CS);
+        x86_kvm_ops().set_segment(self, &mut sregs.ds, VcpuSegment::DS);
+        x86_kvm_ops().set_segment(self, &mut sregs.es, VcpuSegment::ES);
+        x86_kvm_ops().set_segment(self, &mut sregs.fs, VcpuSegment::FS);
+        x86_kvm_ops().set_segment(self, &mut sregs.gs, VcpuSegment::GS);
+        x86_kvm_ops().set_segment(self, &mut sregs.ss, VcpuSegment::SS);
+
+        x86_kvm_ops().set_segment(self, &mut sregs.tr, VcpuSegment::TR);
+        x86_kvm_ops().set_segment(self, &mut sregs.ldt, VcpuSegment::LDTR);
+
+        // TODO: update_cr8_intercept
+
+        if self.arch.is_bsp()
+            && self.arch.read_reg_raw(KvmReg::VcpuRegsRip) == 0xfff0
+            && sregs.cs.selector == 0xf000
+            && sregs.cs.base == 0xffff0000
+            && !self.arch.is_portected_mode()
+        {
+            self.arch.mp_state = MutilProcessorState::Runnable;
+        }
+
+        Ok(())
+    }
+
+    pub fn kvm_set_cr8(&mut self, cr8: u64) {
+        // 先这样写
+        self.arch.cr8 = cr8;
+    }
+
+    fn is_valid_segment_regs(&self, sregs: &UapiKvmSegmentRegs) -> bool {
+        let efer = EferFlags::from_bits_truncate(sregs.efer);
+        let cr4 = Cr4::from_bits_truncate(sregs.cr4 as usize);
+        let cr0 = Cr0::from_bits_truncate(sregs.cr0 as usize);
+
+        if efer.contains(EferFlags::LONG_MODE_ENABLE) && cr0.contains(Cr0::CR0_ENABLE_PAGING) {
+            if !cr4.contains(Cr4::CR4_ENABLE_PAE) || !efer.contains(EferFlags::LONG_MODE_ACTIVE) {
+                return false;
+            }
+
+            // TODO: legal gpa?
+        } else if efer.contains(EferFlags::LONG_MODE_ACTIVE) || sregs.cs.l != 0 {
+            return false;
+        }
+        let ret = self.kvm_is_vaild_cr0(cr0) && self.kvm_is_vaild_cr4(cr4);
+        return ret;
+    }
+
+    fn kvm_is_vaild_cr0(&self, cr0: Cr0) -> bool {
+        if cr0.contains(Cr0::CR0_NOT_WRITE_THROUGH) && !cr0.contains(Cr0::CR0_CACHE_DISABLE) {
+            return false;
+        }
+
+        if cr0.contains(Cr0::CR0_ENABLE_PAGING) && !cr0.contains(Cr0::CR0_PROTECTED_MODE) {
+            return false;
+        }
+        let ret = x86_kvm_ops().is_vaild_cr0(self, cr0);
+        return ret;
+    }
+
+    fn __kvm_is_valid_cr4(&self, cr4: Cr4) -> bool {
+        if cr4.contains(self.arch.cr4_guest_rsvd_bits) {
+            //debug!("__kvm_is_valid_cr4::here");
+            //return false;
+        }
+
+        return true;
+    }
+
+    fn kvm_is_vaild_cr4(&self, cr4: Cr4) -> bool {
+        return self.__kvm_is_valid_cr4(cr4) && x86_kvm_ops().is_vaild_cr4(self, cr4);
+    }
+
+    pub fn is_unrestricted_guest(&self) -> bool {
+        let guard = self.vmx().loaded_vmcs();
+        return vmx_info().enable_unrestricted_guest
+            && (!self.arch.is_guest_mode()
+                || SecondaryControls::from_bits_truncate(
+                    guard.controls_get(ControlsType::SecondaryExec) as u32,
+                )
+                .contains(SecondaryControls::UNRESTRICTED_GUEST));
+    }
+
+    pub fn set_regs(&mut self, regs: &KvmCommonRegs) -> Result<(), SystemError> {
+        self.load();
+        self._set_regs(regs);
+        Ok(())
+    }
+
+    fn _set_regs(&mut self, regs: &KvmCommonRegs) {
+        self.arch.emulate_regs_need_sync_from_vcpu = true;
+        self.arch.emulate_regs_need_sync_to_vcpu = false;
+
+        self.arch.write_reg(KvmReg::VcpuRegsRax, regs.rax);
+        self.arch.write_reg(KvmReg::VcpuRegsRbx, regs.rbx);
+        self.arch.write_reg(KvmReg::VcpuRegsRcx, regs.rcx);
+        self.arch.write_reg(KvmReg::VcpuRegsRdx, regs.rdx);
+        self.arch.write_reg(KvmReg::VcpuRegsRsi, regs.rsi);
+        self.arch.write_reg(KvmReg::VcpuRegsRdi, regs.rdi);
+        self.arch.write_reg(KvmReg::VcpuRegsRsp, regs.rsp);
+        self.arch.write_reg(KvmReg::VcpuRegsRbp, regs.rbp);
+
+        self.arch.write_reg(KvmReg::VcpuRegsR8, regs.r8);
+        self.arch.write_reg(KvmReg::VcpuRegsR9, regs.r9);
+        self.arch.write_reg(KvmReg::VcpuRegsR10, regs.r10);
+        self.arch.write_reg(KvmReg::VcpuRegsR11, regs.r11);
+        self.arch.write_reg(KvmReg::VcpuRegsR12, regs.r12);
+        self.arch.write_reg(KvmReg::VcpuRegsR13, regs.r13);
+        self.arch.write_reg(KvmReg::VcpuRegsR14, regs.r14);
+        self.arch.write_reg(KvmReg::VcpuRegsR15, regs.r15);
+
+        self.arch.write_reg_raw(KvmReg::VcpuRegsRip, regs.rip);
+
+        self.set_rflags(RFlags::from_bits_truncate(regs.rflags) | RFlags::FLAGS_A1);
+
+        self.arch.exception.pending = false;
+        self.arch.exception_vmexit.pending = false;
+
+        self.request(VirtCpuRequest::KVM_REQ_EVENT);
+    }
+
+    pub fn load_guest_xsave_state(&mut self) {
+        if self.arch.guest_state_protected {
+            return;
+        }
+
+        if !self.arch.read_cr4_bits(Cr4::CR4_ENABLE_OS_XSAVE).is_empty() {
+            if self.arch.xcr0 != x86_kvm_manager().host_xcr0 {
+                unsafe { _xsetbv(0, self.arch.xcr0.bits()) };
+            }
+
+            if self.arch.ia32_xss != x86_kvm_manager().host_xss {
+                // XSS
+                unsafe { wrmsr(0xda0, self.arch.ia32_xss) };
+            }
+        }
+
+        if CpuId::new().get_extended_feature_info().unwrap().has_pku()
+            && self.arch.pkru != self.arch.host_pkru
+            && (self.arch.xcr0.contains(Xcr0::XCR0_PKRU_STATE)
+                || !self
+                    .arch
+                    .read_cr4_bits(Cr4::CR4_ENABLE_PROTECTION_KEY)
+                    .is_empty())
+        {
+            KvmX86Asm::write_pkru(self.arch.pkru);
+        }
+    }
+
+    pub fn load_pdptrs(&mut self) {
+        //let mmu = self.arch.mmu();
+        if !self.arch.is_register_dirty(KvmReg::VcpuExregCr3) {
+            return;
+        }
+        //if self.arch.is_pae_paging(){
+        let mmu = self.arch.mmu();
+
+        VmxAsm::vmx_vmwrite(guest::PDPTE0_FULL, mmu.pdptrs[0]);
+        VmxAsm::vmx_vmwrite(guest::PDPTE0_FULL, mmu.pdptrs[1]);
+        VmxAsm::vmx_vmwrite(guest::PDPTE0_FULL, mmu.pdptrs[2]);
+        VmxAsm::vmx_vmwrite(guest::PDPTE0_FULL, mmu.pdptrs[3]);
+        //}else{
+        // debug!("load_pdptrs: not pae paging");
+        //}
+    }
+}
+
+bitflags! {
+    // pub struct VirtCpuRequest: u64 {
+    //     const KVM_REQUEST_MASK = 0xFF;
+
+    //     const KVM_REQ_TLB_FLUSH = 0 | Self::KVM_REQUEST_WAIT.bits | Self::KVM_REQUEST_NO_WAKEUP.bits;
+    //     const KVM_REQ_VM_DEAD = 1 | Self::KVM_REQUEST_WAIT.bits | Self::KVM_REQUEST_NO_WAKEUP.bits;
+
+    //     const KVM_REQUEST_NO_WAKEUP = 1 << 8;
+    //     const KVM_REQUEST_WAIT = 1 << 9;
+    //     const KVM_REQUEST_NO_ACTION = 1 << 10;
+
+    //     const KVM_REQ_MIGRATE_TIMER = kvm_arch_req(0);
+    //     const KVM_REQ_REPORT_TPR_ACCESS = kvm_arch_req(1);
+    //     const KVM_REQ_TRIPLE_FAULT = kvm_arch_req(2);
+    //     const KVM_REQ_MMU_SYNC = kvm_arch_req(3);
+    //     const KVM_REQ_CLOCK_UPDATE = kvm_arch_req(4);
+    //     const KVM_REQ_LOAD_MMU_PGD = kvm_arch_req(5);
+    //     const KVM_REQ_EVENT = kvm_arch_req(6);
+    //     const KVM_REQ_APF_HALT = kvm_arch_req(7);
+    //     const KVM_REQ_STEAL_UPDATE = kvm_arch_req(8);
+    //     const KVM_REQ_NMI = kvm_arch_req(9);
+    //     const KVM_REQ_PMU = kvm_arch_req(10);
+    //     const KVM_REQ_PMI = kvm_arch_req(11);
+    //     const KVM_REQ_SMI = kvm_arch_req(12);
+
+    //     const KVM_REQ_MASTERCLOCK_UPDATE = kvm_arch_req(13);
+    //     const KVM_REQ_MCLOCK_INPROGRESS = kvm_arch_req_flags(14, Self::KVM_REQUEST_WAIT.bits | Self::KVM_REQUEST_NO_WAKEUP.bits);
+    //     const KVM_REQ_SCAN_IOAPIC = kvm_arch_req_flags(15, Self::KVM_REQUEST_WAIT.bits | Self::KVM_REQUEST_NO_WAKEUP.bits);
+    //     const KVM_REQ_GLOBAL_CLOCK_UPDATE = kvm_arch_req(16);
+    //     const KVM_REQ_APIC_PAGE_RELOAD = kvm_arch_req_flags(17, Self::KVM_REQUEST_WAIT.bits | Self::KVM_REQUEST_NO_WAKEUP.bits);
+    //     const KVM_REQ_HV_CRASH = kvm_arch_req(18);
+    //     const KVM_REQ_IOAPIC_EOI_EXIT = kvm_arch_req(19);
+    //     const KVM_REQ_HV_RESET = kvm_arch_req(20);
+    //     const KVM_REQ_HV_EXIT = kvm_arch_req(21);
+    //     const KVM_REQ_HV_STIMER = kvm_arch_req(22);
+    //     const KVM_REQ_LOAD_EOI_EXITMAP = kvm_arch_req(23);
+    //     const KVM_REQ_GET_NESTED_STATE_PAGES = kvm_arch_req(24);
+    //     const KVM_REQ_APICV_UPDATE = kvm_arch_req_flags(25, Self::KVM_REQUEST_WAIT.bits | Self::KVM_REQUEST_NO_WAKEUP.bits);
+    //     const KVM_REQ_TLB_FLUSH_CURRENT = kvm_arch_req(26);
+
+    //     const KVM_REQ_TLB_FLUSH_GUEST = kvm_arch_req_flags(27, Self::KVM_REQUEST_WAIT.bits | Self::KVM_REQUEST_NO_WAKEUP.bits);
+    //     const KVM_REQ_APF_READY = kvm_arch_req(28);
+    //     const KVM_REQ_MSR_FILTER_CHANGED = kvm_arch_req(29);
+    //     const KVM_REQ_UPDATE_CPU_DIRTY_LOGGING  = kvm_arch_req_flags(30, Self::KVM_REQUEST_WAIT.bits | Self::KVM_REQUEST_NO_WAKEUP.bits);
+    //     const KVM_REQ_MMU_FREE_OBSOLETE_ROOTS = kvm_arch_req_flags(31, Self::KVM_REQUEST_WAIT.bits | Self::KVM_REQUEST_NO_WAKEUP.bits);
+    //     const KVM_REQ_HV_TLB_FLUSH = kvm_arch_req_flags(32, Self::KVM_REQUEST_WAIT.bits | Self::KVM_REQUEST_NO_WAKEUP.bits);
+    // }
+
+    pub struct VirtCpuRequest: u64 {
+        // const KVM_REQUEST_MASK = 0xFF;
+
+        const KVM_REQ_TLB_FLUSH = Self::KVM_REQUEST_WAIT.bits | Self::KVM_REQUEST_NO_WAKEUP.bits;
+        const KVM_REQ_VM_DEAD = 1;
+
+        const KVM_REQUEST_NO_WAKEUP = 1 << 8;
+        const KVM_REQUEST_WAIT = 1 << 9;
+        const KVM_REQUEST_NO_ACTION = 1 << 10;
+
+        const KVM_REQ_MIGRATE_TIMER = kvm_arch_req(0);
+        const KVM_REQ_REPORT_TPR_ACCESS = kvm_arch_req(1);
+        const KVM_REQ_TRIPLE_FAULT = kvm_arch_req(2);
+        const KVM_REQ_MMU_SYNC = kvm_arch_req(3);
+        const KVM_REQ_CLOCK_UPDATE = kvm_arch_req(4);
+        const KVM_REQ_LOAD_MMU_PGD = kvm_arch_req(5);
+        const KVM_REQ_EVENT = kvm_arch_req(6);
+        const KVM_REQ_APF_HALT = kvm_arch_req(7);
+        const KVM_REQ_STEAL_UPDATE = kvm_arch_req(8);
+        const KVM_REQ_NMI = kvm_arch_req(9);
+        const KVM_REQ_PMU = kvm_arch_req(10);
+        const KVM_REQ_PMI = kvm_arch_req(11);
+        const KVM_REQ_SMI = kvm_arch_req(12);
+
+        const KVM_REQ_MASTERCLOCK_UPDATE = kvm_arch_req(13);
+
+        const KVM_REQ_MCLOCK_INPROGRESS = kvm_arch_req(14);
+        const MAKE_KVM_REQ_MCLOCK_INPROGRESS = kvm_arch_req_flags(14, Self::KVM_REQUEST_WAIT.bits | Self::KVM_REQUEST_NO_WAKEUP.bits);
+
+        const KVM_REQ_SCAN_IOAPIC = kvm_arch_req(15);
+        const MAKE_KVM_REQ_SCAN_IOAPIC = kvm_arch_req_flags(15, Self::KVM_REQUEST_WAIT.bits | Self::KVM_REQUEST_NO_WAKEUP.bits);
+
+
+        const KVM_REQ_GLOBAL_CLOCK_UPDATE = kvm_arch_req(16);
+
+        const KVM_REQ_APIC_PAGE_RELOAD = kvm_arch_req(17);
+        const MAKE_KVM_REQ_APIC_PAGE_RELOAD = kvm_arch_req_flags(17, Self::KVM_REQUEST_WAIT.bits | Self::KVM_REQUEST_NO_WAKEUP.bits);
+
+        const KVM_REQ_HV_CRASH = kvm_arch_req(18);
+        const KVM_REQ_IOAPIC_EOI_EXIT = kvm_arch_req(19);
+        const KVM_REQ_HV_RESET = kvm_arch_req(20);
+        const KVM_REQ_HV_EXIT = kvm_arch_req(21);
+        const KVM_REQ_HV_STIMER = kvm_arch_req(22);
+        const KVM_REQ_LOAD_EOI_EXITMAP = kvm_arch_req(23);
+        const KVM_REQ_GET_NESTED_STATE_PAGES = kvm_arch_req(24);
+
+        const KVM_REQ_APICV_UPDATE = kvm_arch_req(25);
+        const MAKE_KVM_REQ_APICV_UPDATE = kvm_arch_req_flags(25, Self::KVM_REQUEST_WAIT.bits | Self::KVM_REQUEST_NO_WAKEUP.bits);
+
+        const KVM_REQ_TLB_FLUSH_CURRENT = kvm_arch_req(26);
+
+        const KVM_REQ_TLB_FLUSH_GUEST = kvm_arch_req(27);
+        const MAKE_KVM_REQ_TLB_FLUSH_GUEST = kvm_arch_req_flags(27, Self::KVM_REQUEST_WAIT.bits | Self::KVM_REQUEST_NO_WAKEUP.bits);
+
+        const KVM_REQ_APF_READY = kvm_arch_req(28);
+        const KVM_REQ_MSR_FILTER_CHANGED = kvm_arch_req(29);
+
+        const KVM_REQ_UPDATE_CPU_DIRTY_LOGGING  = kvm_arch_req(30);
+        const MAKE_KVM_REQ_UPDATE_CPU_DIRTY_LOGGING  = kvm_arch_req_flags(30, Self::KVM_REQUEST_WAIT.bits | Self::KVM_REQUEST_NO_WAKEUP.bits);
+
+        const KVM_REQ_MMU_FREE_OBSOLETE_ROOTS = kvm_arch_req(31);
+        const MAKE_KVM_REQ_MMU_FREE_OBSOLETE_ROOTS = kvm_arch_req_flags(31, Self::KVM_REQUEST_WAIT.bits | Self::KVM_REQUEST_NO_WAKEUP.bits);
+
+        const KVM_REQ_HV_TLB_FLUSH = kvm_arch_req(32);
+        const MAKE_KVM_REQ_HV_TLB_FLUSH = kvm_arch_req_flags(32, Self::KVM_REQUEST_WAIT.bits | Self::KVM_REQUEST_NO_WAKEUP.bits);
+    }
+}
+
+// const KVM_REQUEST_ARCH_BASE: u64 = 8;
+const KVM_REQUEST_ARCH_BASE: u64 = 11;
+
+const fn kvm_arch_req(nr: u64) -> u64 {
+    return kvm_arch_req_flags(nr, 0);
+}
+
+const fn kvm_arch_req_flags(nr: u64, flags: u64) -> u64 {
+    1 << (nr + KVM_REQUEST_ARCH_BASE) | flags
+}
+
+#[derive(Debug, Default)]
+pub struct KvmQueuedInterrupt {
+    pub injected: bool,
+    pub soft: bool,
+    pub nr: u8,
+}
+
+#[derive(Debug, Default)]
+#[allow(dead_code)]
+pub struct KvmQueuedException {
+    pending: bool,
+    injected: bool,
+    has_error_code: bool,
+    vector: u8,
+    error_code: u32,
+    payload: usize,
+    has_payload: bool,
+}
+
+#[derive(Debug)]
+#[allow(dead_code)]
+pub struct KvmAsyncPageFault {
+    /// 是否处于停止状态
+    halted: bool,
+    /// 存储异步页面错误的 GFN(Guest Frame Number)
+    gfns: [u64; Self::ASYNC_PF_PER_VCPU],
+    /// 用于 GFN 到 HVA(Host Virtual Address)的缓存
+    data: GfnToHvaCache,
+    /// MSR_KVM_ASYNC_PF_EN 寄存器的值
+    msr_en_val: u64,
+    /// MSR_KVM_ASYNC_PF_INT 寄存器的值
+    msr_int_val: u64,
+    /// 异步 PF 的向量
+    vec: u16,
+    /// 异步 PF 的 ID
+    id: u32,
+    /// 是否仅发送给用户空间
+    send_user_only: bool,
+    /// 主机 APF 标志
+    host_apf_flags: u32,
+    /// 是否作为页面错误 VMExit 传递
+    delivery_as_pf_vmexit: bool,
+    /// 是否处于页面就绪挂起状态
+    pageready_pending: bool,
+}
+
+impl KvmAsyncPageFault {
+    pub const ASYNC_PF_PER_VCPU: usize = 64;
+}
+
+#[derive(Debug)]
+pub enum KvmIntrType {
+    None,
+    Irq,
+    // Nmi,
+}

+ 24 - 0
kernel/src/arch/x86_64/vm/mem.rs

@@ -0,0 +1,24 @@
+use alloc::sync::Arc;
+use log::warn;
+use system_error::SystemError;
+
+use crate::virt::vm::kvm_host::{
+    mem::{KvmMemoryChangeMode, LockedKvmMemSlot},
+    Vm,
+};
+
+#[allow(dead_code)]
+pub struct KvmArchMemorySlot {}
+
+impl Vm {
+    pub fn arch_prepare_memory_region(
+        &self,
+        _old: Option<&Arc<LockedKvmMemSlot>>,
+        _new: Option<&Arc<LockedKvmMemSlot>>,
+        _change: KvmMemoryChangeMode,
+    ) -> Result<(), SystemError> {
+        // todo
+        warn!("arch_prepare_memory_region TODO");
+        Ok(())
+    }
+}

+ 648 - 0
kernel/src/arch/x86_64/vm/mmu/kvm_mmu.rs

@@ -0,0 +1,648 @@
+use crate::arch::mm::X86_64MMArch;
+use crate::arch::vm::asm::VmxAsm;
+use crate::arch::vm::kvm_host::page::KVM_MIN_FREE_MMU_PAGES;
+use crate::mm::PhysAddr;
+use crate::virt::kvm::host_mem::PAGE_SHIFT;
+use crate::{
+    arch::{mm::LockedFrameAllocator, MMArch, VirtCpuArch},
+    libs::spinlock::{SpinLock, SpinLockGuard},
+    mm::{page::PageMapper, MemoryManagementArch, PageTableKind},
+    virt::vm::kvm_host::{vcpu::VirtCpu, Vm},
+};
+use alloc::{sync::Arc, vec::Vec};
+use bitfield_struct::bitfield;
+use core::intrinsics::likely;
+use core::ops::{Add, Sub};
+use log::{debug, error, warn};
+use raw_cpuid::CpuId;
+use system_error::SystemError;
+use x86::controlregs::{Cr0, Cr4};
+use x86::vmx::vmcs::guest;
+use x86_64::registers::control::EferFlags;
+
+use super::super::{vmx::vmx_info, x86_kvm_ops};
+use super::mmu_internal::KvmPageFault;
+
+const PT64_ROOT_5LEVEL: usize = 5;
+const PT64_ROOT_4LEVEL: usize = 4;
+const PT32_ROOT_LEVEL: usize = 2;
+const PT32E_ROOT_LEVEL: usize = 3;
+
+static mut TDP_ENABLED: bool = false;
+static mut TDP_MMU_ENABLED: bool = true;
+static mut TDP_MMU_ALLOWED: bool = unsafe { TDP_MMU_ENABLED };
+
+static mut TDP_ROOT_LEVEL: usize = 0;
+static mut MAX_TDP_LEVEL: usize = 0;
+static mut SHADOW_ACCESSED_MASK: usize = 0;
+
+static mut MAX_HUGE_PAGE_LEVEL: PageLevel = PageLevel::None;
+
+pub const PAGE_SIZE: u64 = 1 << PAGE_SHIFT;
+
+pub fn is_tdp_mmu_enabled() -> bool {
+    unsafe { TDP_MMU_ENABLED }
+}
+
+#[allow(dead_code)]
+#[repr(u8)]
+#[derive(Debug, PartialEq, Eq, Clone, Copy)]
+pub enum PageLevel {
+    None,
+    Level4K,
+    Level2M,
+    Level1G,
+    Level512G,
+    LevelNum,
+}
+// 实现 Add trait
+impl Add<usize> for PageLevel {
+    type Output = Self;
+
+    fn add(self, other: usize) -> Self {
+        let result = self as usize + other;
+        match result {
+            0 => PageLevel::None,
+            1 => PageLevel::Level4K,
+            2 => PageLevel::Level2M,
+            3 => PageLevel::Level1G,
+            4 => PageLevel::Level512G,
+            5 => PageLevel::LevelNum,
+            _ => PageLevel::LevelNum, // 超出范围时返回 LevelNum
+        }
+    }
+}
+// 实现 Sub trait
+impl Sub<usize> for PageLevel {
+    type Output = Self;
+
+    fn sub(self, other: usize) -> Self {
+        let result = self as isize - other as isize;
+        match result {
+            0 => PageLevel::None,
+            1 => PageLevel::Level4K,
+            2 => PageLevel::Level2M,
+            3 => PageLevel::Level1G,
+            4 => PageLevel::Level512G,
+            5 => PageLevel::LevelNum,
+            _ => PageLevel::None, // 超出范围时返回 None
+        }
+    }
+}
+impl PageLevel {
+    fn kvm_hpage_gfn_shift(level: u8) -> u32 {
+        ((level - 1) * 9) as u32
+    }
+
+    fn kvm_hpage_shift(level: u8) -> u32 {
+        PAGE_SHIFT + Self::kvm_hpage_gfn_shift(level)
+    }
+
+    fn kvm_hpage_size(level: u8) -> u64 {
+        1 << Self::kvm_hpage_shift(level)
+    }
+    /// 计算每个大页包含的页数
+    ///
+    /// # 参数
+    /// - `level`: 页级别
+    ///
+    /// # 返回值
+    /// 返回每个大页包含的页数
+    pub fn kvm_pages_per_hpage(level: u8) -> u64 {
+        Self::kvm_hpage_size(level) / PAGE_SIZE
+    }
+}
+///计算给定 GFN(Guest Frame Number)在指定级别上的对齐值
+pub fn gfn_round_for_level(gfn: u64, level: u8) -> u64 {
+    gfn & !(PageLevel::kvm_pages_per_hpage(level) - 1)
+}
+
+#[derive(Debug)]
+pub struct LockedKvmMmu {
+    inner: SpinLock<KvmMmu>,
+}
+
+impl LockedKvmMmu {
+    pub fn new(mmu: KvmMmu) -> Arc<Self> {
+        Arc::new(Self {
+            inner: SpinLock::new(mmu),
+        })
+    }
+
+    pub fn lock(&self) -> SpinLockGuard<KvmMmu> {
+        self.inner.lock()
+    }
+}
+
+pub type KvmMmuPageFaultHandler =
+    fn(vcpu: &mut VirtCpu, page_fault: &KvmPageFault) -> Result<i32, SystemError>;
+
+#[derive(Debug, Default)]
+#[allow(dead_code)]
+pub struct KvmMmu {
+    pub root: KvmMmuRootInfo,
+    pub cpu_role: KvmCpuRole,
+    pub root_role: KvmMmuPageRole,
+    pub page_fault: Option<KvmMmuPageFaultHandler>,
+
+    pkru_mask: u32,
+
+    prev_roots: [KvmMmuRootInfo; Self::KVM_MMU_NUM_PREV_ROOTS],
+
+    pae_root: Vec<u64>,
+
+    pub pdptrs: [u64; 4],
+}
+
+impl KvmMmu {
+    pub fn _save_pdptrs(&mut self) {
+        self.pdptrs[0] = VmxAsm::vmx_vmread(guest::PDPTE0_FULL);
+        self.pdptrs[1] = VmxAsm::vmx_vmread(guest::PDPTE1_FULL);
+        self.pdptrs[2] = VmxAsm::vmx_vmread(guest::PDPTE2_FULL);
+        self.pdptrs[3] = VmxAsm::vmx_vmread(guest::PDPTE3_FULL);
+    }
+    const KVM_MMU_NUM_PREV_ROOTS: usize = 3;
+    pub const INVALID_PAGE: u64 = u64::MAX;
+
+    #[inline]
+    pub fn tdp_enabled() -> bool {
+        unsafe { TDP_ENABLED }
+    }
+
+    #[inline]
+    pub fn tdp_root_level() -> usize {
+        unsafe { TDP_ROOT_LEVEL }
+    }
+
+    #[inline]
+    pub fn max_tdp_level() -> usize {
+        unsafe { MAX_TDP_LEVEL }
+    }
+
+    #[inline]
+    pub fn ad_enabled() -> bool {
+        unsafe { SHADOW_ACCESSED_MASK != 0 }
+    }
+
+    /// 初始化mmu的配置,因为其是无锁的,所以该函数只能在初始化vmx时调用
+    pub fn kvm_configure_mmu(
+        enable_tdp: bool,
+        tdp_forced_root_level: usize,
+        tdp_max_root_level: usize,
+        tdp_huge_page_level: PageLevel,
+    ) {
+        unsafe {
+            TDP_ENABLED = enable_tdp;
+            TDP_ROOT_LEVEL = tdp_forced_root_level;
+            MAX_TDP_LEVEL = tdp_max_root_level;
+
+            TDP_MMU_ENABLED = TDP_MMU_ALLOWED && TDP_ENABLED;
+
+            if TDP_ENABLED {
+                MAX_HUGE_PAGE_LEVEL = tdp_huge_page_level;
+            } else if CpuId::new()
+                .get_extended_processor_and_feature_identifiers()
+                .unwrap()
+                .has_1gib_pages()
+            {
+                MAX_HUGE_PAGE_LEVEL = PageLevel::Level1G;
+            } else {
+                MAX_HUGE_PAGE_LEVEL = PageLevel::Level2M;
+            }
+        }
+    }
+}
+
+#[derive(Debug, Default)]
+pub struct KvmMmuRootInfo {
+    pub pgd: u64,
+    pub hpa: u64,
+}
+
+#[derive(Debug, Default, Clone, Copy)]
+pub struct KvmCpuRole {
+    base: KvmMmuPageRole,
+    extend: KvmMmuExtenedRole,
+}
+
+impl PartialEq for KvmCpuRole {
+    fn eq(&self, other: &Self) -> bool {
+        self.base.0 == other.base.0 && self.extend.0 == other.extend.0
+    }
+}
+
+/// ### 用于跟踪影子页(包括 TDP 页)的属性,以确定页面是否可以在给定的 MMU 上下文中使用。
+#[bitfield(u32)]
+pub struct KvmMmuPageRole {
+    /// 表示页表级别,占用 4 位。对于普通的页表,取值是 2(二级页表)、3(三级页表)、4(四级页表)和 5(五级页表)
+    #[bits(4)]
+    pub level: u32,
+    /// 页表项是否为 4 字节,占用 1 位。在非 PAE 分页模式下,该值为 1
+    has_4_byte_gpte: bool,
+    /// 表示页表项所在的象限,占用 2 位。该字段仅在 has_4_byte_gpte 为 1 时有效。
+    #[bits(2)]
+    quadrant: u32,
+    /// 页面是否直接映射
+    direct: bool,
+    /// 页面的访问权限
+    #[bits(3)]
+    access: u32,
+    /// 页面是否无效
+    invalid: bool,
+    /// 页面是否启用 NX(不可执行)位
+    efer_nx: bool,
+    /// CR0 寄存器中的写保护位(WP)是否被置位
+    cr0_wp: bool,
+    /// SMEP(Supervisor Mode Execution Protection)和非写保护位的组合
+    smep_andnot_wp: bool,
+    /// SMAP(Supervisor Mode Access Prevention)和非写保护位的组合
+    smap_andnot_wp: bool,
+    /// 页面是否禁用访问位(Accessed Bit)
+    ad_disabled: bool,
+    /// 当前页是否处于客户机模式
+    guest_mode: bool,
+    /// 是否将此页透传给客户机
+    passthrough: bool,
+    /// 未使用位域
+    #[bits(5)]
+    unused: u32,
+    /// 表示 SMM(System Management Mode)模式
+    #[bits(8)]
+    pub smm: u32,
+}
+
+impl KvmMmuPageRole {
+    pub fn is_cr0_pg(&self) -> bool {
+        self.level() > 0
+    }
+
+    pub fn is_cr4_pae(&self) -> bool {
+        !self.has_4_byte_gpte()
+    }
+    pub fn get_direct(&self) -> bool {
+        self.direct()
+    }
+}
+
+#[bitfield(u32)]
+pub struct KvmMmuExtenedRole {
+    valid: bool,
+    execonly: bool,
+    cr4_pse: bool,
+    cr4_pke: bool,
+    cr4_smap: bool,
+    cr4_smep: bool,
+    cr4_la57: bool,
+    efer_lma: bool,
+    #[bits(24)]
+    unused: u32,
+}
+
+pub struct KvmMmuRoleRegs {
+    pub cr0: Cr0,
+    pub cr4: Cr4,
+    pub efer: EferFlags,
+}
+
+/// page falut的返回值, 用于表示页面错误的处理结果
+/// 应用在handle_mmio_page_fault()、mmu.page_fault()、fast_page_fault()和
+/// kvm_mmu_do_page_fault()等
+#[derive(Debug, Eq, PartialEq, FromPrimitive, Clone)]
+#[repr(u32)]
+pub enum PFRet {
+    Continue,       // RET_PF_CONTINUE: 到目前为止一切正常,继续处理页面错误。
+    Retry,          // RET_PF_RETRY: 让 CPU 再次对该地址发生页面错误。
+    Emulate,        // RET_PF_EMULATE: MMIO 页面错误,直接模拟指令。
+    Invalid,        // RET_PF_INVALID: SPTE 无效,让实际的页面错误路径更新它。
+    Fixed,          // RET_PF_FIXED: 故障的条目已经被修复
+    Spurious,       // RET_PF_SPURIOUS: 故障的条目已经被修复,例如由另一个 vCPU 修复。
+    Err = u32::MAX, // 错误
+}
+impl From<PFRet> for i32 {
+    fn from(pf_ret: PFRet) -> Self {
+        pf_ret as i32
+    }
+}
+impl From<i32> for PFRet {
+    fn from(value: i32) -> Self {
+        match value {
+            0 => PFRet::Continue,
+            1 => PFRet::Retry,
+            2 => PFRet::Emulate,
+            3 => PFRet::Invalid,
+            4 => PFRet::Fixed,
+            5 => PFRet::Spurious,
+            _ => PFRet::Err, // 默认返回 Invalid
+        }
+    }
+}
+impl VirtCpuArch {
+    pub fn kvm_init_mmu(&mut self) {
+        let regs = self.role_regs();
+        let cpu_role = self.calc_cpu_role(&regs);
+
+        if self.walk_mmu.is_some()
+            && self.nested_mmu.is_some()
+            && Arc::ptr_eq(
+                self.walk_mmu.as_ref().unwrap(),
+                self.nested_mmu.as_ref().unwrap(),
+            )
+        {
+            todo!()
+        } else if KvmMmu::tdp_enabled() {
+            self.init_tdp_mmu(cpu_role);
+        } else {
+            todo!()
+        }
+    }
+
+    fn unload_mmu(&mut self) {
+        // TODO
+    }
+
+    pub fn reset_mmu_context(&mut self) {
+        self.unload_mmu();
+        self.kvm_init_mmu();
+    }
+
+    fn role_regs(&mut self) -> KvmMmuRoleRegs {
+        KvmMmuRoleRegs {
+            cr0: self.read_cr0_bits(Cr0::CR0_ENABLE_PAGING | Cr0::CR0_WRITE_PROTECT),
+            cr4: self.read_cr4_bits(
+                Cr4::CR4_ENABLE_PSE
+                    | Cr4::CR4_ENABLE_PAE
+                    | Cr4::CR4_ENABLE_LA57
+                    | Cr4::CR4_ENABLE_SMEP
+                    | Cr4::CR4_ENABLE_SMAP
+                    | Cr4::CR4_ENABLE_PROTECTION_KEY,
+            ),
+            efer: self.efer,
+        }
+    }
+
+    fn calc_cpu_role(&self, regs: &KvmMmuRoleRegs) -> KvmCpuRole {
+        let mut role = KvmCpuRole::default();
+        let base = &mut role.base;
+        let ext = &mut role.extend;
+        base.set_access(0b111);
+        base.set_smm(self.is_smm() as u32);
+        base.set_guest_mode(self.is_guest_mode());
+        ext.set_valid(true);
+
+        if !regs.cr0.contains(Cr0::CR0_ENABLE_PAGING) {
+            base.set_direct(true);
+            return role;
+        }
+
+        base.set_efer_nx(regs.efer.contains(EferFlags::NO_EXECUTE_ENABLE));
+        base.set_cr0_wp(regs.cr0.contains(Cr0::CR0_WRITE_PROTECT));
+        base.set_smep_andnot_wp(
+            regs.cr4.contains(Cr4::CR4_ENABLE_SMEP) && !regs.cr0.contains(Cr0::CR0_WRITE_PROTECT),
+        );
+        base.set_smap_andnot_wp(
+            regs.cr4.contains(Cr4::CR4_ENABLE_SMAP) && !regs.cr0.contains(Cr0::CR0_WRITE_PROTECT),
+        );
+
+        base.set_has_4_byte_gpte(!regs.cr4.contains(Cr4::CR4_ENABLE_PAE));
+
+        if regs.efer.contains(EferFlags::LONG_MODE_ACTIVE) {
+            let level = if regs.cr4.contains(Cr4::CR4_ENABLE_LA57) {
+                PT64_ROOT_5LEVEL as u32
+            } else {
+                PT64_ROOT_4LEVEL as u32
+            };
+            base.set_level(level);
+        } else if regs.cr4.contains(Cr4::CR4_ENABLE_PAE) {
+            base.set_level(PT32E_ROOT_LEVEL as u32);
+        } else {
+            base.set_level(PT32_ROOT_LEVEL as u32);
+        }
+
+        ext.set_cr4_smep(regs.cr4.contains(Cr4::CR4_ENABLE_SMEP));
+        ext.set_cr4_smap(regs.cr4.contains(Cr4::CR4_ENABLE_SMAP));
+        ext.set_cr4_pse(regs.cr4.contains(Cr4::CR4_ENABLE_PSE));
+        ext.set_cr4_pke(
+            regs.efer.contains(EferFlags::LONG_MODE_ACTIVE)
+                && regs.cr4.contains(Cr4::CR4_ENABLE_PROTECTION_KEY),
+        );
+        ext.set_cr4_la57(
+            regs.efer.contains(EferFlags::LONG_MODE_ACTIVE)
+                && regs.cr4.contains(Cr4::CR4_ENABLE_LA57),
+        );
+        ext.set_efer_lma(regs.efer.contains(EferFlags::LONG_MODE_ACTIVE));
+
+        role
+    }
+
+    /// https://code.dragonos.org.cn/xref/linux-6.6.21/arch/x86/kvm/mmu/mmu.c#6019
+    pub fn vcpu_arch_mmu_create(&mut self) {
+        if vmx_info().tdp_enabled() {
+            self.guset_mmu = Some(self._mmu_create());
+        }
+
+        self.root_mmu = Some(self._mmu_create());
+        self.mmu = self.root_mmu.clone();
+        self.walk_mmu = self.root_mmu.clone();
+    }
+
+    fn _mmu_create(&self) -> Arc<LockedKvmMmu> {
+        let mut mmu = KvmMmu::default();
+
+        mmu.root.hpa = KvmMmu::INVALID_PAGE;
+        mmu.root.pgd = 0;
+
+        for role in &mut mmu.prev_roots {
+            role.hpa = KvmMmu::INVALID_PAGE;
+            role.pgd = KvmMmu::INVALID_PAGE;
+        }
+
+        if KvmMmu::tdp_enabled() && self.mmu_get_tdp_level() > PT32E_ROOT_LEVEL {
+            return LockedKvmMmu::new(mmu);
+        }
+
+        mmu.pae_root
+            .resize(MMArch::PAGE_SIZE / core::mem::size_of::<u64>(), 0);
+
+        return LockedKvmMmu::new(mmu);
+    }
+
+    fn mmu_get_tdp_level(&self) -> usize {
+        if KvmMmu::tdp_root_level() != 0 {
+            return KvmMmu::tdp_root_level();
+        }
+
+        if KvmMmu::max_tdp_level() == 5 && self.max_phyaddr <= 48 {
+            return 4;
+        }
+
+        return KvmMmu::max_tdp_level();
+    }
+
+    pub fn init_tdp_mmu(&mut self, cpu_role: KvmCpuRole) {
+        let context = self.root_mmu();
+        let mut context = context.lock();
+        let root_role = self.calc_tdp_mmu_root_page_role(cpu_role);
+
+        if cpu_role == context.cpu_role && root_role.0 == context.root_role.0 {
+            return;
+        }
+
+        context.cpu_role = cpu_role;
+        context.root_role = root_role;
+
+        // todo 设置函数集
+
+        if !context.cpu_role.base.is_cr0_pg() {
+            // todo: context->gva_to_gpa = nonpaging_gva_to_gpa;
+            warn!("context->gva_to_gpa = nonpaging_gva_to_gpa todo!");
+        } else if context.cpu_role.base.is_cr4_pae() {
+            // todo: context->gva_to_gpa = paging64_gva_to_gpa;
+            warn!("context->gva_to_gpa = paging64_gva_to_gpa todo!");
+        } else {
+            // todo: context->gva_to_gpa = paging32_gva_to_gpa;
+            warn!("context->gva_to_gpa = paging32_gva_to_gpa todo!");
+        }
+
+        // todo:
+        // reset_guest_paging_metadata(vcpu, context);
+        // reset_tdp_shadow_zero_bits_mask(context);
+    }
+
+    #[inline]
+    pub fn root_mmu(&self) -> &Arc<LockedKvmMmu> {
+        self.root_mmu.as_ref().unwrap()
+    }
+
+    #[inline]
+    pub fn mmu(&self) -> SpinLockGuard<KvmMmu> {
+        self.mmu.as_ref().unwrap().lock()
+    }
+
+    fn calc_tdp_mmu_root_page_role(&self, cpu_role: KvmCpuRole) -> KvmMmuPageRole {
+        let mut role = KvmMmuPageRole::default();
+
+        role.set_access(0b111);
+        role.set_cr0_wp(true);
+        role.set_efer_nx(true);
+        role.set_smm(cpu_role.base.smm());
+        role.set_guest_mode(cpu_role.base.guest_mode());
+        role.set_ad_disabled(!KvmMmu::ad_enabled());
+        role.set_level(self.mmu_get_tdp_level() as u32);
+        role.set_direct(true);
+        role.set_has_4_byte_gpte(false);
+
+        role
+    }
+}
+
+impl VirtCpu {
+    pub fn kvm_mmu_reload(&mut self, vm: &Vm) -> Result<(), SystemError> {
+        if likely(self.arch.mmu().root.hpa != KvmMmu::INVALID_PAGE) {
+            return Ok(());
+        }
+
+        return self.kvm_mmu_load(vm);
+    }
+
+    pub fn kvm_mmu_load(&mut self, vm: &Vm) -> Result<(), SystemError> {
+        let direct = self.arch.mmu().root_role.direct();
+        self.mmu_topup_memory_caches(!direct)?;
+        self.mmu_alloc_special_roots()?;
+
+        if direct {
+            self.mmu_alloc_direct_roots(vm)?;
+        } else {
+            self.mmu_alloc_shadow_roots(vm)?;
+        }
+
+        // TODO: kvm_mmu_sync_roots
+
+        self.kvm_mmu_load_pgd(vm);
+
+        Ok(())
+    }
+
+    pub fn kvm_mmu_load_pgd(&mut self, vm: &Vm) {
+        let root_hpa = self.arch.mmu().root.hpa;
+        debug!("kvm_mmu_load_pgd::root_hpa = {:#x}", root_hpa);
+        if root_hpa == KvmMmu::INVALID_PAGE {
+            return;
+        }
+
+        let level = self.arch.mmu().root_role.level();
+        x86_kvm_ops().load_mmu_pgd(self, vm, root_hpa, level);
+    }
+
+    fn mmu_topup_memory_caches(&mut self, _maybe_indirect: bool) -> Result<(), SystemError> {
+        // TODO
+        Ok(())
+    }
+
+    fn mmu_alloc_special_roots(&mut self) -> Result<(), SystemError> {
+        // TODO
+        Ok(())
+    }
+
+    fn mmu_alloc_direct_roots(&mut self, vm: &Vm) -> Result<(), SystemError> {
+        let shadow_root_level = self.arch.mmu().root_role.level();
+        let _r: Result<(), SystemError> = self.make_mmu_pages_available(vm);
+        let root: PhysAddr;
+        if KvmMmu::tdp_enabled() {
+            root = self.kvm_tdp_mmu_get_vcpu_root_hpa().unwrap();
+            let mut mmu = self.arch.mmu();
+            mmu.root.hpa = root.data() as u64;
+        } else if shadow_root_level >= PT64_ROOT_4LEVEL as u32 {
+            todo!()
+        } else if shadow_root_level == PT32E_ROOT_LEVEL as u32 {
+            todo!()
+        } else {
+            error!("Bad TDP root level = {}", shadow_root_level);
+            return Err(SystemError::EIO);
+        }
+        /* root.pgd is ignored for direct MMUs. */
+        self.arch.mmu().root.pgd = 0;
+        Ok(())
+    }
+
+    fn mmu_alloc_shadow_roots(&mut self, _vm: &Vm) -> Result<(), SystemError> {
+        todo!();
+    }
+    fn make_mmu_pages_available(&mut self, vm: &Vm) -> Result<(), SystemError> {
+        let avail = Self::kvm_mmu_available_pages(vm);
+        if likely(avail >= KVM_MIN_FREE_MMU_PAGES) {
+            return Ok(());
+        }
+        //kvm_mmu_zap_oldest_mmu_pages(vm, KVM_REFILL_PAGES - avail);
+        if Self::kvm_mmu_available_pages(vm) == 0 {
+            return Err(SystemError::ENOSPC);
+        }
+        Ok(())
+    }
+    fn kvm_mmu_available_pages(vm: &Vm) -> usize {
+        if vm.arch.n_max_mmu_pages > vm.arch.n_used_mmu_pages {
+            return vm.arch.n_max_mmu_pages - vm.arch.n_used_mmu_pages;
+        }
+        return 0;
+    }
+    fn kvm_tdp_mmu_get_vcpu_root_hpa(&self) -> Result<PhysAddr, SystemError> {
+        //todo Check for an existing root before allocating a new one.  Note, the
+        // role check prevents consuming an invalid root.
+        let root = self.tdp_mmu_alloc_sp().unwrap();
+        Ok(PhysAddr::new(root as usize))
+    }
+    fn tdp_mmu_alloc_sp(&self) -> Result<u64, SystemError> {
+        // 申请并创建新的页表
+        let mapper: crate::mm::page::PageMapper<X86_64MMArch, LockedFrameAllocator> = unsafe {
+            PageMapper::create(PageTableKind::EPT, LockedFrameAllocator)
+                .ok_or(SystemError::ENOMEM)?
+        };
+
+        let ept_root_hpa = mapper.table().phys();
+
+        self.arch.mmu().root.hpa = ept_root_hpa.data() as u64;
+
+        debug!("ept_root_hpa:{:x}!", ept_root_hpa.data() as u64);
+
+        return Ok(self.arch.mmu().root.hpa);
+    }
+}

+ 396 - 0
kernel/src/arch/x86_64/vm/mmu/mmu_internal.rs

@@ -0,0 +1,396 @@
+use crate::mm::page::EntryFlags;
+use alloc::sync::Arc;
+use core::{intrinsics::unlikely, ops::Index};
+use log::{debug, warn};
+use x86::vmx::vmcs::{guest, host};
+
+use system_error::SystemError;
+
+use crate::{
+    arch::{
+        vm::{
+            asm::VmxAsm,
+            kvm_host::{EmulType, KVM_PFN_NOSLOT},
+            mmu::kvm_mmu::{PFRet, PageLevel},
+            mtrr::kvm_mtrr_check_gfn_range_consistency,
+            vmx::{ept::EptPageMapper, PageFaultErr},
+        },
+        MMArch,
+    },
+    mm::PhysAddr,
+    virt::{
+        kvm::host_mem::PAGE_SHIFT,
+        vm::kvm_host::{
+            mem::{LockedKvmMemSlot, LockedVmMemSlotSet, UserMemRegionFlag, __gfn_to_pfn_memslot},
+            search_memslots,
+            vcpu::VirtCpu,
+            Vm,
+        },
+    },
+};
+
+use super::kvm_mmu::{gfn_round_for_level, is_tdp_mmu_enabled, KvmMmuPageRole};
+
+#[allow(dead_code)]
+#[derive(Debug, Default)]
+pub struct KvmMmuPage {
+    pub tdp_mmu_page: bool, // 标记是否为 TDP(Two-Dimensional Paging)页表页
+    pub gfn: u64,           // 客户机帧号(Guest Frame Number)
+
+    /*
+     * The following two entries are used to key the shadow page in the
+     * hash table.暫時沒看出來
+     */
+    pub role: KvmMmuPageRole,
+    pub spt: u64, // 指向页表条目(SPTE)的指针
+    pub mmu_seq: u64,
+    pub map_writable: bool,
+    pub write_fault_to_shadow_pgtable: bool,
+}
+#[allow(dead_code)]
+#[derive(Debug, Default)]
+pub struct KvmPageFault {
+    // vcpu.do_page_fault 的参数
+
+    // addr是guestOS传进来的gpa
+    addr: PhysAddr,
+    error_code: u32,
+    prefetch: bool,
+
+    // 从 error_code 派生
+    exec: bool,
+    write: bool,
+    present: bool,
+    rsvd: bool,
+    user: bool,
+
+    // 从 mmu 和全局状态派生
+    is_tdp: bool,
+    nx_huge_page_workaround_enabled: bool,
+
+    // 是否可以创建大于 4KB 的映射,或由于 NX 大页被禁止
+    huge_page_disallowed: bool,
+
+    // 此故障可以创建的最大页面大小
+    max_level: u8,
+
+    // 基于 max_level 和主机映射使用的页面大小可以创建的页面大小
+    req_level: u8,
+
+    // 基于 req_level 和 huge_page_disallowed 将创建的页面大小
+    goal_level: u8,
+
+    // 移位后的 addr,或如果 addr 是 gva 则是访客页表遍历的结果
+    gfn: u64, // gfn_t 通常是一个 64 位地址
+
+    // 包含 gfn 的 memslot。可能为 None
+    slot: Option<Arc<LockedKvmMemSlot>>,
+
+    // kvm_faultin_pfn 的输出
+    mmu_seq: u64,
+
+    // kvm_pfn_t 通常是一个 64 位地址,相当于知道了hpa
+    pfn: u64,
+    hva: u64, // hva_t 通常是一个 64 位地址
+    map_writable: bool,
+
+    // 表示访客正在尝试写入包含用于翻译写入本身的一个或多个 PTE 的 gfn
+    write_fault_to_shadow_pgtable: bool,
+}
+#[allow(dead_code)]
+impl KvmPageFault {
+    pub fn pfn(&self) -> u64 {
+        self.pfn
+    }
+    pub fn gfn(&self) -> u64 {
+        self.gfn
+    }
+    pub fn gpa(&self) -> u64 {
+        self.addr.data() as u64
+    }
+    pub fn hva(&self) -> u64 {
+        self.hva
+    }
+}
+
+impl VirtCpu {
+    #[inline(never)]
+    pub fn page_fault(
+        &mut self,
+        vm: &Vm,
+        cr2_or_gpa: u64,
+        mut error_code: u64,
+        _insn: Option<u64>,
+        _insn_len: usize,
+    ) -> Result<i32, SystemError> {
+        let emulation_type = EmulType::PF;
+        let _direct = self.arch.mmu().root_role.get_direct();
+        if error_code & PageFaultErr::PFERR_IMPLICIT_ACCESS.bits() != 0 {
+            warn!("Implicit access error code detected");
+            error_code &= !PageFaultErr::PFERR_IMPLICIT_ACCESS.bits();
+        }
+
+        //if self.arch.mmu().root.hpa != KvmMmu::INVALID_PAGE {
+        //    return Ok(PFRet::Retry as u64);
+        //}
+
+        let mut r = PFRet::Invalid;
+        if unlikely(error_code & PageFaultErr::PFERR_RSVD.bits() != 0) {
+            todo!();
+            // r = self.handle_mmio_page_fault(cr2_or_gpa, direct)?;
+            // if r == PFRes::Emulate{
+            //    return x86_emulate_instruction(vcpu, cr2_or_gpa, emulation_type, insn,insn_len)	       insn_len);
+            // }
+        }
+
+        if r == PFRet::Invalid {
+            r = self
+                .do_page_fault(
+                    vm,
+                    cr2_or_gpa,
+                    (error_code & 0xFFFFFFFF) as u32,
+                    false,
+                    emulation_type,
+                )?
+                .into();
+            if r == PFRet::Invalid {
+                return Err(SystemError::EIO);
+            }
+        }
+
+        if i32::from(r.clone()) < 0 {
+            return Ok(i32::from(r));
+        }
+        if r != PFRet::Emulate {
+            return Ok(1);
+        }
+
+        // 在模拟指令之前,检查错误代码是否由于在翻译客户机页面时的只读(RO)违规。
+        // 这可能发生在使用嵌套虚拟化和嵌套分页的情况下。如果是这样,我们只需取消页面保护并恢复客户机。
+        let pferr_nested_guest_page = PageFaultErr::PFERR_GUEST_PAGE
+            | PageFaultErr::PFERR_WRITE
+            | PageFaultErr::PFERR_PRESENT;
+        if self.arch.mmu().root_role.get_direct()
+            && (error_code & pferr_nested_guest_page.bits()) == pferr_nested_guest_page.bits()
+        {
+            todo!()
+        }
+
+        // self.arch.mmu.page_fault 返回 RET_PF_EMULATE,但我们仍然可以乐观地尝试取消页面保护,
+        // 并让处理器重新执行导致页面故障的指令。不允许重试 MMIO 模拟,因为这不仅毫无意义,
+        // 而且可能导致进入无限循环,因为处理器会不断在不存在的 MMIO 地址上发生故障。
+        // 重试来自嵌套客户机的指令也是毫无意义且危险的,因为我们只显式地影子 L1 的页表,
+        // 即为 L1 取消保护并不会神奇地修复导致 L2 失败的问题。
+        // if !self.mmio_info_in_cache(cr2_or_gpa, direct) && !self.arch.is_guest_mode() {
+        //     emulation_type |= EmulType::ALLOW_RETRY_PF;
+        // }
+
+        // self.emulate_instruction(cr2_or_gpa, emulation_type, insn, insn_len)
+        todo!("emulate_instruction")
+    }
+
+    fn do_page_fault(
+        &mut self,
+        vm: &Vm,
+        cr2_or_gpa: u64,
+        error_code: u32,
+        prefetch: bool,
+        mut emultype: EmulType,
+    ) -> Result<i32, SystemError> {
+        //初始化page fault
+        let mut page_fault = KvmPageFault {
+            addr: PhysAddr::new(cr2_or_gpa as usize),
+            error_code,
+            exec: error_code & PageFaultErr::PFERR_FETCH.bits() as u32 != 0,
+            write: error_code & PageFaultErr::PFERR_WRITE.bits() as u32 != 0,
+            present: error_code & PageFaultErr::PFERR_PRESENT.bits() as u32 != 0,
+            rsvd: error_code & PageFaultErr::PFERR_RSVD.bits() as u32 != 0,
+            user: error_code & PageFaultErr::PFERR_USER.bits() as u32 != 0,
+            prefetch,
+            is_tdp: true,
+            nx_huge_page_workaround_enabled: false, //todo
+            max_level: PageLevel::Level1G as u8,
+            req_level: PageLevel::Level4K as u8,
+            goal_level: PageLevel::Level4K as u8,
+            ..Default::default()
+        };
+        //处理直接映射
+        if self.arch.mmu().root_role.get_direct() {
+            page_fault.gfn = (page_fault.addr.data() >> PAGE_SHIFT) as u64;
+            debug!("page_fault.addr.data() : 0x{:x}", page_fault.addr.data());
+            debug!("do_page_fault : gfn = 0x{:x}", page_fault.gfn);
+            page_fault.slot = self.gfn_to_memslot(page_fault.gfn, vm); //kvm_vcpu_gfn_to_memslot(vcpu, fault.gfn);没完成
+        }
+        //异步页面错误(Async #PF),也称为预取错误(prefetch faults),
+        //从客机(guest)的角度来看并不是错误,并且已经在原始错误发生时被计数。
+        if !prefetch {
+            self.stat.pf_taken += 1;
+        }
+
+        let r = if page_fault.is_tdp {
+            self.tdp_page_fault(vm, &mut page_fault).unwrap()
+        } else {
+            //目前的处理page_fault的方法只有tdp_page_fault,所以这里是不会执行的
+            let handle = self.arch.mmu().page_fault.unwrap();
+            handle(self, &page_fault).unwrap()
+        };
+
+        if page_fault.write_fault_to_shadow_pgtable {
+            emultype |= EmulType::WRITE_PF_TO_SP;
+        }
+        //类似于上面的情况,预取错误并不是真正的虚假错误,并且异步页面错误路径不会进行仿真。
+        //然而,确实要统计由异步页面错误处理程序修复的错误,否则它们将永远不会被统计。
+        match PFRet::from(r) {
+            PFRet::Fixed => self.stat.pf_fixed += 1,
+            PFRet::Emulate => self.stat.pf_emulate += 1,
+            PFRet::Spurious => self.stat.pf_spurious += 1,
+            _ => {}
+        }
+        debug!("do_page_fault return r = {}", r);
+        Ok(r)
+    }
+
+    pub fn gfn_to_memslot(&self, gfn: u64, vm: &Vm) -> Option<Arc<LockedKvmMemSlot>> {
+        let slot_set: Arc<LockedVmMemSlotSet> = self.kvm_vcpu_memslots(vm);
+        //...todo
+
+        search_memslots(slot_set, gfn)
+    }
+    pub fn kvm_vcpu_memslots(&self, vm: &Vm) -> Arc<LockedVmMemSlotSet> {
+        vm.memslots.index(0).clone()
+    }
+    fn tdp_page_fault(
+        &mut self,
+        vm: &Vm,
+        page_fault: &mut KvmPageFault,
+    ) -> Result<i32, SystemError> {
+        // 如果 shadow_memtype_mask 为真,并且虚拟机有非一致性 DMA
+        //if shadow_memtype_mask != 0 && self.kvm().lock().arch.noncoherent_dma_count > 0 {
+        while page_fault.max_level > PageLevel::Level4K as u8 {
+            let page_num = PageLevel::kvm_pages_per_hpage(page_fault.max_level);
+
+            //低地址对齐
+            let base = gfn_round_for_level(page_fault.gfn, page_fault.max_level);
+
+            //检查给定 GFN 范围内的内存类型是否一致,暂未实现
+            if kvm_mtrr_check_gfn_range_consistency(self, base, page_num) {
+                break;
+            }
+
+            page_fault.max_level -= 1;
+        }
+        //}
+
+        if is_tdp_mmu_enabled() {
+            return self.kvm_tdp_mmu_page_fault(vm, page_fault);
+        }
+
+        //正常是不会执行到这里的,因为我们的是支持ept的
+        self.direct_page_fault(page_fault)
+    }
+    fn kvm_tdp_mmu_page_fault(
+        &self,
+        vm: &Vm,
+        page_fault: &mut KvmPageFault,
+    ) -> Result<i32, SystemError> {
+        //page_fault_handle_page_track(page_fault)
+        //fast_page_fault(page_fault);
+        //mmu_topup_memory_caches(false);
+        let mut r = self
+            .kvm_faultin_pfn(vm, page_fault, 1 | 1 << 1 | 1 << 2)
+            .unwrap();
+        if r != PFRet::Continue {
+            return Ok(r.into());
+        }
+
+        //r = PFRet::Retry;
+
+        //if self.is_page_fault_stale(page_fault) {return;}
+
+        //实际的映射
+        r = self.tdp_mmu_map(page_fault)?.into();
+
+        Ok(r.into())
+    }
+    //没有实现huge page相关
+    fn tdp_mmu_map(&self, page_fault: &mut KvmPageFault) -> Result<i32, SystemError> {
+        // let ret = PFRet::Retry;//下面的逻辑和linux不一致,可能在判断返回值会有问题
+        let mut mapper = EptPageMapper::lock();
+        debug!("{:?}", &page_fault);
+        //flags :rwx
+        let page_flags: EntryFlags<MMArch> = unsafe { EntryFlags::from_data(0xb77) };
+        mapper.map(PhysAddr::new(page_fault.gpa() as usize), page_flags);
+        //debug_eptp();
+
+        debug!("The ept_root_addr is {:?}", EptPageMapper::root_page_addr());
+        //todo: 一些参数的更新
+        Ok(PFRet::Fixed.into())
+        //todo!()
+    }
+
+    fn direct_page_fault(&self, _page_fault: &KvmPageFault) -> Result<i32, SystemError> {
+        todo!()
+    }
+
+    fn kvm_faultin_pfn(
+        &self,
+        vm: &Vm,
+        page_fault: &mut KvmPageFault,
+        _access: u32,
+    ) -> Result<PFRet, SystemError> {
+        page_fault.mmu_seq = vm.mmu_invalidate_seq;
+        self.__kvm_faultin_pfn(page_fault)
+    }
+    fn __kvm_faultin_pfn(&self, page_fault: &mut KvmPageFault) -> Result<PFRet, SystemError> {
+        let slot = &page_fault.slot;
+        let mut is_async = false;
+        if slot.is_none() {
+            return Err(SystemError::KVM_HVA_ERR_BAD);
+        }
+        let slot = slot.as_ref().unwrap().read();
+
+        if slot.get_flags().bits() & UserMemRegionFlag::KVM_MEMSLOT_INVALID.bits() != 0 {
+            return Ok(PFRet::Retry);
+        }
+        if !slot.is_visible() {
+            /* 不要将私有内存槽暴露给 L2。 */
+            if self.arch.is_guest_mode() {
+                drop(slot);
+                page_fault.slot = None;
+                page_fault.pfn = KVM_PFN_NOSLOT;
+                page_fault.map_writable = false;
+                return Ok(PFRet::Continue);
+            }
+        }
+
+        // 尝试将 GFN 转换为 PFN
+        let guest_cr3 = VmxAsm::vmx_vmread(guest::CR3);
+        let host_cr3 = VmxAsm::vmx_vmread(host::CR3);
+        debug!("guest_cr3={:x}, host_cr3={:x}", guest_cr3, host_cr3);
+        page_fault.pfn = __gfn_to_pfn_memslot(
+            Some(&slot),
+            page_fault.gfn,
+            (false, &mut is_async),
+            false,
+            page_fault.write,
+            &mut page_fault.map_writable,
+            &mut page_fault.hva,
+        )?;
+
+        if !is_async {
+            return Ok(PFRet::Continue); /* *pfn 已经有正确的页面 */
+        }
+
+        // if !page_fault.prefetch && self.kvm_can_do_async_pf() {
+        //     self.trace_kvm_try_async_get_page(page_fault.addr, page_fault.gfn);
+        //     if self.kvm_find_async_pf_gfn(page_fault.gfn) {
+        //         self.trace_kvm_async_pf_repeated_fault(page_fault.addr, page_fault.gfn);
+        //         self.kvm_make_request(KVM_REQ_APF_HALT);
+        //         return Ok(PFRet::Retry);
+        //     } else if self.kvm_arch_setup_async_pf(page_fault.addr, page_fault.gfn) {
+        //         return Ok(PFRet::Retry);
+        //     }
+        // }
+        Ok(PFRet::Continue)
+    }
+}

+ 3 - 0
kernel/src/arch/x86_64/vm/mmu/mod.rs

@@ -0,0 +1,3 @@
+pub mod kvm_mmu;
+pub mod mmu_internal;
+pub mod tdp_iter;

+ 219 - 0
kernel/src/arch/x86_64/vm/mmu/tdp_iter.rs

@@ -0,0 +1,219 @@
+// use crate::{
+//     arch::vm::mmu::mmu::gfn_round_for_level,
+//     mm::{virt_2_phys, PhysAddr, VirtAddr},
+//     time::sleep,
+//     virt::kvm::host_mem::PAGE_SHIFT,
+// };
+
+// use super::{
+//     mmu::{PageLevel, PAGE_SIZE},
+//     mmu_internal::KvmMmuPage,
+// };
+
+// pub const PT64_ROOT_MAX_LEVEL: usize = 5; //通常只用到4级,但是确实有5级的情况
+// pub const PT_LEVEL_BITS: u8 = 9; // 每个页表级别的位数
+// pub const PT64_ENT_PER_PAGE: u32 = 1 << 9;
+// pub const PTE_LEN: usize = 64;
+
+// //Bits 51:12 are from the EPT PDPTE
+// pub const PT64_BASE_ADDR_MASK: u64 = ((1u64 << 52) - 1) & !(PAGE_SIZE - 1);
+
+// pub fn shadow_pt_index(addr: u64, level: u8) -> u64 {
+//     (addr >> (PAGE_SHIFT as u8 + (level - 1) * PT_LEVEL_BITS)) & ((1 << PT_LEVEL_BITS) - 1)
+// }
+// pub fn is_last_spte(pte: u64, level: u8) -> bool {
+//     level == PageLevel::Level4K as u8 || is_large_pte(pte)
+// }
+// pub fn is_shadow_present_pte(pte: u64) -> bool {
+//     pte & 1 << 11 != 0 //在intel手冊中:ept PTE:11 Ignored.不是很懂
+// }
+// pub fn is_large_pte(pte: u64) -> bool {
+//     pte & 1 << 7 != 0 //在intel手冊中:ept PTE:7 Ignored.
+// }
+// ///Bits 51:12 are from the EPT PDPTE
+// pub fn spte_to_pfn(pte: u64) -> u64 {
+//     (pte & PT64_BASE_ADDR_MASK) >> PAGE_SHIFT
+// }
+
+// #[derive(Default)]
+// pub struct TdpIter {
+//     inner: TdpIterInner,
+// }
+
+// impl TdpIter {
+//     pub fn start(
+//         &self,
+//         root_pt: usize,
+//         root_level: u8,
+//         min_level: u8,
+//         next_last_level_gfn: u64,
+//     ) -> Self {
+//         let mut inner = self.inner.clone();
+//         inner.start(root_pt, root_level, min_level, next_last_level_gfn);
+//         TdpIter { inner }
+//     }
+// }
+// ///迭代器将遍历分页结构,直到找到此 GFN 的映射。
+// #[derive(Default, Clone)]
+// pub struct TdpIterInner {
+//     next_last_level_gfn: u64,
+//     /// 线程上次让出时的 next_last_level_gfn。
+//     /// 仅当 next_last_level_gfn != yielded_gfn 时让出,有助于确保前进。
+//     pub yielded_gfn: u64,
+
+//     ///指向遍历到当前 SPTE 的页表的指针
+//     pt_path: [u64; PT64_ROOT_MAX_LEVEL],
+
+//     ///指向当前 SPTE 的指针  是hva吗?
+//     sptep: PhysAddr,
+
+//     /// 当前 SPTE 映射的最低 GFN  hpa>>shift?
+//     pub gfn: u64,
+
+//     ///给迭代器的根页级别
+//     pub root_level: u8,
+
+//     ///迭代器应遍历到的最低级别
+//     pub min_level: u8,
+
+//     ///迭代器在分页结构中的当前级别
+//     pub level: u8,
+
+//     ///sptep 处值的快照
+//     pub old_spte: u64,
+
+//     ///迭代器是否具有有效状态。如果迭代器走出分页结构的末端,则为 false。
+//     ///
+//     pub valid: bool,
+// }
+// impl TdpIterInner {
+//     ///初始化ept iter
+//     #[inline(never)]
+//     pub fn start(
+//         &mut self,
+//         root_pt: usize,
+//         root_level: u8,
+//         min_level: u8,
+//         next_last_level_gfn: u64,
+//     ) {
+//         // if root_pt.role.level() == 0 || root_pt.role.level() > PT64_ROOT_MAX_LEVEL as u32  {
+//         //     self.valid = false;
+//         //     return;
+//         // }
+
+//         if root_level < 1 || root_level > PT64_ROOT_MAX_LEVEL as u8 {
+//             self.valid = false;
+//             return;
+//         }
+//         self.next_last_level_gfn = next_last_level_gfn;
+//         self.root_level = root_level as u8;
+//         self.min_level = min_level as u8;
+//         self.pt_path[(self.root_level - 1) as usize] = root_pt as u64;
+//         self.yielded_gfn = self.next_last_level_gfn;
+//         self.level = self.root_level;
+
+//         self.gfn = gfn_round_for_level(self.next_last_level_gfn, self.level);
+//         self.tdp_iter_refresh_sptep();
+//         self.valid = true;
+//     }
+
+//     /*
+//      * 重新计算当前GFN和level和SPTE指针,并重新读取SPTE。
+//      */
+//     fn tdp_iter_refresh_sptep(&mut self) {
+//         // self.sptep = PhysAddr::new(
+//         //     (self.pt_path[self.level as usize - 1]
+//         //         + shadow_pt_index(self.gfn << PAGE_SHIFT, self.level)) as usize,
+//         // );
+//         // self.old_spte = read_sptep(self.sptep);
+//     }
+
+//     pub fn _next(&mut self) {
+//         if self.try_step_down() {
+//             return;
+//         }
+//         loop {
+//             if self.try_step_side() {
+//                 return;
+//             }
+//             if !self.try_step_up() {
+//                 break;
+//             }
+//         }
+//         self.valid = false;
+//     }
+//     ///在分页结构中向目标GFN下降一级。如果迭代器能够下降一级,则返回true,否则返回false。
+//     fn try_step_down(&mut self) -> bool {
+//         if self.level == self.min_level {
+//             return false;
+//         }
+//         //在下降之前重新读取SPTE,以避免遍历到不再从此条目链接的页表中。
+//         self.old_spte = read_sptep(self.sptep);
+
+//         match spte_to_child_pt(self.old_spte, self.level) {
+//             Some(child_pt) => {
+//                 self.level -= 1;
+//                 self.pt_path[self.level as usize - 1] = child_pt.data() as u64;
+//                 self.gfn = gfn_round_for_level(self.gfn, self.level);
+//                 self.tdp_iter_refresh_sptep();
+//                 true
+//             }
+//             None => false,
+//         }
+//     }
+//     fn try_step_up(&mut self) -> bool {
+//         if self.level == self.root_level {
+//             return false;
+//         }
+//         self.level += 1;
+//         self.gfn = gfn_round_for_level(self.gfn, self.level);
+//         self.tdp_iter_refresh_sptep();
+//         true
+//     }
+//     ///在当前页表的当前级别中,移动到下一个条目。下一个条目可以指向一个page backing guest memory ,
+//     ///或者另一个页表,或者它可能是不存在的。如果迭代器能够移动到页表中的下一个条目,则返回true,
+//     ///如果迭代器已经在当前页表的末尾,则返回false。
+//     fn try_step_side(&mut self) -> bool {
+//         //检查迭代器是否已经在当前页表的末尾。
+//         if shadow_pt_index(self.gfn << PAGE_SHIFT, self.level) == (PT64_ENT_PER_PAGE - 1) as u64 {
+//             return false;
+//         }
+
+//         self.gfn += PageLevel::kvm_pages_per_hpage(self.level);
+//         self.next_last_level_gfn = self.gfn;
+//         self.sptep.add(PTE_LEN); //指向下一个spte,一个spte占64位
+//         self.old_spte = read_sptep(self.sptep);
+//         true
+//     }
+// }
+// impl Iterator for TdpIter {
+//     type Item = TdpIterInner; // 返回 (gfn, spte) 元组
+
+//     fn next(&mut self) -> Option<Self::Item> {
+//         let inner = &mut self.inner;
+//         if !inner.valid {
+//             return None;
+//         }
+//         inner._next();
+//         if inner.valid {
+//             Some(inner.clone())
+//         } else {
+//             None
+//         }
+//     }
+// }
+// ///给定一个 SPTE 及其级别,返回一个指针,该指针包含 SPTE 所引用的子页表的hva。
+// ///如果没有这样的条目,则返回 null。
+// ///
+// fn spte_to_child_pt(spte: u64, level: u8) -> Option<VirtAddr> {
+//     //没有子页表
+//     if !is_shadow_present_pte(spte) || is_last_spte(spte, level) {
+//         return None;
+//     }
+//     Some(VirtAddr::new(virt_2_phys//__va
+//         ((spte_to_pfn(spte)<<PAGE_SHIFT) as usize
+//     )))
+// }
+// pub fn read_sptep(sptep: PhysAddr) -> u64 {
+//     unsafe { *(sptep.data() as *const u64) }
+// }

+ 640 - 0
kernel/src/arch/x86_64/vm/mod.rs

@@ -0,0 +1,640 @@
+use alloc::vec::Vec;
+use log::{error, warn};
+use raw_cpuid::CpuId;
+use system_error::SystemError;
+use x86::{
+    controlregs::{cr4, xcr0, Cr0, Cr4, Xcr0},
+    msr::{self, rdmsr, wrmsr},
+};
+use x86_64::registers::control::{Efer, EferFlags};
+
+use crate::{
+    arch::vm::vmx::{VmxL1dFlushState, L1TF_VMX_MITIGATION},
+    libs::once::Once,
+    mm::percpu::{PerCpu, PerCpuVar},
+};
+
+use self::{
+    asm::{hyperv::*, kvm_msr::*, ArchCapabilities, VmxMsrEntry},
+    kvm_host::{KvmFunc, KvmInitFunc},
+};
+
+use super::driver::tsc::TSCManager;
+
+mod asm;
+mod cpuid;
+pub(super) mod exit;
+pub mod kvm_host;
+pub mod mem;
+pub mod mmu;
+pub mod mtrr;
+pub mod uapi;
+pub mod vmx;
+
+static mut KVM_X86_MANAGER: Option<KvmArchManager> = None;
+
+pub fn x86_kvm_ops() -> &'static dyn KvmFunc {
+    unsafe { KVM_X86_MANAGER.as_ref().unwrap().funcs() }
+}
+
+pub fn x86_kvm_manager() -> &'static KvmArchManager {
+    unsafe { KVM_X86_MANAGER.as_ref().unwrap() }
+}
+
+pub fn x86_kvm_manager_mut() -> &'static mut KvmArchManager {
+    unsafe { KVM_X86_MANAGER.as_mut().unwrap() }
+}
+
+pub fn init_kvm_arch() {
+    static ONCE: Once = Once::new();
+    ONCE.call_once(|| unsafe {
+        KVM_X86_MANAGER = Some(KvmArchManager::init());
+
+        let mut user_return_msrs = Vec::new();
+        user_return_msrs.resize(PerCpu::MAX_CPU_NUM as usize, KvmUserReturnMsrs::default());
+        USER_RETURN_MSRS = Some(PerCpuVar::new(user_return_msrs).unwrap());
+    })
+}
+
+/// fixme:这些成员是否需要加锁呢??
+#[derive(Debug)]
+pub struct KvmArchManager {
+    funcs: Option<&'static dyn KvmFunc>,
+    host_xcr0: Xcr0,
+    host_efer: EferFlags,
+    host_xss: u64,
+    host_arch_capabilities: u64,
+    kvm_uret_msrs_list: Vec<u32>,
+    kvm_caps: KvmCapabilities,
+    max_tsc_khz: u64,
+    msrs_to_save: Vec<u32>,
+    emulated_msrs: Vec<u32>,
+    msr_based_features: Vec<u32>,
+
+    has_noapic_vcpu: bool,
+
+    enable_pmu: bool,
+
+    // 只读
+    possible_cr0_guest: Cr0,
+    possible_cr4_guest: Cr4,
+    cr4_tlbflush_bits: Cr4,
+    cr4_pdptr_bits: Cr4,
+}
+
+impl KvmArchManager {
+    pub fn init() -> Self {
+        Self {
+            possible_cr0_guest: Cr0::CR0_TASK_SWITCHED | Cr0::CR0_WRITE_PROTECT,
+            possible_cr4_guest: Cr4::CR4_VIRTUAL_INTERRUPTS
+                | Cr4::CR4_DEBUGGING_EXTENSIONS
+                | Cr4::CR4_ENABLE_PPMC
+                | Cr4::CR4_ENABLE_SSE
+                | Cr4::CR4_UNMASKED_SSE
+                | Cr4::CR4_ENABLE_GLOBAL_PAGES
+                | Cr4::CR4_TIME_STAMP_DISABLE
+                | Cr4::CR4_ENABLE_FSGSBASE,
+
+            cr4_tlbflush_bits: Cr4::CR4_ENABLE_GLOBAL_PAGES
+                | Cr4::CR4_ENABLE_PCID
+                | Cr4::CR4_ENABLE_PAE
+                | Cr4::CR4_ENABLE_SMEP,
+
+            cr4_pdptr_bits: Cr4::CR4_ENABLE_GLOBAL_PAGES
+                | Cr4::CR4_ENABLE_PSE
+                | Cr4::CR4_ENABLE_PAE
+                | Cr4::CR4_ENABLE_SMEP,
+
+            host_xcr0: Xcr0::empty(),
+
+            funcs: Default::default(),
+            host_efer: EferFlags::empty(),
+            host_xss: Default::default(),
+            host_arch_capabilities: Default::default(),
+            kvm_uret_msrs_list: Default::default(),
+            kvm_caps: Default::default(),
+            max_tsc_khz: Default::default(),
+            msrs_to_save: Default::default(),
+            emulated_msrs: Default::default(),
+            msr_based_features: Default::default(),
+            has_noapic_vcpu: Default::default(),
+            enable_pmu: Default::default(),
+        }
+    }
+
+    #[inline]
+    pub fn set_runtime_func(&mut self, funcs: &'static dyn KvmFunc) {
+        self.funcs = Some(funcs);
+    }
+
+    #[inline]
+    pub fn funcs(&self) -> &'static dyn KvmFunc {
+        self.funcs.unwrap()
+    }
+
+    pub fn find_user_return_msr_idx(&self, msr: u32) -> Option<usize> {
+        for (i, val) in self.kvm_uret_msrs_list.iter().enumerate() {
+            if *val == msr {
+                return Some(i);
+            }
+        }
+
+        None
+    }
+
+    pub fn mpx_supported(&self) -> bool {
+        self.kvm_caps.supported_xcr0 & (Xcr0::XCR0_BNDREG_STATE | Xcr0::XCR0_BNDCSR_STATE)
+            == (Xcr0::XCR0_BNDREG_STATE | Xcr0::XCR0_BNDCSR_STATE)
+    }
+
+    pub const KVM_MAX_VCPUS: usize = 1024;
+    pub const KVM_MAX_NR_USER_RETURN_MSRS: usize = 7;
+
+    const MSRS_TO_SAVE_BASE: &[u32] = &[
+        msr::IA32_SYSENTER_CS,
+        msr::IA32_SYSENTER_ESP,
+        msr::IA32_SYSENTER_EIP,
+        msr::IA32_STAR,
+        msr::IA32_CSTAR,
+        msr::IA32_KERNEL_GSBASE,
+        msr::IA32_FMASK,
+        msr::IA32_LSTAR,
+        msr::IA32_TIME_STAMP_COUNTER,
+        msr::IA32_PAT,
+        0xc0010117, // MSR_VM_HSAVE_PA?
+        msr::IA32_FEATURE_CONTROL,
+        msr::MSR_C1_PMON_EVNT_SEL0,
+        msr::IA32_TSC_AUX,
+        0x48, // MSR_IA32_SPEC_CTRL
+        msr::MSR_IA32_TSX_CTRL,
+        msr::MSR_IA32_RTIT_CTL,
+        msr::MSR_IA32_RTIT_STATUS,
+        msr::MSR_IA32_CR3_MATCH,
+        msr::MSR_IA32_RTIT_OUTPUT_BASE,
+        msr::MSR_IA32_RTIT_OUTPUT_MASK_PTRS,
+        msr::MSR_IA32_ADDR0_START,
+        msr::MSR_IA32_ADDR0_END,
+        msr::MSR_IA32_ADDR1_START,
+        msr::MSR_IA32_ADDR1_END,
+        msr::MSR_IA32_ADDR2_START,
+        msr::MSR_IA32_ADDR2_END,
+        msr::MSR_IA32_ADDR3_START,
+        msr::MSR_IA32_ADDR3_END,
+        0xe1,  // MSR_IA32_UMWAIT_CONTROL
+        0x1c4, // MSR_IA32_XFD
+        0x1c5, // MSR_IA32_XFD_ERR
+    ];
+
+    const EMULATED_MSRS_ALL: &[u32] = &[
+        MSR_KVM_SYSTEM_TIME,
+        MSR_KVM_WALL_CLOCK,
+        MSR_KVM_SYSTEM_TIME_NEW,
+        MSR_KVM_WALL_CLOCK_NEW,
+        HV_X64_MSR_GUEST_OS_ID,
+        HV_X64_MSR_HYPERCALL,
+        HV_REGISTER_TIME_REF_COUNT,
+        HV_REGISTER_REFERENCE_TSC,
+        HV_X64_MSR_TSC_FREQUENCY,
+        HV_X64_MSR_APIC_FREQUENCY,
+        HV_REGISTER_CRASH_P0,
+        HV_REGISTER_CRASH_P1,
+        HV_REGISTER_CRASH_P2,
+        HV_REGISTER_CRASH_P3,
+        HV_REGISTER_CRASH_P4,
+        HV_REGISTER_CRASH_CTL,
+        HV_X64_MSR_RESET,
+        HV_REGISTER_VP_INDEX,
+        HV_X64_MSR_VP_RUNTIME,
+        HV_REGISTER_SCONTROL,
+        HV_REGISTER_STIMER0_CONFIG,
+        HV_X64_MSR_VP_ASSIST_PAGE,
+        HV_X64_MSR_REENLIGHTENMENT_CONTROL,
+        HV_X64_MSR_TSC_EMULATION_CONTROL,
+        HV_X64_MSR_TSC_EMULATION_STATUS,
+        HV_X64_MSR_TSC_INVARIANT_CONTROL,
+        HV_X64_MSR_SYNDBG_OPTIONS,
+        HV_X64_MSR_SYNDBG_CONTROL,
+        HV_X64_MSR_SYNDBG_STATUS,
+        HV_X64_MSR_SYNDBG_SEND_BUFFER,
+        HV_X64_MSR_SYNDBG_RECV_BUFFER,
+        HV_X64_MSR_SYNDBG_PENDING_BUFFER,
+        MSR_KVM_ASYNC_PF_EN,
+        MSR_KVM_STEAL_TIME,
+        MSR_KVM_PV_EOI_EN,
+        MSR_KVM_ASYNC_PF_INT,
+        MSR_KVM_ASYNC_PF_ACK,
+        msr::IA32_TSC_ADJUST,
+        msr::IA32_TSC_DEADLINE,
+        msr::IA32_PERF_CAPABILITIES,
+        0x10a, // MSR_IA32_ARCH_CAPABILITIES,
+        msr::IA32_MISC_ENABLE,
+        msr::IA32_MCG_STATUS,
+        msr::IA32_MCG_CTL,
+        0x4d0, // MSR_IA32_MCG_EXT_CTL,
+        msr::IA32_SMBASE,
+        msr::MSR_SMI_COUNT,
+        msr::MSR_PLATFORM_INFO,
+        0x140,      // MSR_MISC_FEATURES_ENABLES,
+        0xc001011f, // MSR_AMD64_VIRT_SPEC_CTRL,
+        0xc0000104, // MSR_AMD64_TSC_RATIO,
+        msr::MSR_POWER_CTL,
+        msr::IA32_BIOS_SIGN_ID, // MSR_IA32_UCODE_REV,
+        /*
+         * KVM always supports the "true" VMX control MSRs, even if the host
+         * does not.  The VMX MSRs as a whole are considered "emulated" as KVM
+         * doesn't strictly require them to exist in the host (ignoring that
+         * KVM would refuse to load in the first place if the core set of MSRs
+         * aren't supported).
+         */
+        msr::IA32_VMX_BASIC,
+        msr::IA32_VMX_TRUE_PINBASED_CTLS,
+        msr::IA32_VMX_TRUE_PROCBASED_CTLS,
+        msr::IA32_VMX_TRUE_EXIT_CTLS,
+        msr::IA32_VMX_TRUE_ENTRY_CTLS,
+        msr::IA32_VMX_MISC,
+        msr::IA32_VMX_CR0_FIXED0,
+        msr::IA32_VMX_CR4_FIXED0,
+        msr::IA32_VMX_VMCS_ENUM,
+        msr::IA32_VMX_PROCBASED_CTLS2,
+        msr::IA32_VMX_EPT_VPID_CAP,
+        msr::IA32_VMX_VMFUNC,
+        0xc0010015, // MSR_K7_HWCR,
+        MSR_KVM_POLL_CONTROL,
+    ];
+
+    const MSR_BASED_FEATURES_ALL_EXCEPT_VMX: &[u32] = &[
+        0xc0011029,             // MSR_AMD64_DE_CFG
+        msr::IA32_BIOS_SIGN_ID, // MSR_IA32_UCODE_REV
+        0x10a,                  // MSR_IA32_ARCH_CAPABILITIES,
+        msr::IA32_PERF_CAPABILITIES,
+    ];
+
+    pub fn arch_hardware_enable(&self) -> Result<(), SystemError> {
+        self.online_user_return_msr();
+
+        x86_kvm_ops().hardware_enable()?;
+
+        // TODO: 这里需要对TSC进行一系列检测
+
+        Ok(())
+    }
+
+    /// ## 初始化当前cpu的kvm msr寄存器
+    fn online_user_return_msr(&self) {
+        let user_return_msrs = user_return_msrs().get_mut();
+
+        for (idx, msr) in self.kvm_uret_msrs_list.iter().enumerate() {
+            let val = unsafe { rdmsr(*msr) };
+            user_return_msrs.values[idx].host = val;
+            user_return_msrs.values[idx].curr = val;
+        }
+    }
+
+    /// 厂商相关的init工作
+    pub fn vendor_init(&mut self, init_ops: &'static dyn KvmInitFunc) -> Result<(), SystemError> {
+        let cpuid = CpuId::new();
+        let cpu_feature = cpuid.get_feature_info().ok_or(SystemError::ENOSYS)?;
+        let cpu_extend = cpuid.get_extended_state_info().ok_or(SystemError::ENOSYS)?;
+        let extend_features = cpuid
+            .get_extended_feature_info()
+            .ok_or(SystemError::ENOSYS)?;
+
+        let kvm_x86_ops = &self.funcs;
+
+        // 是否已经设置过
+        if kvm_x86_ops.is_some() {
+            error!(
+                "[KVM] already loaded vendor module {}",
+                kvm_x86_ops.unwrap().name()
+            );
+            return Err(SystemError::EEXIST);
+        }
+
+        // 确保cpu支持fpu浮点数处理器
+        if !cpu_feature.has_fpu() || !cpu_feature.has_fxsave_fxstor() {
+            error!("[KVM] inadequate fpu");
+            return Err(SystemError::ENOSYS);
+        }
+
+        // TODO:实时内核需要判断tsc
+        // https://code.dragonos.org.cn/xref/linux-6.6.21/arch/x86/kvm/x86.c#9472
+
+        // 读取主机page attribute table(页属性表)
+        let host_pat = unsafe { rdmsr(msr::IA32_PAT) };
+        // PAT[0]是否为write back类型,即判断低三位是否为0b110(0x06)
+        if host_pat & 0b111 != 0b110 {
+            error!("[KVM] host PAT[0] is not WB");
+            return Err(SystemError::EIO);
+        }
+
+        // TODO:mmu vendor init
+        if cpu_feature.has_xsave() && unsafe { cr4() }.contains(Cr4::CR4_ENABLE_OS_XSAVE) {
+            self.host_xcr0 = unsafe { xcr0() };
+            self.kvm_caps.supported_xcr0 = self.host_xcr0;
+        }
+
+        // 保存efer
+        self.host_efer = Efer::read();
+
+        // 保存xss
+        if cpu_extend.has_xsaves_xrstors() {
+            self.host_xss = unsafe { rdmsr(msr::MSR_C5_PMON_BOX_CTRL) };
+        }
+
+        // TODO: 初始化性能监视单元(PMU)
+        // https://code.dragonos.org.cn/xref/linux-6.6.21/arch/x86/kvm/x86.c#9518
+        if extend_features.has_sha() {
+            self.host_arch_capabilities = unsafe {
+                // MSR_IA32_ARCH_CAPABILITIES
+                rdmsr(0x10a)
+            }
+        }
+
+        init_ops.hardware_setup()?;
+
+        self.set_runtime_func(init_ops.runtime_funcs());
+
+        self.kvm_timer_init()?;
+
+        // TODO: https://code.dragonos.org.cn/xref/linux-6.6.21/arch/x86/kvm/x86.c#9544
+
+        let kvm_caps = &mut self.kvm_caps;
+        if !cpu_extend.has_xsaves_xrstors() {
+            kvm_caps.supported_xss = 0;
+        }
+
+        if kvm_caps.has_tsc_control {
+            kvm_caps.max_guest_tsc_khz = 0x7fffffff.min(
+                ((kvm_caps.max_tsc_scaling_ratio as i128 * TSCManager::tsc_khz() as i128)
+                    >> kvm_caps.tsc_scaling_ratio_frac_bits) as u32,
+            );
+        }
+
+        kvm_caps.default_tsc_scaling_ratio = 1 << kvm_caps.tsc_scaling_ratio_frac_bits;
+        self.kvm_init_msr_lists();
+
+        warn!("vendor init over");
+        Ok(())
+    }
+
+    fn kvm_init_msr_lists(&mut self) {
+        self.msrs_to_save.clear();
+        self.emulated_msrs.clear();
+        self.msr_based_features.clear();
+
+        for msr in Self::MSRS_TO_SAVE_BASE {
+            self.kvm_probe_msr_to_save(*msr);
+        }
+
+        if self.enable_pmu {
+            todo!()
+        }
+
+        for msr in Self::EMULATED_MSRS_ALL {
+            if !x86_kvm_ops().has_emulated_msr(*msr) {
+                continue;
+            }
+            self.emulated_msrs.push(*msr);
+        }
+
+        for msr in msr::IA32_VMX_BASIC..=msr::IA32_VMX_VMFUNC {
+            self.kvm_prove_feature_msr(msr)
+        }
+
+        for msr in Self::MSR_BASED_FEATURES_ALL_EXCEPT_VMX {
+            self.kvm_prove_feature_msr(*msr);
+        }
+    }
+
+    fn kvm_probe_msr_to_save(&mut self, msr: u32) {
+        let cpuid = CpuId::new();
+        let cpu_feat = cpuid.get_feature_info().unwrap();
+        let cpu_extend = cpuid.get_extended_feature_info().unwrap();
+
+        match msr {
+            msr::MSR_C1_PMON_EVNT_SEL0 => {
+                if !cpu_extend.has_mpx() {
+                    return;
+                }
+            }
+
+            msr::IA32_TSC_AUX => {
+                if !cpu_feat.has_tsc() {
+                    return;
+                }
+            }
+            // MSR_IA32_UNWAIT_CONTROL
+            0xe1 => {
+                if !cpu_extend.has_waitpkg() {
+                    return;
+                }
+            }
+            msr::MSR_IA32_RTIT_CTL | msr::MSR_IA32_RTIT_STATUS => {
+                if !cpu_extend.has_processor_trace() {
+                    return;
+                }
+            }
+            msr::MSR_IA32_CR3_MATCH => {
+                // TODO: 判断intel_pt_validate_hw_cap(PT_CAP_cr3_filtering)
+                if !cpu_extend.has_processor_trace() {
+                    return;
+                }
+            }
+            msr::MSR_IA32_RTIT_OUTPUT_BASE | msr::MSR_IA32_RTIT_OUTPUT_MASK_PTRS => {
+                // TODO: 判断!intel_pt_validate_hw_cap(PT_CAP_topa_output) &&!intel_pt_validate_hw_cap(PT_CAP_single_range_output)
+                if !cpu_extend.has_processor_trace() {
+                    return;
+                }
+            }
+            msr::MSR_IA32_ADDR0_START..msr::MSR_IA32_ADDR3_END => {
+                // TODO: 判断msr_index - MSR_IA32_RTIT_ADDR0_A >= intel_pt_validate_hw_cap(PT_CAP_num_address_ranges) * 2)
+                if !cpu_extend.has_processor_trace() {
+                    return;
+                }
+            }
+            msr::IA32_PMC0..msr::IA32_PMC7 => {
+                // TODO: 判断msr是否符合配置
+            }
+            msr::IA32_PERFEVTSEL0..msr::IA32_PERFEVTSEL7 => {
+                // TODO: 判断msr是否符合配置
+            }
+            msr::MSR_PERF_FIXED_CTR0..msr::MSR_PERF_FIXED_CTR2 => {
+                // TODO: 判断msr是否符合配置
+            }
+            msr::MSR_IA32_TSX_CTRL => {
+                // TODO: !(kvm_get_arch_capabilities() & ARCH_CAP_TSX_CTRL_MSR)
+                // 这个寄存器目前不支持,现在先return
+                // return;
+            }
+            _ => {}
+        }
+
+        self.msrs_to_save.push(msr);
+    }
+
+    fn kvm_prove_feature_msr(&mut self, index: u32) {
+        let mut msr = VmxMsrEntry {
+            index,
+            reserved: Default::default(),
+            data: Default::default(),
+        };
+
+        if self.get_msr_feature(&mut msr) {
+            return;
+        }
+
+        self.msr_based_features.push(index);
+    }
+
+    fn get_msr_feature(&self, msr: &mut VmxMsrEntry) -> bool {
+        match msr.index {
+            0x10a => {
+                // MSR_IA32_ARCH_CAPABILITIES,
+                msr.data = self.get_arch_capabilities();
+            }
+            msr::IA32_PERF_CAPABILITIES => {
+                msr.data = self.kvm_caps.supported_perf_cap;
+            }
+            msr::IA32_BIOS_SIGN_ID => {
+                // MSR_IA32_UCODE_REV
+                msr.data = unsafe { rdmsr(msr.index) };
+            }
+            _ => {
+                return x86_kvm_ops().get_msr_feature(msr);
+            }
+        }
+
+        return true;
+    }
+
+    fn get_arch_capabilities(&self) -> u64 {
+        let mut data = ArchCapabilities::from_bits_truncate(self.host_arch_capabilities)
+            & ArchCapabilities::KVM_SUPPORTED_ARCH_CAP;
+        data.insert(ArchCapabilities::ARCH_CAP_PSCHANGE_MC_NO);
+
+        if *L1TF_VMX_MITIGATION.read() != VmxL1dFlushState::Never {
+            data.insert(ArchCapabilities::ARCH_CAP_SKIP_VMENTRY_L1DFLUSH);
+        }
+
+        // fixme:这里是直接赋值,这里应该是需要判断cpu是否存在某些bug
+
+        data.insert(
+            ArchCapabilities::ARCH_CAP_RDCL_NO
+                | ArchCapabilities::ARCH_CAP_SSB_NO
+                | ArchCapabilities::ARCH_CAP_MDS_NO
+                | ArchCapabilities::ARCH_CAP_GDS_NO,
+        );
+
+        return data.bits();
+    }
+
+    pub fn add_user_return_msr(&mut self, msr: u32) {
+        assert!(self.kvm_uret_msrs_list.len() < Self::KVM_MAX_NR_USER_RETURN_MSRS);
+        self.kvm_uret_msrs_list.push(msr)
+    }
+
+    fn kvm_timer_init(&mut self) -> Result<(), SystemError> {
+        let cpuid = CpuId::new();
+        let cpu_feature = cpuid.get_feature_info().ok_or(SystemError::ENOSYS)?;
+        if cpu_feature.has_tsc() {
+            self.max_tsc_khz = TSCManager::tsc_khz();
+        }
+
+        // TODO:此处未完成
+        Ok(())
+    }
+
+    pub fn kvm_set_user_return_msr(&self, slot: usize, mut value: u64, mask: u64) {
+        let msrs = user_return_msrs().get_mut();
+
+        value = (value & mask) | (msrs.values[slot].host & !mask);
+        if value == msrs.values[slot].curr {
+            return;
+        }
+
+        unsafe { wrmsr(self.kvm_uret_msrs_list[slot], value) };
+
+        msrs.values[slot].curr = value;
+
+        if !msrs.registered {
+            msrs.registered = true;
+        }
+    }
+}
+
+/// ### Kvm的功能特性
+#[derive(Debug)]
+pub struct KvmCapabilities {
+    ///  是否支持控制客户机的 TSC(时间戳计数器)速率
+    has_tsc_control: bool,
+    /// 客户机可以使用的 TSC 的最大速率,以khz为单位
+    max_guest_tsc_khz: u32,
+    /// TSC 缩放比例的小数部分的位数
+    tsc_scaling_ratio_frac_bits: u8,
+    /// TSC 缩放比例的最大允许值
+    max_tsc_scaling_ratio: u64,
+    /// 默认的 TSC 缩放比例,其值为 1ull << tsc_scaling_ratio_frac_bits
+    default_tsc_scaling_ratio: u64,
+    /// 是否支持总线锁定的退出
+    has_bus_lock_exit: bool,
+    /// 是否支持 VM 退出通知
+    has_notify_vmexit: bool,
+    /// 支持的 MCE(机器检查异常)功能的位掩码
+    supported_mce_cap: McgCap,
+    /// 支持的 XCR0 寄存器的位掩码
+    supported_xcr0: Xcr0,
+    /// 支持的 XSS(XSAVE Extended State)寄存器的位掩码
+    supported_xss: u64,
+    /// 支持的性能监控功能的位掩码
+    supported_perf_cap: u64,
+}
+
+impl Default for KvmCapabilities {
+    fn default() -> Self {
+        Self {
+            has_tsc_control: Default::default(),
+            max_guest_tsc_khz: Default::default(),
+            tsc_scaling_ratio_frac_bits: Default::default(),
+            max_tsc_scaling_ratio: Default::default(),
+            default_tsc_scaling_ratio: Default::default(),
+            has_bus_lock_exit: Default::default(),
+            has_notify_vmexit: Default::default(),
+            supported_mce_cap: McgCap::MCG_CTL_P | McgCap::MCG_SER_P,
+            supported_xcr0: Xcr0::empty(),
+            supported_xss: Default::default(),
+            supported_perf_cap: Default::default(),
+        }
+    }
+}
+
+bitflags! {
+    pub struct McgCap: u64 {
+        const MCG_BANKCNT_MASK	= 0xff;         /* Number of Banks */
+        const MCG_CTL_P		= 1 << 8;   /* MCG_CTL register available */
+        const MCG_EXT_P		= 1 << 9;   /* Extended registers available */
+        const MCG_CMCI_P	= 1 << 10;  /* CMCI supported */
+        const MCG_EXT_CNT_MASK	= 0xff0000;     /* Number of Extended registers */
+        const MCG_EXT_CNT_SHIFT	= 16;
+        const MCG_SER_P		= 1 << 24;  /* MCA recovery/new status bits */
+        const MCG_ELOG_P	= 1 << 26;  /* Extended error log supported */
+        const MCG_LMCE_P	= 1 << 27;  /* Local machine check supported */
+    }
+}
+
+static mut USER_RETURN_MSRS: Option<PerCpuVar<KvmUserReturnMsrs>> = None;
+
+fn user_return_msrs() -> &'static PerCpuVar<KvmUserReturnMsrs> {
+    unsafe { USER_RETURN_MSRS.as_ref().unwrap() }
+}
+
+#[derive(Debug, Default, Clone)]
+struct KvmUserReturnMsrs {
+    pub registered: bool,
+    pub values: [KvmUserReturnMsrsValues; KvmArchManager::KVM_MAX_NR_USER_RETURN_MSRS],
+}
+
+#[derive(Debug, Default, Clone)]
+struct KvmUserReturnMsrsValues {
+    pub host: u64,
+    pub curr: u64,
+}

+ 37 - 0
kernel/src/arch/x86_64/vm/mtrr.rs

@@ -0,0 +1,37 @@
+use crate::virt::vm::kvm_host::vcpu::VirtCpu;
+
+use super::kvm_host::gfn_to_gpa;
+
+pub fn kvm_mtrr_check_gfn_range_consistency(_vcpu: &mut VirtCpu, gfn: u64, page_num: u64) -> bool {
+    // let mtrr_state = &vcpu.arch.mtrr_state;
+    // let mut iter = MtrrIter {
+    //     mem_type: -1,
+    //     mtrr_disabled: false,
+    //     partial_map: false,
+    // };
+    let _start = gfn_to_gpa(gfn);
+    let _end = gfn_to_gpa(gfn + page_num);
+
+    // mtrr_for_each_mem_type(&mut iter, mtrr_state, start, end, |iter| {
+    //     if iter.mem_type == -1 {
+    //         iter.mem_type = iter.mem_type;
+    //     } else if iter.mem_type != iter.mem_type {
+    //         return false;
+    //     }
+    // });
+
+    // if iter.mtrr_disabled {
+    //     return true;
+    // }
+
+    // if !iter.partial_map {
+    //     return true;
+    // }
+
+    // if iter.mem_type == -1 {
+    //     return true;
+    // }
+
+    // iter.mem_type == mtrr_default_type(mtrr_state)
+    true
+}

+ 102 - 0
kernel/src/arch/x86_64/vm/uapi.rs

@@ -0,0 +1,102 @@
+#![allow(dead_code)]
+
+use crate::virt::vm::user_api::UapiKvmSegment;
+
+pub const DE_VECTOR: usize = 0;
+pub const DB_VECTOR: usize = 1;
+pub const BP_VECTOR: usize = 3;
+pub const OF_VECTOR: usize = 4;
+pub const BR_VECTOR: usize = 5;
+pub const UD_VECTOR: usize = 6;
+pub const NM_VECTOR: usize = 7;
+pub const DF_VECTOR: usize = 8;
+pub const TS_VECTOR: usize = 10;
+pub const NP_VECTOR: usize = 11;
+pub const SS_VECTOR: usize = 12;
+pub const GP_VECTOR: usize = 13;
+pub const PF_VECTOR: usize = 14;
+pub const MF_VECTOR: usize = 16;
+pub const AC_VECTOR: usize = 17;
+pub const MC_VECTOR: usize = 18;
+pub const XM_VECTOR: usize = 19;
+pub const VE_VECTOR: usize = 20;
+
+pub const KVM_SYNC_X86_REGS: u64 = 1 << 0;
+pub const KVM_SYNC_X86_SREGS: u64 = 1 << 1;
+pub const KVM_SYNC_X86_EVENTS: u64 = 1 << 2;
+
+pub const KVM_SYNC_X86_VALID_FIELDS: u64 =
+    KVM_SYNC_X86_REGS | KVM_SYNC_X86_SREGS | KVM_SYNC_X86_EVENTS;
+
+#[repr(C)]
+#[derive(Debug, Default, Copy, Clone, PartialEq)]
+pub struct UapiKvmSegmentRegs {
+    pub cs: UapiKvmSegment,
+    pub ds: UapiKvmSegment,
+    pub es: UapiKvmSegment,
+    pub fs: UapiKvmSegment,
+    pub gs: UapiKvmSegment,
+    pub ss: UapiKvmSegment,
+    pub tr: UapiKvmSegment,
+    pub ldt: UapiKvmSegment,
+    pub gdt: UapiKvmDtable,
+    pub idt: UapiKvmDtable,
+    pub cr0: u64,
+    pub cr2: u64,
+    pub cr3: u64,
+    pub cr4: u64,
+    pub cr8: u64,
+    pub efer: u64,
+    pub apic_base: u64,
+    pub interrupt_bitmap: [u64; 4usize],
+}
+
+#[repr(C)]
+#[derive(Debug, Default, Copy, Clone, PartialEq)]
+pub struct UapiKvmDtable {
+    pub base: u64,
+    pub limit: u16,
+    pub padding: [u16; 3usize],
+}
+
+#[allow(dead_code)]
+pub mod kvm_exit {
+    pub const KVM_EXIT_UNKNOWN: u32 = 0;
+    pub const KVM_EXIT_EXCEPTION: u32 = 1;
+    pub const KVM_EXIT_IO: u32 = 2;
+    pub const KVM_EXIT_HYPERCALL: u32 = 3;
+    pub const KVM_EXIT_DEBUG: u32 = 4;
+    pub const KVM_EXIT_HLT: u32 = 5;
+    pub const KVM_EXIT_MMIO: u32 = 6;
+    pub const KVM_EXIT_IRQ_WINDOW_OPEN: u32 = 7;
+    pub const KVM_EXIT_SHUTDOWN: u32 = 8;
+    pub const KVM_EXIT_FAIL_ENTRY: u32 = 9;
+    pub const KVM_EXIT_INTR: u32 = 10;
+    pub const KVM_EXIT_SET_TPR: u32 = 11;
+    pub const KVM_EXIT_TPR_ACCESS: u32 = 12;
+    pub const KVM_EXIT_S390_SIEIC: u32 = 13;
+    pub const KVM_EXIT_S390_RESET: u32 = 14;
+    pub const KVM_EXIT_DCR: u32 = 15;
+    pub const KVM_EXIT_NMI: u32 = 16;
+    pub const KVM_EXIT_INTERNAL_ERROR: u32 = 17;
+    pub const KVM_EXIT_OSI: u32 = 18;
+    pub const KVM_EXIT_PAPR_HCALL: u32 = 19;
+    pub const KVM_EXIT_S390_UCONTROL: u32 = 20;
+    pub const KVM_EXIT_WATCHDOG: u32 = 21;
+    pub const KVM_EXIT_S390_TSCH: u32 = 22;
+    pub const KVM_EXIT_EPR: u32 = 23;
+    pub const KVM_EXIT_SYSTEM_EVENT: u32 = 24;
+    pub const KVM_EXIT_S390_STSI: u32 = 25;
+    pub const KVM_EXIT_IOAPIC_EOI: u32 = 26;
+    pub const KVM_EXIT_HYPERV: u32 = 27;
+    pub const KVM_EXIT_ARM_NISV: u32 = 28;
+    pub const KVM_EXIT_X86_RDMSR: u32 = 29;
+    pub const KVM_EXIT_X86_WRMSR: u32 = 30;
+    pub const KVM_EXIT_DIRTY_RING_FULL: u32 = 31;
+    pub const KVM_EXIT_AP_RESET_HOLD: u32 = 32;
+    pub const KVM_EXIT_X86_BUS_LOCK: u32 = 33;
+    pub const KVM_EXIT_XEN: u32 = 34;
+    pub const KVM_EXIT_RISCV_SBI: u32 = 35;
+    pub const KVM_EXIT_RISCV_CSR: u32 = 36;
+    pub const KVM_EXIT_NOTIFY: u32 = 37;
+}

+ 19 - 0
kernel/src/arch/x86_64/vm/vmx/asm.rs

@@ -0,0 +1,19 @@
+#![allow(dead_code)]
+
+pub const VMX_EPT_MT_EPTE_SHIFT: u64 = 3;
+pub const VMX_EPTP_PWL_MASK: u64 = 0x38;
+pub const VMX_EPTP_PWL_4: u64 = 0x18;
+pub const VMX_EPTP_PWL_5: u64 = 0x20;
+pub const VMX_EPTP_AD_ENABLE_BIT: u64 = 1 << 6;
+pub const VMX_EPTP_MT_MASK: u64 = 0x7;
+pub const VMX_EPTP_MT_WB: u64 = 0x6;
+pub const VMX_EPTP_MT_UC: u64 = 0x0;
+pub const VMX_EPT_READABLE_MASK: u64 = 0x1;
+pub const VMX_EPT_WRITABLE_MASK: u64 = 0x2;
+pub const VMX_EPT_EXECUTABLE_MASK: u64 = 0x4;
+pub const VMX_EPT_IPAT_BIT: u64 = 1 << 6;
+pub const VMX_EPT_ACCESS_BIT: u64 = 1 << 8;
+pub const VMX_EPT_DIRTY_BIT: u64 = 1 << 9;
+pub const VMX_EPT_RWX_MASK: u64 =
+    VMX_EPT_READABLE_MASK | VMX_EPT_WRITABLE_MASK | VMX_EPT_EXECUTABLE_MASK;
+pub const VMX_EPT_MT_MASK: u64 = 7 << VMX_EPT_MT_EPTE_SHIFT;

+ 591 - 0
kernel/src/arch/x86_64/vm/vmx/capabilities.rs

@@ -0,0 +1,591 @@
+use raw_cpuid::CpuId;
+use x86::{
+    msr,
+    vmx::vmcs::control::{
+        EntryControls, ExitControls, PinbasedControls, PrimaryControls, SecondaryControls,
+    },
+};
+
+use crate::{
+    arch::vm::{
+        mmu::kvm_mmu::PageLevel, CPU_BASED_ALWAYSON_WITHOUT_TRUE_MSR,
+        PIN_BASED_ALWAYSON_WITHOUT_TRUE_MSR, VM_ENTRY_ALWAYSON_WITHOUT_TRUE_MSR,
+        VM_EXIT_ALWAYSON_WITHOUT_TRUE_MSR,
+    },
+    virt::vm::kvm_host::vcpu::VirtCpu,
+};
+
+use super::{vmcs::feat::VmxFeat, Vmx};
+
+#[derive(Debug)]
+pub struct VmcsConfig {
+    pub size: u32,
+    pub basic_cap: u32,
+    pub revision_id: u32,
+    pub pin_based_exec_ctrl: PinbasedControls,
+    pub cpu_based_exec_ctrl: PrimaryControls,
+    pub cpu_based_2nd_exec_ctrl: SecondaryControls,
+    pub cpu_based_3rd_exec_ctrl: u32,
+    pub vmexit_ctrl: ExitControls,
+    pub vmentry_ctrl: EntryControls,
+    pub misc: u64,
+    pub nested: NestedVmxMsrs,
+}
+
+impl Default for VmcsConfig {
+    fn default() -> Self {
+        Self {
+            size: Default::default(),
+            basic_cap: Default::default(),
+            revision_id: Default::default(),
+            pin_based_exec_ctrl: PinbasedControls::empty(),
+            cpu_based_exec_ctrl: PrimaryControls::empty(),
+            cpu_based_2nd_exec_ctrl: SecondaryControls::empty(),
+            cpu_based_3rd_exec_ctrl: Default::default(),
+            vmexit_ctrl: ExitControls::empty(),
+            vmentry_ctrl: EntryControls::empty(),
+            misc: Default::default(),
+            nested: Default::default(),
+        }
+    }
+}
+
+#[derive(Debug, Default)]
+pub struct NestedVmxMsrs {
+    /// 主处理器基于控制,分为低32位和高32位
+    pub procbased_ctls_low: u32,
+    /// 主处理器基于控制,分为低32位和高32位
+    pub procbased_ctls_high: u32,
+    /// 次要处理器控制,分为低32位和高32位
+    pub secondary_ctls_low: u32,
+    /// 次要处理器控制,分为低32位和高32位
+    pub secondary_ctls_high: u32,
+    /// VMX 的针脚基于控制,分为低32位和高32位
+    pub pinbased_ctls_low: u32,
+    /// VMX 的针脚基于控制,分为低32位和高32位
+    pub pinbased_ctls_high: u32,
+    /// VM退出控制,分为低32位和高32位
+    pub exit_ctls_low: u32,
+    /// VM退出控制,分为低32位和高32位
+    pub exit_ctls_high: u32,
+    /// VM进入控制,分为低32位和高32位
+    pub entry_ctls_low: u32,
+    /// VM进入控制,分为低32位和高32位
+    pub entry_ctls_high: u32,
+    /// VMX 的其他杂项控制,分为低32位和高32位
+    pub misc_low: u32,
+    /// VMX 的其他杂项控制,分为低32位和高32位
+    pub misc_high: u32,
+    /// 扩展页表(EPT)的能力信息
+    pub ept_caps: u32,
+    /// 虚拟处理器标识(VPID)的能力信息
+    pub vpid_caps: u32,
+    ///  基本能力
+    pub basic: u64,
+    ///  VMX 控制的CR0寄存器的固定位
+    pub cr0_fixed0: u64,
+    ///  VMX 控制的CR0寄存器的固定位
+    pub cr0_fixed1: u64,
+    ///  VMX 控制的CR4寄存器的固定位
+    pub cr4_fixed0: u64,
+    ///  VMX 控制的CR4寄存器的固定位
+    pub cr4_fixed1: u64,
+    /// VMX 控制的VMCS寄存器的编码
+    pub vmcs_enum: u64,
+    /// VM功能控制
+    pub vmfunc_controls: u64,
+}
+
+impl NestedVmxMsrs {
+    pub fn control_msr(low: u32, high: u32) -> u64 {
+        (high as u64) << 32 | low as u64
+    }
+
+    pub fn get_vmx_msr(&self, msr_index: u32) -> Option<u64> {
+        match msr_index {
+            msr::IA32_VMX_BASIC => {
+                return Some(self.basic);
+            }
+            msr::IA32_VMX_TRUE_PINBASED_CTLS | msr::IA32_VMX_PINBASED_CTLS => {
+                let mut data =
+                    NestedVmxMsrs::control_msr(self.pinbased_ctls_low, self.pinbased_ctls_high);
+                if msr_index == msr::IA32_VMX_PINBASED_CTLS {
+                    data |= PIN_BASED_ALWAYSON_WITHOUT_TRUE_MSR;
+                }
+                return Some(data);
+            }
+            msr::IA32_VMX_TRUE_PROCBASED_CTLS | msr::IA32_VMX_PROCBASED_CTLS => {
+                let mut data =
+                    NestedVmxMsrs::control_msr(self.procbased_ctls_low, self.procbased_ctls_high);
+                if msr_index == msr::IA32_VMX_PROCBASED_CTLS {
+                    data |= CPU_BASED_ALWAYSON_WITHOUT_TRUE_MSR;
+                }
+                return Some(data);
+            }
+            msr::IA32_VMX_TRUE_EXIT_CTLS | msr::IA32_VMX_EXIT_CTLS => {
+                let mut data = NestedVmxMsrs::control_msr(self.exit_ctls_low, self.exit_ctls_high);
+                if msr_index == msr::IA32_VMX_EXIT_CTLS {
+                    data |= VM_EXIT_ALWAYSON_WITHOUT_TRUE_MSR;
+                }
+                return Some(data);
+            }
+            msr::IA32_VMX_TRUE_ENTRY_CTLS | msr::IA32_VMX_ENTRY_CTLS => {
+                let mut data =
+                    NestedVmxMsrs::control_msr(self.entry_ctls_low, self.entry_ctls_high);
+                if msr_index == msr::IA32_VMX_ENTRY_CTLS {
+                    data |= VM_ENTRY_ALWAYSON_WITHOUT_TRUE_MSR;
+                }
+                return Some(data);
+            }
+            msr::IA32_VMX_MISC => {
+                return Some(NestedVmxMsrs::control_msr(self.misc_low, self.misc_high));
+            }
+            msr::IA32_VMX_CR0_FIXED0 => {
+                return Some(self.cr0_fixed0);
+            }
+            msr::IA32_VMX_CR0_FIXED1 => {
+                return Some(self.cr0_fixed1);
+            }
+            msr::IA32_VMX_CR4_FIXED0 => {
+                return Some(self.cr4_fixed0);
+            }
+            msr::IA32_VMX_CR4_FIXED1 => {
+                return Some(self.cr4_fixed1);
+            }
+            msr::IA32_VMX_VMCS_ENUM => {
+                return Some(self.vmcs_enum);
+            }
+            msr::IA32_VMX_PROCBASED_CTLS2 => {
+                return Some(NestedVmxMsrs::control_msr(
+                    self.secondary_ctls_low,
+                    self.secondary_ctls_high,
+                ));
+            }
+            msr::IA32_VMX_EPT_VPID_CAP => {
+                return Some(self.ept_caps as u64 | ((self.vpid_caps as u64) << 32));
+            }
+            msr::IA32_VMX_VMFUNC => {
+                return Some(self.vmfunc_controls);
+            }
+            _ => {
+                return None;
+            }
+        }
+    }
+}
+
+#[derive(Debug, Default)]
+pub struct VmxCapability {
+    pub ept: EptFlag,
+    pub vpid: VpidFlag,
+}
+
+#[derive(Debug, PartialEq)]
+pub enum ProcessorTraceMode {
+    System,
+    HostGuest,
+}
+
+bitflags! {
+    #[derive(Default)]
+    pub struct VpidFlag: u32 {
+        /// 表示处理器支持 INVVPID 指令
+        const INVVPID = 1 << 0; /* (32 - 32) */
+        /// 表示 VPID 支持以单独地址方式进行范围
+        const EXTENT_INDIVIDUAL_ADDR = 1 << 8; /* (40 - 32) */
+        /// 表示 VPID 支持以单个上下文方式进行范围
+        const EXTENT_SINGLE_CONTEXT = 1 << 9; /* (41 - 32) */
+        /// 表示 VPID 支持以全局上下文方式进行范围
+        const EXTENT_GLOBAL_CONTEXT = 1 << 10; /* (42 - 32) */
+        /// 表示 VPID 支持以单个非全局方式进行范围
+        const EXTENT_SINGLE_NON_GLOBAL = 1 << 11; /* (43 - 32) */
+    }
+
+    #[derive(Default)]
+    pub struct EptFlag: u32 {
+        /// EPT 条目是否允许执行
+        const EPT_EXECUTE_ONLY = 1;
+        /// 处理器是否支持 4 级页表
+        const EPT_PAGE_WALK_4 = 1 << 6;
+        /// 处理器是否支持 5 级页表
+        const EPT_PAGE_WALK_5 = 1 << 7;
+        /// EPT 表的内存类型是否为不可缓存(uncached)
+        const EPTP_UC = 1 << 8;
+        /// EPT 表的内存类型是否为写回(write-back)
+        const EPTP_WB = 1 << 14;
+        /// 处理器是否支持 2MB 大页
+        const EPT_2MB_PAGE = 1 << 16;
+        /// 处理器是否支持 1GB 大页
+        const EPT_1GB_PAGE = 1 << 17;
+        /// 处理器是否支持 INV-EPT 指令,用于刷新 EPT TLB
+        const EPT_INVEPT = 1 << 20;
+        /// EPT 表是否支持访问位(Access-Dirty)
+        const EPT_AD = 1 << 21;
+        /// 处理器是否支持上下文扩展
+        const EPT_EXTENT_CONTEXT = 1 << 25;
+        /// 处理器是否支持全局扩展
+        const EPT_EXTENT_GLOBAL = 1 << 26;
+    }
+}
+
+impl VmxCapability {
+    pub fn set_val_from_msr_val(&mut self, val: u64) {
+        self.ept = EptFlag::from_bits_truncate(val as u32);
+        self.vpid = VpidFlag::from_bits_truncate((val >> 32) as u32);
+    }
+}
+
+impl Vmx {
+    /// 检查处理器是否支持VMX基本控制结构的输入输出功能
+    #[inline]
+    #[allow(dead_code)]
+    pub fn has_basic_inout(&self) -> bool {
+        return ((self.vmcs_config.basic_cap as u64) << 32) & VmxFeat::VMX_BASIC_INOUT != 0;
+    }
+
+    /// 检查处理器是否支持虚拟的非屏蔽中断(NMI)
+    #[inline]
+    pub fn has_virtual_nmis(&self) -> bool {
+        return self
+            .vmcs_config
+            .pin_based_exec_ctrl
+            .contains(PinbasedControls::VIRTUAL_NMIS)
+            && self
+                .vmcs_config
+                .cpu_based_exec_ctrl
+                .contains(PrimaryControls::NMI_WINDOW_EXITING);
+    }
+
+    /// 检查处理器是否支持VMX的抢占计时器功能
+    #[inline]
+    pub fn has_preemption_timer(&self) -> bool {
+        return self
+            .vmcs_config
+            .pin_based_exec_ctrl
+            .contains(PinbasedControls::VMX_PREEMPTION_TIMER);
+    }
+
+    /// 检查处理器是否支持VMX的posted interrupt功能
+    #[inline]
+    pub fn has_posted_intr(&self) -> bool {
+        return self
+            .vmcs_config
+            .pin_based_exec_ctrl
+            .contains(PinbasedControls::POSTED_INTERRUPTS);
+    }
+
+    /// 是否支持加载IA32_EFER寄存器
+    #[inline]
+    pub fn has_load_ia32_efer(&self) -> bool {
+        return self
+            .vmcs_config
+            .vmentry_ctrl
+            .contains(EntryControls::LOAD_IA32_EFER);
+    }
+
+    /// 是否支持加载IA32_PERF_GLOBAL_CTRL寄存器
+    #[inline]
+    pub fn has_load_perf_global_ctrl(&self) -> bool {
+        return self
+            .vmcs_config
+            .vmentry_ctrl
+            .contains(EntryControls::LOAD_IA32_PERF_GLOBAL_CTRL);
+    }
+
+    /// 是否支持加载边界检查配置寄存器(MPX)
+    #[inline]
+    pub fn has_mpx(&self) -> bool {
+        return self
+            .vmcs_config
+            .vmentry_ctrl
+            .contains(EntryControls::LOAD_IA32_BNDCFGS);
+    }
+
+    /// 是否支持虚拟处理器的任务优先级(TPR)影子
+    #[inline]
+    pub fn has_tpr_shadow(&self) -> bool {
+        return self
+            .vmcs_config
+            .cpu_based_exec_ctrl
+            .contains(PrimaryControls::USE_TPR_SHADOW);
+    }
+
+    /// 检查处理器是否支持 VMX中的 VPID(Virtual Processor ID)功能
+    ///
+    /// VPID 允许虚拟机监视器为每个虚拟处理器分配唯一的标识符,从而使得在不同的虚拟机之间进行快速的上下文切换和恢复成为可能。
+    ///
+    /// 通过使用 VPID,VMM 可以更快速地识别和恢复之前保存的虚拟处理器的状态,从而提高了虚拟化性能和效率。
+    #[inline]
+    pub fn has_vpid(&self) -> bool {
+        return self
+            .vmcs_config
+            .cpu_based_2nd_exec_ctrl
+            .contains(SecondaryControls::ENABLE_VPID);
+    }
+
+    /// 是否支持invvpid
+    ///
+    /// INVVPID 指令用于通知处理器无效化指定虚拟处理器标识符(VPID)相关的 TLB(Translation Lookaside Buffer)条目
+    #[inline]
+    pub fn has_invvpid(&self) -> bool {
+        return self.vmx_cap.vpid.contains(VpidFlag::INVVPID);
+    }
+
+    /// VPID 是否支持以单独地址方式进行范围
+    #[allow(dead_code)]
+    #[inline]
+    pub fn has_invvpid_individual_addr(&self) -> bool {
+        return self.vmx_cap.vpid.contains(VpidFlag::EXTENT_INDIVIDUAL_ADDR);
+    }
+
+    /// VPID 是否支持以单个上下文方式进行范围
+    #[inline]
+    pub fn has_invvpid_single(&self) -> bool {
+        return self.vmx_cap.vpid.contains(VpidFlag::EXTENT_SINGLE_CONTEXT);
+    }
+
+    /// VPID 是否支持以全局上下文方式进行范围
+    #[inline]
+    pub fn has_invvpid_global(&self) -> bool {
+        return self.vmx_cap.vpid.contains(VpidFlag::EXTENT_GLOBAL_CONTEXT);
+    }
+
+    /// 是否启用EPT(Extended Page Tables)
+    ///
+    /// EPT:EPT 是一种硬件虚拟化技术,允许虚拟机管理程序(例如 Hypervisor) 控制客户操作系统中虚拟地址和物理地址之间的映射。
+    ///
+    /// 通过启用 EPT,处理器可以将虚拟地址直接映射到物理地址,从而提高虚拟机的性能和安全性。
+    #[inline]
+    pub fn has_ept(&self) -> bool {
+        return self
+            .vmcs_config
+            .cpu_based_2nd_exec_ctrl
+            .contains(SecondaryControls::ENABLE_EPT);
+    }
+
+    /// 是否支持4级页表
+    #[inline]
+    pub fn has_ept_4levels(&self) -> bool {
+        return self.vmx_cap.ept.contains(EptFlag::EPT_PAGE_WALK_4);
+    }
+
+    /// 是否支持5级页表
+    #[inline]
+    pub fn has_ept_5levels(&self) -> bool {
+        return self.vmx_cap.ept.contains(EptFlag::EPT_PAGE_WALK_5);
+    }
+
+    pub fn get_max_ept_level(&self) -> usize {
+        if self.has_ept_5levels() {
+            return 5;
+        }
+        return 4;
+    }
+
+    pub fn ept_cap_to_lpage_level(&self) -> PageLevel {
+        if self.vmx_cap.ept.contains(EptFlag::EPT_1GB_PAGE) {
+            return PageLevel::Level1G;
+        }
+        if self.vmx_cap.ept.contains(EptFlag::EPT_2MB_PAGE) {
+            return PageLevel::Level2M;
+        }
+
+        return PageLevel::Level4K;
+    }
+
+    /// 判断mt(Memory type)是否为write back
+    #[inline]
+    pub fn has_ept_mt_wb(&self) -> bool {
+        return self.vmx_cap.ept.contains(EptFlag::EPTP_WB);
+    }
+
+    #[inline]
+    pub fn has_vmx_invept_context(&self) -> bool {
+        self.vmx_cap.ept.contains(EptFlag::EPT_EXTENT_CONTEXT)
+    }
+
+    /// EPT是否支持全局拓展
+    #[inline]
+    pub fn has_invept_global(&self) -> bool {
+        return self.vmx_cap.ept.contains(EptFlag::EPT_EXTENT_GLOBAL);
+    }
+
+    /// EPT是否支持访问位
+    #[inline]
+    pub fn has_ept_ad_bits(&self) -> bool {
+        return self.vmx_cap.ept.contains(EptFlag::EPT_AD);
+    }
+
+    /// 是否支持 VMX 中的无限制客户(unrestricted guest)功能
+    ///
+    /// 无限制客户功能允许客户操作系统在未受到主机操作系统干预的情况下运行
+    #[inline]
+    pub fn has_unrestricted_guest(&self) -> bool {
+        return self
+            .vmcs_config
+            .cpu_based_2nd_exec_ctrl
+            .contains(SecondaryControls::UNRESTRICTED_GUEST);
+    }
+
+    /// 是否支持 VMX 中的 FlexPriority 功能
+    ///
+    /// FlexPriority 是一种功能,可以在 TPR shadow 和虚拟化 APIC 访问同时可用时启用。
+    ///
+    /// TPR shadow 允许虚拟机管理程序(VMM)跟踪虚拟机中处理器的 TPR 值,并在需要时拦截和修改。
+    ///
+    /// 虚拟化 APIC 访问允许 VMM 控制虚拟机中的 APIC 寄存器访问。
+    #[inline]
+    pub fn has_flexproirity(&self) -> bool {
+        return self.has_tpr_shadow() && self.has_virtualize_apic_accesses();
+    }
+
+    /// 是否支持 VMX 中的虚拟化 APIC 访问功能。
+    ///
+    /// 当启用此功能时,虚拟机管理程序(VMM)可以控制虚拟机中的 APIC 寄存器访问。
+    #[inline]
+    pub fn has_virtualize_apic_accesses(&self) -> bool {
+        return self
+            .vmcs_config
+            .cpu_based_2nd_exec_ctrl
+            .contains(SecondaryControls::VIRTUALIZE_APIC);
+    }
+
+    /// 是否支持 VMX 中的 ENCLS 指令导致的 VM 退出功能
+    #[inline]
+    pub fn has_encls_vmexit(&self) -> bool {
+        return self
+            .vmcs_config
+            .cpu_based_2nd_exec_ctrl
+            .contains(SecondaryControls::ENCLS_EXITING);
+    }
+
+    /// 是否支持 VMX 中的 PLE (Pause Loop Exiting) 功能。
+    #[inline]
+    pub fn has_ple(&self) -> bool {
+        return self
+            .vmcs_config
+            .cpu_based_2nd_exec_ctrl
+            .contains(SecondaryControls::PAUSE_LOOP_EXITING);
+    }
+
+    /// 是否支持 VMX 中的 APICv 功能
+    #[inline]
+    pub fn has_apicv(&self) -> bool {
+        return self.has_apic_register_virt()
+            && self.has_posted_intr()
+            && self.has_virtual_intr_delivery();
+    }
+
+    /// 是否支持虚拟化的 APIC 寄存器功能
+    #[inline]
+    pub fn has_apic_register_virt(&self) -> bool {
+        return self
+            .vmcs_config
+            .cpu_based_2nd_exec_ctrl
+            .contains(SecondaryControls::VIRTUALIZE_APIC_REGISTER);
+    }
+
+    /// 是否支持虚拟化的中断传递功能
+    #[inline]
+    pub fn has_virtual_intr_delivery(&self) -> bool {
+        return self
+            .vmcs_config
+            .cpu_based_2nd_exec_ctrl
+            .contains(SecondaryControls::VIRTUAL_INTERRUPT_DELIVERY);
+    }
+
+    /// 是否支持虚拟化的中断注入(Inter-Processor Interrupt Virtualization,IPIV)
+    #[inline]
+    pub fn has_ipiv(&self) -> bool {
+        return false;
+    }
+
+    /// 是否支持虚拟化的 TSC 缩放功能
+    #[inline]
+    pub fn has_tsc_scaling(&self) -> bool {
+        return self
+            .vmcs_config
+            .cpu_based_2nd_exec_ctrl
+            .contains(SecondaryControls::USE_TSC_SCALING);
+    }
+
+    /// 是否支持虚拟化的页修改日志(Page Modification Logging)
+    #[inline]
+    pub fn has_pml(&self) -> bool {
+        return self
+            .vmcs_config
+            .cpu_based_2nd_exec_ctrl
+            .contains(SecondaryControls::ENABLE_PML);
+    }
+
+    /// 检查 CPU 是否支持使用 MSR 位图来控制 VMX
+    #[inline]
+    pub fn has_msr_bitmap(&self) -> bool {
+        return self
+            .vmcs_config
+            .cpu_based_exec_ctrl
+            .contains(PrimaryControls::USE_MSR_BITMAPS);
+    }
+
+    #[inline]
+    pub fn has_sceondary_exec_ctrls(&self) -> bool {
+        self.vmcs_config
+            .cpu_based_exec_ctrl
+            .contains(PrimaryControls::SECONDARY_CONTROLS)
+    }
+
+    #[inline]
+    pub fn has_rdtscp(&self) -> bool {
+        self.vmcs_config
+            .cpu_based_2nd_exec_ctrl
+            .contains(SecondaryControls::ENABLE_RDTSCP)
+    }
+
+    #[inline]
+    pub fn has_vmfunc(&self) -> bool {
+        self.vmcs_config
+            .cpu_based_2nd_exec_ctrl
+            .contains(SecondaryControls::ENABLE_VM_FUNCTIONS)
+    }
+
+    #[inline]
+    pub fn has_xsaves(&self) -> bool {
+        self.vmcs_config
+            .cpu_based_2nd_exec_ctrl
+            .contains(SecondaryControls::ENABLE_XSAVES_XRSTORS)
+    }
+
+    #[inline]
+    pub fn vmx_umip_emulated(&self) -> bool {
+        let feat = CpuId::new().get_extended_feature_info().unwrap().has_umip();
+
+        return !feat
+            && (self
+                .vmcs_config
+                .cpu_based_2nd_exec_ctrl
+                .contains(SecondaryControls::DTABLE_EXITING));
+    }
+
+    #[inline]
+    pub fn has_tertiary_exec_ctrls(&self) -> bool {
+        false
+    }
+
+    #[inline]
+    pub fn has_bus_lock_detection(&self) -> bool {
+        false
+    }
+
+    #[inline]
+    pub fn has_notify_vmexit(&self) -> bool {
+        false
+    }
+
+    /// 是否需要拦截页面故障
+    #[inline]
+    pub fn vmx_need_pf_intercept(&self, _vcpu: &VirtCpu) -> bool {
+        // if (!enable_ept)
+        // return true;
+        false
+    }
+}

+ 466 - 0
kernel/src/arch/x86_64/vm/vmx/ept/mod.rs

@@ -0,0 +1,466 @@
+use crate::arch::mm::LockedFrameAllocator;
+use crate::arch::vm::asm::VmxAsm;
+use crate::arch::vm::mmu::kvm_mmu::PageLevel;
+use crate::arch::vm::mmu::mmu_internal::KvmPageFault;
+use crate::arch::MMArch;
+use crate::libs::spinlock::SpinLockGuard;
+use crate::mm::allocator::page_frame::FrameAllocator;
+use crate::mm::page::{
+    page_manager_lock_irqsave, EntryFlags, PageEntry, PageFlags, PageFlush, PageManager, PageType,
+};
+use crate::mm::{MemoryManagementArch, PhysAddr, VirtAddr};
+use crate::smp::core::smp_get_processor_id;
+use crate::smp::cpu::AtomicProcessorId;
+use crate::smp::cpu::ProcessorId;
+use core::ops::Add;
+use core::sync::atomic::{compiler_fence, AtomicUsize, Ordering};
+use log::{debug, error, warn};
+use system_error::SystemError;
+use x86::msr;
+use x86::vmx::vmcs::control;
+
+// pub const VMX_EPT_MT_EPTE_SHIFT:u64 = 3;
+pub const VMX_EPT_RWX_MASK: u64 = 0x7;
+
+// Exit Qualifications for EPT Violations
+pub const EPT_VIOLATION_ACC_READ_BIT: u64 = 0;
+pub const EPT_VIOLATION_ACC_WRITE_BIT: u64 = 1;
+pub const EPT_VIOLATION_ACC_INSTR_BIT: u64 = 2;
+pub const EPT_VIOLATION_RWX_SHIFT: u64 = 3;
+pub const EPT_VIOLATION_GVA_IS_VALID_BIT: u64 = 7;
+pub const EPT_VIOLATION_GVA_TRANSLATED_BIT: u64 = 8;
+
+bitflags! {
+    pub struct EptViolationExitQual :u64{
+        const ACC_READ = 1 << EPT_VIOLATION_ACC_READ_BIT;
+        const ACC_WRITE = 1 << EPT_VIOLATION_ACC_WRITE_BIT;
+        const ACC_INSTR = 1 << EPT_VIOLATION_ACC_INSTR_BIT;
+        const RWX_MASK = VMX_EPT_RWX_MASK << EPT_VIOLATION_RWX_SHIFT;
+        const GVA_IS_VALID = 1 << EPT_VIOLATION_GVA_IS_VALID_BIT;
+        const GVA_TRANSLATED = 1 << EPT_VIOLATION_GVA_TRANSLATED_BIT;
+    }
+}
+
+// /// 全局EPT物理页信息管理器
+// pub static mut EPT_PAGE_MANAGER: Option<SpinLock<EptPageManager>> = None;
+
+// /// 初始化EPT_PAGE_MANAGER
+// pub fn ept_page_manager_init() {
+//     kinfo!("page_manager_init");
+//     let page_manager = SpinLock::new(EptPageManager::new());
+
+//     compiler_fence(Ordering::SeqCst);
+//     unsafe { EPT_PAGE_MANAGER = Some(page_manager) };
+//     compiler_fence(Ordering::SeqCst);
+
+//     kinfo!("page_manager_init done");
+// }
+
+// pub fn ept_page_manager_lock_irqsave() -> SpinLockGuard<'static, EptPageManager> {
+//     unsafe { EPT_PAGE_MANAGER.as_ref().unwrap().lock_irqsave() }
+// }
+// EPT 页表数据结构
+#[derive(Debug)]
+pub struct EptPageTable {
+    /// 当前页表表示的虚拟地址空间的起始地址,内核访问EPT页表也是在虚拟地址空间中的
+    base: VirtAddr,
+    /// 当前页表所在的物理地址
+    phys: PhysAddr,
+    /// 当前页表的层级
+    /// PageLevel::4K = 1
+    level: PageLevel,
+}
+impl EptPageTable {
+    pub fn phys(&self) -> PhysAddr {
+        self.phys
+    }
+
+    /// 设置当前页表的第i个页表项
+    pub unsafe fn set_entry(&self, i: usize, entry: PageEntry<MMArch>) -> Option<()> {
+        let entry_virt = self.entry_virt(i)?;
+        MMArch::write::<PageEntry<MMArch>>(entry_virt, entry);
+        let page_entry = MMArch::read::<PageEntry<MMArch>>(entry_virt);
+        debug!("Set EPT entry: {:?} , index : {:?}", page_entry, i);
+        return Some(());
+    }
+    /// 判断当前页表的第i个页表项是否已经填写了值
+    ///
+    /// ## 参数
+    /// - Some(true) 如果已经填写了值
+    /// - Some(false) 如果未填写值
+    /// - None 如果i超出了页表项的范围
+    pub fn entry_mapped(&self, i: usize) -> Option<bool> {
+        let etv = unsafe { self.entry_virt(i) }?;
+        if unsafe { MMArch::read::<usize>(etv) } != 0 {
+            return Some(true);
+        } else {
+            return Some(false);
+        }
+    }
+
+    /// 获取当前页表的层级
+    #[inline(always)]
+    pub fn level(&self) -> PageLevel {
+        self.level
+    }
+
+    /// 获取第i个entry的虚拟内存空间
+    #[allow(dead_code)]
+    pub fn entry_base(&self, i: usize) -> Option<VirtAddr> {
+        if i < MMArch::PAGE_ENTRY_NUM {
+            let shift = (self.level as usize - 1) * MMArch::PAGE_ENTRY_SHIFT + MMArch::PAGE_SHIFT;
+            return Some(self.base.add(i << shift));
+        } else {
+            return None;
+        }
+    }
+    /// 获取当前页表自身所在的虚拟地址
+    #[inline(always)]
+    pub unsafe fn virt(&self) -> VirtAddr {
+        return MMArch::phys_2_virt(self.phys).unwrap();
+    }
+    /// 获取当前页表的第i个页表项所在的虚拟地址(注意与entry_base进行区分)
+    pub unsafe fn entry_virt(&self, i: usize) -> Option<VirtAddr> {
+        if i < MMArch::PAGE_ENTRY_NUM {
+            return Some(self.virt().add(i * MMArch::PAGE_ENTRY_SIZE));
+        } else {
+            return None;
+        }
+    }
+    /// 获取当前页表的第i个页表项
+    pub unsafe fn entry(&self, i: usize) -> Option<PageEntry<MMArch>> {
+        let entry_virt = self.entry_virt(i)?;
+        return Some(PageEntry::from_usize(MMArch::read::<usize>(entry_virt)));
+    }
+
+    pub fn new(base: VirtAddr, phys: PhysAddr, level: PageLevel) -> Self {
+        Self { base, phys, level }
+    }
+    /// 根据虚拟地址,获取对应的页表项在页表中的下标
+    ///
+    /// ## 参数
+    ///
+    /// - hva: 虚拟地址
+    ///
+    /// ## 返回值
+    ///
+    /// 页表项在页表中的下标。如果addr不在当前页表所表示的虚拟地址空间中,则返回None
+    pub unsafe fn index_of(&self, gpa: PhysAddr) -> Option<usize> {
+        let addr = VirtAddr::new(gpa.data() & MMArch::PAGE_ADDRESS_MASK);
+        let shift = (self.level - 1) as usize * MMArch::PAGE_ENTRY_SHIFT + MMArch::PAGE_SHIFT;
+
+        //let mask = (MMArch::PAGE_ENTRY_NUM << shift) - 1;
+        // if addr < self.base || addr >= self.base.add(mask) {
+        //     return None;
+        // } else {
+        return Some((addr.data() >> shift) & MMArch::PAGE_ENTRY_MASK);
+        //}
+    }
+
+    pub fn next_level_table(&self, index: usize) -> Option<EptPageTable> {
+        if self.level == PageLevel::Level4K {
+            return None;
+        }
+        // 返回下一级页表
+        let phys = unsafe { self.entry(index)?.address() };
+
+        let base;
+        if let Ok(phys) = phys {
+            base = unsafe { MMArch::phys_2_virt(PhysAddr::new(phys.data())).unwrap() };
+        } else {
+            base = unsafe { MMArch::phys_2_virt(PhysAddr::new(phys.unwrap_err().data())).unwrap() };
+        }
+
+        let level = self.level - 1;
+        if let Err(_phys) = phys {
+            debug!("EptPageTable::next_level_table: phys {:?}", phys);
+            // Not Present的情况下,返回None
+            // 这里之所以绕了一圈,是因为在虚拟机启动阶段的page_fault的addr是not_present的,但是也要进行映射
+            // 可能有点问题,但是先这么写
+            if _phys.data() & 0x7 == 0x000 {
+                return None;
+            }
+            return Some(EptPageTable::new(base, PhysAddr::new(_phys.data()), level));
+        }
+        return Some(EptPageTable::new(
+            base,
+            PhysAddr::new(phys.unwrap().data()),
+            level,
+        ));
+    }
+}
+
+// // EPT物理页管理器
+// pub struct EptPageManager {
+//     phys2page: HashMap<PhysAddr, EptPageTable>,
+// }
+
+// impl EptPageManager {
+//     pub fn new() -> Self {
+//         Self {
+//             phys2page: HashMap::new(),
+//         }
+//     }
+
+// }
+
+/// Check if MTRR is supported
+#[allow(dead_code)]
+pub fn check_ept_features() -> Result<(), SystemError> {
+    const MTRR_ENABLE_BIT: u64 = 1 << 11;
+    let ia32_mtrr_def_type = unsafe { msr::rdmsr(msr::IA32_MTRR_DEF_TYPE) };
+    if (ia32_mtrr_def_type & MTRR_ENABLE_BIT) == 0 {
+        return Err(SystemError::EOPNOTSUPP_OR_ENOTSUP);
+    }
+    Ok(())
+}
+
+/// 标志当前没有处理器持有内核映射器的锁
+/// 之所以需要这个标志,是因为AtomicUsize::new(0)会把0当作一个处理器的id
+const EPT_MAPPER_NO_PROCESSOR: ProcessorId = ProcessorId::INVALID;
+/// 当前持有内核映射器锁的处理器
+static EPT_MAPPER_LOCK_OWNER: AtomicProcessorId = AtomicProcessorId::new(EPT_MAPPER_NO_PROCESSOR);
+/// 内核映射器的锁计数器
+static EPT_MAPPER_LOCK_COUNT: AtomicUsize = AtomicUsize::new(0);
+
+pub struct EptPageMapper {
+    /// EPT页表映射器
+    //mapper: PageMapper,//PageTableKind::EPT, LockedFrameAllocator
+    /// 标记当前映射器是否为只读
+    readonly: bool,
+    // EPT页表根地址
+    root_page_addr: PhysAddr,
+    /// 页分配器
+    frame_allocator: LockedFrameAllocator,
+}
+
+impl EptPageMapper {
+    /// 返回最上层的ept页表
+    pub fn table(&self) -> EptPageTable {
+        EptPageTable::new(
+            unsafe { MMArch::phys_2_virt(self.root_page_addr).unwrap() },
+            self.root_page_addr,
+            PageLevel::Level512G,
+        )
+    }
+    pub fn root_page_addr() -> PhysAddr {
+        //PML4的物理地址
+        let eptp = VmxAsm::vmx_vmread(control::EPTP_FULL);
+        let addr = eptp & 0xFFFF_FFFF_FFFF_F000; //去除低12位
+        PhysAddr::new(addr as usize)
+    }
+
+    fn lock_cpu(cpuid: ProcessorId) -> Self {
+        loop {
+            match EPT_MAPPER_LOCK_OWNER.compare_exchange_weak(
+                EPT_MAPPER_NO_PROCESSOR,
+                cpuid,
+                Ordering::Acquire,
+                Ordering::Relaxed,
+            ) {
+                Ok(_) => break,
+                // 当前处理器已经持有了锁
+                Err(id) if id == cpuid => break,
+                // either CAS failed, or some other hardware thread holds the lock
+                Err(_) => core::hint::spin_loop(),
+            }
+        }
+
+        let prev_count = EPT_MAPPER_LOCK_COUNT.fetch_add(1, Ordering::Relaxed);
+        compiler_fence(Ordering::Acquire);
+
+        // 本地核心已经持有过锁,因此标记当前加锁获得的映射器为只读
+        let readonly = prev_count > 0;
+        let root_page_addr = Self::root_page_addr();
+        return Self {
+            readonly,
+            root_page_addr,
+            frame_allocator: LockedFrameAllocator,
+        };
+    }
+
+    /// 锁定内核映射器, 并返回一个内核映射器对象
+    /// 目前只有这一个办法可以获得EptPageMapper对象
+    #[inline(always)]
+    pub fn lock() -> Self {
+        //fixme:得到的是cpuid还是vcpuid?
+        let cpuid = smp_get_processor_id();
+        return Self::lock_cpu(cpuid);
+    }
+
+    /// 检查有无gpa->hpa的映射
+    #[no_mangle]
+    pub fn is_mapped(&self, page_fault: &mut KvmPageFault) -> bool {
+        let gpa = page_fault.gpa() as usize;
+        let mut page_table = self.table();
+        let mut next_page_table;
+        loop {
+            let index: usize = unsafe {
+                if let Some(i) = page_table.index_of(PhysAddr::new(gpa)) {
+                    debug!("ept page table index: {:?}", i);
+                    i
+                } else {
+                    error!("ept page table index_of failed");
+                    return false;
+                }
+            };
+            debug!("EPT table: index = {:?}, value =  {:?}", index, page_table);
+            if let Some(table) = page_table.next_level_table(index) {
+                if table.level() == PageLevel::Level4K {
+                    debug!("EPT table 4K: {:?}", table);
+                    return true;
+                }
+                debug!("table.level():  {:?}", table.level());
+                next_page_table = table;
+            } else {
+                return false;
+            }
+            page_table = next_page_table;
+        }
+    }
+
+    /// 从当前EptPageMapper的页分配器中分配一个物理页(hpa),并将其映射到指定的gpa
+    pub fn map(&mut self, gpa: PhysAddr, flags: EntryFlags<MMArch>) -> Option<PageFlush<MMArch>> {
+        let gpa = PhysAddr::new(gpa.data() & (!MMArch::PAGE_NEGATIVE_MASK) & !0xFFF);
+        self.map_gpa(gpa, flags)
+    }
+
+    ///映射一个hpa到指定的gpa
+    pub fn map_gpa(
+        &mut self,
+        gpa: PhysAddr,
+        flags: EntryFlags<MMArch>,
+    ) -> Option<PageFlush<MMArch>> {
+        // 验证虚拟地址和物理地址是否对齐
+        if !(gpa.check_aligned(MMArch::PAGE_SIZE)) {
+            error!("Try to map unaligned page: gpa={:?}", gpa);
+        }
+
+        // TODO: 验证flags是否合法
+
+        let mut table = self.table();
+        debug!("ept page table: {:?}", table);
+        loop {
+            let i = unsafe { table.index_of(gpa).unwrap() };
+            assert!(i < MMArch::PAGE_ENTRY_NUM);
+            if table.level() == PageLevel::Level4K {
+                //检查这个4K页面是否映射过
+                if table.entry_mapped(i).unwrap() {
+                    unsafe {
+                        let entry_virt = table.entry_virt(i)?;
+                        let _set_entry = MMArch::read::<PageEntry<MMArch>>(entry_virt);
+                        warn!(
+                            "index :: {:?} , Page gpa :: {:?} already mapped,content is: {:x}",
+                            i,
+                            gpa,
+                            _set_entry.data()
+                        );
+                        return None;
+                    };
+                }
+
+                //分配一个entry的物理页
+                compiler_fence(Ordering::SeqCst);
+                // let hpa: PhysAddr = unsafe { self.frame_allocator.allocate_one() }?;
+                // debug!("Allocate hpa: {:?}", hpa);
+                // 修改全局页管理器
+                let mut page_manager_guard: SpinLockGuard<'static, PageManager> =
+                    page_manager_lock_irqsave();
+                let page = page_manager_guard
+                    .create_one_page(
+                        PageType::Normal,
+                        PageFlags::empty(),
+                        &mut self.frame_allocator,
+                    )
+                    .ok()?;
+                let hpa = page.phys_address();
+                drop(page_manager_guard);
+                // 清空这个页帧
+                unsafe {
+                    MMArch::write_bytes(MMArch::phys_2_virt(hpa).unwrap(), 0, MMArch::PAGE_SIZE)
+                };
+                let entry = PageEntry::new(hpa, flags);
+                unsafe { table.set_entry(i, entry) };
+                compiler_fence(Ordering::SeqCst);
+
+                //打印页表项以进行验证
+                unsafe {
+                    let entry_virt = table.entry_virt(i)?;
+                    let _set_entry = MMArch::read::<PageEntry<MMArch>>(entry_virt);
+                }
+
+                return Some(PageFlush::new(unsafe { table.entry_virt(i)? }));
+            } else {
+                let next_table = table.next_level_table(i);
+                if let Some(next_table) = next_table {
+                    table = next_table;
+                    debug!("already next table: {:?}", table);
+                } else {
+                    // 分配下一级页表
+                    let frame = unsafe { self.frame_allocator.allocate_one() }?;
+
+                    // 清空这个页帧
+                    unsafe {
+                        MMArch::write_bytes(
+                            MMArch::phys_2_virt(frame).unwrap(),
+                            0,
+                            MMArch::PAGE_SIZE,
+                        )
+                    };
+
+                    // fixme::设置页表项的flags,可能有点问题
+                    let flags: EntryFlags<MMArch> = unsafe { EntryFlags::from_data(0x7) };
+
+                    // 把新分配的页表映射到当前页表
+                    unsafe { table.set_entry(i, PageEntry::new(frame, flags)) };
+
+                    // 获取新分配的页表
+                    table = table.next_level_table(i)?;
+                }
+            }
+        }
+    }
+}
+#[allow(dead_code)]
+//调试EPT页表用,可以打印出EPT页表的值
+pub fn debug_eptp() {
+    let pml4_hpa: PhysAddr = EptPageMapper::lock().table().phys();
+    debug!("Prepare to read EPTP address");
+    let pml4_hva = unsafe { MMArch::phys_2_virt(PhysAddr::new(pml4_hpa.data())).unwrap() };
+    debug!("PML4_hpa: 0x{:x}", pml4_hpa.data());
+    debug!("PML4_hva: 0x{:x}", pml4_hva.data()); //Level512G
+    unsafe {
+        let entry = MMArch::read::<u64>(pml4_hva);
+        debug!("Value at EPTP address: 0x{:x}", entry); //Level2M
+                                                        // 遍历并打印所有已分配的页面
+        traverse_ept_table(pml4_hva, 4);
+    }
+}
+unsafe fn traverse_ept_table(table_addr: VirtAddr, level: u8) {
+    if level == (u8::MAX) {
+        return;
+    }
+
+    let entries = MMArch::read_array::<u64>(table_addr, 511);
+    for (i, entry) in entries.iter().enumerate() {
+        //打印已分配的entry和4K页表的所有entry
+        if *entry & 0x7 != 0 || level == 0 {
+            let next_level_addr = if level != 0 {
+                MMArch::phys_2_virt(PhysAddr::new((*entry & 0xFFFFFFFFF000) as usize))
+            } else {
+                //暂未分配地址
+                if *entry == 0 {
+                    continue;
+                }
+                MMArch::phys_2_virt(PhysAddr::new((*entry & 0xFFFFFFFFF000) as usize))
+            };
+            let entry_value = MMArch::read::<u64>(next_level_addr.unwrap());
+            debug!(
+                "Level {} - index {}: HPA: 0x{:016x}, read_to: 0x{:016x}",
+                level, i, *entry, /*& 0xFFFFFFFFF000*/ entry_value,
+            );
+            // 递归遍历下一级页表
+            traverse_ept_table(next_level_addr.unwrap(), level - 1);
+        }
+    }
+}

+ 426 - 0
kernel/src/arch/x86_64/vm/vmx/exit.rs

@@ -0,0 +1,426 @@
+use bitfield_struct::bitfield;
+use system_error::SystemError;
+use x86::vmx::vmcs::{guest, ro};
+
+use crate::{
+    arch::vm::asm::{IntrInfo, VmxAsm},
+    virt::vm::kvm_host::{vcpu::VirtCpu, Vm},
+};
+
+use super::{ept::EptViolationExitQual, vmx_info, PageFaultErr};
+extern crate num_traits;
+
+#[bitfield(u32)]
+pub struct VmxExitReason {
+    pub basic: u16,
+    pub reserved16: bool,
+    pub reserved17: bool,
+    pub reserved18: bool,
+    pub reserved19: bool,
+    pub reserved20: bool,
+    pub reserved21: bool,
+    pub reserved22: bool,
+    pub reserved23: bool,
+    pub reserved24: bool,
+    pub reserved25: bool,
+    pub bus_lock_detected: bool,
+    pub enclave_mode: bool,
+    pub smi_pending_mtf: bool,
+    pub smi_from_vmx_root: bool,
+    pub reserved30: bool,
+    pub failed_vmentry: bool,
+}
+
+//#define VMX_EXIT_REASONS
+#[derive(FromPrimitive, PartialEq, Clone, Copy)]
+#[allow(non_camel_case_types)]
+pub enum VmxExitReasonBasic {
+    EXCEPTION_OR_NMI = 0,
+    EXTERNAL_INTERRUPT = 1,
+    TRIPLE_FAULT = 2,
+    INIT_SIGNAL = 3,
+    SIPI = 4,
+    IO_SMI = 5,
+    OTHER_SMI = 6,
+    INTERRUPT_WINDOW = 7,
+    NMI_WINDOW = 8,
+    TASK_SWITCH = 9,
+    CPUID = 10,
+    GETSEC = 11,
+    HLT = 12,
+    INVD = 13,
+    INVLPG = 14,
+    RDPMC = 15,
+    RDTSC = 16,
+    RSM = 17,
+    VMCALL = 18,
+    VMCLEAR = 19,
+    VMLAUNCH = 20,
+    VMPTRLD = 21,
+    VMPTRST = 22,
+    VMREAD = 23,
+    VMRESUME = 24,
+    VMWRITE = 25,
+    VMXOFF = 26,
+    VMXON = 27,
+    CR_ACCESS = 28,
+    DR_ACCESS = 29,
+    IO_INSTRUCTION = 30,
+    RDMSR = 31,
+    WRMSR = 32,
+    VM_ENTRY_FAILURE_INVALID_GUEST_STATE = 33,
+    VM_ENTRY_FAILURE_MSR_LOADING = 34,
+    MWAIT = 36,
+    MONITOR_TRAP_FLAG = 37,
+    MONITOR = 39,
+    PAUSE = 40,
+    VM_ENTRY_FAILURE_MACHINE_CHECK_EVENT = 41,
+    TPR_BELOW_THRESHOLD = 43,
+    APIC_ACCESS = 44,
+    VIRTUALIZED_EOI = 45, // "EOI_INDUCED"
+    ACCESS_GDTR_OR_IDTR = 46,
+    ACCESS_LDTR_OR_TR = 47,
+    EPT_VIOLATION = 48,
+    EPT_MISCONFIG = 49,
+    INVEPT = 50,
+    RDTSCP = 51,
+    VMX_PREEMPTION_TIMER_EXPIRED = 52,
+    INVVPID = 53,
+    WBINVD = 54,
+    XSETBV = 55,
+    APIC_WRITE = 56,
+    RDRAND = 57,
+    INVPCID = 58,
+    VMFUNC = 59,
+    ENCLS = 60,
+    RDSEED = 61,
+    PML_FULL = 62,
+    XSAVES = 63,
+    XRSTORS = 64,
+
+    UMWAIT = 67,
+    TPAUSE = 68,
+    BUS_LOCK = 74,
+    NOTIFY = 75,
+
+    UNKNOWN,
+}
+
+impl From<u16> for VmxExitReasonBasic {
+    fn from(num: u16) -> Self {
+        match num {
+            0 => VmxExitReasonBasic::EXCEPTION_OR_NMI,
+            1 => VmxExitReasonBasic::EXTERNAL_INTERRUPT,
+            2 => VmxExitReasonBasic::TRIPLE_FAULT,
+            3 => VmxExitReasonBasic::INIT_SIGNAL,
+            4 => VmxExitReasonBasic::SIPI,
+            5 => VmxExitReasonBasic::IO_SMI,
+            6 => VmxExitReasonBasic::OTHER_SMI,
+            7 => VmxExitReasonBasic::INTERRUPT_WINDOW,
+            8 => VmxExitReasonBasic::NMI_WINDOW,
+            9 => VmxExitReasonBasic::TASK_SWITCH,
+            10 => VmxExitReasonBasic::CPUID,
+            11 => VmxExitReasonBasic::GETSEC,
+            12 => VmxExitReasonBasic::HLT,
+            13 => VmxExitReasonBasic::INVD,
+            14 => VmxExitReasonBasic::INVLPG,
+            15 => VmxExitReasonBasic::RDPMC,
+            16 => VmxExitReasonBasic::RDTSC,
+            17 => VmxExitReasonBasic::RSM,
+            18 => VmxExitReasonBasic::VMCALL,
+            19 => VmxExitReasonBasic::VMCLEAR,
+            20 => VmxExitReasonBasic::VMLAUNCH,
+            21 => VmxExitReasonBasic::VMPTRLD,
+            22 => VmxExitReasonBasic::VMPTRST,
+            23 => VmxExitReasonBasic::VMREAD,
+            24 => VmxExitReasonBasic::VMRESUME,
+            25 => VmxExitReasonBasic::VMWRITE,
+            26 => VmxExitReasonBasic::VMXOFF,
+            27 => VmxExitReasonBasic::VMXON,
+            28 => VmxExitReasonBasic::CR_ACCESS,
+            29 => VmxExitReasonBasic::DR_ACCESS,
+            30 => VmxExitReasonBasic::IO_INSTRUCTION,
+            31 => VmxExitReasonBasic::RDMSR,
+            32 => VmxExitReasonBasic::WRMSR,
+            33 => VmxExitReasonBasic::VM_ENTRY_FAILURE_INVALID_GUEST_STATE,
+            34 => VmxExitReasonBasic::VM_ENTRY_FAILURE_MSR_LOADING,
+            36 => VmxExitReasonBasic::MWAIT,
+            37 => VmxExitReasonBasic::MONITOR_TRAP_FLAG,
+            39 => VmxExitReasonBasic::MONITOR,
+            40 => VmxExitReasonBasic::PAUSE,
+            41 => VmxExitReasonBasic::VM_ENTRY_FAILURE_MACHINE_CHECK_EVENT,
+            43 => VmxExitReasonBasic::TPR_BELOW_THRESHOLD,
+            44 => VmxExitReasonBasic::APIC_ACCESS,
+            45 => VmxExitReasonBasic::VIRTUALIZED_EOI,
+            46 => VmxExitReasonBasic::ACCESS_GDTR_OR_IDTR,
+            47 => VmxExitReasonBasic::ACCESS_LDTR_OR_TR,
+            48 => VmxExitReasonBasic::EPT_VIOLATION,
+            49 => VmxExitReasonBasic::EPT_MISCONFIG,
+            50 => VmxExitReasonBasic::INVEPT,
+            51 => VmxExitReasonBasic::RDTSCP,
+            52 => VmxExitReasonBasic::VMX_PREEMPTION_TIMER_EXPIRED,
+            53 => VmxExitReasonBasic::INVVPID,
+            54 => VmxExitReasonBasic::WBINVD,
+            55 => VmxExitReasonBasic::XSETBV,
+            56 => VmxExitReasonBasic::APIC_WRITE,
+            57 => VmxExitReasonBasic::RDRAND,
+            58 => VmxExitReasonBasic::INVPCID,
+            59 => VmxExitReasonBasic::VMFUNC,
+            60 => VmxExitReasonBasic::ENCLS,
+            61 => VmxExitReasonBasic::RDSEED,
+            62 => VmxExitReasonBasic::PML_FULL,
+            63 => VmxExitReasonBasic::XSAVES,
+            64 => VmxExitReasonBasic::XRSTORS,
+
+            67 => VmxExitReasonBasic::UMWAIT,
+            68 => VmxExitReasonBasic::TPAUSE,
+            74 => VmxExitReasonBasic::BUS_LOCK,
+            75 => VmxExitReasonBasic::NOTIFY,
+            _ => VmxExitReasonBasic::UNKNOWN,
+        }
+    }
+}
+
+#[derive(Debug, PartialEq)]
+#[allow(dead_code)]
+pub enum ExitFastpathCompletion {
+    None,
+    ReenterGuest,
+    ExitHandled,
+}
+pub struct VmxExitHandlers {}
+//     //name 代表暂时不懂含义的(name linux=name DragonOS)
+//     ExceptionNmi = VmxExitReasonBasic::EXCEPTION_OR_NMI as isize,
+//     ExternalInterrupt = VmxExitReasonBasic::EXTERNAL_INTERRUPT as isize,
+//     TripleFault = VmxExitReasonBasic::TRIPLE_FAULT as isize,
+//     NmiWindow = VmxExitReasonBasic::NMI_WINDOW as isize,
+//     IoInstruction = VmxExitReasonBasic::IO_INSTRUCTION as isize,
+//     CrAccess = VmxExitReasonBasic::CR_ACCESS as isize,
+//     DrAccess = VmxExitReasonBasic::DR_ACCESS as isize,
+//     Cpuid = VmxExitReasonBasic::CPUID as isize,
+//     MsrRead = VmxExitReasonBasic::RDMSR as isize,
+//     MsrWrite = VmxExitReasonBasic::WRMSR as isize,
+//     InterruptWindow = VmxExitReasonBasic::INTERRUPT_WINDOW as isize,
+//     Hlt = VmxExitReasonBasic::HLT as isize,
+//     Invd = VmxExitReasonBasic::INVD as isize,
+//     Invlpg = VmxExitReasonBasic::INVLPG as isize,
+//     Rdpmc = VmxExitReasonBasic::RDPMC as isize,
+//     Vmcall = VmxExitReasonBasic::VMCALL as isize,
+//     Vmclear = VmxExitReasonBasic::VMCLEAR as isize,
+//     Vmlaunch = VmxExitReasonBasic::VMLAUNCH as isize,
+//     Vmptrld = VmxExitReasonBasic::VMPTRLD as isize,
+//     Vmptrst = VmxExitReasonBasic::VMPTRST as isize,
+//     Vmread = VmxExitReasonBasic::VMREAD as isize,
+//     Vmresume = VmxExitReasonBasic::VMRESUME as isize,
+//     Vmwrite = VmxExitReasonBasic::VMWRITE as isize,
+//     Vmoff = VmxExitReasonBasic::VMXOFF as isize,
+//     Vmon = VmxExitReasonBasic::VMXON as isize,
+//     TprBelowThreshold = VmxExitReasonBasic::TPR_BELOW_THRESHOLD as isize,
+//     ApicAccess = VmxExitReasonBasic::APIC_ACCESS as isize,
+//     ApicWrite = VmxExitReasonBasic::APIC_WRITE as isize,
+//     EoiInduced = VmxExitReasonBasic::VIRTUALIZED_EOI as isize, //name
+//     Wbinvd = VmxExitReasonBasic::WBINVD as isize,
+//     Xsetbv = VmxExitReasonBasic::XSETBV as isize,
+//     TaskSwitch = VmxExitReasonBasic::TASK_SWITCH as isize,
+//     MceDuringVmentry = VmxExitReasonBasic::VM_ENTRY_FAILURE_MACHINE_CHECK_EVENT as isize, //name
+//     GdtrIdtr = VmxExitReasonBasic::ACCESS_GDTR_OR_IDTR as isize,
+//     LdtrTr = VmxExitReasonBasic::ACCESS_LDTR_OR_TR as isize,
+//     EptViolation = VmxExitReasonBasic::EPT_VIOLATION as isize,
+//     EptMisconfig = VmxExitReasonBasic::EPT_MISCONFIG as isize,
+//     PauseInstruction = VmxExitReasonBasic::PAUSE as isize,
+//     MwaitInstruction = VmxExitReasonBasic::MWAIT as isize,
+//     MonitorTrapFlag = VmxExitReasonBasic::MONITOR_TRAP_FLAG as isize,
+//     MonitorInstruction = VmxExitReasonBasic::MONITOR as isize,
+//     Invept = VmxExitReasonBasic::INVEPT as isize,
+//     Invvpid = VmxExitReasonBasic::INVVPID as isize,
+//     Rdrand = VmxExitReasonBasic::RDRAND as isize,
+//     Rdseed = VmxExitReasonBasic::RDSEED as isize,
+//     PmlFull = VmxExitReasonBasic::PML_FULL as isize,
+//     Invpcid = VmxExitReasonBasic::INVPCID as isize,
+//     Vmfunc = VmxExitReasonBasic::VMFUNC as isize,
+//     PreemptionTimer = VmxExitReasonBasic::VMX_PREEMPTION_TIMER_EXPIRED as isize,
+//     Encls = VmxExitReasonBasic::ENCLS as isize,
+//     BusLock = VmxExitReasonBasic::BUS_LOCK as isize,
+//     Notify = VmxExitReasonBasic::NOTIFY as isize,
+//     Unknown,
+
+impl VmxExitHandlers {
+    #[inline(never)]
+    pub fn try_handle_exit(
+        vcpu: &mut VirtCpu,
+        vm: &Vm,
+        basic: VmxExitReasonBasic,
+    ) -> Option<Result<i32, SystemError>> {
+        // let exit_reason = vmx_vmread(VmcsFields::VMEXIT_EXIT_REASON as u32).unwrap() as u32;
+        // let exit_basic_reason = exit_reason & 0x0000_ffff;
+        // let guest_rip = vmx_vmread(VmcsFields::GUEST_RIP as u32).unwrap();
+        // let _guest_rflags = vmx_vmread(VmcsFields::GUEST_RFLAGS as u32).unwrap();
+        match basic {
+            VmxExitReasonBasic::IO_INSTRUCTION => {
+                return Some(Self::handle_io(vcpu));
+            }
+            VmxExitReasonBasic::EPT_VIOLATION => {
+                let r = Some(Self::handle_ept_violation(vcpu, vm));
+                debug();
+                r
+            }
+            VmxExitReasonBasic::EXTERNAL_INTERRUPT => {
+                return Some(Self::handle_external_interrupt(vcpu));
+            }
+            VmxExitReasonBasic::EXCEPTION_OR_NMI => {
+                todo!()
+            }
+            _ => None,
+        }
+    }
+
+    fn handle_io(_vcpu: &mut VirtCpu) -> Result<i32, SystemError> {
+        todo!();
+    }
+
+    fn handle_external_interrupt(vcpu: &mut VirtCpu) -> Result<i32, SystemError> {
+        vcpu.stat.irq_exits += 1;
+        Ok(1)
+    }
+
+    fn handle_ept_violation(vcpu: &mut VirtCpu, vm: &Vm) -> Result<i32, SystemError> {
+        let exit_qualification = vcpu.get_exit_qual(); //0x184
+                                                       // EPT 违规发生在从 NMI 执行 iret 时,
+                                                       // 在下一次 VM 进入之前必须设置 "blocked by NMI" 位。
+                                                       // 有一些错误可能会导致该位未被设置:
+                                                       // AAK134, BY25。
+        let vmx = vcpu.vmx();
+        if vmx.idt_vectoring_info.bits() & IntrInfo::INTR_INFO_VALID_MASK.bits() != 0
+            && vmx_info().enable_vnmi
+            && exit_qualification & IntrInfo::INTR_INFO_UNBLOCK_NMI.bits() as u64 != 0
+        {
+            VmxAsm::vmx_vmwrite(guest::INTERRUPTIBILITY_STATE, 0x8); //GUEST_INTR_STATE_NMI
+        }
+        let gpa = VmxAsm::vmx_vmread(ro::GUEST_PHYSICAL_ADDR_FULL);
+        //let exit_qualification = VmxAsm::vmx_vmread(ro::EXIT_QUALIFICATION);
+        // trace_kvm_page_fault(vcpu, gpa, exit_qualification);//
+
+        // 根据故障类型确定错误代码
+        let mut error_code = if exit_qualification & (EptViolationExitQual::ACC_READ.bits()) != 0 {
+            //debug!("error_code::ACC_READ");
+            PageFaultErr::PFERR_USER.bits()
+        } else {
+            0
+        };
+        error_code |= if exit_qualification & (EptViolationExitQual::ACC_WRITE.bits()) != 0 {
+            //debug!("error_code::ACC_WRITE");
+            PageFaultErr::PFERR_WRITE.bits()
+        } else {
+            0
+        };
+        error_code |= if exit_qualification & (EptViolationExitQual::ACC_INSTR.bits()) != 0 {
+            //actice
+            //debug!("error_code::ACC_INSTR");
+            PageFaultErr::PFERR_FETCH.bits()
+        } else {
+            0
+        };
+        error_code |= if exit_qualification & (EptViolationExitQual::RWX_MASK.bits()) != 0 {
+            //debug!("error_code::RWX_MASK");
+            PageFaultErr::PFERR_PRESENT.bits()
+        } else {
+            0
+        };
+        if exit_qualification & (EptViolationExitQual::GVA_IS_VALID.bits()) != 0 {
+            //调试用
+            //debug!("GVA is valid");
+        } else {
+            //debug!("GVA is invalid");
+        }
+        error_code |= if exit_qualification & (EptViolationExitQual::GVA_TRANSLATED.bits()) != 0 {
+            //debug!("error_code:GVA GVA_TRANSLATED");
+            PageFaultErr::PFERR_GUEST_FINAL.bits() //active
+        } else {
+            PageFaultErr::PFERR_GUEST_PAGE.bits()
+        };
+        //fixme:: 此时error_code为0x100000011
+
+        vcpu.arch.exit_qual = exit_qualification;
+
+        // 检查 GPA 是否超出物理内存限制,因为这是一个客户机页面错误。
+        // 我们必须在这里模拟指令,因为如果非法地址是分页结构的地址,
+        // 则会设置 EPT_VIOLATION_ACC_WRITE 位。
+        // 或者,如果支持,我们还可以使用 EPT 违规的高级 VM 退出信息来重建页面错误代码。
+        // if allow_smaller_maxphyaddr && kvm_vcpu_is_illegal_gpa(vcpu, gpa) {
+        //     return kvm_emulate_instruction(vcpu, 0);
+        // }
+        //debug!("EPT violation: error_code={:#x}", error_code);
+        vcpu.page_fault(vm, gpa, error_code, None, 0)
+    }
+}
+fn debug() {
+    //     // 3
+    //     let info = VmxAsm::vmx_vmread(VmcsFields::VMEXIT_INSTR_LEN as u32);
+    //     debug!("vmexit handler: VMEXIT_INSTR_LEN: 0x{:x}!", info);
+
+    //     //0
+    //     let info = VmxAsm::vmx_vmread(VmcsFields::VMEXIT_INSTR_INFO as u32);
+    //     debug!("vmexit handler: VMEXIT_INSTR_INFO: 0x{:x}!", info);
+
+    //     //0x64042
+    //     /*0x64042:
+
+    //     将其转换为二进制:0x64042 的二进制表示是 110010000001000010。
+    //     每个位代表一个异常向量(例如,除以零,调试,不可屏蔽中断,断点等)。
+
+    // 从 vmx_update_exception_bitmap 函数中,我们看到设置的特定异常:
+
+    //     PF_VECTOR:页面错误
+    //     UD_VECTOR:未定义操作码
+    //     MC_VECTOR:机器检查
+    //     DB_VECTOR:调试
+    //     AC_VECTOR:对齐检查
+
+    // 值 0x64042 设置了与这些异常相对应的位,这意味着当这些异常在来宾中发生时将导致 VM 退出。 */
+    //     let info = VmxAsm::vmx_vmread(control::EXCEPTION_BITMAP);
+    //     debug!("vmexit handler: EXCEPTION_BITMAP: 0x{:x}!", info);
+
+    //     //9
+    //     let info = VmxAsm::vmx_vmread(control::PAGE_FAULT_ERR_CODE_MASK);
+    //     debug!("vmexit handler: PAGE_FAULT_ERR_CODE_MASK: 0x{:x}!", info);
+
+    //     //1
+    //     let info = VmxAsm::vmx_vmread(control::PAGE_FAULT_ERR_CODE_MATCH);
+    //     debug!("vmexit handler: PAGE_FAULT_ERR_CODE_MATCH: 0x{:x}!", info);
+
+    //     //0
+    //     let info = VmxAsm::vmx_vmread(control::EPTP_LIST_ADDR_FULL);
+    //     debug!("vmexit handler: EPTP_LIST_ADDR_FULL: 0x{:x}!", info);
+
+    //     let info = VmxAsm::vmx_vmread(ro::VM_INSTRUCTION_ERROR);
+    //     debug!("vmexit handler: VM_INSTRUCTION_ERROR: 0x{:x}!", info);
+
+    // let info = VmxAsm::vmx_vmread(ro::EXIT_REASON);
+    // debug!("vmexit handler: EXIT_REASON:0x{:x}!", info);//EPT VIOLATION
+
+    // let info = VmxAsm::vmx_vmread(ro::VMEXIT_INTERRUPTION_INFO);
+    // debug!("vmexit handler: VMEXIT_INTERRUPTION_INFO: 0x{:x}!", info);
+
+    // let info = VmxAsm::vmx_vmread(ro::VMEXIT_INTERRUPTION_ERR_CODE);
+    // debug!("vmexit handler: VMEXIT_INTERRUPTION_ERR_CODE: 0x{:x}!", info);
+
+    // let info = VmxAsm::vmx_vmread(ro::IDT_VECTORING_INFO);
+    // debug!("vmexit handler: IDT_VECTORING_INFO: 0x{:x}!", info);
+
+    // let info = VmxAsm::vmx_vmread(ro::IDT_VECTORING_ERR_CODE);
+    // debug!("vmexit handler: IDT_VECTORING_ERR_CODE: 0x{:x}!", info);
+
+    // let info = VmxAsm::vmx_vmread(ro::VMEXIT_INSTRUCTION_LEN);
+    // debug!("vmexit handler: VMEXIT_INSTRUCTION_LEN: 0x{:x}!", info);
+
+    // let info = VmxAsm::vmx_vmread(ro::VMEXIT_INSTRUCTION_INFO);
+    // debug!("vmexit handler: VMEXIT_INSTRUCTION_INFO: 0x{:x}!", info);
+
+    //panic
+    // let info = VmxAsm::vmx_vmread(control::EPTP_INDEX);
+    // debug!("vmexit handler: EPTP_INDEX: 0x{:x}!", info);
+
+    //panic
+    // let info = VmxAsm::vmx_vmread(control::VIRT_EXCEPTION_INFO_ADDR_FULL);
+    // debug!("vmexit handler: VIRT_EXCEPTION_INFO_ADDR_FULL: 0x{:x}!", info);
+}

+ 3775 - 0
kernel/src/arch/x86_64/vm/vmx/mod.rs

@@ -0,0 +1,3775 @@
+use core::intrinsics::likely;
+use core::intrinsics::unlikely;
+use core::sync::atomic::{AtomicBool, Ordering};
+use exit::VmxExitHandlers;
+use log::debug;
+use log::error;
+use log::warn;
+use x86_64::registers::control::Cr3Flags;
+use x86_64::structures::paging::PhysFrame;
+
+use crate::arch::process::table::USER_DS;
+use crate::arch::vm::mmu::kvm_mmu::KvmMmu;
+use crate::arch::vm::uapi::kvm_exit;
+use crate::arch::vm::uapi::{
+    AC_VECTOR, BP_VECTOR, DB_VECTOR, GP_VECTOR, MC_VECTOR, NM_VECTOR, PF_VECTOR, UD_VECTOR,
+};
+use crate::arch::vm::vmx::vmcs::VmcsIntrHelper;
+use crate::libs::spinlock::SpinLockGuard;
+use crate::mm::VirtAddr;
+use crate::process::ProcessManager;
+use crate::virt::vm::kvm_host::vcpu::GuestDebug;
+use crate::{
+    arch::{
+        vm::{
+            asm::KvmX86Asm,
+            kvm_host::{vcpu::VirtCpuRequest, X86KvmArch},
+            vmx::vmcs::vmx_area,
+        },
+        CurrentIrqArch, MMArch, VirtCpuArch,
+    },
+    exception::InterruptArch,
+    libs::spinlock::SpinLock,
+    mm::{
+        percpu::{PerCpu, PerCpuVar},
+        MemoryManagementArch,
+    },
+    smp::{core::smp_get_processor_id, cpu::ProcessorId},
+    virt::vm::{kvm_dev::kvm_init, kvm_host::vcpu::VirtCpu, user_api::UapiKvmSegment},
+};
+use alloc::{alloc::Global, boxed::Box, collections::LinkedList, sync::Arc, vec::Vec};
+use asm::VMX_EPTP_AD_ENABLE_BIT;
+use asm::VMX_EPTP_MT_WB;
+use asm::VMX_EPTP_PWL_4;
+use asm::VMX_EPTP_PWL_5;
+use bitfield_struct::bitfield;
+use bitmap::{traits::BitMapOps, AllocBitmap};
+use raw_cpuid::CpuId;
+use system_error::SystemError;
+use x86::controlregs::{cr2, cr2_write};
+use x86::dtables::ldtr;
+use x86::msr::wrmsr;
+use x86::segmentation::load_ds;
+use x86::segmentation::load_es;
+use x86::segmentation::{ds, es, fs, gs};
+use x86::vmx::vmcs::ro;
+use x86::{
+    bits64::rflags::RFlags,
+    controlregs::{cr0, cr4, Cr0, Cr4, Xcr0},
+    msr::{self, rdmsr},
+    segmentation::{self},
+    vmx::vmcs::{
+        control::{
+            self, EntryControls, ExitControls, PinbasedControls, PrimaryControls, SecondaryControls,
+        },
+        guest, host,
+    },
+};
+use x86_64::registers::control::Cr3;
+use x86_64::{instructions::tables::sidt, registers::control::EferFlags};
+
+use crate::{
+    arch::{
+        vm::{vmx::vmcs::feat::VmxFeat, x86_kvm_manager_mut, McgCap},
+        KvmArch,
+    },
+    libs::rwlock::RwLock,
+    virt::vm::kvm_host::Vm,
+};
+
+use self::exit::ExitFastpathCompletion;
+use self::exit::VmxExitReason;
+use self::exit::VmxExitReasonBasic;
+use self::vmcs::LoadedVmcs;
+use self::{
+    capabilities::{ProcessorTraceMode, VmcsConfig, VmxCapability},
+    vmcs::{
+        current_loaded_vmcs_list_mut, current_vmcs, current_vmcs_mut, ControlsType,
+        LockedLoadedVmcs, VMControlStructure, VmxMsrBitmapAccess, VmxMsrBitmapAction,
+        PERCPU_LOADED_VMCS_LIST, PERCPU_VMCS, VMXAREA,
+    },
+};
+
+use super::asm::IntrInfo;
+use super::asm::SegmentCacheField;
+use super::kvm_host::vcpu::KvmIntrType;
+use super::kvm_host::RMODE_TSS_SIZE;
+use super::x86_kvm_ops;
+use super::{
+    asm::{VcpuSegment, VmxAsm, VmxMsrEntry},
+    init_kvm_arch,
+    kvm_host::{KvmFunc, KvmInitFunc, KvmIrqChipMode, KvmReg, MsrFilterType, NotifyVmExitFlags},
+    x86_kvm_manager, KvmArchManager,
+};
+
+pub mod asm;
+pub mod capabilities;
+pub mod ept;
+pub mod exit;
+pub mod vmcs;
+
+extern "C" {
+    fn vmx_vmexit();
+}
+
+pub struct VmxKvmInitFunc;
+
+impl VmxKvmInitFunc {
+    pub fn setup_per_cpu(&self) {
+        let mut vmcs_areas = Vec::new();
+        vmcs_areas.resize(PerCpu::MAX_CPU_NUM as usize, VMControlStructure::new());
+        unsafe { VMXAREA = PerCpuVar::new(vmcs_areas) };
+
+        let mut percpu_current_vmcs = Vec::new();
+        percpu_current_vmcs.resize(PerCpu::MAX_CPU_NUM as usize, None);
+        unsafe { PERCPU_VMCS = PerCpuVar::new(percpu_current_vmcs) }
+
+        let mut percpu_loaded_vmcs_lists = Vec::new();
+        percpu_loaded_vmcs_lists.resize(PerCpu::MAX_CPU_NUM as usize, LinkedList::new());
+        unsafe { PERCPU_LOADED_VMCS_LIST = PerCpuVar::new(percpu_loaded_vmcs_lists) }
+    }
+}
+
+impl KvmInitFunc for VmxKvmInitFunc {
+    #[allow(clippy::borrow_interior_mutable_const)]
+    #[inline(never)]
+    fn hardware_setup(&self) -> Result<(), SystemError> {
+        let idt = sidt();
+        let cpuid = CpuId::new();
+        let cpu_extend_feature = cpuid
+            .get_extended_processor_and_feature_identifiers()
+            .ok_or(SystemError::ENOSYS)?;
+
+        let mut vmx_init: Box<Vmx> = unsafe {
+            Box::try_new_zeroed_in(Global)
+                .map_err(|_| SystemError::ENOMEM)?
+                .assume_init()
+        };
+
+        vmx_init.init();
+
+        vmx_init.host_idt_base = idt.base.as_u64();
+        Vmx::set_up_user_return_msrs();
+
+        Vmx::setup_vmcs_config(&mut vmx_init.vmcs_config, &mut vmx_init.vmx_cap)?;
+
+        let manager = x86_kvm_manager_mut();
+        let kvm_cap = &mut manager.kvm_caps;
+
+        if vmx_init.has_mpx() {
+            kvm_cap.supported_xcr0 &= !(Xcr0::XCR0_BNDREG_STATE | Xcr0::XCR0_BNDCSR_STATE);
+        }
+
+        // 判断是否启用vpid
+        if !vmx_init.has_vpid()
+            || !vmx_init.has_invvpid()
+            || !vmx_init.has_invvpid_single()
+            || !vmx_init.has_invvpid_global()
+        {
+            vmx_init.enable_vpid = false;
+        }
+
+        if !vmx_init.has_ept()
+            || !vmx_init.has_ept_4levels()
+            || !vmx_init.has_ept_mt_wb()
+            || !vmx_init.has_invept_global()
+        {
+            vmx_init.enable_ept = false;
+        }
+
+        // 是否启用了 EPT 并且检查 CPU 是否支持 Execute Disable(NX)功能
+        // Execute Disable 是一种 CPU 功能,可以防止代码在数据内存区域上执行
+        if !vmx_init.enable_ept && !cpu_extend_feature.has_execute_disable() {
+            error!("[KVM] NX (Execute Disable) not supported");
+            return Err(SystemError::ENOSYS);
+        }
+
+        if !vmx_init.has_ept_ad_bits() || !vmx_init.enable_ept {
+            vmx_init.enable_ept_ad = false;
+        }
+
+        if !vmx_init.has_unrestricted_guest() || !vmx_init.enable_ept {
+            vmx_init.enable_unrestricted_guest = false;
+        }
+
+        if !vmx_init.has_flexproirity() {
+            vmx_init.enable_flexpriority = false;
+        }
+
+        if !vmx_init.has_virtual_nmis() {
+            vmx_init.enable_vnmi = false;
+        }
+
+        if !vmx_init.has_encls_vmexit() {
+            vmx_init.enable_sgx = false;
+        }
+
+        if !vmx_init.enable_flexpriority {
+            VmxKvmFunc::CONFIG.write().have_set_apic_access_page_addr = false;
+        }
+
+        if !vmx_init.has_tpr_shadow() {
+            VmxKvmFunc::CONFIG.write().have_update_cr8_intercept = false;
+        }
+
+        // TODO:https://code.dragonos.org.cn/xref/linux-6.6.21/arch/x86/kvm/vmx/vmx.c#8501 - 8513
+
+        if !vmx_init.has_ple() {
+            vmx_init.ple_gap = 0;
+            vmx_init.ple_window = 0;
+            vmx_init.ple_window_grow = 0;
+            vmx_init.ple_window_max = 0;
+            vmx_init.ple_window_shrink = 0;
+        }
+
+        if !vmx_init.has_apicv() {
+            vmx_init.enable_apicv = false;
+        }
+
+        if !vmx_init.enable_apicv {
+            // TODO: 设置sync_pir_to_irr
+        }
+
+        if !vmx_init.enable_apicv || !vmx_init.has_ipiv() {
+            vmx_init.enable_ipiv = false;
+        }
+
+        if vmx_init.has_tsc_scaling() {
+            kvm_cap.has_tsc_control = true;
+        }
+
+        kvm_cap.max_tsc_scaling_ratio = 0xffffffffffffffff;
+        kvm_cap.tsc_scaling_ratio_frac_bits = 48;
+        kvm_cap.has_bus_lock_exit = vmx_init.has_bus_lock_detection();
+        kvm_cap.has_notify_vmexit = vmx_init.has_notify_vmexit();
+
+        // vmx_init.vpid_bitmap.lock().set_all(false);
+
+        if vmx_init.enable_ept {
+            // TODO: mmu_set_ept_masks
+            warn!("mmu_set_ept_masks TODO!");
+        }
+
+        warn!("vmx_setup_me_spte_mask TODO!");
+
+        KvmMmu::kvm_configure_mmu(
+            vmx_init.enable_ept,
+            0,
+            vmx_init.get_max_ept_level(),
+            vmx_init.ept_cap_to_lpage_level(),
+        );
+
+        if !vmx_init.enable_ept || !vmx_init.enable_ept_ad || !vmx_init.has_pml() {
+            vmx_init.enable_pml = false;
+        }
+
+        if !vmx_init.enable_pml {
+            // TODO: Set cpu dirty log size
+        }
+
+        if !vmx_init.has_preemption_timer() {
+            vmx_init.enable_preemption_timer = false;
+        }
+
+        if vmx_init.enable_preemption_timer {
+            // TODO
+        }
+
+        if !vmx_init.enable_preemption_timer {
+            // TODO
+        }
+
+        kvm_cap
+            .supported_mce_cap
+            .insert(McgCap::MCG_LMCE_P | McgCap::MCG_CMCI_P);
+
+        // TODO: pt_mode
+
+        // TODO: setup_default_sgx_lepubkeyhash
+
+        // TODO: nested
+
+        // TODO: vmx_set_cpu_caps
+        init_vmx(vmx_init);
+        self.setup_per_cpu();
+
+        warn!("hardware setup finish");
+        Ok(())
+    }
+
+    fn handle_intel_pt_intr(&self) -> u32 {
+        todo!()
+    }
+
+    fn runtime_funcs(&self) -> &'static dyn super::kvm_host::KvmFunc {
+        &VmxKvmFunc
+    }
+}
+
+#[derive(Debug)]
+pub struct VmxKvmFunc;
+
+pub struct VmxKvmFuncConfig {
+    pub have_set_apic_access_page_addr: bool,
+    pub have_update_cr8_intercept: bool,
+}
+
+impl VmxKvmFunc {
+    #[allow(clippy::declare_interior_mutable_const)]
+    pub const CONFIG: RwLock<VmxKvmFuncConfig> = RwLock::new(VmxKvmFuncConfig {
+        have_set_apic_access_page_addr: true,
+        have_update_cr8_intercept: true,
+    });
+
+    pub fn vcpu_load_vmcs(
+        vcpu: &mut VirtCpu,
+        cpu: ProcessorId,
+        _buddy: Option<Arc<LockedLoadedVmcs>>,
+    ) {
+        let vmx = vcpu.vmx();
+        let already_loaded = vmx.loaded_vmcs.lock().cpu == cpu;
+
+        if !already_loaded {
+            Self::loaded_vmcs_clear(&vmx.loaded_vmcs);
+            let _irq_guard = unsafe { CurrentIrqArch::save_and_disable_irq() };
+
+            current_loaded_vmcs_list_mut().push_back(vmx.loaded_vmcs.clone());
+        }
+
+        if let Some(prev) = current_vmcs() {
+            let vmcs = vmx.loaded_vmcs.lock().vmcs.clone();
+            if !Arc::ptr_eq(&vmcs, prev) {
+                VmxAsm::vmcs_load(vmcs.phys_addr());
+                *current_vmcs_mut() = Some(vmcs);
+
+                // TODO:buddy barrier?
+            }
+        } else {
+            let vmcs = vmx.loaded_vmcs.lock().vmcs.clone();
+            VmxAsm::vmcs_load(vmcs.phys_addr());
+            *current_vmcs_mut() = Some(vmcs);
+
+            // TODO:buddy barrier?
+        }
+
+        if !already_loaded {
+            let mut pseudo_descriptpr: x86::dtables::DescriptorTablePointer<u64> =
+                Default::default();
+            unsafe {
+                x86::dtables::sgdt(&mut pseudo_descriptpr);
+            };
+
+            vmx.loaded_vmcs.lock().cpu = cpu;
+            let id = vmx.loaded_vmcs.lock().vmcs.lock().revision_id();
+            debug!(
+                "revision_id {id} req {:?}",
+                VirtCpuRequest::KVM_REQ_TLB_FLUSH
+            );
+            vcpu.request(VirtCpuRequest::KVM_REQ_TLB_FLUSH);
+
+            VmxAsm::vmx_vmwrite(
+                host::TR_BASE,
+                KvmX86Asm::get_segment_base(
+                    pseudo_descriptpr.base,
+                    pseudo_descriptpr.limit,
+                    unsafe { x86::task::tr().bits() },
+                ),
+            );
+
+            VmxAsm::vmx_vmwrite(host::GDTR_BASE, pseudo_descriptpr.base as usize as u64);
+
+            VmxAsm::vmx_vmwrite(host::IA32_SYSENTER_ESP, unsafe {
+                rdmsr(msr::IA32_SYSENTER_ESP)
+            });
+        }
+    }
+
+    pub fn loaded_vmcs_clear(loaded_vmcs: &Arc<LockedLoadedVmcs>) {
+        let mut guard = loaded_vmcs.lock();
+        if guard.cpu == ProcessorId::INVALID {
+            return;
+        }
+
+        if guard.cpu == smp_get_processor_id() {
+            if let Some(vmcs) = current_vmcs() {
+                if Arc::ptr_eq(vmcs, &guard.vmcs) {
+                    *current_vmcs_mut() = None;
+                }
+            }
+
+            VmxAsm::vmclear(guard.vmcs.phys_addr());
+
+            if let Some(shadow) = &guard.shadow_vmcs {
+                if guard.launched {
+                    VmxAsm::vmclear(shadow.phys_addr());
+                }
+            }
+
+            let _ = current_loaded_vmcs_list_mut().extract_if(|x| Arc::ptr_eq(x, loaded_vmcs));
+
+            guard.cpu = ProcessorId::INVALID;
+            guard.launched = false;
+        } else {
+            // 交由对应cpu处理
+            todo!()
+        }
+    }
+
+    pub fn seg_setup(&self, seg: VcpuSegment) {
+        let seg_field = &KVM_VMX_SEGMENT_FIELDS[seg as usize];
+
+        VmxAsm::vmx_vmwrite(seg_field.selector, 0);
+        VmxAsm::vmx_vmwrite(seg_field.base, 0);
+        VmxAsm::vmx_vmwrite(seg_field.limit, 0xffff);
+
+        let mut ar = 0x93;
+        if seg == VcpuSegment::CS {
+            ar |= 0x08;
+        }
+        VmxAsm::vmx_vmwrite(seg_field.ar_bytes, ar);
+    }
+}
+
+impl KvmFunc for VmxKvmFunc {
+    fn name(&self) -> &'static str {
+        "VMX"
+    }
+
+    fn hardware_enable(&self) -> Result<(), SystemError> {
+        let vmcs = vmx_area().get().as_ref();
+
+        debug!("vmcs idx {}", vmcs.abort);
+
+        let phys_addr =
+            unsafe { MMArch::virt_2_phys(VirtAddr::new(vmcs as *const _ as usize)).unwrap() };
+
+        // TODO: intel_pt_handle_vmx(1);
+
+        VmxAsm::kvm_cpu_vmxon(phys_addr)?;
+
+        Ok(())
+    }
+
+    fn vm_init(&self) -> X86KvmArch {
+        let vmx_init = vmx_info();
+
+        let mut arch = X86KvmArch::default();
+        if vmx_init.ple_gap == 0 {
+            arch.pause_in_guest = true;
+        }
+
+        return arch;
+    }
+
+    fn vcpu_create(&self, vcpu: &mut VirtCpu, vm: &Vm) {
+        VmxVCpuPriv::init(vcpu, vm);
+    }
+
+    fn vcpu_load(&self, vcpu: &mut VirtCpu, cpu: crate::smp::cpu::ProcessorId) {
+        Self::vcpu_load_vmcs(vcpu, cpu, None);
+        // TODO: vmx_vcpu_pi_load
+    }
+
+    fn cache_reg(&self, vcpu: &mut VirtCpuArch, reg: KvmReg) {
+        vcpu.mark_register_available(reg);
+
+        match reg {
+            KvmReg::VcpuRegsRsp => {
+                vcpu.regs[reg as usize] = VmxAsm::vmx_vmread(guest::RSP);
+            }
+            KvmReg::VcpuRegsRip => {
+                vcpu.regs[reg as usize] = VmxAsm::vmx_vmread(guest::RIP);
+            }
+            // VCPU_EXREG_PDPTR
+            KvmReg::NrVcpuRegs => {
+                if vmx_info().enable_ept {
+                    todo!()
+                }
+            }
+            KvmReg::VcpuExregCr0 => {
+                let guest_owned = vcpu.cr0_guest_owned_bits;
+
+                vcpu.cr0.remove(guest_owned);
+                vcpu.cr0.insert(
+                    Cr0::from_bits_truncate(VmxAsm::vmx_vmread(guest::CR0) as usize) & guest_owned,
+                );
+            }
+            KvmReg::VcpuExregCr3 => {
+                //当拦截CR3加载时(例如用于影子分页),KVM(Kernel-based Virtual Machine)的CR3会被加载到硬件中,而不是客户机的CR3。
+                //暂时先直接读寄存器
+                vcpu.cr3 = VmxAsm::vmx_vmread(guest::CR3);
+                //todo!()
+            }
+            KvmReg::VcpuExregCr4 => {
+                let guest_owned = vcpu.cr4_guest_owned_bits;
+
+                vcpu.cr4.remove(guest_owned);
+                vcpu.cr4.insert(
+                    Cr4::from_bits_truncate(VmxAsm::vmx_vmread(guest::CR4) as usize) & guest_owned,
+                );
+            }
+            _ => {
+                todo!()
+            }
+        }
+    }
+
+    fn apicv_pre_state_restore(&self, _vcpu: &mut VirtCpu) {
+        // https://code.dragonos.org.cn/xref/linux-6.6.21/arch/x86/kvm/vmx/vmx.c#6924
+        // TODO: pi
+        // todo!()
+    }
+
+    fn set_msr(&self, vcpu: &mut VirtCpu, msr: super::asm::MsrData) -> Result<(), SystemError> {
+        let vmx = vcpu.vmx_mut();
+        let msr_index = msr.index;
+        let data = msr.data;
+
+        match msr_index {
+            msr::IA32_EFER => {
+                todo!("IA32_EFER")
+            }
+
+            msr::IA32_FS_BASE => {
+                todo!("IA32_FS_BASE")
+            }
+
+            msr::IA32_GS_BASE => {
+                todo!("IA32_GS_BASE")
+            }
+
+            msr::IA32_KERNEL_GSBASE => {
+                todo!("IA32_KERNEL_GSBASE")
+            }
+
+            0x000001c4 => {
+                todo!("MSR_IA32_XFD")
+            }
+
+            msr::IA32_SYSENTER_CS => {
+                todo!("IA32_SYSENTER_CS")
+            }
+
+            msr::IA32_SYSENTER_EIP => {
+                todo!("IA32_SYSENTER_EIP")
+            }
+
+            msr::IA32_SYSENTER_ESP => {
+                todo!("IA32_SYSENTER_ESP")
+            }
+
+            msr::IA32_DEBUGCTL => {
+                todo!("IA32_DEBUGCTL")
+            }
+
+            msr::MSR_C1_PMON_EVNT_SEL0 => {
+                todo!("MSR_IA32_BNDCFGS")
+            }
+
+            0xe1 => {
+                todo!("MSR_IA32_UMWAIT_CONTROL	")
+            }
+
+            0x48 => {
+                todo!("MSR_IA32_SPEC_CTRL")
+            }
+
+            msr::MSR_IA32_TSX_CTRL => {
+                todo!("MSR_IA32_TSX_CTRL")
+            }
+
+            msr::IA32_PAT => {
+                todo!("IA32_PAT")
+            }
+
+            0x4d0 => {
+                todo!("MSR_IA32_MCG_EXT_CTL")
+            }
+
+            msr::IA32_FEATURE_CONTROL => {
+                todo!("IA32_FEATURE_CONTROL")
+            }
+
+            0x8c..=0x8f => {
+                todo!("MSR_IA32_SGXLEPUBKEYHASH0 ... MSR_IA32_SGXLEPUBKEYHASH3 {msr_index}")
+            }
+
+            msr::IA32_VMX_BASIC..=msr::IA32_VMX_VMFUNC => {
+                todo!("msr::IA32_VMX_BASIC..=msr::IA32_VMX_VMFUNC")
+            }
+
+            msr::MSR_IA32_RTIT_CTL => {
+                todo!("MSR_IA32_RTIT_CTL")
+            }
+
+            msr::MSR_IA32_RTIT_STATUS => {
+                todo!("MSR_IA32_RTIT_STATUS")
+            }
+
+            msr::MSR_IA32_RTIT_OUTPUT_BASE => {
+                todo!("MSR_IA32_RTIT_OUTPUT_BASE")
+            }
+
+            0x572 => {
+                todo!("MSR_IA32_RTIT_CR3_MATCH")
+            }
+
+            msr::MSR_IA32_RTIT_OUTPUT_MASK_PTRS => {
+                todo!("MSR_IA32_RTIT_OUTPUT_MASK_PTRS")
+            }
+
+            msr::MSR_IA32_ADDR0_START..=msr::MSR_IA32_ADDR3_END => {
+                todo!("msr::MSR_IA32_ADDR0_START..=msr::MSR_IA32_ADDR3_END")
+            }
+
+            msr::MSR_PERF_CAPABILITIES => {
+                todo!("MSR_PERF_CAPABILITIES")
+            }
+
+            _ => {
+                let uret_msr = vmx.find_uret_msr(msr_index);
+
+                if let Some((idx, _msr)) = uret_msr {
+                    vmx.set_guest_uret_msr(idx, data)?;
+                    vmx.set_uret_msr(msr_index, data);
+                } else {
+                    vcpu.arch.set_msr_common(&msr);
+                };
+            }
+        }
+
+        if msr_index == 0x10a {
+            // MSR_IA32_ARCH_CAPABILITIES
+            todo!()
+        }
+
+        Ok(())
+    }
+
+    fn vcpu_reset(&self, vcpu: &mut VirtCpu, vm: &Vm, init_event: bool) {
+        if !init_event {
+            vmx_info_mut().vmx_reset_vcpu(vcpu, vm)
+        }
+        vcpu.kvm_set_cr8(0);
+
+        let vmx = vcpu.vmx_mut();
+        vmx.rmode.vm86_active = false;
+        vmx.spec_ctrl = 0;
+        vmx.msr_ia32_umwait_control = 0;
+        vmx.hv_deadline_tsc = u64::MAX;
+
+        vmx.segment_cache_clear();
+
+        vcpu.arch.mark_register_available(KvmReg::VcpuExregSegments);
+
+        self.seg_setup(VcpuSegment::CS);
+        VmxAsm::vmx_vmwrite(guest::CS_SELECTOR, 0xf000);
+        VmxAsm::vmx_vmwrite(guest::CS_BASE, 0xffff0000);
+
+        self.seg_setup(VcpuSegment::DS);
+        self.seg_setup(VcpuSegment::ES);
+        self.seg_setup(VcpuSegment::FS);
+        self.seg_setup(VcpuSegment::GS);
+        self.seg_setup(VcpuSegment::SS);
+
+        VmxAsm::vmx_vmwrite(guest::TR_SELECTOR, 0);
+        VmxAsm::vmx_vmwrite(guest::TR_BASE, 0);
+        VmxAsm::vmx_vmwrite(guest::TR_LIMIT, 0xffff);
+        VmxAsm::vmx_vmwrite(guest::TR_ACCESS_RIGHTS, 0x008b);
+
+        VmxAsm::vmx_vmwrite(guest::LDTR_SELECTOR, 0);
+        VmxAsm::vmx_vmwrite(guest::LDTR_BASE, 0);
+        VmxAsm::vmx_vmwrite(guest::LDTR_LIMIT, 0xffff);
+        VmxAsm::vmx_vmwrite(guest::LDTR_ACCESS_RIGHTS, 0x00082);
+
+        VmxAsm::vmx_vmwrite(guest::GDTR_BASE, 0);
+        VmxAsm::vmx_vmwrite(guest::GDTR_LIMIT, 0xffff);
+
+        VmxAsm::vmx_vmwrite(guest::IDTR_BASE, 0);
+        VmxAsm::vmx_vmwrite(guest::IDTR_LIMIT, 0xffff);
+
+        VmxAsm::vmx_vmwrite(guest::ACTIVITY_STATE, 0);
+        VmxAsm::vmx_vmwrite(guest::INTERRUPTIBILITY_STATE, 0);
+        VmxAsm::vmx_vmwrite(guest::PENDING_DBG_EXCEPTIONS, 0);
+
+        if x86_kvm_manager().mpx_supported() {
+            VmxAsm::vmx_vmwrite(guest::IA32_BNDCFGS_FULL, 0);
+        }
+
+        VmxAsm::vmx_vmwrite(control::VMENTRY_INTERRUPTION_INFO_FIELD, 0);
+
+        vcpu.request(VirtCpuRequest::MAKE_KVM_REQ_APIC_PAGE_RELOAD);
+
+        vmx_info().vpid_sync_context(vcpu.vmx().vpid);
+
+        warn!("TODO: vmx_update_fb_clear_dis");
+    }
+
+    fn set_rflags(&self, vcpu: &mut VirtCpu, mut rflags: x86::bits64::rflags::RFlags) {
+        if vcpu.is_unrestricted_guest() {
+            vcpu.arch.mark_register_available(KvmReg::VcpuExregRflags);
+            vcpu.vmx_mut().rflags = rflags;
+            VmxAsm::vmx_vmwrite(guest::RFLAGS, rflags.bits());
+            return;
+        }
+
+        let old_rflags = self.get_rflags(vcpu);
+
+        let vmx = vcpu.vmx_mut();
+
+        vmx.rflags = rflags;
+        if vmx.rmode.vm86_active {
+            vmx.rmode.save_rflags = rflags;
+            rflags.insert(RFlags::FLAGS_IOPL3 | RFlags::FLAGS_VM);
+        }
+
+        VmxAsm::vmx_vmwrite(guest::RFLAGS, rflags.bits());
+
+        if (old_rflags ^ vmx.rflags).contains(RFlags::FLAGS_VM) {
+            let emulation_required = vmx_info().emulation_required(vcpu);
+            vcpu.vmx_mut().emulation_required = emulation_required;
+        }
+    }
+
+    fn set_cr0(&self, vm: &Vm, vcpu: &mut VirtCpu, cr0: x86::controlregs::Cr0) {
+        let old_cr0_pg = vcpu.arch.read_cr0_bits(Cr0::CR0_ENABLE_PAGING);
+        let mut hw_cr0 = cr0 & (!(Cr0::CR0_NOT_WRITE_THROUGH | Cr0::CR0_CACHE_DISABLE));
+
+        if vmx_info().enable_unrestricted_guest {
+            hw_cr0.insert(Cr0::CR0_NUMERIC_ERROR);
+        } else {
+            hw_cr0
+                .insert(Cr0::CR0_NUMERIC_ERROR | Cr0::CR0_ENABLE_PAGING | Cr0::CR0_PROTECTED_MODE);
+
+            if !vmx_info().enable_ept {
+                hw_cr0.insert(Cr0::CR0_WRITE_PROTECT);
+            }
+
+            if vcpu.vmx().rmode.vm86_active && cr0.contains(Cr0::CR0_PROTECTED_MODE) {
+                vmx_info().enter_pmode(vcpu);
+            }
+
+            if !vcpu.vmx().rmode.vm86_active && !cr0.contains(Cr0::CR0_PROTECTED_MODE) {
+                vmx_info().enter_rmode(vcpu, vm);
+            }
+        }
+
+        VmxAsm::vmx_vmwrite(control::CR0_READ_SHADOW, cr0.bits() as u64);
+        VmxAsm::vmx_vmwrite(guest::CR0, hw_cr0.bits() as u64);
+
+        vcpu.arch.cr0 = cr0;
+
+        vcpu.arch.mark_register_available(KvmReg::VcpuExregCr0);
+
+        if vcpu.arch.efer.contains(EferFlags::LONG_MODE_ENABLE) {
+            if old_cr0_pg.is_empty() && cr0.contains(Cr0::CR0_ENABLE_PAGING) {
+                todo!("enter lmode todo");
+            } else if !old_cr0_pg.is_empty() && !cr0.contains(Cr0::CR0_ENABLE_PAGING) {
+                todo!("exit lmode todo");
+            }
+        }
+
+        if vmx_info().enable_ept && !vmx_info().enable_unrestricted_guest {
+            todo!()
+        }
+
+        vcpu.vmx_mut().emulation_required = vmx_info().emulation_required(vcpu);
+    }
+
+    fn set_cr4(&self, vcpu: &mut VirtCpu, cr4_flags: x86::controlregs::Cr4) {
+        let old_cr4 = vcpu.arch.read_cr4_bits(Cr4::all());
+
+        let mut hw_cr4 = (unsafe { cr4() } & Cr4::CR4_ENABLE_MACHINE_CHECK)
+            | (cr4_flags & (!Cr4::CR4_ENABLE_MACHINE_CHECK));
+
+        if vmx_info().enable_unrestricted_guest {
+            hw_cr4.insert(Cr4::CR4_ENABLE_VMX);
+        } else if vcpu.vmx().rmode.vm86_active {
+            hw_cr4.insert(Cr4::CR4_ENABLE_PAE | Cr4::CR4_ENABLE_VMX | Cr4::CR4_ENABLE_VME);
+        } else {
+            hw_cr4.insert(Cr4::CR4_ENABLE_PAE | Cr4::CR4_ENABLE_VMX);
+        }
+
+        if vmx_info().vmx_umip_emulated() {
+            if cr4_flags.contains(Cr4::CR4_ENABLE_UMIP) {
+                vcpu.vmx().loaded_vmcs().controls_set(
+                    ControlsType::SecondaryExec,
+                    SecondaryControls::DTABLE_EXITING.bits() as u64,
+                );
+                hw_cr4.remove(Cr4::CR4_ENABLE_UMIP);
+            } else if !vcpu.arch.is_guest_mode() {
+                vcpu.vmx().loaded_vmcs().controls_clearbit(
+                    ControlsType::SecondaryExec,
+                    SecondaryControls::DTABLE_EXITING.bits() as u64,
+                );
+            }
+        }
+
+        vcpu.arch.cr4 = cr4_flags;
+        vcpu.arch.mark_register_available(KvmReg::VcpuExregCr4);
+
+        if !vmx_info().enable_unrestricted_guest {
+            if vmx_info().enable_ept {
+                if vcpu.arch.read_cr0_bits(Cr0::CR0_ENABLE_PAGING).is_empty() {
+                    hw_cr4.remove(Cr4::CR4_ENABLE_PAE);
+                    hw_cr4.insert(Cr4::CR4_ENABLE_PSE);
+                } else if !cr4_flags.contains(Cr4::CR4_ENABLE_PAE) {
+                    hw_cr4.remove(Cr4::CR4_ENABLE_PAE);
+                }
+            }
+
+            if vcpu.arch.read_cr0_bits(Cr0::CR0_ENABLE_PAGING).is_empty() {
+                hw_cr4.remove(
+                    Cr4::CR4_ENABLE_SMEP | Cr4::CR4_ENABLE_SMAP | Cr4::CR4_ENABLE_PROTECTION_KEY,
+                );
+            }
+        }
+
+        VmxAsm::vmx_vmwrite(control::CR4_READ_SHADOW, cr4_flags.bits() as u64);
+        VmxAsm::vmx_vmwrite(guest::CR4, hw_cr4.bits() as u64);
+
+        if (cr4_flags ^ old_cr4).contains(Cr4::CR4_ENABLE_OS_XSAVE | Cr4::CR4_ENABLE_PROTECTION_KEY)
+        {
+            // TODO: update_cpuid_runtime
+        }
+    }
+
+    fn set_efer(&self, vcpu: &mut VirtCpu, efer: x86_64::registers::control::EferFlags) {
+        if vcpu.vmx().find_uret_msr(msr::IA32_EFER).is_none() {
+            return;
+        }
+
+        vcpu.arch.efer = efer;
+        if efer.contains(EferFlags::LONG_MODE_ACTIVE) {
+            vcpu.vmx().loaded_vmcs().controls_setbit(
+                ControlsType::VmEntry,
+                EntryControls::IA32E_MODE_GUEST.bits().into(),
+            );
+        } else {
+            vcpu.vmx().loaded_vmcs().controls_clearbit(
+                ControlsType::VmEntry,
+                EntryControls::IA32E_MODE_GUEST.bits().into(),
+            );
+        }
+
+        vmx_info().setup_uret_msrs(vcpu);
+    }
+
+    fn update_exception_bitmap(&self, vcpu: &mut VirtCpu) {
+        let mut eb = (1u32 << PF_VECTOR)
+            | (1 << UD_VECTOR)
+            | (1 << MC_VECTOR)
+            | (1 << DB_VECTOR)
+            | (1 << AC_VECTOR);
+
+        if vmx_info().enable_vmware_backdoor {
+            eb |= 1 << GP_VECTOR;
+        }
+
+        if vcpu.guest_debug & (GuestDebug::ENABLE | GuestDebug::USE_SW_BP)
+            == (GuestDebug::ENABLE | GuestDebug::USE_SW_BP)
+        {
+            eb |= 1 << BP_VECTOR;
+        }
+
+        if vcpu.vmx().rmode.vm86_active {
+            eb = !0;
+        }
+
+        if !vmx_info().vmx_need_pf_intercept(vcpu) {
+            eb &= !(1 << PF_VECTOR);
+        }
+
+        if vcpu.arch.is_guest_mode() {
+            todo!()
+        } else {
+            let mut mask = PageFaultErr::empty();
+            let mut match_code = PageFaultErr::empty();
+            if vmx_info().enable_ept && (eb & (1 << PF_VECTOR) != 0) {
+                mask = PageFaultErr::PFERR_PRESENT | PageFaultErr::PFERR_RSVD;
+                match_code = PageFaultErr::PFERR_PRESENT;
+            }
+
+            VmxAsm::vmx_vmwrite(control::PAGE_FAULT_ERR_CODE_MASK, mask.bits);
+            VmxAsm::vmx_vmwrite(control::PAGE_FAULT_ERR_CODE_MATCH, match_code.bits);
+        }
+
+        if vcpu.arch.xfd_no_write_intercept {
+            eb |= 1 << NM_VECTOR;
+        }
+
+        VmxAsm::vmx_vmwrite(control::EXCEPTION_BITMAP, eb as u64);
+    }
+
+    fn has_emulated_msr(&self, msr: u32) -> bool {
+        match msr {
+            msr::IA32_SMBASE => {
+                return vmx_info().enable_unrestricted_guest
+                    || vmx_info().emulate_invalid_guest_state;
+            }
+
+            msr::IA32_VMX_BASIC..=msr::IA32_VMX_VMFUNC => {
+                return vmx_info().nested;
+            }
+
+            0xc001011f | 0xc0000104 => {
+                // MSR_AMD64_VIRT_SPEC_CTRL | MSR_AMD64_TSC_RATIO
+                return false;
+            }
+
+            _ => {
+                return true;
+            }
+        }
+    }
+
+    fn get_msr_feature(&self, msr: &mut super::asm::VmxMsrEntry) -> bool {
+        match msr.index {
+            msr::IA32_VMX_BASIC..=msr::IA32_VMX_VMFUNC => {
+                if !vmx_info().nested {
+                    return false;
+                }
+
+                match vmx_info().vmcs_config.nested.get_vmx_msr(msr.index) {
+                    Some(data) => {
+                        msr.data = data;
+                        return true;
+                    }
+                    None => {
+                        return false;
+                    }
+                }
+            }
+            _ => {
+                return false;
+            }
+        }
+    }
+
+    fn get_rflags(&self, vcpu: &mut VirtCpu) -> x86::bits64::rflags::RFlags {
+        if !vcpu.arch.is_register_available(KvmReg::VcpuExregRflags) {
+            vcpu.arch.mark_register_available(KvmReg::VcpuExregRflags);
+            let mut rflags = RFlags::from_bits_truncate(VmxAsm::vmx_vmread(guest::RFLAGS));
+            if vcpu.vmx_mut().rmode.vm86_active {
+                rflags.remove(RFlags::FLAGS_IOPL3 | RFlags::FLAGS_VM);
+                let save_rflags = vcpu.vmx_mut().rmode.save_rflags;
+                rflags.insert(save_rflags & !(RFlags::FLAGS_IOPL3 | RFlags::FLAGS_VM));
+            }
+
+            vcpu.vmx_mut().rflags = rflags;
+        }
+
+        return vcpu.vmx_mut().rflags;
+    }
+
+    fn vcpu_precreate(&self, vm: &mut Vm) -> Result<(), SystemError> {
+        if vm.arch.irqchip_mode != KvmIrqChipMode::None || !vmx_info().enable_ipiv {
+            return Ok(());
+        }
+
+        let kvm_vmx = vm.kvm_vmx_mut();
+
+        if kvm_vmx.pid_table.is_some() {
+            return Ok(());
+        }
+
+        kvm_vmx.pid_table = Some(unsafe { Box::new_zeroed().assume_init() });
+        Ok(())
+    }
+
+    fn set_segment(&self, vcpu: &mut VirtCpu, var: &mut UapiKvmSegment, seg: VcpuSegment) {
+        vcpu.vmx_mut().emulation_required = vmx_info().emulation_required(vcpu);
+        *var = vmx_info()._vmx_set_segment(vcpu, *var, seg);
+    }
+
+    fn get_segment(
+        &self,
+        vcpu: &mut VirtCpu,
+        var: UapiKvmSegment,
+        seg: VcpuSegment,
+    ) -> UapiKvmSegment {
+        return vmx_info().vmx_get_segment(vcpu, var, seg);
+    }
+
+    fn get_idt(&self, _vcpu: &mut VirtCpu, dt: &mut x86::dtables::DescriptorTablePointer<u8>) {
+        dt.limit = VmxAsm::vmx_vmread(guest::IDTR_LIMIT) as u16;
+        dt.base = VmxAsm::vmx_vmread(guest::IDTR_BASE) as usize as *const _;
+    }
+
+    fn set_idt(&self, _vcpu: &mut VirtCpu, dt: &x86::dtables::DescriptorTablePointer<u8>) {
+        VmxAsm::vmx_vmwrite(guest::IDTR_LIMIT, dt.limit as u64);
+        VmxAsm::vmx_vmwrite(guest::IDTR_BASE, dt.base as usize as u64);
+    }
+
+    fn get_gdt(&self, _vcpu: &mut VirtCpu, dt: &mut x86::dtables::DescriptorTablePointer<u8>) {
+        dt.limit = VmxAsm::vmx_vmread(guest::GDTR_LIMIT) as u16;
+        dt.base = VmxAsm::vmx_vmread(guest::GDTR_BASE) as usize as *const _;
+    }
+
+    fn set_gdt(&self, _vcpu: &mut VirtCpu, dt: &x86::dtables::DescriptorTablePointer<u8>) {
+        VmxAsm::vmx_vmwrite(guest::GDTR_LIMIT, dt.limit as u64);
+        VmxAsm::vmx_vmwrite(guest::GDTR_BASE, dt.base as usize as u64);
+    }
+
+    fn is_vaild_cr0(&self, vcpu: &VirtCpu, _cr0: Cr0) -> bool {
+        if vcpu.arch.is_guest_mode() {
+            todo!()
+        }
+
+        // TODO: 判断vmx->nested->vmxon
+
+        true
+    }
+
+    fn is_vaild_cr4(&self, vcpu: &VirtCpu, cr4: Cr4) -> bool {
+        if cr4.contains(Cr4::CR4_ENABLE_VMX) && vcpu.arch.is_smm() {
+            return false;
+        }
+
+        // TODO: 判断vmx->nested->vmxon
+
+        return true;
+    }
+
+    fn post_set_cr3(&self, _vcpu: &VirtCpu, _cr3: u64) {
+        // Do Nothing
+    }
+
+    fn vcpu_run(&self, vcpu: &mut VirtCpu) -> ExitFastpathCompletion {
+        if unlikely(vmx_info().enable_vnmi && vcpu.vmx().loaded_vmcs().soft_vnmi_blocked) {
+            todo!()
+        }
+
+        if unlikely(vcpu.vmx().emulation_required) {
+            todo!()
+        }
+
+        if vcpu.vmx().ple_window_dirty {
+            vcpu.vmx_mut().ple_window_dirty = false;
+            VmxAsm::vmx_vmwrite(control::PLE_WINDOW, vcpu.vmx().ple_window as u64);
+        }
+
+        if vcpu.arch.is_register_dirty(KvmReg::VcpuRegsRsp) {
+            VmxAsm::vmx_vmwrite(guest::RSP, vcpu.arch.regs[KvmReg::VcpuRegsRsp as usize]);
+        }
+        if vcpu.arch.is_register_dirty(KvmReg::VcpuRegsRip) {
+            VmxAsm::vmx_vmwrite(guest::RIP, vcpu.arch.regs[KvmReg::VcpuRegsRip as usize]);
+        }
+
+        vcpu.arch.clear_dirty();
+
+        let cr3: (PhysFrame, Cr3Flags) = Cr3::read();
+        if unlikely(cr3 != vcpu.vmx().loaded_vmcs().host_state.cr3) {
+            let cr3_combined: u64 =
+                (cr3.0.start_address().as_u64() & 0xFFFF_FFFF_FFFF_F000) | (cr3.1.bits() & 0xFFF);
+            VmxAsm::vmx_vmwrite(host::CR3, cr3_combined);
+            vcpu.vmx().loaded_vmcs().host_state.cr3 = cr3;
+        }
+
+        let cr4 = unsafe { cr4() };
+        if unlikely(cr4 != vcpu.vmx().loaded_vmcs().host_state.cr4) {
+            VmxAsm::vmx_vmwrite(host::CR4, cr4.bits() as u64);
+            vcpu.vmx().loaded_vmcs().host_state.cr4 = cr4;
+        }
+
+        // TODO: set_debugreg
+
+        if vcpu.guest_debug.contains(GuestDebug::SINGLESTEP) {
+            todo!()
+        }
+
+        vcpu.load_guest_xsave_state();
+
+        // TODO: pt_guest_enter
+
+        // TODO: atomic_switch_perf_msrs
+
+        if vmx_info().enable_preemption_timer {
+            // todo!()
+            warn!("vmx_update_hv_timer TODO");
+        }
+
+        Vmx::vmx_vcpu_enter_exit(vcpu, vcpu.vmx().vmx_vcpu_run_flags());
+
+        unsafe {
+            load_ds(USER_DS);
+            load_es(USER_DS);
+        };
+
+        // TODO: pt_guest_exit
+
+        // TODO: kvm_load_host_xsave_state
+
+        if vcpu.arch.is_guest_mode() {
+            todo!()
+        }
+
+        if unlikely(vcpu.vmx().fail != 0) {
+            return ExitFastpathCompletion::None;
+        }
+
+        if unlikely(
+            vcpu.vmx().exit_reason.basic()
+                == VmxExitReasonBasic::VM_ENTRY_FAILURE_MACHINE_CHECK_EVENT as u16,
+        ) {
+            todo!()
+        }
+
+        if unlikely(vcpu.vmx().exit_reason.failed_vmentry()) {
+            return ExitFastpathCompletion::None;
+        }
+
+        vcpu.vmx().loaded_vmcs().launched = true;
+
+        // TODO: 处理中断
+
+        if vcpu.arch.is_guest_mode() {
+            return ExitFastpathCompletion::None;
+        }
+
+        return Vmx::vmx_exit_handlers_fastpath(vcpu);
+    }
+
+    fn prepare_switch_to_guest(&self, vcpu: &mut VirtCpu) {
+        // let cpu = smp_get_processor_id();
+        let vmx = vcpu.vmx_mut();
+        vmx.req_immediate_exit = false;
+
+        if !vmx.guest_uret_msrs_loaded {
+            vmx.guest_uret_msrs_loaded = true;
+
+            for (idx, msr) in vmx.guest_uret_msrs.iter().enumerate() {
+                if msr.load_into_hardware {
+                    x86_kvm_manager().kvm_set_user_return_msr(idx, msr.data, msr.mask);
+                }
+            }
+        }
+
+        // TODO: nested
+
+        if vmx.guest_state_loaded {
+            return;
+        }
+
+        // fixme: 这里读的是当前cpu的gsbase,正确安全做法应该为将gsbase设置为percpu变量
+        let gs_base = unsafe { rdmsr(msr::IA32_KERNEL_GSBASE) };
+
+        let current = ProcessManager::current_pcb();
+        let mut pcb_arch = current.arch_info_irqsave();
+
+        let fs_sel = fs().bits();
+        let gs_sel = gs().bits();
+
+        unsafe {
+            pcb_arch.save_fsbase();
+            pcb_arch.save_gsbase();
+        }
+
+        let fs_base = pcb_arch.fsbase();
+        vmx.msr_host_kernel_gs_base = pcb_arch.gsbase() as u64;
+
+        unsafe { wrmsr(msr::IA32_KERNEL_GSBASE, vmx.msr_guest_kernel_gs_base) };
+
+        let mut loaded_vmcs = vmx.loaded_vmcs();
+        let host_state = &mut loaded_vmcs.host_state;
+        host_state.ldt_sel = unsafe { ldtr() }.bits();
+
+        host_state.ds_sel = ds().bits();
+        host_state.es_sel = es().bits();
+
+        host_state.set_host_fsgs(fs_sel, gs_sel, fs_base, gs_base as usize);
+        drop(loaded_vmcs);
+
+        vmx.guest_state_loaded = true;
+    }
+
+    fn flush_tlb_all(&self, vcpu: &mut VirtCpu) {
+        if vmx_info().enable_ept {
+            VmxAsm::ept_sync_global();
+        } else if vmx_info().has_invvpid_global() {
+            VmxAsm::sync_vcpu_global();
+        } else {
+            VmxAsm::sync_vcpu_single(vcpu.vmx().vpid);
+            // TODO: 嵌套:VmxAsm::sync_vcpu_single(vcpu.vmx().nested.vpid02);
+        }
+    }
+
+    fn handle_exit_irqoff(&self, vcpu: &mut VirtCpu) {
+        if vcpu.vmx().emulation_required {
+            return;
+        }
+
+        let basic = VmxExitReasonBasic::from(vcpu.vmx().exit_reason.basic());
+
+        if basic == VmxExitReasonBasic::EXTERNAL_INTERRUPT {
+            Vmx::handle_external_interrupt_irqoff(vcpu);
+        } else if basic == VmxExitReasonBasic::EXCEPTION_OR_NMI {
+            //todo!()
+        }
+    }
+
+    fn handle_exit(
+        //vmx_handle_exit
+        &self,
+        vcpu: &mut VirtCpu,
+        vm: &Vm,
+        fastpath: ExitFastpathCompletion,
+    ) -> Result<i32, SystemError> {
+        let r = vmx_info().vmx_handle_exit(vcpu, vm, fastpath);
+
+        if vcpu.vmx().exit_reason.bus_lock_detected() {
+            todo!()
+        }
+
+        r
+    }
+
+    fn load_mmu_pgd(&self, vcpu: &mut VirtCpu, _vm: &Vm, root_hpa: u64, root_level: u32) {
+        let guest_cr3;
+        let eptp;
+
+        if vmx_info().enable_ept {
+            eptp = vmx_info().construct_eptp(vcpu, root_hpa, root_level);
+
+            VmxAsm::vmx_vmwrite(control::EPTP_FULL, eptp);
+
+            if !vmx_info().enable_unrestricted_guest
+                && !vcpu.arch.cr0.contains(Cr0::CR0_ENABLE_PAGING)
+            {
+                todo!()
+            } else if vcpu.arch.is_register_dirty(KvmReg::VcpuExregCr3) {
+                guest_cr3 = vcpu.arch.cr3;
+                debug!("load_mmu_pgd: guest_cr3 = {:#x}", guest_cr3);
+            } else {
+                return;
+            }
+        } else {
+            todo!();
+        }
+        vcpu.load_pdptrs();
+        VmxAsm::vmx_vmwrite(guest::CR3, guest_cr3);
+    }
+}
+
+static mut VMX: Option<Vmx> = None;
+
+#[inline]
+pub fn vmx_info() -> &'static Vmx {
+    unsafe { VMX.as_ref().unwrap() }
+}
+
+#[inline]
+pub fn vmx_info_mut() -> &'static mut Vmx {
+    unsafe { VMX.as_mut().unwrap() }
+}
+
+#[inline(never)]
+pub fn init_vmx(vmx: Box<Vmx>) {
+    static INIT_ONCE: AtomicBool = AtomicBool::new(false);
+    if INIT_ONCE
+        .compare_exchange(false, true, Ordering::SeqCst, Ordering::SeqCst)
+        .is_ok()
+    {
+        unsafe { VMX = Some(*vmx) };
+    } else {
+        panic!("init_vmx can only be called once");
+    }
+}
+
+#[derive(Debug)]
+pub struct Vmx {
+    pub host_idt_base: u64,
+    pub vmcs_config: VmcsConfig,
+    pub vmx_cap: VmxCapability,
+    pub vpid_bitmap: SpinLock<AllocBitmap>,
+    pub enable_vpid: bool,
+    pub enable_ept: bool,
+    pub enable_ept_ad: bool,
+    pub enable_unrestricted_guest: bool,
+    pub emulate_invalid_guest_state: bool,
+    pub enable_flexpriority: bool,
+    pub enable_vnmi: bool,
+    pub enable_sgx: bool,
+    pub enable_apicv: bool,
+    pub enable_ipiv: bool,
+    pub enable_pml: bool,
+    pub enable_preemption_timer: bool,
+
+    pub enable_vmware_backdoor: bool,
+
+    pub nested: bool,
+
+    pub ple_gap: u32,
+    pub ple_window: u32,
+    pub ple_window_grow: u32,
+    pub ple_window_max: u32,
+    pub ple_window_shrink: u32,
+
+    pub pt_mode: ProcessorTraceMode,
+}
+
+impl Vmx {
+    fn init(&mut self) {
+        let mut bitmap = AllocBitmap::new(1 << 16);
+
+        // 0为vpid的非法值
+        bitmap.set(0, true);
+
+        self.host_idt_base = Default::default();
+        self.vmcs_config = Default::default();
+        self.vmx_cap = Default::default();
+        self.vpid_bitmap = SpinLock::new(bitmap);
+        self.enable_vpid = true;
+        self.enable_ept = true;
+        self.enable_ept_ad = true;
+        self.enable_unrestricted_guest = true;
+        self.enable_flexpriority = true;
+        self.enable_vnmi = true;
+        self.enable_sgx = true;
+        self.ple_gap = 128;
+        self.ple_window = 4096;
+        self.ple_window_grow = 2;
+        self.ple_window_max = u32::MAX;
+        self.ple_window_shrink = 0;
+        self.enable_apicv = true;
+        self.enable_ipiv = true;
+        self.enable_pml = true;
+        self.enable_preemption_timer = true;
+        self.pt_mode = ProcessorTraceMode::System;
+        self.emulate_invalid_guest_state = true;
+
+        // 目前先不管嵌套虚拟化,后续再实现
+        self.nested = false;
+        self.enable_vmware_backdoor = false;
+    }
+
+    /*
+     * Internal error codes that are used to indicate that MSR emulation encountered
+     * an error that should result in #GP in the guest, unless userspace
+     * handles it.
+     */
+    #[allow(dead_code)]
+    pub const KVM_MSR_RET_INVALID: u32 = 2; /* in-kernel MSR emulation #GP condition */
+    #[allow(dead_code)]
+    pub const KVM_MSR_RET_FILTERED: u32 = 3; /* #GP due to userspace MSR filter */
+
+    pub const MAX_POSSIBLE_PASSTHROUGH_MSRS: usize = 16;
+
+    pub const VMX_POSSIBLE_PASSTHROUGH_MSRS: [u32; Self::MAX_POSSIBLE_PASSTHROUGH_MSRS] = [
+        0x48,  // MSR_IA32_SPEC_CTRL
+        0x49,  // MSR_IA32_PRED_CMD
+        0x10b, // MSR_IA32_FLUSH_CMD
+        msr::IA32_TIME_STAMP_COUNTER,
+        msr::IA32_FS_BASE,
+        msr::IA32_GS_BASE,
+        msr::IA32_KERNEL_GSBASE,
+        0x1c4, // MSR_IA32_XFD
+        0x1c5, // MSR_IA32_XFD_ERR
+        msr::IA32_SYSENTER_CS,
+        msr::IA32_SYSENTER_ESP,
+        msr::IA32_SYSENTER_EIP,
+        msr::MSR_CORE_C1_RESIDENCY,
+        msr::MSR_CORE_C3_RESIDENCY,
+        msr::MSR_CORE_C6_RESIDENCY,
+        msr::MSR_CORE_C7_RESIDENCY,
+    ];
+
+    /// ### 查看CPU是否支持虚拟化
+    #[allow(dead_code)]
+    pub fn check_vmx_support() -> bool {
+        let cpuid = CpuId::new();
+        // Check to see if CPU is Intel (“GenuineIntel”).
+        if let Some(vi) = cpuid.get_vendor_info() {
+            if vi.as_str() != "GenuineIntel" {
+                return false;
+            }
+        }
+        // Check processor supports for Virtual Machine Extension (VMX) technology
+        // CPUID.1:ECX.VMX[bit 5] = 1 (Intel Manual: 24.6 Discovering Support for VMX)
+        if let Some(fi) = cpuid.get_feature_info() {
+            if !fi.has_vmx() {
+                return false;
+            }
+        }
+        return true;
+    }
+
+    #[inline(never)]
+    pub fn set_up_user_return_msrs() {
+        const VMX_URET_MSRS_LIST: &[u32] = &[
+            msr::IA32_FMASK,
+            msr::IA32_LSTAR,
+            msr::IA32_CSTAR,
+            msr::IA32_EFER,
+            msr::IA32_TSC_AUX,
+            msr::IA32_STAR,
+            // 这个寄存器会出错<,先注释掉
+            // MSR_IA32_TSX_CTRL,
+        ];
+
+        let manager = x86_kvm_manager_mut();
+        for msr in VMX_URET_MSRS_LIST {
+            manager.add_user_return_msr(*msr);
+        }
+    }
+
+    /// 初始化设置vmcs的config
+    #[inline(never)]
+    pub fn setup_vmcs_config(
+        vmcs_config: &mut VmcsConfig,
+        vmx_cap: &mut VmxCapability,
+    ) -> Result<(), SystemError> {
+        const VMCS_ENTRY_EXIT_PAIRS: &[VmcsEntryExitPair] = &[
+            VmcsEntryExitPair::new(
+                EntryControls::LOAD_IA32_PERF_GLOBAL_CTRL,
+                ExitControls::LOAD_IA32_PERF_GLOBAL_CTRL,
+            ),
+            VmcsEntryExitPair::new(EntryControls::LOAD_IA32_PAT, ExitControls::LOAD_IA32_PAT),
+            VmcsEntryExitPair::new(EntryControls::LOAD_IA32_EFER, ExitControls::LOAD_IA32_EFER),
+            VmcsEntryExitPair::new(
+                EntryControls::LOAD_IA32_BNDCFGS,
+                ExitControls::CLEAR_IA32_BNDCFGS,
+            ),
+            VmcsEntryExitPair::new(
+                EntryControls::LOAD_IA32_RTIT_CTL,
+                ExitControls::CLEAR_IA32_RTIT_CTL,
+            ),
+        ];
+
+        let mut cpu_based_exec_control = VmxFeat::adjust_primary_controls()?;
+
+        let mut cpu_based_2nd_exec_control =
+            if cpu_based_exec_control.contains(PrimaryControls::SECONDARY_CONTROLS) {
+                VmxFeat::adjust_secondary_controls()?
+            } else {
+                SecondaryControls::empty()
+            };
+
+        if cpu_based_2nd_exec_control.contains(SecondaryControls::VIRTUALIZE_APIC) {
+            cpu_based_exec_control.remove(PrimaryControls::USE_TPR_SHADOW)
+        }
+
+        if !cpu_based_exec_control.contains(PrimaryControls::USE_TPR_SHADOW) {
+            cpu_based_2nd_exec_control.remove(
+                SecondaryControls::VIRTUALIZE_APIC_REGISTER
+                    | SecondaryControls::VIRTUALIZE_X2APIC
+                    | SecondaryControls::VIRTUAL_INTERRUPT_DELIVERY,
+            )
+        }
+
+        let cap = unsafe { rdmsr(msr::IA32_VMX_EPT_VPID_CAP) };
+        vmx_cap.set_val_from_msr_val(cap);
+
+        // 不支持ept但是读取到了值
+        if !cpu_based_2nd_exec_control.contains(SecondaryControls::ENABLE_EPT)
+            && !vmx_cap.ept.is_empty()
+        {
+            warn!("EPT CAP should not exist if not support. 1-setting enable EPT VM-execution control");
+            return Err(SystemError::EIO);
+        }
+
+        if !cpu_based_2nd_exec_control.contains(SecondaryControls::ENABLE_VPID)
+            && !vmx_cap.vpid.is_empty()
+        {
+            warn!("VPID CAP should not exist if not support. 1-setting enable VPID VM-execution control");
+            return Err(SystemError::EIO);
+        }
+
+        let cpuid = CpuId::new();
+        let cpu_extend_feat = cpuid
+            .get_extended_feature_info()
+            .ok_or(SystemError::ENOSYS)?;
+        if !cpu_extend_feat.has_sgx() {
+            cpu_based_2nd_exec_control.remove(SecondaryControls::ENCLS_EXITING);
+        }
+
+        let cpu_based_3rd_exec_control = 0;
+        // if cpu_based_exec_control.contains(SecondaryControls::TERTIARY_CONTROLS) {
+        //     // Self::adjust_vmx_controls64(VmxFeature::IPI_VIRT, IA32_CTLS3)
+        //     todo!()
+        // } else {
+        //     0
+        // };
+
+        let vmxexit_control = VmxFeat::adjust_exit_controls()?;
+
+        let pin_based_exec_control = VmxFeat::adjust_pin_based_controls()?;
+
+        // TODO: broken timer?
+        // https://code.dragonos.org.cn/xref/linux-6.6.21/arch/x86/kvm/vmx/vmx.c#2676
+
+        let vmentry_control = VmxFeat::adjust_entry_controls()?;
+
+        for pair in VMCS_ENTRY_EXIT_PAIRS {
+            let n_ctrl = pair.entry;
+            let x_ctrl = pair.exit;
+
+            // if !(vmentry_control.bits() & n_ctrl.bits) == !(vmxexit_control.bits() & x_ctrl.bits) {
+            //     continue;
+            // }
+            if (vmentry_control.contains(n_ctrl)) == (vmxexit_control.contains(x_ctrl)) {
+                continue;
+            }
+
+            warn!(
+                "Inconsistent VM-Entry/VM-Exit pair, entry = {:?}, exit = {:?}",
+                vmentry_control & n_ctrl,
+                vmxexit_control & x_ctrl,
+            );
+
+            return Err(SystemError::EIO);
+        }
+
+        let basic = unsafe { rdmsr(msr::IA32_VMX_BASIC) };
+        let vmx_msr_high = (basic >> 32) as u32;
+        let vmx_msr_low = basic as u32;
+
+        // 64位cpu,VMX_BASIC[48] == 0
+        if vmx_msr_high & (1 << 16) != 0 {
+            return Err(SystemError::EIO);
+        }
+
+        // 判断是否为写回(WB)
+        if (vmx_msr_high >> 18) & 15 != 6 {
+            return Err(SystemError::EIO);
+        }
+
+        let misc_msr = unsafe { rdmsr(msr::IA32_VMX_MISC) };
+
+        vmcs_config.size = vmx_msr_high & 0x1fff;
+        vmcs_config.basic_cap = vmx_msr_high & !0x1fff;
+        vmcs_config.revision_id = vmx_msr_low;
+        vmcs_config.pin_based_exec_ctrl = pin_based_exec_control;
+        vmcs_config.cpu_based_exec_ctrl = cpu_based_exec_control;
+        vmcs_config.cpu_based_2nd_exec_ctrl = cpu_based_2nd_exec_control;
+        vmcs_config.cpu_based_3rd_exec_ctrl = cpu_based_3rd_exec_control;
+        vmcs_config.vmentry_ctrl = vmentry_control;
+        vmcs_config.vmexit_ctrl = vmxexit_control;
+        vmcs_config.misc = misc_msr;
+
+        Ok(())
+    }
+
+    fn adjust_vmx_controls(ctl_min: u32, ctl_opt: u32, msr: u32) -> Result<u32, SystemError> {
+        let mut ctl = ctl_min | ctl_opt;
+        let val = unsafe { rdmsr(msr) };
+        let low = val as u32;
+        let high = (val >> 32) as u32;
+
+        ctl &= high;
+        ctl |= low;
+
+        if ctl_min & !ctl != 0 {
+            return Err(SystemError::EIO);
+        }
+
+        return Ok(ctl);
+    }
+    #[allow(dead_code)]
+    fn adjust_vmx_controls64(ctl_opt: u32, msr: u32) -> u32 {
+        let allow = unsafe { rdmsr(msr) } as u32;
+        ctl_opt & allow
+    }
+
+    pub fn alloc_vpid(&self) -> Option<usize> {
+        if !self.enable_vpid {
+            return None;
+        }
+
+        let mut bitmap_guard = self.vpid_bitmap.lock();
+
+        let idx = bitmap_guard.first_false_index();
+        if let Some(idx) = idx {
+            bitmap_guard.set(idx, true);
+        }
+
+        return idx;
+    }
+    #[allow(dead_code)]
+    pub fn free_vpid(&self, vpid: Option<usize>) {
+        if !self.enable_vpid || vpid.is_none() {
+            return;
+        }
+
+        self.vpid_bitmap.lock().set(vpid.unwrap(), false);
+    }
+
+    pub fn is_valid_passthrough_msr(msr: u32) -> bool {
+        match msr {
+            0x800..0x8ff => {
+                // x2Apic msr寄存器
+                return true;
+            }
+            msr::MSR_IA32_RTIT_STATUS
+            | msr::MSR_IA32_RTIT_OUTPUT_BASE
+            | msr::MSR_IA32_RTIT_OUTPUT_MASK_PTRS
+            | msr::MSR_IA32_CR3_MATCH
+            | msr::MSR_LBR_SELECT
+            | msr::MSR_LASTBRANCH_TOS => {
+                return true;
+            }
+            msr::MSR_IA32_ADDR0_START..msr::MSR_IA32_ADDR3_END => {
+                return true;
+            }
+            0xdc0..0xddf => {
+                // MSR_LBR_INFO_0 ... MSR_LBR_INFO_0 + 31
+                return true;
+            }
+            0x680..0x69f => {
+                // MSR_LBR_NHM_FROM ... MSR_LBR_NHM_FROM + 31
+                return true;
+            }
+            0x6c0..0x6df => {
+                // MSR_LBR_NHM_TO ... MSR_LBR_NHM_TO + 31
+                return true;
+            }
+            0x40..0x48 => {
+                // MSR_LBR_CORE_FROM ... MSR_LBR_CORE_FROM + 8
+                return true;
+            }
+            0x60..0x68 => {
+                // MSR_LBR_CORE_TO ... MSR_LBR_CORE_TO + 8
+                return true;
+            }
+            _ => {
+                return Self::possible_passthrough_msr_slot(msr).is_some();
+            }
+        }
+    }
+
+    pub fn vpid_sync_context(&self, vpid: u16) {
+        if self.has_invvpid_single() {
+            VmxAsm::sync_vcpu_single(vpid);
+        } else if vpid != 0 {
+            VmxAsm::sync_vcpu_global();
+        }
+    }
+
+    pub fn possible_passthrough_msr_slot(msr: u32) -> Option<usize> {
+        for (idx, val) in Self::VMX_POSSIBLE_PASSTHROUGH_MSRS.iter().enumerate() {
+            if *val == msr {
+                return Some(idx);
+            }
+        }
+
+        return None;
+    }
+
+    pub fn tdp_enabled(&self) -> bool {
+        self.enable_ept
+    }
+
+    fn setup_l1d_flush(&self) {
+        // TODO:先这样写
+        *L1TF_VMX_MITIGATION.write() = VmxL1dFlushState::NotRequired;
+    }
+
+    pub fn construct_eptp(&self, vcpu: &mut VirtCpu, root_hpa: u64, root_level: u32) -> u64 {
+        let mut eptp = VMX_EPTP_MT_WB;
+
+        eptp |= if root_level == 5 {
+            VMX_EPTP_PWL_5
+        } else {
+            VMX_EPTP_PWL_4
+        };
+
+        if self.enable_ept_ad && !vcpu.arch.is_guest_mode() {
+            eptp |= VMX_EPTP_AD_ENABLE_BIT;
+        }
+
+        eptp |= root_hpa;
+
+        return eptp;
+    }
+
+    fn vmx_reset_vcpu(&mut self, vcpu: &mut VirtCpu, vm: &Vm) {
+        self.init_vmcs(vcpu, vm);
+
+        if self.nested {
+            todo!()
+        }
+
+        // TODO: vcpu_setup_sgx_lepubkeyhash
+
+        // TODO: nested
+
+        vcpu.arch.microcode_version = 0x100000000;
+
+        let vmx = vcpu.vmx_mut();
+        vmx.msr_ia32_feature_control_valid_bits = 1 << 0;
+
+        vmx.post_intr_desc.control.set_nv(0xf2);
+        vmx.post_intr_desc.control.set_sn(true);
+    }
+
+    fn init_vmcs(&mut self, vcpu: &mut VirtCpu, vm: &Vm) {
+        let kvm_vmx = vm.kvm_vmx();
+        if vmx_info().nested {
+            todo!()
+        }
+
+        if vmx_info().has_msr_bitmap() {
+            debug!(
+                "msr_bitmap addr 0x{:x}",
+                vcpu.vmx().vmcs01.lock().msr_bitmap.phys_addr() as u64
+            );
+            VmxAsm::vmx_vmwrite(
+                control::MSR_BITMAPS_ADDR_FULL,
+                vcpu.vmx().vmcs01.lock().msr_bitmap.phys_addr() as u64,
+            )
+        }
+
+        VmxAsm::vmx_vmwrite(guest::LINK_PTR_FULL, u64::MAX);
+
+        let mut loaded_vmcs = vcpu.vmx().loaded_vmcs.lock();
+
+        loaded_vmcs.controls_set(
+            ControlsType::Pin,
+            self.get_pin_based_exec_controls(vcpu).bits() as u64,
+        );
+
+        loaded_vmcs.controls_set(
+            ControlsType::Exec,
+            self.get_exec_controls(vcpu, &vm.arch).bits() as u64,
+        );
+
+        if self.has_sceondary_exec_ctrls() {
+            loaded_vmcs.controls_set(
+                ControlsType::SecondaryExec,
+                self.get_secondary_exec_controls(vcpu, vm).bits() as u64,
+            )
+        }
+
+        if self.has_tertiary_exec_ctrls() {
+            todo!()
+        }
+
+        drop(loaded_vmcs);
+
+        if self.enable_apicv && vcpu.arch.lapic_in_kernel() {
+            VmxAsm::vmx_vmwrite(control::EOI_EXIT0_FULL, 0);
+            VmxAsm::vmx_vmwrite(control::EOI_EXIT1_FULL, 0);
+            VmxAsm::vmx_vmwrite(control::EOI_EXIT2_FULL, 0);
+            VmxAsm::vmx_vmwrite(control::EOI_EXIT3_FULL, 0);
+
+            VmxAsm::vmx_vmwrite(guest::INTERRUPT_STATUS, 0);
+
+            VmxAsm::vmx_vmwrite(control::POSTED_INTERRUPT_NOTIFICATION_VECTOR, 0xf2);
+            VmxAsm::vmx_vmwrite(control::POSTED_INTERRUPT_DESC_ADDR_FULL, unsafe {
+                MMArch::virt_2_phys(VirtAddr::new(
+                    &vcpu.vmx().post_intr_desc as *const _ as usize,
+                ))
+                .unwrap()
+                .data() as u64
+            })
+        }
+
+        if self.enable_apicv && vcpu.arch.lapic_in_kernel() {
+            // PID_POINTER_TABLE
+            VmxAsm::vmx_vmwrite(0x2042, unsafe {
+                MMArch::virt_2_phys(VirtAddr::new(kvm_vmx.pid_table().as_ptr() as usize))
+                    .unwrap()
+                    .data() as u64
+            });
+            // LAST_PID_POINTER_INDEX
+            VmxAsm::vmx_vmwrite(0x08, vm.arch.max_vcpu_ids as u64 - 1);
+        }
+
+        if !vm.arch.pause_in_guest {
+            VmxAsm::vmx_vmwrite(control::PLE_GAP, self.ple_gap as u64);
+            vcpu.vmx_mut().ple_window = self.ple_window;
+            vcpu.vmx_mut().ple_window_dirty = true;
+        }
+
+        if vm
+            .arch
+            .notify_vmexit_flags
+            .contains(NotifyVmExitFlags::KVM_X86_NOTIFY_VMEXIT_ENABLED)
+        {
+            // NOTIFY_WINDOW
+            VmxAsm::vmx_vmwrite(0x4024, vm.arch.notify_window as u64);
+        }
+
+        VmxAsm::vmx_vmwrite(control::PAGE_FAULT_ERR_CODE_MASK, 0);
+        VmxAsm::vmx_vmwrite(control::PAGE_FAULT_ERR_CODE_MATCH, 0);
+        VmxAsm::vmx_vmwrite(control::CR3_TARGET_COUNT, 0);
+
+        VmxAsm::vmx_vmwrite(host::FS_SELECTOR, 0);
+        VmxAsm::vmx_vmwrite(host::GS_SELECTOR, 0);
+        self.set_constant_host_state(vcpu);
+
+        VmxAsm::vmx_vmwrite(host::FS_BASE, 0);
+        VmxAsm::vmx_vmwrite(host::GS_BASE, 0);
+
+        if self.has_vmfunc() {
+            VmxAsm::vmx_vmwrite(control::VM_FUNCTION_CONTROLS_FULL, 0);
+        }
+
+        VmxAsm::vmx_vmwrite(control::VMEXIT_MSR_STORE_COUNT, 0);
+        VmxAsm::vmx_vmwrite(control::VMEXIT_MSR_LOAD_COUNT, 0);
+        VmxAsm::vmx_vmwrite(control::VMEXIT_MSR_LOAD_ADDR_FULL, unsafe {
+            MMArch::virt_2_phys(VirtAddr::new(
+                vcpu.vmx().msr_autoload.host.val.as_ptr() as *const _ as usize,
+            ))
+            .unwrap()
+            .data() as u64
+        });
+        VmxAsm::vmx_vmwrite(control::VMENTRY_MSR_LOAD_COUNT, 0);
+        VmxAsm::vmx_vmwrite(control::VMENTRY_MSR_LOAD_ADDR_FULL, unsafe {
+            MMArch::virt_2_phys(VirtAddr::new(
+                vcpu.vmx().msr_autoload.guest.val.as_ptr() as usize
+            ))
+            .unwrap()
+            .data() as u64
+        });
+
+        if self
+            .vmcs_config
+            .vmentry_ctrl
+            .contains(EntryControls::LOAD_IA32_PAT)
+        {
+            VmxAsm::vmx_vmwrite(guest::IA32_PAT_FULL, vcpu.arch.pat) //todo
+        }
+
+        let mut loaded_vmcs = vcpu.vmx().loaded_vmcs.lock();
+        loaded_vmcs.controls_set(
+            ControlsType::VmExit,
+            self.get_vmexit_controls().bits() as u64,
+        );
+
+        loaded_vmcs.controls_set(
+            ControlsType::VmEntry,
+            self.get_vmentry_controls().bits() as u64,
+        );
+
+        drop(loaded_vmcs);
+
+        vcpu.arch.cr0_guest_owned_bits = self.l1_guest_owned_cr0_bits();
+        VmxAsm::vmx_vmwrite(
+            control::CR0_GUEST_HOST_MASK,
+            (!vcpu.arch.cr0_guest_owned_bits).bits() as u64,
+        );
+
+        self.set_cr4_guest_host_mask(&mut vcpu.arch);
+
+        if vcpu.vmx().vpid != 0 {
+            VmxAsm::vmx_vmwrite(control::VPID, vcpu.vmx().vpid as u64);
+        }
+
+        if self.has_xsaves() {
+            VmxAsm::vmx_vmwrite(control::XSS_EXITING_BITMAP_FULL, 0);
+        }
+
+        if self.enable_pml {
+            VmxAsm::vmx_vmwrite(control::PML_ADDR_FULL, unsafe {
+                MMArch::virt_2_phys(VirtAddr::new(vcpu.vmx().pml_pg.as_ref().as_ptr() as usize))
+                    .unwrap()
+                    .data() as u64
+            });
+
+            VmxAsm::vmx_vmwrite(guest::PML_INDEX, VmxVCpuPriv::PML_ENTITY_NUM as u64 - 1);
+        }
+
+        // TODO: vmx_write_encls_bitmap
+
+        if self.pt_mode == ProcessorTraceMode::HostGuest {
+            todo!()
+        }
+
+        VmxAsm::vmx_vmwrite(guest::IA32_SYSENTER_CS, 0);
+        VmxAsm::vmx_vmwrite(guest::IA32_SYSENTER_ESP, 0);
+        VmxAsm::vmx_vmwrite(guest::IA32_SYSENTER_EIP, 0);
+        VmxAsm::vmx_vmwrite(guest::IA32_DEBUGCTL_FULL, 0);
+
+        if self.has_tpr_shadow() {
+            VmxAsm::vmx_vmwrite(control::VIRT_APIC_ADDR_FULL, 0);
+            if vcpu.arch.lapic_in_kernel() {
+                VmxAsm::vmx_vmwrite(control::VIRT_APIC_ADDR_FULL, unsafe {
+                    MMArch::virt_2_phys(VirtAddr::new(vcpu.arch.lapic().regs.as_ptr() as usize))
+                        .unwrap()
+                        .data() as u64
+                });
+            }
+
+            VmxAsm::vmx_vmwrite(control::TPR_THRESHOLD, 0);
+        }
+
+        self.setup_uret_msrs(vcpu);
+    }
+
+    /// 打印VMCS信息用于debug
+    pub fn dump_vmcs(&self, vcpu: &VirtCpu) {
+        let vmentry_ctl = unsafe {
+            EntryControls::from_bits_unchecked(self.vmread(control::VMENTRY_CONTROLS) as u32)
+        };
+
+        let vmexit_ctl = unsafe {
+            ExitControls::from_bits_unchecked(self.vmread(control::VMEXIT_CONTROLS) as u32)
+        };
+
+        let cpu_based_exec_ctl = PrimaryControls::from_bits_truncate(
+            self.vmread(control::PRIMARY_PROCBASED_EXEC_CONTROLS) as u32,
+        );
+
+        let pin_based_exec_ctl = PinbasedControls::from_bits_truncate(
+            self.vmread(control::PINBASED_EXEC_CONTROLS) as u32,
+        );
+
+        // let cr4 = Cr4::from_bits_truncate(self.vmread(guest::CR4) as usize);
+
+        let secondary_exec_control = if self.has_sceondary_exec_ctrls() {
+            unsafe {
+                SecondaryControls::from_bits_unchecked(
+                    self.vmread(control::SECONDARY_PROCBASED_EXEC_CONTROLS) as u32,
+                )
+            }
+        } else {
+            SecondaryControls::empty()
+        };
+
+        if self.has_tertiary_exec_ctrls() {
+            todo!()
+        }
+
+        error!(
+            "VMCS addr: 0x{:x}, last attempted VM-entry on CPU {:?}",
+            vcpu.vmx().loaded_vmcs().vmcs.lock().as_ref() as *const _ as usize,
+            vcpu.arch.last_vmentry_cpu
+        );
+
+        error!("--- GUEST STATE ---");
+        error!(
+            "CR0: actual = 0x{:x}, shadow = 0x{:x}, gh_mask = 0x{:x}",
+            self.vmread(guest::CR0),
+            self.vmread(control::CR0_READ_SHADOW),
+            self.vmread(control::CR0_GUEST_HOST_MASK)
+        );
+        error!(
+            "CR4: actual = 0x{:x}, shadow = 0x{:x}, gh_mask = 0x{:x}",
+            self.vmread(guest::CR4),
+            self.vmread(control::CR4_READ_SHADOW),
+            self.vmread(control::CR4_GUEST_HOST_MASK)
+        );
+        error!("CR3: actual = 0x{:x}", self.vmread(guest::CR3));
+
+        if self.has_ept() {
+            error!(
+                "PDPTR0 = 0x{:x}, PDPTR1 = 0x{:x}",
+                self.vmread(guest::PDPTE0_FULL),
+                self.vmread(guest::PDPTE1_FULL)
+            );
+            error!(
+                "PDPTR2 = 0x{:x}, PDPTR3 = 0x{:x}",
+                self.vmread(guest::PDPTE2_FULL),
+                self.vmread(guest::PDPTE3_FULL)
+            );
+        }
+        error!(
+            "RSP = 0x{:x}, RIP = 0x{:x}",
+            self.vmread(guest::RSP),
+            self.vmread(guest::RIP)
+        );
+        error!(
+            "RFLAGS = 0x{:x}, DR7 = 0x{:x}",
+            self.vmread(guest::RFLAGS),
+            self.vmread(guest::DR7)
+        );
+        error!(
+            "Sysenter RSP = 0x{:x}, CS:RIP = 0x{:x}:0x{:x}",
+            self.vmread(guest::IA32_SYSENTER_ESP),
+            self.vmread(guest::IA32_SYSENTER_CS),
+            self.vmread(guest::IA32_SYSENTER_EIP),
+        );
+
+        self.dump_sel("CS: ", guest::CS_SELECTOR);
+        self.dump_sel("DS: ", guest::DS_SELECTOR);
+        self.dump_sel("SS: ", guest::SS_SELECTOR);
+        self.dump_sel("ES: ", guest::ES_SELECTOR);
+        self.dump_sel("FS: ", guest::FS_SELECTOR);
+        self.dump_sel("GS: ", guest::GS_SELECTOR);
+
+        self.dump_dtsel("GDTR: ", guest::GDTR_LIMIT);
+        self.dump_sel("LDTR: ", guest::LDTR_SELECTOR);
+        self.dump_dtsel("IDTR: ", guest::IDTR_LIMIT);
+        self.dump_sel("TR: ", guest::TR_SELECTOR);
+
+        let efer_slot = vcpu
+            .vmx()
+            .msr_autoload
+            .guest
+            .find_loadstore_msr_slot(msr::IA32_EFER);
+
+        if vmentry_ctl.contains(EntryControls::LOAD_IA32_EFER) {
+            error!("EFER = 0x{:x}", self.vmread(guest::IA32_EFER_FULL));
+        } else if let Some(slot) = efer_slot {
+            error!(
+                "EFER = 0x{:x} (autoload)",
+                vcpu.vmx().msr_autoload.guest.val[slot].data
+            );
+        } else if vmentry_ctl.contains(EntryControls::IA32E_MODE_GUEST) {
+            error!(
+                "EFER = 0x{:x} (effective)",
+                vcpu.arch.efer | (EferFlags::LONG_MODE_ACTIVE | EferFlags::LONG_MODE_ENABLE)
+            );
+        } else {
+            error!(
+                "EFER = 0x{:x} (effective)",
+                vcpu.arch.efer & !(EferFlags::LONG_MODE_ACTIVE | EferFlags::LONG_MODE_ENABLE)
+            );
+        }
+
+        if vmentry_ctl.contains(EntryControls::LOAD_IA32_PAT) {
+            error!("PAT = 0x{:x}", self.vmread(guest::IA32_PAT_FULL));
+        }
+
+        error!(
+            "DebugCtl = 0x{:x}, DebugExceptions = 0x{:x}",
+            self.vmread(guest::IA32_DEBUGCTL_FULL),
+            self.vmread(guest::PENDING_DBG_EXCEPTIONS)
+        );
+
+        if self.has_load_perf_global_ctrl()
+            && vmentry_ctl.contains(EntryControls::LOAD_IA32_PERF_GLOBAL_CTRL)
+        {
+            error!(
+                "PerfGlobCtl = 0x{:x}",
+                self.vmread(guest::IA32_PERF_GLOBAL_CTRL_FULL)
+            );
+        }
+
+        if vmentry_ctl.contains(EntryControls::LOAD_IA32_BNDCFGS) {
+            error!("BndCfgS = 0x{:x}", self.vmread(guest::IA32_BNDCFGS_FULL));
+        }
+
+        error!(
+            "Interruptibility = 0x{:x}, ActivityState = 0x{:x}",
+            self.vmread(guest::INTERRUPT_STATUS),
+            self.vmread(guest::ACTIVITY_STATE)
+        );
+
+        if secondary_exec_control.contains(SecondaryControls::VIRTUAL_INTERRUPT_DELIVERY) {
+            error!(
+                "InterruptStatus = 0x{:x}",
+                self.vmread(guest::INTERRUPT_STATUS)
+            );
+        }
+
+        if self.vmread(control::VMENTRY_MSR_LOAD_COUNT) > 0 {
+            self.dump_msrs("guest autoload", &vcpu.vmx().msr_autoload.guest);
+        }
+        if self.vmread(control::VMEXIT_MSR_LOAD_COUNT) > 0 {
+            self.dump_msrs("guest autostore", &vcpu.vmx().msr_autostore);
+        }
+
+        error!("\n--- HOST STATE ---");
+        error!(
+            "RIP = 0x{:x}, RSP = 0x{:x}",
+            self.vmread(host::RIP),
+            self.vmread(host::RSP)
+        );
+        error!(
+            "CS = 0x{:x}, SS = 0x{:x}, DS = 0x{:x}, ES = 0x{:x}, FS = 0x{:x}, GS = 0x{:x}, TR = 0x{:x}",
+            self.vmread(host::CS_SELECTOR),
+            self.vmread(host::SS_SELECTOR),
+            self.vmread(host::DS_SELECTOR),
+            self.vmread(host::ES_SELECTOR),
+            self.vmread(host::FS_SELECTOR),
+            self.vmread(host::GS_SELECTOR),
+            self.vmread(host::TR_SELECTOR)
+        );
+        error!(
+            "FSBase = 0x{:x}, GSBase = 0x{:x}, TRBase = 0x{:x}",
+            self.vmread(host::FS_BASE),
+            self.vmread(host::GS_BASE),
+            self.vmread(host::TR_BASE),
+        );
+        error!(
+            "GDTBase = 0x{:x}, IDTBase = 0x{:x}",
+            self.vmread(host::GDTR_BASE),
+            self.vmread(host::IDTR_BASE),
+        );
+        error!(
+            "CR0 = 0x{:x}, CR3 = 0x{:x}, CR4 = 0x{:x}",
+            self.vmread(host::CR0),
+            self.vmread(host::CR3),
+            self.vmread(host::CR4),
+        );
+        error!(
+            "Sysenter RSP = 0x{:x}, CS:RIP=0x{:x}:0x{:x}",
+            self.vmread(host::IA32_SYSENTER_ESP),
+            self.vmread(host::IA32_SYSENTER_CS),
+            self.vmread(host::IA32_SYSENTER_EIP),
+        );
+
+        if vmexit_ctl.contains(ExitControls::LOAD_IA32_EFER) {
+            error!("EFER = 0x{:x}", self.vmread(host::IA32_EFER_FULL));
+        }
+
+        if vmexit_ctl.contains(ExitControls::LOAD_IA32_PAT) {
+            error!("PAT = 0x{:x}", self.vmread(host::IA32_PAT_FULL));
+        }
+
+        if self.has_load_perf_global_ctrl()
+            && vmexit_ctl.contains(ExitControls::LOAD_IA32_PERF_GLOBAL_CTRL)
+        {
+            error!(
+                "PerfGlobCtl = 0x{:x}",
+                self.vmread(host::IA32_PERF_GLOBAL_CTRL_FULL)
+            );
+        }
+
+        if self.vmread(control::VMEXIT_MSR_LOAD_COUNT) > 0 {
+            self.dump_msrs("host autoload", &vcpu.vmx().msr_autoload.host);
+        }
+
+        error!("\n--- CONTROL STATE ---");
+        error!(
+            "\nCPUBased = {:?},\nSecondaryExec = 0x{:x},\nTertiaryExec = 0(Unused)",
+            cpu_based_exec_ctl, secondary_exec_control,
+        );
+        error!(
+            "\nPinBased = {:?},\nEntryControls = {:?},\nExitControls = {:?}",
+            pin_based_exec_ctl, vmentry_ctl, vmexit_ctl,
+        );
+        error!(
+            "ExceptionBitmap = 0x{:x}, PFECmask = 0x{:x}, PFECmatch = 0x{:x}",
+            self.vmread(control::EXCEPTION_BITMAP),
+            self.vmread(control::PAGE_FAULT_ERR_CODE_MASK),
+            self.vmread(control::PAGE_FAULT_ERR_CODE_MATCH),
+        );
+        error!(
+            "VMEntry: intr_info = 0x{:x}, errcode = 0x{:x}, ilen = 0x{:x}",
+            self.vmread(control::VMENTRY_INTERRUPTION_INFO_FIELD),
+            self.vmread(control::VMENTRY_EXCEPTION_ERR_CODE),
+            self.vmread(control::VMENTRY_INSTRUCTION_LEN),
+        );
+        error!(
+            "VMExit: intr_info = 0x{:x}, errcode = 0x{:x}, ilen = 0x{:x}",
+            self.vmread(ro::VMEXIT_INSTRUCTION_INFO),
+            self.vmread(ro::VMEXIT_INTERRUPTION_ERR_CODE),
+            self.vmread(ro::VMEXIT_INSTRUCTION_LEN),
+        );
+        error!(
+            "        reason = 0x{:x}, qualification = 0x{:x}",
+            self.vmread(ro::EXIT_REASON),
+            self.vmread(ro::EXIT_QUALIFICATION),
+        );
+        error!(
+            "IDTVectoring: info = 0x{:x}, errcode = 0x{:x}",
+            self.vmread(ro::IDT_VECTORING_INFO),
+            self.vmread(ro::IDT_VECTORING_ERR_CODE),
+        );
+        error!("TSC Offset = 0x{:x}", self.vmread(control::TSC_OFFSET_FULL));
+
+        if secondary_exec_control.contains(SecondaryControls::USE_TSC_SCALING) {
+            error!(
+                "TSC Multiplier = 0x{:x}",
+                self.vmread(control::TSC_MULTIPLIER_FULL)
+            );
+        }
+
+        if cpu_based_exec_ctl.contains(PrimaryControls::USE_TPR_SHADOW) {
+            if secondary_exec_control.contains(SecondaryControls::VIRTUAL_INTERRUPT_DELIVERY) {
+                let status = self.vmread(guest::INTERRUPT_STATUS);
+                error!("SVI|RVI = 0x{:x}|0x{:x}", status >> 8, status & 0xff);
+            }
+
+            error!(
+                "TPR Threshold = 0x{:x}",
+                self.vmread(control::TPR_THRESHOLD)
+            );
+            if secondary_exec_control.contains(SecondaryControls::VIRTUALIZE_APIC) {
+                error!(
+                    "APIC-access addr = 0x{:x}",
+                    self.vmread(control::APIC_ACCESS_ADDR_FULL)
+                );
+            }
+            error!(
+                "virt-APIC addr = 0x{:x}",
+                self.vmread(control::VIRT_APIC_ADDR_FULL)
+            );
+        }
+
+        if pin_based_exec_ctl.contains(PinbasedControls::POSTED_INTERRUPTS) {
+            error!(
+                "PostedIntrVec = 0x{:x}",
+                self.vmread(control::POSTED_INTERRUPT_NOTIFICATION_VECTOR)
+            );
+        }
+
+        if secondary_exec_control.contains(SecondaryControls::ENABLE_EPT) {
+            error!("EPT pointer = 0x{:x}", self.vmread(control::EPTP_FULL));
+        }
+        if secondary_exec_control.contains(SecondaryControls::PAUSE_LOOP_EXITING) {
+            error!(
+                "PLE Gap = 0x{:x}, Window = 0x{:x}",
+                self.vmread(control::PLE_GAP),
+                self.vmread(control::PLE_WINDOW)
+            );
+        }
+        if secondary_exec_control.contains(SecondaryControls::ENABLE_VPID) {
+            error!("Virtual processor ID = 0x{:x}", self.vmread(control::VPID));
+        }
+    }
+
+    pub fn dump_sel(&self, name: &'static str, sel: u32) {
+        error!(
+            "{name} sel = 0x{:x}, attr = 0x{:x}, limit = 0x{:x}, base = 0x{:x}",
+            self.vmread(sel),
+            self.vmread(sel + guest::ES_ACCESS_RIGHTS - guest::ES_SELECTOR),
+            self.vmread(sel + guest::ES_LIMIT - guest::ES_SELECTOR),
+            self.vmread(sel + guest::ES_BASE - guest::ES_SELECTOR),
+        );
+    }
+
+    pub fn dump_dtsel(&self, name: &'static str, limit: u32) {
+        error!(
+            "{name} limit = 0x{:x}, base = 0x{:x}",
+            self.vmread(limit),
+            self.vmread(limit + guest::GDTR_BASE - guest::GDTR_LIMIT)
+        );
+    }
+
+    pub fn dump_msrs(&self, name: &'static str, msr: &VmxMsrs) {
+        error!("MSR {name}:");
+        for (idx, msr) in msr.val.iter().enumerate() {
+            error!("{idx}: msr = 0x{:x}, value = 0x{:x}", msr.index, msr.data);
+        }
+    }
+
+    #[inline]
+    pub fn vmread(&self, field: u32) -> u64 {
+        VmxAsm::vmx_vmread(field)
+    }
+
+    fn setup_uret_msrs(&self, vcpu: &mut VirtCpu) {
+        // 是否加载syscall相关msr
+        let load_syscall_msrs =
+            vcpu.arch.is_long_mode() && vcpu.arch.efer.contains(EferFlags::SYSTEM_CALL_EXTENSIONS);
+
+        self.setup_uret_msr(vcpu, msr::IA32_STAR, load_syscall_msrs);
+        self.setup_uret_msr(vcpu, msr::IA32_LSTAR, load_syscall_msrs);
+        self.setup_uret_msr(vcpu, msr::IA32_FMASK, load_syscall_msrs);
+
+        let load_efer = self.update_transition_efer(vcpu);
+        self.setup_uret_msr(vcpu, msr::IA32_EFER, load_efer);
+
+        // TODO: MSR_TSC_AUX
+
+        self.setup_uret_msr(
+            vcpu,
+            msr::MSR_IA32_TSX_CTRL,
+            CpuId::default()
+                .get_extended_feature_info()
+                .unwrap()
+                .has_rtm(),
+        );
+
+        vcpu.vmx_mut().guest_uret_msrs_loaded = false;
+    }
+
+    fn setup_uret_msr(&self, vcpu: &mut VirtCpu, msr: u32, load_into_hardware: bool) {
+        let uret_msr = vcpu.vmx_mut().find_uret_msr_mut(msr);
+
+        if let Some((_idx, msr)) = uret_msr {
+            msr.load_into_hardware = load_into_hardware;
+        }
+    }
+
+    fn update_transition_efer(&self, vcpu: &mut VirtCpu) -> bool {
+        let mut guest_efer = vcpu.arch.efer;
+        let mut ignore_efer = EferFlags::empty();
+        if !self.enable_ept {
+            guest_efer.insert(EferFlags::NO_EXECUTE_ENABLE);
+        }
+
+        ignore_efer.insert(EferFlags::SYSTEM_CALL_EXTENSIONS);
+
+        ignore_efer.insert(EferFlags::LONG_MODE_ACTIVE | EferFlags::LONG_MODE_ENABLE);
+
+        if guest_efer.contains(EferFlags::LONG_MODE_ACTIVE) {
+            ignore_efer.remove(EferFlags::SYSTEM_CALL_EXTENSIONS);
+        }
+
+        if self.has_load_ia32_efer()
+            || (self.enable_ept
+                && (vcpu.arch.efer ^ x86_kvm_manager().host_efer)
+                    .contains(EferFlags::NO_EXECUTE_ENABLE))
+        {
+            if !guest_efer.contains(EferFlags::LONG_MODE_ACTIVE) {
+                guest_efer.remove(EferFlags::LONG_MODE_ENABLE);
+            }
+
+            if guest_efer != x86_kvm_manager().host_efer {
+                vcpu.vmx_mut().add_atomic_switch_msr(
+                    msr::IA32_EFER,
+                    guest_efer.bits(),
+                    x86_kvm_manager().host_efer.bits(),
+                    false,
+                );
+            } else {
+                vcpu.vmx_mut().clear_atomic_switch_msr(msr::IA32_EFER);
+            }
+
+            return false;
+        }
+
+        let idx = x86_kvm_manager().find_user_return_msr_idx(msr::IA32_EFER);
+        if let Some(i) = idx {
+            vcpu.vmx_mut().clear_atomic_switch_msr(msr::IA32_EFER);
+
+            guest_efer.remove(ignore_efer);
+            guest_efer.insert(x86_kvm_manager().host_efer & ignore_efer);
+
+            vcpu.vmx_mut().guest_uret_msrs[i].data = guest_efer.bits();
+            vcpu.vmx_mut().guest_uret_msrs[i].mask = (!ignore_efer).bits();
+            return true;
+        } else {
+            return false;
+        }
+    }
+
+    fn set_cr4_guest_host_mask(&self, arch: &mut VirtCpuArch) {
+        arch.cr4_guest_owned_bits =
+            x86_kvm_manager().possible_cr4_guest & (!arch.cr4_guest_rsvd_bits);
+
+        if !self.enable_ept {
+            arch.cr4_guest_owned_bits
+                .remove(x86_kvm_manager().cr4_tlbflush_bits);
+            arch.cr4_guest_owned_bits
+                .remove(x86_kvm_manager().cr4_pdptr_bits);
+        }
+
+        if arch.is_guest_mode() {
+            // 嵌套todo
+            todo!()
+        }
+
+        VmxAsm::vmx_vmwrite(
+            control::CR4_GUEST_HOST_MASK,
+            (!arch.cr4_guest_owned_bits).bits() as u64,
+        );
+    }
+
+    fn l1_guest_owned_cr0_bits(&self) -> Cr0 {
+        let mut cr0 = x86_kvm_manager().possible_cr0_guest;
+
+        if !self.enable_ept {
+            cr0.remove(Cr0::CR0_WRITE_PROTECT)
+        }
+
+        return cr0;
+    }
+
+    /// 设置在guest生命周期中host不变的部分
+    fn set_constant_host_state(&self, vcpu: &mut VirtCpu) {
+        let loaded_vmcs_host_state = &mut vcpu.vmx().loaded_vmcs.lock().host_state;
+
+        VmxAsm::vmx_vmwrite(host::CR0, unsafe { cr0() }.bits() as u64);
+
+        let cr3: (PhysFrame, Cr3Flags) = Cr3::read();
+        let cr3_combined: u64 =
+            (cr3.0.start_address().as_u64() & 0xFFFF_FFFF_FFFF_F000) | (cr3.1.bits() & 0xFFF);
+        VmxAsm::vmx_vmwrite(host::CR3, cr3_combined);
+        loaded_vmcs_host_state.cr3 = cr3;
+
+        let cr4 = unsafe { cr4() };
+        VmxAsm::vmx_vmwrite(host::CR4, cr4.bits() as u64);
+        loaded_vmcs_host_state.cr4 = cr4;
+
+        VmxAsm::vmx_vmwrite(
+            host::CS_SELECTOR,
+            (segmentation::cs().bits() & (!0x07)).into(),
+        );
+
+        VmxAsm::vmx_vmwrite(host::DS_SELECTOR, 0);
+        VmxAsm::vmx_vmwrite(host::ES_SELECTOR, 0);
+
+        VmxAsm::vmx_vmwrite(
+            host::SS_SELECTOR,
+            (segmentation::ds().bits() & (!0x07)).into(),
+        );
+        VmxAsm::vmx_vmwrite(
+            host::TR_SELECTOR,
+            (unsafe { x86::task::tr().bits() } & (!0x07)).into(),
+        );
+
+        VmxAsm::vmx_vmwrite(host::IDTR_BASE, self.host_idt_base);
+        VmxAsm::vmx_vmwrite(host::RIP, vmx_vmexit as usize as u64);
+
+        let val = unsafe { rdmsr(msr::IA32_SYSENTER_CS) };
+
+        // low32
+        VmxAsm::vmx_vmwrite(host::IA32_SYSENTER_CS, (val << 32) >> 32);
+
+        // VmxAsm::vmx_vmwrite(host::IA32_SYSENTER_ESP, 0);
+
+        let tmp = unsafe { rdmsr(msr::IA32_SYSENTER_EIP) };
+        VmxAsm::vmx_vmwrite(host::IA32_SYSENTER_EIP, (tmp << 32) >> 32);
+
+        if self
+            .vmcs_config
+            .vmexit_ctrl
+            .contains(ExitControls::LOAD_IA32_PAT)
+        {
+            VmxAsm::vmx_vmwrite(host::IA32_PAT_FULL, unsafe { rdmsr(msr::IA32_PAT) });
+        }
+
+        if self.has_load_ia32_efer() {
+            VmxAsm::vmx_vmwrite(
+                host::IA32_EFER_FULL,
+                x86_kvm_manager().host_efer.bits() as u64,
+            );
+        }
+    }
+
+    fn get_pin_based_exec_controls(&self, vcpu: &VirtCpu) -> PinbasedControls {
+        let mut ctrls = self.vmcs_config.pin_based_exec_ctrl;
+
+        if !vcpu.arch.vcpu_apicv_active() {
+            ctrls.remove(PinbasedControls::POSTED_INTERRUPTS);
+        }
+
+        if !self.enable_vnmi {
+            ctrls.remove(PinbasedControls::VIRTUAL_NMIS);
+        }
+
+        if !self.enable_preemption_timer {
+            ctrls.remove(PinbasedControls::VMX_PREEMPTION_TIMER);
+        }
+
+        return ctrls;
+    }
+
+    fn get_exec_controls(&self, vcpu: &VirtCpu, vmarch: &KvmArch) -> PrimaryControls {
+        let mut ctrls = self.vmcs_config.cpu_based_exec_ctrl;
+
+        ctrls.remove(
+            PrimaryControls::RDTSC_EXITING
+                | PrimaryControls::USE_IO_BITMAPS
+                | PrimaryControls::MONITOR_TRAP_FLAG
+                | PrimaryControls::PAUSE_EXITING,
+        );
+
+        ctrls.remove(
+            PrimaryControls::NMI_WINDOW_EXITING | PrimaryControls::INTERRUPT_WINDOW_EXITING,
+        );
+
+        ctrls.remove(PrimaryControls::MOV_DR_EXITING);
+
+        if vcpu.arch.lapic_in_kernel() && self.has_tpr_shadow() {
+            ctrls.remove(PrimaryControls::USE_TPR_SHADOW);
+        }
+
+        if ctrls.contains(PrimaryControls::USE_TPR_SHADOW) {
+            ctrls.remove(PrimaryControls::CR8_LOAD_EXITING | PrimaryControls::CR8_STORE_EXITING);
+        } else {
+            ctrls.insert(PrimaryControls::CR8_LOAD_EXITING | PrimaryControls::CR8_STORE_EXITING);
+        }
+
+        if self.enable_ept {
+            ctrls.remove(
+                PrimaryControls::CR3_LOAD_EXITING
+                    | PrimaryControls::CR3_STORE_EXITING
+                    | PrimaryControls::INVLPG_EXITING,
+            );
+        }
+
+        if vmarch.mwait_in_guest {
+            ctrls.remove(PrimaryControls::MWAIT_EXITING | PrimaryControls::MONITOR_EXITING);
+        }
+
+        if vmarch.hlt_in_guest {
+            ctrls.remove(PrimaryControls::HLT_EXITING);
+        }
+
+        return ctrls;
+    }
+
+    fn get_secondary_exec_controls(&mut self, vcpu: &VirtCpu, vm: &Vm) -> SecondaryControls {
+        let mut ctrls = self.vmcs_config.cpu_based_2nd_exec_ctrl;
+
+        if self.pt_mode == ProcessorTraceMode::System {
+            ctrls.remove(
+                SecondaryControls::INTEL_PT_GUEST_PHYSICAL | SecondaryControls::CONCEAL_VMX_FROM_PT,
+            );
+        }
+
+        if !(self.enable_flexpriority && vcpu.arch.lapic_in_kernel()) {
+            ctrls.remove(SecondaryControls::VIRTUALIZE_APIC)
+        }
+
+        if vcpu.vmx().vpid == 0 {
+            ctrls.remove(SecondaryControls::ENABLE_VPID);
+        }
+
+        if !self.enable_ept {
+            ctrls.remove(SecondaryControls::ENABLE_EPT);
+            self.enable_unrestricted_guest = false;
+        }
+
+        if !self.enable_unrestricted_guest {
+            ctrls.remove(SecondaryControls::UNRESTRICTED_GUEST);
+        }
+
+        if vm.arch.pause_in_guest {
+            ctrls.remove(SecondaryControls::PAUSE_LOOP_EXITING);
+        }
+        if !vcpu.arch.vcpu_apicv_active() {
+            ctrls.remove(
+                SecondaryControls::VIRTUALIZE_APIC_REGISTER
+                    | SecondaryControls::VIRTUAL_INTERRUPT_DELIVERY,
+            );
+        }
+
+        ctrls.remove(SecondaryControls::VIRTUALIZE_X2APIC);
+
+        ctrls.remove(SecondaryControls::ENABLE_VM_FUNCTIONS);
+
+        ctrls.remove(SecondaryControls::DTABLE_EXITING);
+
+        ctrls.remove(SecondaryControls::VMCS_SHADOWING);
+
+        if !self.enable_pml || vm.nr_memslots_dirty_logging == 0 {
+            ctrls.remove(SecondaryControls::ENABLE_PML);
+        }
+
+        // TODO: vmx_adjust_sec_exec_feature
+
+        if self.has_rdtscp() {
+            warn!("adjust RDTSCP todo!");
+            // todo!()
+        }
+
+        return ctrls;
+    }
+
+    fn get_vmexit_controls(&self) -> ExitControls {
+        let mut ctrls = self.vmcs_config.vmexit_ctrl;
+
+        ctrls.remove(
+            ExitControls::SAVE_IA32_PAT
+                | ExitControls::SAVE_IA32_EFER
+                | ExitControls::SAVE_VMX_PREEMPTION_TIMER,
+        );
+
+        if self.pt_mode == ProcessorTraceMode::System {
+            ctrls.remove(ExitControls::CONCEAL_VMX_FROM_PT | ExitControls::CLEAR_IA32_RTIT_CTL);
+        }
+
+        // todo: cpu_has_perf_global_ctrl_bug
+
+        ctrls.remove(ExitControls::LOAD_IA32_PERF_GLOBAL_CTRL | ExitControls::LOAD_IA32_EFER);
+
+        ctrls
+    }
+
+    fn get_vmentry_controls(&self) -> EntryControls {
+        let mut ctrls = self.vmcs_config.vmentry_ctrl;
+
+        if self.pt_mode == ProcessorTraceMode::System {
+            ctrls.remove(EntryControls::CONCEAL_VMX_FROM_PT | EntryControls::LOAD_IA32_RTIT_CTL);
+        }
+
+        ctrls.remove(
+            EntryControls::LOAD_IA32_PERF_GLOBAL_CTRL
+                | EntryControls::LOAD_IA32_EFER
+                | EntryControls::IA32E_MODE_GUEST,
+        );
+
+        // todo: cpu_has_perf_global_ctrl_bug
+
+        ctrls
+    }
+
+    pub fn emulation_required(&self, vcpu: &mut VirtCpu) -> bool {
+        return self.emulate_invalid_guest_state && !self.guest_state_valid(vcpu);
+    }
+
+    pub fn guest_state_valid(&self, vcpu: &mut VirtCpu) -> bool {
+        return vcpu.is_unrestricted_guest() || self.__guest_state_valid(vcpu);
+    }
+
+    pub fn __guest_state_valid(&self, vcpu: &mut VirtCpu) -> bool {
+        if vcpu.arch.is_portected_mode()
+            || x86_kvm_ops().get_rflags(vcpu).contains(RFlags::FLAGS_VM)
+        {
+            if !self.rmode_segment_valid(vcpu, VcpuSegment::CS) {
+                return false;
+            }
+            if !self.rmode_segment_valid(vcpu, VcpuSegment::SS) {
+                return false;
+            }
+            if !self.rmode_segment_valid(vcpu, VcpuSegment::DS) {
+                return false;
+            }
+            if !self.rmode_segment_valid(vcpu, VcpuSegment::ES) {
+                return false;
+            }
+            if !self.rmode_segment_valid(vcpu, VcpuSegment::FS) {
+                return false;
+            }
+            if !self.rmode_segment_valid(vcpu, VcpuSegment::GS) {
+                return false;
+            }
+        } else {
+            todo!("protected mode guest state checks todo");
+        }
+
+        return true;
+    }
+
+    pub fn vmx_get_segment(
+        &self,
+        vcpu: &mut VirtCpu,
+        mut var: UapiKvmSegment,
+        seg: VcpuSegment,
+    ) -> UapiKvmSegment {
+        if vcpu.vmx().rmode.vm86_active && seg != VcpuSegment::LDTR {
+            var = vcpu.vmx().rmode.segs[seg as usize];
+            if seg == VcpuSegment::TR || var.selector == Vmx::vmx_read_guest_seg_selector(vcpu, seg)
+            {
+                return var;
+            }
+
+            var.base = Vmx::vmx_read_guest_seg_base(vcpu, seg);
+            var.selector = Vmx::vmx_read_guest_seg_selector(vcpu, seg);
+            return var;
+        }
+
+        var.base = Vmx::vmx_read_guest_seg_base(vcpu, seg);
+        var.limit = Vmx::vmx_read_guest_seg_limit(vcpu, seg);
+        var.selector = Vmx::vmx_read_guest_seg_selector(vcpu, seg);
+
+        let ar = Vmx::vmx_read_guest_seg_ar(vcpu, seg);
+
+        var.unusable = ((ar >> 16) & 1) as u8;
+        var.type_ = (ar & 15) as u8;
+        var.s = ((ar >> 4) & 1) as u8;
+        var.dpl = ((ar >> 5) & 3) as u8;
+
+        var.present = !var.unusable;
+        var.avl = ((ar >> 12) & 1) as u8;
+        var.l = ((ar >> 13) & 1) as u8;
+        var.db = ((ar >> 14) & 1) as u8;
+        var.g = ((ar >> 15) & 1) as u8;
+
+        return var;
+    }
+
+    pub fn _vmx_set_segment(
+        &self,
+        vcpu: &mut VirtCpu,
+        mut var: UapiKvmSegment,
+        seg: VcpuSegment,
+    ) -> UapiKvmSegment {
+        let sf = &KVM_VMX_SEGMENT_FIELDS[seg as usize];
+
+        vcpu.vmx_mut().segment_cache_clear();
+
+        if vcpu.vmx().rmode.vm86_active && seg != VcpuSegment::LDTR {
+            vcpu.vmx_mut().rmode.segs[seg as usize] = var;
+            if seg == VcpuSegment::TR {
+                VmxAsm::vmx_vmwrite(sf.selector, var.selector as u64);
+            } else if var.s != 0 {
+                Vmx::fix_rmode_seg(seg, &vcpu.vmx().rmode.segs[seg as usize]);
+            }
+            return var;
+        }
+
+        VmxAsm::vmx_vmwrite(sf.base, var.base);
+        VmxAsm::vmx_vmwrite(sf.limit, var.limit as u64);
+        VmxAsm::vmx_vmwrite(sf.selector, var.selector as u64);
+
+        if vcpu.is_unrestricted_guest() && seg != VcpuSegment::LDTR {
+            var.type_ |= 0x1;
+        }
+
+        VmxAsm::vmx_vmwrite(sf.ar_bytes, var.vmx_segment_access_rights() as u64);
+        return var;
+    }
+
+    pub fn rmode_segment_valid(&self, vcpu: &mut VirtCpu, seg: VcpuSegment) -> bool {
+        let mut var = UapiKvmSegment::default();
+        var = self.vmx_get_segment(vcpu, var, seg);
+
+        var.dpl = 0x3;
+
+        if seg == VcpuSegment::CS {
+            var.type_ = 0x3;
+        }
+
+        let ar = var.vmx_segment_access_rights();
+
+        if var.base != ((var.selector as u64) << 4) {
+            return false;
+        }
+
+        if var.limit != 0xffff {
+            return false;
+        }
+
+        if ar != 0xf3 {
+            return false;
+        }
+
+        true
+    }
+
+    pub fn fix_rmode_seg(seg: VcpuSegment, save: &UapiKvmSegment) {
+        let sf = &KVM_VMX_SEGMENT_FIELDS[seg as usize];
+
+        let mut var = *save;
+        var.dpl = 0x3;
+        if seg == VcpuSegment::CS {
+            var.type_ = 0x3;
+        }
+
+        if !vmx_info().emulate_invalid_guest_state {
+            var.selector = (var.base >> 4) as u16;
+            var.base &= 0xffff0;
+            var.limit = 0xffff;
+            var.g = 0;
+            var.db = 0;
+            var.present = 1;
+            var.s = 1;
+            var.l = 0;
+            var.unusable = 0;
+            var.type_ = 0x3;
+            var.avl = 0;
+            if save.base & 0xf != 0 {
+                warn!("segment base is not paragraph aligned when entering protected mode (seg={seg:?})");
+            }
+        }
+
+        VmxAsm::vmx_vmwrite(sf.selector, var.selector as u64);
+        VmxAsm::vmx_vmwrite(sf.base, var.base);
+        VmxAsm::vmx_vmwrite(sf.limit, var.limit as u64);
+        VmxAsm::vmx_vmwrite(sf.ar_bytes, var.vmx_segment_access_rights() as u64);
+    }
+
+    pub fn fix_pmode_seg(
+        &self,
+        vcpu: &mut VirtCpu,
+        seg: VcpuSegment,
+        mut save: UapiKvmSegment,
+    ) -> UapiKvmSegment {
+        if self.emulate_invalid_guest_state {
+            if seg == VcpuSegment::CS || seg == VcpuSegment::SS {
+                save.selector &= !0x3;
+            }
+
+            save.dpl = (save.selector & 0x3) as u8;
+            save.s = 1;
+        }
+
+        self._vmx_set_segment(vcpu, save, seg);
+
+        return save;
+    }
+
+    pub fn enter_pmode(&self, vcpu: &mut VirtCpu) {
+        self.get_segment_with_rmode(vcpu, VcpuSegment::ES);
+        self.get_segment_with_rmode(vcpu, VcpuSegment::DS);
+        self.get_segment_with_rmode(vcpu, VcpuSegment::FS);
+        self.get_segment_with_rmode(vcpu, VcpuSegment::GS);
+        self.get_segment_with_rmode(vcpu, VcpuSegment::SS);
+        self.get_segment_with_rmode(vcpu, VcpuSegment::CS);
+
+        vcpu.vmx_mut().rmode.vm86_active = false;
+
+        self.set_segment_with_rmode(vcpu, VcpuSegment::TR);
+
+        let mut flags = RFlags::from_bits_truncate(VmxAsm::vmx_vmread(guest::RFLAGS));
+
+        flags.remove(RFlags::FLAGS_IOPL3 | RFlags::FLAGS_VM);
+
+        flags.insert(vcpu.vmx().rmode.save_rflags & (RFlags::FLAGS_IOPL3 | RFlags::FLAGS_VM));
+
+        VmxAsm::vmx_vmwrite(guest::RFLAGS, flags.bits());
+
+        let cr4 = (Cr4::from_bits_truncate(VmxAsm::vmx_vmread(guest::CR4) as usize)
+            & (!Cr4::CR4_ENABLE_VME))
+            | (Cr4::from_bits_truncate(VmxAsm::vmx_vmread(control::CR4_READ_SHADOW) as usize)
+                & Cr4::CR4_ENABLE_VME);
+        VmxAsm::vmx_vmwrite(guest::CR4, cr4.bits() as u64);
+
+        VmxKvmFunc.update_exception_bitmap(vcpu);
+
+        self.fix_pmode_seg_with_rmode(vcpu, VcpuSegment::CS);
+        self.fix_pmode_seg_with_rmode(vcpu, VcpuSegment::SS);
+        self.fix_pmode_seg_with_rmode(vcpu, VcpuSegment::ES);
+        self.fix_pmode_seg_with_rmode(vcpu, VcpuSegment::DS);
+        self.fix_pmode_seg_with_rmode(vcpu, VcpuSegment::FS);
+        self.fix_pmode_seg_with_rmode(vcpu, VcpuSegment::GS);
+    }
+
+    fn fix_pmode_seg_with_rmode(&self, vcpu: &mut VirtCpu, seg: VcpuSegment) {
+        let segment = vcpu.vmx().rmode.segs[seg as usize];
+        vcpu.vmx_mut().rmode.segs[seg as usize] = self.fix_pmode_seg(vcpu, seg, segment);
+    }
+
+    fn get_segment_with_rmode(&self, vcpu: &mut VirtCpu, seg: VcpuSegment) {
+        let segment = vcpu.vmx().rmode.segs[seg as usize];
+        vcpu.vmx_mut().rmode.segs[seg as usize] = self.vmx_get_segment(vcpu, segment, seg);
+    }
+
+    fn set_segment_with_rmode(&self, vcpu: &mut VirtCpu, seg: VcpuSegment) {
+        let segment = vcpu.vmx().rmode.segs[seg as usize];
+        vcpu.vmx_mut().rmode.segs[seg as usize] = self._vmx_set_segment(vcpu, segment, seg);
+    }
+
+    pub fn enter_rmode(&self, vcpu: &mut VirtCpu, vm: &Vm) {
+        let kvm_vmx = vm.kvm_vmx();
+
+        self.get_segment_with_rmode(vcpu, VcpuSegment::TR);
+        self.get_segment_with_rmode(vcpu, VcpuSegment::ES);
+        self.get_segment_with_rmode(vcpu, VcpuSegment::DS);
+        self.get_segment_with_rmode(vcpu, VcpuSegment::FS);
+        self.get_segment_with_rmode(vcpu, VcpuSegment::GS);
+        self.get_segment_with_rmode(vcpu, VcpuSegment::SS);
+        self.get_segment_with_rmode(vcpu, VcpuSegment::CS);
+
+        vcpu.vmx_mut().rmode.vm86_active = true;
+
+        vcpu.vmx_mut().segment_cache_clear();
+
+        VmxAsm::vmx_vmwrite(guest::TR_BASE, kvm_vmx.tss_addr as u64);
+        VmxAsm::vmx_vmwrite(guest::TR_LIMIT, RMODE_TSS_SIZE as u64 - 1);
+        VmxAsm::vmx_vmwrite(guest::TR_ACCESS_RIGHTS, 0x008b);
+
+        let mut flags = RFlags::from_bits_truncate(VmxAsm::vmx_vmread(guest::RFLAGS));
+        vcpu.vmx_mut().rmode.save_rflags = flags;
+
+        flags.insert(RFlags::FLAGS_IOPL3 | RFlags::FLAGS_VM);
+
+        VmxAsm::vmx_vmwrite(guest::RFLAGS, flags.bits());
+        VmxAsm::vmx_vmwrite(
+            guest::CR4,
+            VmxAsm::vmx_vmread(guest::CR4) | Cr4::CR4_ENABLE_VME.bits() as u64,
+        );
+
+        VmxKvmFunc.update_exception_bitmap(vcpu);
+
+        self.fix_rmode_seg_with_rmode(vcpu, VcpuSegment::SS);
+        self.fix_rmode_seg_with_rmode(vcpu, VcpuSegment::CS);
+        self.fix_rmode_seg_with_rmode(vcpu, VcpuSegment::ES);
+        self.fix_rmode_seg_with_rmode(vcpu, VcpuSegment::DS);
+        self.fix_rmode_seg_with_rmode(vcpu, VcpuSegment::GS);
+        self.fix_rmode_seg_with_rmode(vcpu, VcpuSegment::FS);
+    }
+
+    fn fix_rmode_seg_with_rmode(&self, vcpu: &VirtCpu, seg: VcpuSegment) {
+        Vmx::fix_rmode_seg(seg, &vcpu.vmx().rmode.segs[seg as usize]);
+    }
+
+    pub fn vmx_read_guest_seg_ar(vcpu: &mut VirtCpu, seg: VcpuSegment) -> u32 {
+        if !Vmx::vmx_segment_cache_test_set(vcpu, seg, SegmentCacheField::AR) {
+            vcpu.vmx_mut().segment_cache.seg[seg as usize].ar =
+                VmxAsm::vmx_vmread(KVM_VMX_SEGMENT_FIELDS[seg as usize].ar_bytes) as u32;
+        }
+
+        return vcpu.vmx().segment_cache.seg[seg as usize].ar;
+    }
+
+    pub fn vmx_read_guest_seg_selector(vcpu: &mut VirtCpu, seg: VcpuSegment) -> u16 {
+        if !Vmx::vmx_segment_cache_test_set(vcpu, seg, SegmentCacheField::SEL) {
+            vcpu.vmx_mut().segment_cache.seg[seg as usize].selector =
+                VmxAsm::vmx_vmread(KVM_VMX_SEGMENT_FIELDS[seg as usize].selector) as u16;
+        }
+
+        return vcpu.vmx().segment_cache.seg[seg as usize].selector;
+    }
+
+    pub fn vmx_read_guest_seg_base(vcpu: &mut VirtCpu, seg: VcpuSegment) -> u64 {
+        if !Vmx::vmx_segment_cache_test_set(vcpu, seg, SegmentCacheField::BASE) {
+            vcpu.vmx_mut().segment_cache.seg[seg as usize].base =
+                VmxAsm::vmx_vmread(KVM_VMX_SEGMENT_FIELDS[seg as usize].base);
+        }
+
+        return vcpu.vmx().segment_cache.seg[seg as usize].base;
+    }
+
+    pub fn vmx_read_guest_seg_limit(vcpu: &mut VirtCpu, seg: VcpuSegment) -> u32 {
+        if !Vmx::vmx_segment_cache_test_set(vcpu, seg, SegmentCacheField::LIMIT) {
+            vcpu.vmx_mut().segment_cache.seg[seg as usize].limit =
+                VmxAsm::vmx_vmread(KVM_VMX_SEGMENT_FIELDS[seg as usize].limit) as u32;
+        }
+
+        return vcpu.vmx().segment_cache.seg[seg as usize].limit;
+    }
+
+    fn vmx_segment_cache_test_set(
+        vcpu: &mut VirtCpu,
+        seg: VcpuSegment,
+        field: SegmentCacheField,
+    ) -> bool {
+        let mask = 1u32 << (seg as usize * SegmentCacheField::NR as usize + field as usize);
+
+        if !vcpu.arch.is_register_available(KvmReg::VcpuExregSegments) {
+            vcpu.arch.mark_register_available(KvmReg::VcpuExregSegments);
+            vcpu.vmx_mut().segment_cache_clear();
+        }
+
+        let ret = vcpu.vmx().segment_cache.bitmask & mask;
+
+        vcpu.vmx_mut().segment_cache.bitmask |= mask;
+
+        return ret != 0;
+    }
+
+    pub fn vmx_vcpu_enter_exit(vcpu: &mut VirtCpu, flags: VmxRunFlag) {
+        // TODO: vmx_l1d_should_flush and mmio_stale_data_clear
+
+        // TODO: vmx_disable_fb_clear
+
+        if vcpu.arch.cr2 != unsafe { cr2() } as u64 {
+            unsafe { cr2_write(vcpu.arch.cr2) };
+        }
+
+        let fail =
+            unsafe { __vmx_vcpu_run(vcpu.vmx(), vcpu.arch.regs.as_ptr(), flags.bits as u32) };
+
+        vcpu.vmx_mut().fail = fail as u8;
+
+        vcpu.arch.cr2 = unsafe { cr2() } as u64;
+        vcpu.arch.regs_avail.set_all(true);
+
+        // 这些寄存器需要更新缓存
+        for reg_idx in Vmx::VMX_REGS_LAZY_LOAD_SET {
+            vcpu.arch.regs_avail.set(*reg_idx, false);
+        }
+
+        vcpu.vmx_mut().idt_vectoring_info = IntrInfo::empty();
+
+        // TODO: enable_fb_clear
+
+        if unlikely(vcpu.vmx().fail != 0) {
+            vcpu.vmx_mut().exit_reason = VmxExitReason::from(0xdead);
+            return;
+        }
+
+        vcpu.vmx_mut().exit_reason =
+            VmxExitReason::from(VmxAsm::vmx_vmread(ro::EXIT_REASON) as u32);
+
+        if likely(!vcpu.vmx().exit_reason.failed_vmentry()) {
+            vcpu.vmx_mut().idt_vectoring_info =
+                IntrInfo::from_bits_truncate(VmxAsm::vmx_vmread(ro::IDT_VECTORING_INFO) as u32);
+        }
+
+        if VmxExitReasonBasic::from(vcpu.vmx().exit_reason.basic())
+            == VmxExitReasonBasic::EXCEPTION_OR_NMI
+            && VmcsIntrHelper::is_nmi(&Vmx::vmx_get_intr_info(vcpu))
+        {
+            todo!()
+        }
+    }
+
+    fn vmx_get_intr_info(vcpu: &mut VirtCpu) -> IntrInfo {
+        if !vcpu
+            .arch
+            .test_and_mark_available(KvmReg::VcpuExregExitInfo2)
+        {
+            vcpu.vmx_mut().exit_intr_info = IntrInfo::from_bits_truncate(VmxAsm::vmx_vmread(
+                ro::VMEXIT_INTERRUPTION_INFO,
+            ) as u32);
+        }
+
+        return vcpu.vmx_mut().exit_intr_info;
+    }
+
+    pub fn vmx_exit_handlers_fastpath(vcpu: &mut VirtCpu) -> ExitFastpathCompletion {
+        match VmxExitReasonBasic::from(vcpu.vmx().exit_reason.basic()) {
+            VmxExitReasonBasic::WRMSR => {
+                todo!()
+            }
+            VmxExitReasonBasic::VMX_PREEMPTION_TIMER_EXPIRED => {
+                todo!()
+            }
+            _ => ExitFastpathCompletion::None,
+        }
+    }
+
+    pub fn vmx_handle_exit(
+        &self,
+        vcpu: &mut VirtCpu,
+        vm: &Vm,
+        exit_fastpath: ExitFastpathCompletion,
+    ) -> Result<i32, SystemError> {
+        let exit_reason = vcpu.vmx().exit_reason;
+        // self.dump_vmcs(vcpu);
+        {
+            let reason = self.vmread(ro::EXIT_REASON);
+            debug!("vm_exit reason 0x{:x}\n", reason);
+        }
+        let unexpected_vmexit = |vcpu: &mut VirtCpu| -> Result<i32, SystemError> {
+            error!("vmx: unexpected exit reason {:?}\n", exit_reason);
+
+            self.dump_vmcs(vcpu);
+
+            let cpu = vcpu.arch.last_vmentry_cpu.into() as u64;
+            let run = vcpu.kvm_run_mut();
+            run.exit_reason = kvm_exit::KVM_EXIT_INTERNAL_ERROR;
+
+            unsafe {
+                run.__bindgen_anon_1.internal.ndata = 2;
+                run.__bindgen_anon_1.internal.data[0] = Into::<u32>::into(exit_reason) as u64;
+                run.__bindgen_anon_1.internal.data[1] = cpu;
+            }
+
+            return Ok(0);
+        };
+
+        let vectoring_info = vcpu.vmx().idt_vectoring_info;
+
+        if self.enable_pml && !vcpu.arch.is_guest_mode() {
+            todo!()
+        }
+
+        if vcpu.arch.is_guest_mode() {
+            if exit_reason.basic() == VmxExitReasonBasic::PML_FULL as u16 {
+                return unexpected_vmexit(vcpu);
+            }
+
+            todo!()
+        }
+
+        if vcpu.vmx().emulation_required {
+            todo!()
+        }
+
+        if exit_reason.failed_vmentry() {
+            self.dump_vmcs(vcpu);
+            todo!()
+        }
+
+        if unlikely(vcpu.vmx().fail != 0) {
+            self.dump_vmcs(vcpu);
+            todo!()
+        }
+
+        let basic = VmxExitReasonBasic::from(exit_reason.basic());
+        if vectoring_info.contains(IntrInfo::INTR_INFO_VALID_MASK)
+            && basic != VmxExitReasonBasic::EXCEPTION_OR_NMI
+            && basic != VmxExitReasonBasic::EPT_VIOLATION
+            && basic != VmxExitReasonBasic::PML_FULL
+            && basic != VmxExitReasonBasic::APIC_ACCESS
+            && basic != VmxExitReasonBasic::TASK_SWITCH
+            && basic != VmxExitReasonBasic::NOTIFY
+        {
+            todo!()
+        }
+
+        if unlikely(!self.enable_pml && vcpu.vmx().loaded_vmcs().soft_vnmi_blocked) {
+            todo!()
+        }
+
+        if exit_fastpath != ExitFastpathCompletion::None {
+            return Err(SystemError::EINVAL);
+        }
+
+        match VmxExitHandlers::try_handle_exit(
+            vcpu,
+            vm,
+            VmxExitReasonBasic::from(exit_reason.basic()),
+        ) {
+            Some(Ok(r)) => {
+                debug!("vmx: handled exit return {:?}\n", r);
+                return Ok(r);
+            }
+            Some(Err(_)) | None => unexpected_vmexit(vcpu),
+        }
+    }
+
+    #[allow(unreachable_code)]
+    pub fn handle_external_interrupt_irqoff(vcpu: &mut VirtCpu) {
+        let intr_info = Vmx::vmx_get_intr_info(vcpu);
+        let _vector = intr_info & IntrInfo::INTR_INFO_VECTOR_MASK;
+        // let desc = vmx_info().host_idt_base + vector.bits() as u64;
+        if !VmcsIntrHelper::is_external_intr(&intr_info) {
+            error!("unexpected VM-Exit interrupt info: {:?}", intr_info);
+            return;
+        }
+
+        vcpu.arch.kvm_before_interrupt(KvmIntrType::Irq);
+        // TODO
+        warn!("handle_external_interrupt_irqoff TODO");
+        vcpu.arch.kvm_after_interrupt();
+
+        vcpu.arch.at_instruction_boundary = true;
+    }
+
+    /// 需要在缓存中更新的寄存器集。此处未列出的其他寄存器在 VM 退出后立即同步到缓存。
+    pub const VMX_REGS_LAZY_LOAD_SET: &'static [usize] = &[
+        KvmReg::VcpuRegsRip as usize,
+        KvmReg::VcpuRegsRsp as usize,
+        KvmReg::VcpuExregRflags as usize,
+        KvmReg::NrVcpuRegs as usize,
+        KvmReg::VcpuExregSegments as usize,
+        KvmReg::VcpuExregCr0 as usize,
+        KvmReg::VcpuExregCr3 as usize,
+        KvmReg::VcpuExregCr4 as usize,
+        KvmReg::VcpuExregExitInfo1 as usize,
+        KvmReg::VcpuExregExitInfo2 as usize,
+    ];
+}
+
+extern "C" {
+    /// #[allow(improper_ctypes)]因为只需要在内部调用而无需与C交互
+    #[allow(improper_ctypes)]
+    fn __vmx_vcpu_run(vmx: &VmxVCpuPriv, regs: *const u64, flags: u32) -> i32;
+}
+
+struct VmcsEntryExitPair {
+    entry: EntryControls,
+    exit: ExitControls,
+}
+
+impl VmcsEntryExitPair {
+    pub const fn new(entry: EntryControls, exit: ExitControls) -> Self {
+        Self { entry, exit }
+    }
+}
+
+#[derive(Debug, Default)]
+#[repr(C, align(64))]
+pub struct PostedIntrDesc {
+    pir: [u32; 8],
+    control: PostedIntrDescControl,
+    // 保留位
+    rsvd: [u32; 6],
+}
+
+#[bitfield(u64)]
+pub struct PostedIntrDescControl {
+    #[bits(1)]
+    on: bool,
+    #[bits(1)]
+    sn: bool,
+    #[bits(14)]
+    rsvd_1: u16,
+    nv: u8,
+    rsvd_2: u8,
+    ndst: u32,
+}
+
+#[derive(Debug, Default, Clone, Copy)]
+pub struct VmxUretMsr {
+    load_into_hardware: bool,
+    data: u64,
+    mask: u64,
+}
+
+#[derive(Debug, Default)]
+pub struct VmxMsrs {
+    nr: usize,
+    val: [VmxMsrEntry; Self::MAX_NR_LOADSTORE_MSRS],
+}
+
+impl VmxMsrs {
+    pub const MAX_NR_LOADSTORE_MSRS: usize = 8;
+
+    pub fn find_loadstore_msr_slot(&self, msr: u32) -> Option<usize> {
+        return (0..self.nr).find(|&i| self.val[i].index == msr);
+    }
+}
+
+#[derive(Debug, Default)]
+pub struct VmxMsrAutoLoad {
+    guest: VmxMsrs,
+    host: VmxMsrs,
+}
+
+#[derive(Debug)]
+pub struct VmxRMode {
+    pub vm86_active: bool,
+    pub save_rflags: RFlags,
+    pub segs: [UapiKvmSegment; 8],
+}
+
+impl Default for VmxRMode {
+    fn default() -> Self {
+        Self {
+            vm86_active: false,
+            save_rflags: RFlags::empty(),
+            segs: [UapiKvmSegment::default(); 8],
+        }
+    }
+}
+
+#[derive(Debug, Clone, Copy, Default)]
+pub struct VmxSaveSegment {
+    selector: u16,
+    base: u64,
+    limit: u32,
+    ar: u32,
+}
+
+#[derive(Debug, Default)]
+pub struct VmxSegmentCache {
+    pub bitmask: u32,
+    pub seg: [VmxSaveSegment; 8],
+}
+
+#[derive(Debug)]
+#[allow(dead_code)]
+pub struct VmxVCpuPriv {
+    vpid: u16,
+
+    fail: u8,
+
+    exit_reason: VmxExitReason,
+
+    exit_intr_info: IntrInfo,
+
+    idt_vectoring_info: IntrInfo,
+
+    vmcs01: Arc<LockedLoadedVmcs>,
+    loaded_vmcs: Arc<LockedLoadedVmcs>,
+    guest_uret_msrs: [VmxUretMsr; KvmArchManager::KVM_MAX_NR_USER_RETURN_MSRS],
+    guest_uret_msrs_loaded: bool,
+
+    post_intr_desc: PostedIntrDesc,
+
+    shadow_msr_intercept_read: AllocBitmap,
+    shadow_msr_intercept_write: AllocBitmap,
+
+    msr_ia32_feature_control: u64,
+    msr_ia32_feature_control_valid_bits: u64,
+
+    msr_host_kernel_gs_base: u64,
+    msr_guest_kernel_gs_base: u64,
+
+    emulation_required: bool,
+
+    rflags: RFlags,
+
+    ple_window: u32,
+    ple_window_dirty: bool,
+
+    msr_autoload: VmxMsrAutoLoad,
+    msr_autostore: VmxMsrs,
+
+    pml_pg: Box<[u8; MMArch::PAGE_SIZE]>,
+
+    rmode: VmxRMode,
+
+    spec_ctrl: u64,
+    msr_ia32_umwait_control: u32,
+    hv_deadline_tsc: u64,
+
+    segment_cache: VmxSegmentCache,
+
+    req_immediate_exit: bool,
+    guest_state_loaded: bool,
+
+    exit_qualification: u64, //暂时不知道用处fztodo
+}
+
+#[derive(Debug, Default)]
+#[allow(dead_code)]
+pub struct KvmVmx {
+    tss_addr: usize,
+    ept_identity_pagetable_done: bool,
+    ept_identity_map_addr: u64,
+    pid_table: Option<Box<[u64; MMArch::PAGE_SIZE]>>,
+}
+
+impl KvmVmx {
+    pub fn pid_table(&self) -> &[u64; MMArch::PAGE_SIZE] {
+        self.pid_table.as_ref().unwrap().as_ref()
+    }
+}
+
+impl VmxVCpuPriv {
+    pub const PML_ENTITY_NUM: usize = 512;
+
+    pub fn loaded_vmcs(&self) -> SpinLockGuard<LoadedVmcs> {
+        self.loaded_vmcs.lock()
+    }
+
+    /// 参考:https://code.dragonos.org.cn/xref/linux-6.6.21/arch/x86/kvm/vmx/vmx.c#7452
+    pub fn init(vcpu: &mut VirtCpu, vm: &Vm) {
+        let vmcs = LockedLoadedVmcs::new();
+
+        // TODO: 改堆分配
+        let mut vmx = Self {
+            vpid: 0,
+            fail: 0,
+            vmcs01: vmcs.clone(),
+            loaded_vmcs: vmcs,
+            guest_uret_msrs: [VmxUretMsr::default(); KvmArchManager::KVM_MAX_NR_USER_RETURN_MSRS],
+            shadow_msr_intercept_read: AllocBitmap::new(16),
+            shadow_msr_intercept_write: AllocBitmap::new(16),
+            post_intr_desc: PostedIntrDesc::default(),
+            ple_window: 0,
+            ple_window_dirty: false,
+            msr_autoload: VmxMsrAutoLoad::default(),
+            pml_pg: unsafe { Box::new_zeroed().assume_init() },
+            guest_uret_msrs_loaded: false,
+            msr_ia32_feature_control: 0,
+            msr_ia32_feature_control_valid_bits: 0,
+            rmode: VmxRMode::default(),
+            spec_ctrl: 0,
+            msr_ia32_umwait_control: 0,
+            hv_deadline_tsc: u64::MAX,
+            segment_cache: VmxSegmentCache::default(),
+            emulation_required: false,
+            rflags: RFlags::empty(),
+            req_immediate_exit: false,
+            guest_state_loaded: false,
+            msr_host_kernel_gs_base: 0,
+            msr_guest_kernel_gs_base: 0,
+            idt_vectoring_info: IntrInfo::empty(),
+            exit_reason: VmxExitReason::new(),
+            exit_intr_info: IntrInfo::empty(),
+            msr_autostore: VmxMsrs::default(),
+            exit_qualification: 0, //fztodo
+        };
+
+        vmx.vpid = vmx_info().alloc_vpid().unwrap_or_default() as u16;
+
+        for i in 0..x86_kvm_manager().kvm_uret_msrs_list.len() {
+            vmx.guest_uret_msrs[i].mask = u64::MAX;
+        }
+
+        if CpuId::new().get_extended_feature_info().unwrap().has_rtm() {
+            let tsx_ctrl = vmx.find_uret_msr_mut(msr::MSR_IA32_TSX_CTRL);
+            if let Some((_idx, tsx_ctrl)) = tsx_ctrl {
+                // Disable TSX enumeration
+                tsx_ctrl.mask = !(1 << 1);
+            }
+        }
+
+        vmx.shadow_msr_intercept_read.set_all(true);
+        vmx.shadow_msr_intercept_write.set_all(true);
+
+        let arch = &vm.arch;
+
+        vmx.disable_intercept_for_msr(arch, msr::IA32_TIME_STAMP_COUNTER, MsrType::READ);
+        vmx.disable_intercept_for_msr(arch, msr::IA32_FS_BASE, MsrType::RW);
+        vmx.disable_intercept_for_msr(arch, msr::IA32_GS_BASE, MsrType::RW);
+        vmx.disable_intercept_for_msr(arch, msr::IA32_KERNEL_GSBASE, MsrType::RW);
+
+        vmx.disable_intercept_for_msr(arch, msr::IA32_SYSENTER_CS, MsrType::RW);
+        vmx.disable_intercept_for_msr(arch, msr::IA32_SYSENTER_ESP, MsrType::RW);
+        vmx.disable_intercept_for_msr(arch, msr::IA32_SYSENTER_EIP, MsrType::RW);
+
+        if arch.pause_in_guest {
+            vmx.disable_intercept_for_msr(arch, msr::MSR_CORE_C1_RESIDENCY, MsrType::READ);
+            vmx.disable_intercept_for_msr(arch, msr::MSR_CORE_C3_RESIDENCY, MsrType::READ);
+            vmx.disable_intercept_for_msr(arch, msr::MSR_CORE_C6_RESIDENCY, MsrType::READ);
+            vmx.disable_intercept_for_msr(arch, msr::MSR_CORE_C7_RESIDENCY, MsrType::READ);
+        }
+
+        if vmx_info().enable_flexpriority && vcpu.arch.lapic_in_kernel() {
+            todo!()
+        }
+
+        if vmx_info().enable_ept && !vmx_info().enable_unrestricted_guest {
+            todo!()
+        }
+
+        if vcpu.arch.lapic_in_kernel() && vmx_info().enable_ipiv {
+            todo!()
+        }
+
+        // 初始化vmx私有信息
+        vcpu.private = Some(vmx);
+    }
+
+    pub fn find_uret_msr(&self, msr: u32) -> Option<(usize, &VmxUretMsr)> {
+        let idx = x86_kvm_manager().find_user_return_msr_idx(msr);
+        if let Some(index) = idx {
+            return Some((index, &self.guest_uret_msrs[index]));
+        } else {
+            return None;
+        }
+    }
+
+    fn set_uret_msr(&mut self, msr: u32, data: u64) {
+        if let Some((_idx, msr)) = self.find_uret_msr_mut(msr) {
+            msr.data = data;
+        }
+    }
+
+    pub fn find_uret_msr_mut(&mut self, msr: u32) -> Option<(usize, &mut VmxUretMsr)> {
+        let idx = x86_kvm_manager().find_user_return_msr_idx(msr);
+        if let Some(index) = idx {
+            return Some((index, &mut self.guest_uret_msrs[index]));
+        } else {
+            return None;
+        }
+    }
+
+    fn set_guest_uret_msr(&mut self, slot: usize, data: u64) -> Result<(), SystemError> {
+        let msr = &mut self.guest_uret_msrs[slot];
+        if msr.load_into_hardware {
+            x86_kvm_manager().kvm_set_user_return_msr(slot, data, msr.mask);
+        }
+
+        msr.data = data;
+
+        Ok(())
+    }
+
+    /// ## 禁用对特定的 MSR 的拦截
+    fn disable_intercept_for_msr(&mut self, arch: &KvmArch, msr: u32, mut msr_type: MsrType) {
+        if !vmx_info().has_msr_bitmap() {
+            return;
+        }
+
+        let msr_bitmap = &mut self.vmcs01.lock().msr_bitmap;
+
+        // TODO: https://code.dragonos.org.cn/xref/linux-6.6.21/arch/x86/kvm/vmx/vmx.c#3974
+        // 嵌套vmx处理
+
+        if Vmx::is_valid_passthrough_msr(msr) {
+            if let Some(idx) = Vmx::possible_passthrough_msr_slot(msr) {
+                if msr_type.contains(MsrType::READ) {
+                    self.shadow_msr_intercept_read.set(idx, false);
+                }
+                if msr_type.contains(MsrType::WRITE) {
+                    self.shadow_msr_intercept_write.set(idx, false);
+                }
+            }
+        }
+
+        if msr_type.contains(MsrType::READ)
+            && !arch.msr_allowed(msr, MsrFilterType::KVM_MSR_FILTER_READ)
+        {
+            msr_bitmap.ctl(msr, VmxMsrBitmapAction::Set, VmxMsrBitmapAccess::Read);
+            msr_type.remove(MsrType::READ);
+        }
+
+        if msr_type.contains(MsrType::WRITE)
+            && !arch.msr_allowed(msr, MsrFilterType::KVM_MSR_FILTER_WRITE)
+        {
+            msr_bitmap.ctl(msr, VmxMsrBitmapAction::Set, VmxMsrBitmapAccess::Write);
+            msr_type.remove(MsrType::WRITE);
+        }
+
+        if msr_type.contains(MsrType::READ) {
+            msr_bitmap.ctl(msr, VmxMsrBitmapAction::Clear, VmxMsrBitmapAccess::Read);
+        }
+
+        if msr_type.contains(MsrType::WRITE) {
+            msr_bitmap.ctl(msr, VmxMsrBitmapAction::Clear, VmxMsrBitmapAccess::Write);
+        }
+    }
+
+    #[inline]
+    pub fn segment_cache_clear(&mut self) {
+        self.segment_cache.bitmask = 0;
+    }
+
+    pub fn clear_atomic_switch_msr(&mut self, msr: u32) {
+        match msr {
+            msr::IA32_EFER => {
+                if vmx_info().has_load_ia32_efer() {
+                    self.clear_stomic_switch_msr_special(
+                        EntryControls::LOAD_IA32_EFER.bits().into(),
+                        ExitControls::LOAD_IA32_EFER.bits().into(),
+                    );
+                    return;
+                }
+            }
+
+            msr::MSR_PERF_GLOBAL_CTRL => {
+                if vmx_info().has_load_perf_global_ctrl() {
+                    self.clear_stomic_switch_msr_special(
+                        EntryControls::LOAD_IA32_PERF_GLOBAL_CTRL.bits().into(),
+                        ExitControls::LOAD_IA32_PERF_GLOBAL_CTRL.bits().into(),
+                    );
+                    return;
+                }
+            }
+            _ => {}
+        }
+
+        let m = &mut self.msr_autoload;
+        let i = m.guest.find_loadstore_msr_slot(msr);
+
+        if let Some(i) = i {
+            m.guest.nr -= 1;
+            m.guest.val[i] = m.guest.val[m.guest.nr];
+            VmxAsm::vmx_vmwrite(control::VMENTRY_MSR_LOAD_COUNT, m.guest.nr as u64);
+        }
+
+        let i = m.host.find_loadstore_msr_slot(msr);
+        if let Some(i) = i {
+            m.host.nr -= 1;
+            m.host.val[i] = m.host.val[m.host.nr];
+            VmxAsm::vmx_vmwrite(control::VMEXIT_MSR_LOAD_COUNT, m.host.nr as u64);
+        }
+    }
+
+    fn clear_stomic_switch_msr_special(&self, entry: u64, exit: u64) {
+        let mut guard = self.loaded_vmcs.lock();
+        guard.controls_clearbit(ControlsType::VmEntry, entry);
+        guard.controls_clearbit(ControlsType::VmExit, exit);
+    }
+
+    pub fn add_atomic_switch_msr(
+        &mut self,
+        msr: u32,
+        guest_val: u64,
+        host_val: u64,
+        entry_only: bool,
+    ) {
+        match msr {
+            msr::IA32_EFER => {
+                if vmx_info().has_load_ia32_efer() {
+                    self.add_atomic_switch_msr_special(
+                        EntryControls::LOAD_IA32_EFER.bits() as u64,
+                        ExitControls::LOAD_IA32_EFER.bits() as u64,
+                        guest::IA32_EFER_FULL,
+                        host::IA32_EFER_FULL,
+                        guest_val,
+                        host_val,
+                    );
+                    return;
+                }
+            }
+            msr::MSR_PERF_GLOBAL_CTRL => {
+                if vmx_info().has_load_perf_global_ctrl() {
+                    self.add_atomic_switch_msr_special(
+                        EntryControls::LOAD_IA32_PERF_GLOBAL_CTRL.bits().into(),
+                        ExitControls::LOAD_IA32_PERF_GLOBAL_CTRL.bits().into(),
+                        guest::IA32_PERF_GLOBAL_CTRL_FULL,
+                        host::IA32_PERF_GLOBAL_CTRL_FULL,
+                        guest_val,
+                        host_val,
+                    );
+                    return;
+                }
+            }
+            msr::MSR_PEBS_ENABLE => {
+                unsafe { wrmsr(msr::MSR_PEBS_ENABLE, 0) };
+            }
+
+            _ => {}
+        }
+
+        let m = &mut self.msr_autoload;
+        let i = m.guest.find_loadstore_msr_slot(msr);
+        let j = if !entry_only {
+            m.host.find_loadstore_msr_slot(msr)
+        } else {
+            Some(0)
+        };
+
+        if (i.is_none() && m.guest.nr == VmxMsrs::MAX_NR_LOADSTORE_MSRS)
+            || (j.is_none() && m.host.nr == VmxMsrs::MAX_NR_LOADSTORE_MSRS)
+        {
+            warn!("Not enough msr switch entries. Can't add msr 0x{:x}", msr);
+            return;
+        }
+
+        let i = if let Some(i) = i {
+            i
+        } else {
+            m.guest.nr += 1;
+            VmxAsm::vmx_vmwrite(control::VMENTRY_MSR_LOAD_COUNT, m.guest.nr as u64);
+            m.guest.nr
+        };
+
+        m.guest.val[i].index = msr;
+        m.guest.val[i].data = guest_val;
+
+        if entry_only {
+            return;
+        }
+
+        let j = if let Some(j) = j {
+            j
+        } else {
+            m.host.nr += 1;
+            VmxAsm::vmx_vmwrite(control::VMEXIT_MSR_LOAD_COUNT, m.host.nr as u64);
+            m.host.nr
+        };
+
+        m.host.val[j].index = msr;
+        m.host.val[j].data = host_val;
+    }
+
+    fn add_atomic_switch_msr_special(
+        &self,
+        entry: u64,
+        exit: u64,
+        guest_val_vmcs: u32,
+        host_val_vmcs: u32,
+        guest_val: u64,
+        host_val: u64,
+    ) {
+        VmxAsm::vmx_vmwrite(guest_val_vmcs, guest_val);
+        if host_val_vmcs != host::IA32_EFER_FULL {
+            VmxAsm::vmx_vmwrite(host_val_vmcs, host_val);
+        }
+
+        let mut guard = self.loaded_vmcs.lock();
+        guard.controls_setbit(ControlsType::VmEntry, entry);
+        guard.controls_setbit(ControlsType::VmExit, exit);
+    }
+
+    pub fn vmx_vcpu_run_flags(&self) -> VmxRunFlag {
+        let mut flags = VmxRunFlag::empty();
+
+        if self.loaded_vmcs().launched {
+            flags.insert(VmxRunFlag::VMRESUME);
+        }
+
+        // MSR_IA32_SPEC_CTRL
+        if !self.loaded_vmcs().msr_write_intercepted(0x48) {
+            flags.insert(VmxRunFlag::SAVE_SPEC_CTRL);
+        }
+
+        flags
+    }
+    pub fn get_exit_qual(&self) -> u64 {
+        self.exit_qualification
+    }
+    pub fn vmread_exit_qual(&mut self) {
+        self.exit_qualification = VmxAsm::vmx_vmread(ro::EXIT_QUALIFICATION);
+    }
+}
+
+bitflags! {
+    pub struct MsrType: u8 {
+        const READ = 1;
+        const WRITE = 2;
+        const RW = 3;
+    }
+
+    //https://code.dragonos.org.cn/xref/linux-6.6.21/arch/x86/include/asm/kvm_host.h#249
+    pub struct PageFaultErr: u64 {
+        const PFERR_PRESENT = 1 << 0;
+        const PFERR_WRITE = 1 << 1;
+        const PFERR_USER = 1 << 2;
+        const PFERR_RSVD = 1 << 3;
+        const PFERR_FETCH = 1 << 4;
+        const PFERR_PK = 1 << 5;
+        const PFERR_SGX = 1 << 15;
+        const PFERR_GUEST_FINAL = 1 << 32;
+        const PFERR_GUEST_PAGE = 1 << 33;
+        const PFERR_IMPLICIT_ACCESS = 1 << 48;
+    }
+
+    pub struct VmxRunFlag: u8 {
+        const VMRESUME = 1 << 0;
+        const SAVE_SPEC_CTRL = 1 << 1;
+    }
+}
+
+#[derive(Debug, PartialEq)]
+#[allow(dead_code)]
+pub enum VmxL1dFlushState {
+    Auto,
+    Never,
+    Cond,
+    Always,
+    EptDisabled,
+    NotRequired,
+}
+
+#[derive(Debug, PartialEq)]
+pub struct VmxSegmentField {
+    selector: u32,
+    base: u32,
+    limit: u32,
+    ar_bytes: u32,
+}
+//fix
+pub const KVM_VMX_SEGMENT_FIELDS: &[VmxSegmentField] = &[
+    // ES
+    VmxSegmentField {
+        selector: guest::ES_SELECTOR,
+        base: guest::ES_BASE,
+        limit: guest::ES_LIMIT,
+        ar_bytes: guest::ES_ACCESS_RIGHTS,
+    },
+    // CS
+    VmxSegmentField {
+        selector: guest::CS_SELECTOR,
+        base: guest::CS_BASE,
+        limit: guest::CS_LIMIT,
+        ar_bytes: guest::CS_ACCESS_RIGHTS,
+    },
+    // SS
+    VmxSegmentField {
+        selector: guest::SS_SELECTOR,
+        base: guest::SS_BASE,
+        limit: guest::SS_LIMIT,
+        ar_bytes: guest::SS_ACCESS_RIGHTS,
+    },
+    // DS
+    VmxSegmentField {
+        selector: guest::DS_SELECTOR,
+        base: guest::DS_BASE,
+        limit: guest::DS_LIMIT,
+        ar_bytes: guest::DS_ACCESS_RIGHTS,
+    },
+    // FS
+    VmxSegmentField {
+        selector: guest::FS_SELECTOR,
+        base: guest::FS_BASE,
+        limit: guest::FS_LIMIT,
+        ar_bytes: guest::FS_ACCESS_RIGHTS,
+    },
+    // GS
+    VmxSegmentField {
+        selector: guest::GS_SELECTOR,
+        base: guest::GS_BASE,
+        limit: guest::GS_LIMIT,
+        ar_bytes: guest::GS_ACCESS_RIGHTS,
+    },
+    // TR
+    VmxSegmentField {
+        selector: guest::TR_SELECTOR,
+        base: guest::TR_BASE,
+        limit: guest::TR_LIMIT,
+        ar_bytes: guest::TR_ACCESS_RIGHTS,
+    },
+    // LDTR
+    VmxSegmentField {
+        selector: guest::LDTR_SELECTOR,
+        base: guest::LDTR_BASE,
+        limit: guest::LDTR_LIMIT,
+        ar_bytes: guest::LDTR_ACCESS_RIGHTS,
+    },
+];
+
+pub static L1TF_VMX_MITIGATION: RwLock<VmxL1dFlushState> = RwLock::new(VmxL1dFlushState::Auto);
+
+pub fn vmx_init() -> Result<(), SystemError> {
+    let cpuid = CpuId::new();
+    let cpu_feat = cpuid.get_feature_info().ok_or(SystemError::ENOSYS)?;
+    if !cpu_feat.has_vmx() {
+        return Err(SystemError::ENOSYS);
+    }
+
+    init_kvm_arch();
+
+    x86_kvm_manager_mut().vendor_init(&VmxKvmInitFunc)?;
+
+    vmx_info().setup_l1d_flush();
+
+    kvm_init()?;
+    Ok(())
+}
+
+#[no_mangle]
+unsafe extern "C" fn vmx_update_host_rsp(vcpu_vmx: &VmxVCpuPriv, host_rsp: usize) {
+    warn!("vmx_update_host_rsp");
+    let mut guard = vcpu_vmx.loaded_vmcs.lock();
+    if unlikely(host_rsp != guard.host_state.rsp) {
+        guard.host_state.rsp = host_rsp;
+        VmxAsm::vmx_vmwrite(host::RSP, host_rsp as u64);
+    }
+}
+
+#[no_mangle]
+unsafe extern "C" fn vmx_spec_ctrl_restore_host(_vcpu_vmx: &VmxVCpuPriv, _flags: u32) {
+    // TODO
+    warn!("vmx_spec_ctrl_restore_host todo!");
+}

+ 160 - 0
kernel/src/arch/x86_64/vm/vmx/vmcs/feat.rs

@@ -0,0 +1,160 @@
+use system_error::SystemError;
+use x86::{
+    msr::{
+        IA32_VMX_ENTRY_CTLS, IA32_VMX_EXIT_CTLS, IA32_VMX_PINBASED_CTLS, IA32_VMX_PROCBASED_CTLS,
+        IA32_VMX_PROCBASED_CTLS2,
+    },
+    vmx::vmcs::control::{
+        EntryControls, ExitControls, PinbasedControls, PrimaryControls, SecondaryControls,
+    },
+};
+
+use crate::arch::vm::vmx::Vmx;
+
+pub struct VmxFeat;
+#[allow(dead_code)]
+impl VmxFeat {
+    pub const KVM_REQUIRED_VMX_CPU_BASED_VM_EXEC_CONTROL: u32 = PrimaryControls::HLT_EXITING.bits()
+        | PrimaryControls::CR3_LOAD_EXITING.bits()
+        | PrimaryControls::CR3_STORE_EXITING.bits()
+        | PrimaryControls::UNCOND_IO_EXITING.bits()
+        | PrimaryControls::MOV_DR_EXITING.bits()
+        | PrimaryControls::USE_TSC_OFFSETTING.bits()
+        | PrimaryControls::MWAIT_EXITING.bits()
+        | PrimaryControls::MONITOR_EXITING.bits()
+        | PrimaryControls::INVLPG_EXITING.bits()
+        | PrimaryControls::RDPMC_EXITING.bits()
+        | PrimaryControls::INTERRUPT_WINDOW_EXITING.bits()
+        | PrimaryControls::CR8_LOAD_EXITING.bits()
+        | PrimaryControls::CR8_STORE_EXITING.bits();
+
+    pub const KVM_OPTIONAL_VMX_CPU_BASED_VM_EXEC_CONTROL: u32 = PrimaryControls::RDTSC_EXITING
+        .bits()
+        | PrimaryControls::USE_TPR_SHADOW.bits()
+        | PrimaryControls::USE_IO_BITMAPS.bits()
+        | PrimaryControls::MONITOR_TRAP_FLAG.bits()
+        | PrimaryControls::USE_MSR_BITMAPS.bits()
+        | PrimaryControls::NMI_WINDOW_EXITING.bits()
+        | PrimaryControls::PAUSE_EXITING.bits()
+        | PrimaryControls::SECONDARY_CONTROLS.bits();
+
+    pub const KVM_REQUIRED_VMX_SECONDARY_VM_EXEC_CONTROL: u32 = 0;
+
+    pub const KVM_OPTIONAL_VMX_SECONDARY_VM_EXEC_CONTROL: u32 = SecondaryControls::VIRTUALIZE_APIC
+        .bits()
+        | SecondaryControls::VIRTUALIZE_X2APIC.bits()
+        | SecondaryControls::WBINVD_EXITING.bits()
+        | SecondaryControls::ENABLE_VPID.bits()
+        | SecondaryControls::ENABLE_EPT.bits()
+        | SecondaryControls::UNRESTRICTED_GUEST.bits()
+        | SecondaryControls::PAUSE_LOOP_EXITING.bits()
+        | SecondaryControls::DTABLE_EXITING.bits()
+        | SecondaryControls::ENABLE_RDTSCP.bits()
+        | SecondaryControls::ENABLE_INVPCID.bits()
+        | SecondaryControls::VIRTUALIZE_APIC_REGISTER.bits()
+        | SecondaryControls::VIRTUAL_INTERRUPT_DELIVERY.bits()
+        | SecondaryControls::VMCS_SHADOWING.bits()
+        | SecondaryControls::ENABLE_XSAVES_XRSTORS.bits()
+        | SecondaryControls::RDSEED_EXITING.bits()
+        | SecondaryControls::RDRAND_EXITING.bits()
+        | SecondaryControls::ENABLE_PML.bits()
+        | SecondaryControls::USE_TSC_SCALING.bits()
+        | SecondaryControls::ENABLE_USER_WAIT_PAUSE.bits()
+        | SecondaryControls::INTEL_PT_GUEST_PHYSICAL.bits()
+        | SecondaryControls::CONCEAL_VMX_FROM_PT.bits()
+        | SecondaryControls::ENABLE_VM_FUNCTIONS.bits()
+        | SecondaryControls::ENCLS_EXITING.bits();
+    // | SecondaryControls::BUS_LOCK_DETECTION.bits()
+    // | SecondaryControls::NOTIFY_VM_EXITING.bits()
+
+    pub const KVM_REQUIRED_VMX_VM_EXIT_CONTROLS: u32 = ExitControls::SAVE_DEBUG_CONTROLS.bits()
+        | ExitControls::ACK_INTERRUPT_ON_EXIT.bits()
+        | ExitControls::HOST_ADDRESS_SPACE_SIZE.bits();
+
+    pub const KVM_OPTIONAL_VMX_VM_EXIT_CONTROLS: u32 = ExitControls::LOAD_IA32_PERF_GLOBAL_CTRL
+        .bits()
+        | ExitControls::SAVE_IA32_PAT.bits()
+        | ExitControls::LOAD_IA32_PAT.bits()
+        | ExitControls::SAVE_IA32_EFER.bits()
+        | ExitControls::SAVE_VMX_PREEMPTION_TIMER.bits()
+        | ExitControls::LOAD_IA32_EFER.bits()
+        | ExitControls::CLEAR_IA32_BNDCFGS.bits()
+        | ExitControls::CONCEAL_VMX_FROM_PT.bits()
+        | ExitControls::CLEAR_IA32_RTIT_CTL.bits();
+
+    pub const KVM_REQUIRED_VMX_PIN_BASED_VM_EXEC_CONTROL: u32 =
+        PinbasedControls::EXTERNAL_INTERRUPT_EXITING.bits() | PinbasedControls::NMI_EXITING.bits();
+
+    pub const KVM_OPTIONAL_VMX_PIN_BASED_VM_EXEC_CONTROL: u32 =
+        PinbasedControls::VIRTUAL_NMIS.bits() | PinbasedControls::POSTED_INTERRUPTS.bits();
+
+    pub const KVM_REQUIRED_VMX_VM_ENTRY_CONTROLS: u32 =
+        EntryControls::LOAD_DEBUG_CONTROLS.bits() | EntryControls::IA32E_MODE_GUEST.bits();
+
+    pub const KVM_OPTIONAL_VMX_VM_ENTRY_CONTROLS: u32 = EntryControls::LOAD_IA32_PERF_GLOBAL_CTRL
+        .bits()
+        | EntryControls::LOAD_IA32_PAT.bits()
+        | EntryControls::LOAD_IA32_EFER.bits()
+        | EntryControls::LOAD_IA32_BNDCFGS.bits()
+        | EntryControls::CONCEAL_VMX_FROM_PT.bits()
+        | EntryControls::LOAD_IA32_RTIT_CTL.bits();
+
+    /* VMX_BASIC bits and bitmasks */
+    pub const VMX_BASIC_VMCS_SIZE_SHIFT: u64 = 32;
+    pub const VMX_BASIC_TRUE_CTLS: u64 = 1 << 55;
+    pub const VMX_BASIC_64: u64 = 0x0001000000000000;
+    pub const VMX_BASIC_MEM_TYPE_SHIFT: u64 = 50;
+    pub const VMX_BASIC_MEM_TYPE_MASK: u64 = 0x003c000000000000;
+    pub const VMX_BASIC_MEM_TYPE_WB: u64 = 6;
+    pub const VMX_BASIC_INOUT: u64 = 0x0040000000000000;
+
+    pub fn adjust_primary_controls() -> Result<PrimaryControls, SystemError> {
+        Ok(unsafe {
+            PrimaryControls::from_bits_unchecked(Vmx::adjust_vmx_controls(
+                Self::KVM_REQUIRED_VMX_CPU_BASED_VM_EXEC_CONTROL,
+                Self::KVM_OPTIONAL_VMX_CPU_BASED_VM_EXEC_CONTROL,
+                IA32_VMX_PROCBASED_CTLS,
+            )?)
+        })
+    }
+
+    pub fn adjust_secondary_controls() -> Result<SecondaryControls, SystemError> {
+        Ok(unsafe {
+            SecondaryControls::from_bits_unchecked(Vmx::adjust_vmx_controls(
+                Self::KVM_REQUIRED_VMX_SECONDARY_VM_EXEC_CONTROL,
+                Self::KVM_OPTIONAL_VMX_SECONDARY_VM_EXEC_CONTROL,
+                IA32_VMX_PROCBASED_CTLS2,
+            )?)
+        })
+    }
+
+    pub fn adjust_exit_controls() -> Result<ExitControls, SystemError> {
+        Ok(unsafe {
+            ExitControls::from_bits_unchecked(Vmx::adjust_vmx_controls(
+                Self::KVM_REQUIRED_VMX_VM_EXIT_CONTROLS,
+                Self::KVM_OPTIONAL_VMX_VM_EXIT_CONTROLS,
+                IA32_VMX_EXIT_CTLS,
+            )?)
+        })
+    }
+
+    pub fn adjust_entry_controls() -> Result<EntryControls, SystemError> {
+        Ok(unsafe {
+            EntryControls::from_bits_unchecked(Vmx::adjust_vmx_controls(
+                Self::KVM_REQUIRED_VMX_VM_ENTRY_CONTROLS,
+                Self::KVM_OPTIONAL_VMX_VM_ENTRY_CONTROLS,
+                IA32_VMX_ENTRY_CTLS,
+            )?)
+        })
+    }
+
+    pub fn adjust_pin_based_controls() -> Result<PinbasedControls, SystemError> {
+        Ok(unsafe {
+            PinbasedControls::from_bits_unchecked(Vmx::adjust_vmx_controls(
+                Self::KVM_REQUIRED_VMX_PIN_BASED_VM_EXEC_CONTROL,
+                Self::KVM_OPTIONAL_VMX_PIN_BASED_VM_EXEC_CONTROL,
+                IA32_VMX_PINBASED_CTLS,
+            )?)
+        })
+    }
+}

+ 451 - 0
kernel/src/arch/x86_64/vm/vmx/vmcs/mod.rs

@@ -0,0 +1,451 @@
+use core::intrinsics::unlikely;
+
+use alloc::{boxed::Box, collections::LinkedList, sync::Arc};
+use bitmap::{traits::BitMapOps, AllocBitmap};
+use x86::{
+    controlregs::Cr4,
+    vmx::vmcs::{
+        control::{self, PrimaryControls},
+        host,
+    },
+};
+use x86_64::{registers::control::Cr3Flags, structures::paging::PhysFrame};
+
+use crate::{
+    arch::{
+        vm::asm::{IntrInfo, IntrType, VmxAsm},
+        MMArch,
+    },
+    libs::spinlock::{SpinLock, SpinLockGuard},
+    mm::{percpu::PerCpuVar, MemoryManagementArch, PhysAddr, VirtAddr},
+    smp::cpu::ProcessorId,
+};
+
+use super::vmx_info;
+
+pub mod feat;
+
+pub static mut PERCPU_VMCS: Option<PerCpuVar<Option<Arc<LockedVMControlStructure>>>> = None;
+pub static mut PERCPU_LOADED_VMCS_LIST: Option<PerCpuVar<LinkedList<Arc<LockedLoadedVmcs>>>> = None;
+pub static mut VMXAREA: Option<PerCpuVar<Box<VMControlStructure>>> = None;
+
+pub fn current_vmcs() -> &'static Option<Arc<LockedVMControlStructure>> {
+    unsafe { PERCPU_VMCS.as_ref().unwrap().get() }
+}
+
+pub fn current_vmcs_mut() -> &'static mut Option<Arc<LockedVMControlStructure>> {
+    unsafe { PERCPU_VMCS.as_ref().unwrap().get_mut() }
+}
+
+pub fn current_loaded_vmcs_list_mut() -> &'static mut LinkedList<Arc<LockedLoadedVmcs>> {
+    unsafe { PERCPU_LOADED_VMCS_LIST.as_ref().unwrap().get_mut() }
+}
+
+#[allow(dead_code)]
+pub fn current_loaded_vmcs_list() -> &'static LinkedList<Arc<LockedLoadedVmcs>> {
+    unsafe { PERCPU_LOADED_VMCS_LIST.as_ref().unwrap().get() }
+}
+
+pub fn vmx_area() -> &'static PerCpuVar<Box<VMControlStructure>> {
+    unsafe { VMXAREA.as_ref().unwrap() }
+}
+
+#[repr(C, align(4096))]
+#[derive(Debug, Clone)]
+pub struct VMControlStructure {
+    pub header: u32,
+    pub abort: u32,
+    pub data: [u8; MMArch::PAGE_SIZE - core::mem::size_of::<u32>() - core::mem::size_of::<u32>()],
+}
+
+impl VMControlStructure {
+    pub fn new() -> Box<Self> {
+        let mut vmcs: Box<VMControlStructure> = unsafe {
+            Box::try_new_zeroed()
+                .expect("alloc vmcs failed")
+                .assume_init()
+        };
+
+        vmcs.set_revision_id(vmx_info().vmcs_config.revision_id);
+        vmcs
+    }
+
+    pub fn revision_id(&self) -> u32 {
+        self.header & 0x7FFF_FFFF
+    }
+
+    #[allow(dead_code)]
+    pub fn is_shadow_vmcs(&self) -> bool {
+        self.header & 0x8000_0000 == 1
+    }
+
+    pub fn set_shadow_vmcs(&mut self, shadow: bool) {
+        self.header |= (shadow as u32) << 31;
+    }
+
+    pub fn set_revision_id(&mut self, id: u32) {
+        self.header = self.header & 0x8000_0000 | (id & 0x7FFF_FFFF);
+    }
+}
+
+#[derive(Debug)]
+pub struct LockedVMControlStructure {
+    /// 记录内部的vmcs的物理地址
+    phys_addr: PhysAddr,
+    inner: SpinLock<Box<VMControlStructure>>,
+}
+
+impl LockedVMControlStructure {
+    #[inline(never)]
+    pub fn new(shadow: bool) -> Arc<Self> {
+        let mut vmcs = VMControlStructure::new();
+
+        let phys_addr = unsafe {
+            MMArch::virt_2_phys(VirtAddr::new(vmcs.as_ref() as *const _ as usize)).unwrap()
+        };
+
+        vmcs.set_shadow_vmcs(shadow);
+
+        Arc::new(Self {
+            phys_addr,
+            inner: SpinLock::new(vmcs),
+        })
+    }
+
+    pub fn lock(&self) -> SpinLockGuard<Box<VMControlStructure>> {
+        self.inner.lock()
+    }
+
+    pub fn phys_addr(&self) -> PhysAddr {
+        self.phys_addr
+    }
+}
+
+#[derive(Debug)]
+pub struct VmcsHostState {
+    pub cr3: (PhysFrame, Cr3Flags),
+    pub cr4: Cr4,
+    pub gs_base: usize,
+    pub fs_base: usize,
+    pub rsp: usize,
+    pub fs_sel: u16,
+    pub gs_sel: u16,
+    pub ldt_sel: u16,
+    pub ds_sel: u16,
+    pub es_sel: u16,
+}
+
+impl VmcsHostState {
+    pub fn set_host_fsgs(&mut self, fs_sel: u16, gs_sel: u16, fs_base: usize, gs_base: usize) {
+        if unlikely(self.fs_sel != fs_sel) {
+            if (fs_sel & 7) == 0 {
+                VmxAsm::vmx_vmwrite(host::FS_SELECTOR, fs_sel as u64);
+            } else {
+                VmxAsm::vmx_vmwrite(host::FS_SELECTOR, 0);
+            }
+
+            self.fs_sel = fs_sel;
+        }
+
+        if unlikely(self.gs_sel != gs_sel) {
+            if (gs_sel & 7) == 0 {
+                VmxAsm::vmx_vmwrite(host::GS_SELECTOR, gs_sel as u64);
+            } else {
+                VmxAsm::vmx_vmwrite(host::GS_SELECTOR, 0);
+            }
+
+            self.gs_sel = gs_sel;
+        }
+
+        if unlikely(fs_base != self.fs_base) {
+            VmxAsm::vmx_vmwrite(host::FS_BASE, fs_base as u64);
+            self.fs_base = fs_base;
+        }
+
+        if unlikely(self.gs_base != gs_base) {
+            VmxAsm::vmx_vmwrite(host::GS_BASE, gs_base as u64);
+            self.gs_base = gs_base;
+        }
+    }
+}
+
+impl Default for VmcsHostState {
+    fn default() -> Self {
+        Self {
+            cr3: (
+                PhysFrame::containing_address(x86_64::PhysAddr::new(0)),
+                Cr3Flags::empty(),
+            ),
+            cr4: Cr4::empty(),
+            gs_base: 0,
+            fs_base: 0,
+            rsp: 0,
+            fs_sel: 0,
+            gs_sel: 0,
+            ldt_sel: 0,
+            ds_sel: 0,
+            es_sel: 0,
+        }
+    }
+}
+
+#[derive(Debug, Default)]
+pub struct VmcsControlsShadow {
+    vm_entry: u32,
+    vm_exit: u32,
+    pin: u32,
+    exec: u32,
+    secondary_exec: u32,
+    tertiary_exec: u64,
+}
+
+#[derive(Debug)]
+#[allow(dead_code)]
+pub struct LoadedVmcs {
+    pub vmcs: Arc<LockedVMControlStructure>,
+    pub shadow_vmcs: Option<Arc<LockedVMControlStructure>>,
+    pub cpu: ProcessorId,
+    /// 是否已经执行了 VMLAUNCH 指令
+    pub launched: bool,
+    /// NMI 是否已知未被屏蔽
+    nmi_known_unmasked: bool,
+    /// Hypervisor 定时器是否被软禁用
+    hv_timer_soft_disabled: bool,
+    /// 支持 vnmi-less CPU 的字段,指示 VNMI 是否被软阻止
+    pub soft_vnmi_blocked: bool,
+    /// 记录 VM 进入时间
+    entry_time: u64,
+    /// 记录 VNMI 被阻止的时间
+    vnmi_blocked_time: u64,
+    /// msr位图
+    pub msr_bitmap: VmxMsrBitmap,
+    /// 保存 VMCS 主机状态的结构体
+    pub host_state: VmcsHostState,
+    /// 保存 VMCS 控制字段的shadow状态的结构体。
+    controls_shadow: VmcsControlsShadow,
+}
+
+impl LoadedVmcs {
+    pub fn controls_set(&mut self, ctl_type: ControlsType, value: u64) {
+        match ctl_type {
+            ControlsType::VmEntry => {
+                if self.controls_shadow.vm_entry != value as u32 {
+                    VmxAsm::vmx_vmwrite(control::VMENTRY_CONTROLS, value);
+                    self.controls_shadow.vm_entry = value as u32;
+                }
+            }
+            ControlsType::VmExit => {
+                if self.controls_shadow.vm_exit != value as u32 {
+                    VmxAsm::vmx_vmwrite(control::VMEXIT_CONTROLS, value);
+                    self.controls_shadow.vm_exit = value as u32;
+                }
+            }
+            ControlsType::Pin => {
+                if self.controls_shadow.pin != value as u32 {
+                    VmxAsm::vmx_vmwrite(control::PINBASED_EXEC_CONTROLS, value);
+                    self.controls_shadow.pin = value as u32;
+                }
+            }
+            ControlsType::Exec => {
+                if self.controls_shadow.exec != value as u32 {
+                    VmxAsm::vmx_vmwrite(control::PRIMARY_PROCBASED_EXEC_CONTROLS, value);
+                    self.controls_shadow.exec = value as u32;
+                }
+            }
+            ControlsType::SecondaryExec => {
+                if self.controls_shadow.secondary_exec != value as u32 {
+                    VmxAsm::vmx_vmwrite(control::SECONDARY_PROCBASED_EXEC_CONTROLS, value);
+                    self.controls_shadow.secondary_exec = value as u32;
+                }
+            }
+            ControlsType::TertiaryExec => {
+                if self.controls_shadow.tertiary_exec != value {
+                    VmxAsm::vmx_vmwrite(0x2034, value);
+                    self.controls_shadow.tertiary_exec = value;
+                }
+            }
+        }
+    }
+
+    pub fn controls_get(&self, ctl_type: ControlsType) -> u64 {
+        match ctl_type {
+            ControlsType::VmEntry => self.controls_shadow.vm_entry as u64,
+            ControlsType::VmExit => self.controls_shadow.vm_exit as u64,
+            ControlsType::Pin => self.controls_shadow.pin as u64,
+            ControlsType::Exec => self.controls_shadow.exec as u64,
+            ControlsType::SecondaryExec => self.controls_shadow.secondary_exec as u64,
+            ControlsType::TertiaryExec => self.controls_shadow.tertiary_exec,
+        }
+    }
+
+    pub fn controls_setbit(&mut self, ctl_type: ControlsType, value: u64) {
+        let val = self.controls_get(ctl_type) | value;
+        self.controls_set(ctl_type, val)
+    }
+
+    pub fn controls_clearbit(&mut self, ctl_type: ControlsType, value: u64) {
+        let val = self.controls_get(ctl_type) & (!value);
+        self.controls_set(ctl_type, val)
+    }
+
+    pub fn msr_write_intercepted(&mut self, msr: u32) -> bool {
+        if unsafe {
+            PrimaryControls::from_bits_unchecked(self.controls_get(ControlsType::Exec) as u32)
+                .contains(PrimaryControls::USE_MSR_BITMAPS)
+        } {
+            return true;
+        }
+
+        return self
+            .msr_bitmap
+            .ctl(msr, VmxMsrBitmapAction::Test, VmxMsrBitmapAccess::Write);
+    }
+}
+
+#[derive(Debug)]
+pub struct LockedLoadedVmcs {
+    inner: SpinLock<LoadedVmcs>,
+}
+
+#[derive(Debug, Clone, Copy)]
+#[allow(dead_code)]
+pub enum ControlsType {
+    VmEntry,
+    VmExit,
+    Pin,
+    Exec,
+    SecondaryExec,
+    TertiaryExec,
+}
+
+impl LockedLoadedVmcs {
+    pub fn new() -> Arc<Self> {
+        let bitmap = if vmx_info().has_msr_bitmap() {
+            let bitmap = VmxMsrBitmap::new(true, MMArch::PAGE_SIZE * u8::BITS as usize);
+            bitmap
+        } else {
+            VmxMsrBitmap::new(true, 0)
+        };
+        let vmcs = LockedVMControlStructure::new(false);
+
+        VmxAsm::vmclear(vmcs.phys_addr);
+
+        Arc::new(Self {
+            inner: SpinLock::new(LoadedVmcs {
+                vmcs,
+                shadow_vmcs: None,
+                cpu: ProcessorId::INVALID,
+                launched: false,
+                hv_timer_soft_disabled: false,
+                msr_bitmap: bitmap,
+                host_state: VmcsHostState::default(),
+                controls_shadow: VmcsControlsShadow::default(),
+                nmi_known_unmasked: false,
+                soft_vnmi_blocked: false,
+                entry_time: 0,
+                vnmi_blocked_time: 0,
+            }),
+        })
+    }
+
+    pub fn lock(&self) -> SpinLockGuard<LoadedVmcs> {
+        self.inner.lock()
+    }
+}
+
+#[derive(Debug)]
+pub struct VmxMsrBitmap {
+    data: AllocBitmap,
+    phys_addr: usize,
+}
+
+pub enum VmxMsrBitmapAction {
+    Test,
+    Set,
+    Clear,
+}
+
+pub enum VmxMsrBitmapAccess {
+    Write,
+    Read,
+}
+
+impl VmxMsrBitmapAccess {
+    pub const fn base(&self) -> usize {
+        match self {
+            VmxMsrBitmapAccess::Write => 0x800 * core::mem::size_of::<usize>(),
+            VmxMsrBitmapAccess::Read => 0,
+        }
+    }
+}
+
+impl VmxMsrBitmap {
+    pub fn new(init_val: bool, size: usize) -> Self {
+        let mut data = AllocBitmap::new(size);
+        data.set_all(init_val);
+
+        let addr = data.data() as *const [usize] as *const usize as usize;
+        Self {
+            data,
+            phys_addr: unsafe { MMArch::virt_2_phys(VirtAddr::new(addr)).unwrap().data() },
+        }
+    }
+
+    pub fn phys_addr(&self) -> usize {
+        self.phys_addr
+    }
+
+    pub fn ctl(
+        &mut self,
+        msr: u32,
+        action: VmxMsrBitmapAction,
+        access: VmxMsrBitmapAccess,
+    ) -> bool {
+        if msr <= 0x1fff {
+            return self.bit_op(msr as usize, access.base(), action);
+        } else if (0xc0000000..=0xc0001fff).contains(&msr) {
+            // 这里是有问题的,需要后续检查
+            // https://code.dragonos.org.cn/xref/linux-6.6.21/arch/x86/kvm/vmx/vmx.h#450
+            return self.bit_op(msr as usize & 0x1fff, access.base() + 0x400, action);
+        } else {
+            return true;
+        }
+    }
+
+    fn bit_op(&mut self, msr: usize, base: usize, action: VmxMsrBitmapAction) -> bool {
+        match action {
+            VmxMsrBitmapAction::Test => {
+                let ret = self.data.get(msr + base);
+                ret.unwrap_or(false)
+            }
+            VmxMsrBitmapAction::Set => {
+                self.data.set(msr + base, true);
+                true
+            }
+            VmxMsrBitmapAction::Clear => {
+                self.data.set(msr + base, false);
+                true
+            }
+        }
+    }
+}
+
+/// 中断相关辅助函数载体
+pub struct VmcsIntrHelper;
+
+impl VmcsIntrHelper {
+    pub fn is_nmi(intr_info: &IntrInfo) -> bool {
+        return Self::is_intr_type(intr_info, IntrType::INTR_TYPE_NMI_INTR);
+    }
+
+    pub fn is_intr_type(intr_info: &IntrInfo, intr_type: IntrType) -> bool {
+        return (*intr_info
+            & (IntrInfo::INTR_INFO_VALID_MASK | IntrInfo::INTR_INFO_INTR_TYPE_MASK))
+            .bits()
+            == IntrInfo::INTR_INFO_VALID_MASK.bits() | intr_type.bits();
+    }
+
+    pub fn is_external_intr(intr_info: &IntrInfo) -> bool {
+        return Self::is_intr_type(intr_info, IntrType::INTR_TYPE_EXT_INTR);
+    }
+}

+ 179 - 0
kernel/src/arch/x86_64/vm/vmx/vmenter.S

@@ -0,0 +1,179 @@
+#include "common/asm.h"
+
+#define __VCPU_REGS_RAX  0
+#define __VCPU_REGS_RCX  1
+#define __VCPU_REGS_RDX  2
+#define __VCPU_REGS_RBX  3
+#define __VCPU_REGS_RSP  4
+#define __VCPU_REGS_RBP  5
+#define __VCPU_REGS_RSI  6
+#define __VCPU_REGS_RDI  7
+
+#define __VCPU_REGS_R8   8
+#define __VCPU_REGS_R9   9
+#define __VCPU_REGS_R10 10
+#define __VCPU_REGS_R11 11
+#define __VCPU_REGS_R12 12
+#define __VCPU_REGS_R13 13
+#define __VCPU_REGS_R14 14
+#define __VCPU_REGS_R15 15
+
+#define VCPU_RAX __VCPU_REGS_RAX * 8
+#define VCPU_RCX __VCPU_REGS_RCX * 8
+#define VCPU_RDX __VCPU_REGS_RDX * 8
+#define VCPU_RBX __VCPU_REGS_RBX * 8
+#define VCPU_RBP __VCPU_REGS_RBP * 8
+#define VCPU_RSI __VCPU_REGS_RSI * 8
+#define VCPU_RDI __VCPU_REGS_RDI * 8
+
+#define VCPU_R8  __VCPU_REGS_R8  * 8
+#define VCPU_R9  __VCPU_REGS_R9  * 8
+#define VCPU_R10 __VCPU_REGS_R10 * 8
+#define VCPU_R11 __VCPU_REGS_R11 * 8
+#define VCPU_R12 __VCPU_REGS_R12 * 8
+#define VCPU_R13 __VCPU_REGS_R13 * 8
+#define VCPU_R14 __VCPU_REGS_R14 * 8
+#define VCPU_R15 __VCPU_REGS_R15 * 8
+
+#define VMX_RUN_VMRESUME_SHIFT		0
+#define VMX_RUN_SAVE_SPEC_CTRL_SHIFT	1
+
+#define VMX_RUN_VMRESUME		1 << VMX_RUN_VMRESUME_SHIFT
+#define VMX_RUN_SAVE_SPEC_CTRL		1 << VMX_RUN_SAVE_SPEC_CTRL_SHIFT
+
+// 将VCPU运行在guest模式
+ENTRY(__vmx_vcpu_run)
+    pushq %rbp
+    movq %rsp, %rbp
+
+    pushq %r15
+    pushq %r14
+    pushq %r13
+    pushq %r12
+
+    push %rbx
+
+    // 参数一
+    push %rdi
+    // 参数三
+    push %rdx
+    // 参数二
+    push %rsi
+
+    mov %edx, %ebx
+
+    lea (%rsp), %rsi
+
+    call vmx_update_host_rsp
+
+    // TODO: spec_ctrl
+
+.Lspec_ctrl_done:
+    mov %rsp, %rax
+
+    bt $VMX_RUN_VMRESUME_SHIFT, %ebx
+
+    mov VCPU_RCX(%rax), %rcx
+    mov VCPU_RDX(%rax), %rdx
+    mov VCPU_RBX(%rax), %rbx
+    mov VCPU_RBP(%rax), %rbp
+    mov VCPU_RSI(%rax), %rsi
+    mov VCPU_RDI(%rax), %rdi
+
+    mov VCPU_R8(%rax), %R8
+    mov VCPU_R9(%rax), %r9
+    mov VCPU_R10(%rax), %r10
+    mov VCPU_R11(%rax), %r11
+    mov VCPU_R12(%rax), %r12
+    mov VCPU_R13(%rax), %r13
+    mov VCPU_R14(%rax), %r14
+    mov VCPU_R15(%rax), %r15
+
+    mov VCPU_RAX(%rax), %rax
+
+    // TODO: clear cpu buffer
+
+    jnc .Lvmlaunch
+
+.Lvmresume:
+    vmresume
+    jmp .Lvmfail
+
+.Lvmlaunch:
+    call vmx_vmlaunch
+    jmp .Lvmfail
+
+// 从guest模式退出
+ENTRY(vmx_vmexit)
+    // TODO: unwind hint restore
+    // 临时保存guest RAX
+    push %rax
+
+    // 拿到regs头指针,存入rax
+    mov 8(%rsp), %rax
+
+    // 保存所有guest寄存器
+    pop VCPU_RAX(%rax)
+    mov %rcx, VCPU_RCX(%rax)
+    mov %rdx, VCPU_RDX(%rax)
+    mov %rbx, VCPU_RBX(%rax)
+    mov %rbp, VCPU_RBP(%rax)
+    mov %rsi, VCPU_RSI(%rax)
+    mov %rdi, VCPU_RDI(%rax)
+
+    mov %r8, VCPU_R8(%rax)
+    mov %r9, VCPU_R9(%rax)
+    mov %r10, VCPU_R10(%rax)
+    mov %r11, VCPU_R11(%rax)
+    mov %r12, VCPU_R12(%rax)
+    mov %r13, VCPU_R13(%rax)
+    mov %r14, VCPU_R14(%rax)
+    mov %r15, VCPU_R15(%rax)
+
+    xor %ebx, %ebx
+
+.Lclear_regs:
+    pop %rax
+
+    xor %eax, %eax
+    xor %ecx, %ecx
+    xor %edx, %edx
+    xor %ebp, %ebp
+    xor %esi, %esi
+    xor %edi, %edi
+
+    xor %r8d, %r8d
+    xor %r9d, %r9d
+    xor %r10d, %r10d
+    xor %r11d, %r11d
+    xor %r12d, %r12d
+    xor %r13d, %r13d
+    xor %r14d, %r14d
+    xor %r15d, %r15d
+
+    // todo: https://code.dragonos.org.cn/xref/linux-6.6.21/arch/x86/kvm/vmx/vmenter.S#270
+
+    pop %rsi
+    pop %rdi
+
+    call vmx_spec_ctrl_restore_host
+
+    mov %rbx, %rax
+
+    pop %rbx
+
+    pop %r12
+    pop %r13
+    pop %r14
+    pop %r15
+
+    pop %rbp
+    ret
+
+.Lvmfail:
+    // 失败,设置返回值为1
+    mov $1, %rbx
+    jmp .Lclear_regs
+
+
+

+ 4 - 1
kernel/src/init/init.rs

@@ -92,8 +92,11 @@ fn do_start_kernel() {
     Futex::init();
     crate::bpf::init_bpf_system();
     crate::debug::jump_label::static_keys_init();
+
+    // #[cfg(all(target_arch = "x86_64", feature = "kvm"))]
+    // crate::virt::kvm::kvm_init();
     #[cfg(all(target_arch = "x86_64", feature = "kvm"))]
-    crate::virt::kvm::kvm_init();
+    crate::arch::vm::vmx::vmx_init().unwrap();
 }
 
 /// 在内存管理初始化之前,执行的初始化

+ 1 - 0
kernel/src/lib.rs

@@ -1,5 +1,6 @@
 #![no_main] // <1>
 #![feature(alloc_error_handler)]
+#![feature(new_zeroed_alloc)]
 #![feature(allocator_api)]
 #![feature(arbitrary_self_types)]
 #![feature(concat_idents)]

+ 9 - 0
kernel/src/libs/rbtree.rs

@@ -829,6 +829,15 @@ impl<K: Ord + Debug, V: Debug> IntoIterator for RBTree<K, V> {
     }
 }
 
+impl<K: Ord + Debug, V: Debug> Default for RBTree<K, V> {
+    fn default() -> Self {
+        RBTree {
+            root: NodePtr::null(),
+            len: 0,
+        }
+    }
+}
+
 impl<K: Ord + Debug, V: Debug> RBTree<K, V> {
     /// Creates an empty `RBTree`.
     pub fn new() -> RBTree<K, V> {

+ 2 - 2
kernel/src/mm/mod.rs

@@ -155,7 +155,7 @@ pub enum PageTableKind {
 }
 
 /// 物理内存地址
-#[derive(Clone, Copy, Eq, Ord, PartialEq, PartialOrd, Hash)]
+#[derive(Clone, Copy, Eq, Ord, PartialEq, PartialOrd, Hash, Default)]
 #[repr(transparent)]
 pub struct PhysAddr(usize);
 
@@ -277,7 +277,7 @@ impl core::ops::SubAssign<PhysAddr> for PhysAddr {
 }
 
 /// 虚拟内存地址
-#[derive(Clone, Copy, Eq, Ord, PartialEq, PartialOrd, Hash)]
+#[derive(Clone, Copy, Eq, Ord, PartialEq, PartialOrd, Hash, Default)]
 #[repr(transparent)]
 pub struct VirtAddr(usize);
 

+ 1 - 0
kernel/src/mm/page.rs

@@ -874,6 +874,7 @@ impl<Arch: MemoryManagementArch> PageTable<Arch> {
 }
 
 /// 页表项
+#[repr(C, align(8))]
 #[derive(Copy, Clone)]
 pub struct PageEntry<Arch> {
     data: usize,

+ 1 - 1
kernel/src/namespaces/syscall.rs

@@ -36,7 +36,7 @@ impl Syscall {
 
         Ok(check)
     }
-
+    #[allow(dead_code)]
     pub fn sys_setns(_fd: i32, flags: u64) -> Result<usize, SystemError> {
         let check = check_unshare_flags(flags)?;
 

+ 1 - 0
kernel/src/virt/mod.rs

@@ -1 +1,2 @@
 pub mod kvm;
+pub mod vm;

+ 491 - 0
kernel/src/virt/vm/kvm_dev.rs

@@ -0,0 +1,491 @@
+use core::intrinsics::unlikely;
+
+use alloc::sync::{Arc, Weak};
+use log::{debug, warn};
+use system_error::SystemError;
+
+use crate::{
+    arch::{
+        vm::{kvm_host::KvmCommonRegs, uapi::UapiKvmSegmentRegs},
+        MMArch,
+    },
+    driver::base::device::device_number::DeviceNumber,
+    filesystem::{
+        devfs::{devfs_register, DevFS, DeviceINode},
+        vfs::{
+            core::generate_inode_id,
+            file::{File, FileMode},
+            syscall::ModeType,
+            FileType, IndexNode, Metadata,
+        },
+    },
+    libs::spinlock::SpinLock,
+    mm::MemoryManagementArch,
+    process::ProcessManager,
+    syscall::user_access::{UserBufferReader, UserBufferWriter},
+    time::PosixTimeSpec,
+    virt::vm::user_api::{KvmUserspaceMemoryRegion, PosixKvmUserspaceMemoryRegion},
+};
+
+use super::kvm_host::{vcpu::LockedVirtCpu, LockedVm};
+
+#[derive(Debug)]
+pub struct KvmInode {
+    /// 指向自身的弱引用
+    self_ref: Weak<LockedKvmInode>,
+    /// 指向inode所在的文件系统对象的指针
+    fs: Weak<DevFS>,
+    /// INode 元数据
+    metadata: Metadata,
+}
+
+#[derive(Debug)]
+pub struct LockedKvmInode {
+    inner: SpinLock<KvmInode>,
+}
+
+impl LockedKvmInode {
+    const KVM_CREATE_VM: u32 = 0xAE01;
+    const KVM_GET_VCPU_MMAP_SIZE: u32 = 0xAE04;
+
+    pub fn new() -> Arc<Self> {
+        let inode = KvmInode {
+            self_ref: Weak::default(),
+            fs: Weak::default(),
+            metadata: Metadata {
+                dev_id: 1,
+                inode_id: generate_inode_id(),
+                size: 0,
+                blk_size: 0,
+                blocks: 0,
+                atime: PosixTimeSpec::default(),
+                mtime: PosixTimeSpec::default(),
+                ctime: PosixTimeSpec::default(),
+                file_type: FileType::KvmDevice, // 文件夹,block设备,char设备
+                mode: ModeType::S_IALLUGO,
+                nlinks: 1,
+                uid: 0,
+                gid: 0,
+                raw_dev: DeviceNumber::default(), // 这里用来作为device number
+            },
+        };
+
+        let result = Arc::new(LockedKvmInode {
+            inner: SpinLock::new(inode),
+        });
+        result.inner.lock().self_ref = Arc::downgrade(&result);
+
+        return result;
+    }
+
+    fn create_vm(&self, vm_type: usize) -> Result<usize, SystemError> {
+        let kvm = LockedVm::create(vm_type)?;
+
+        let instance = KvmInstance::new(kvm);
+
+        let current = ProcessManager::current_pcb();
+
+        let file = File::new(instance, FileMode::O_RDWR)?;
+        let fd = current.fd_table().write().alloc_fd(file, None)?;
+        return Ok(fd as usize);
+    }
+}
+
+impl DeviceINode for LockedKvmInode {
+    fn set_fs(&self, fs: Weak<DevFS>) {
+        self.inner.lock().fs = fs;
+    }
+}
+
+impl IndexNode for LockedKvmInode {
+    fn open(
+        &self,
+        _data: crate::libs::spinlock::SpinLockGuard<crate::filesystem::vfs::FilePrivateData>,
+        _mode: &FileMode,
+    ) -> Result<(), SystemError> {
+        Ok(())
+    }
+    fn read_at(
+        &self,
+        _offset: usize,
+        _len: usize,
+        _buf: &mut [u8],
+        _data: crate::libs::spinlock::SpinLockGuard<crate::filesystem::vfs::FilePrivateData>,
+    ) -> Result<usize, system_error::SystemError> {
+        Err(SystemError::ENOSYS)
+    }
+
+    fn write_at(
+        &self,
+        _offset: usize,
+        _len: usize,
+        _buf: &[u8],
+        _data: crate::libs::spinlock::SpinLockGuard<crate::filesystem::vfs::FilePrivateData>,
+    ) -> Result<usize, system_error::SystemError> {
+        Err(SystemError::ENOSYS)
+    }
+
+    fn fs(&self) -> Arc<dyn crate::filesystem::vfs::FileSystem> {
+        self.inner.lock().fs.upgrade().unwrap()
+    }
+
+    fn as_any_ref(&self) -> &dyn core::any::Any {
+        self
+    }
+
+    fn list(&self) -> Result<alloc::vec::Vec<alloc::string::String>, system_error::SystemError> {
+        Err(SystemError::ENOSYS)
+    }
+
+    fn metadata(&self) -> Result<Metadata, system_error::SystemError> {
+        Ok(self.inner.lock().metadata.clone())
+    }
+
+    fn ioctl(
+        &self,
+        cmd: u32,
+        arg: usize,
+        _private_data: &crate::filesystem::vfs::FilePrivateData,
+    ) -> Result<usize, SystemError> {
+        match cmd {
+            Self::KVM_CREATE_VM => {
+                let ret = self.create_vm(arg);
+                warn!("[KVM]: KVM_CREATE_VM {ret:?}");
+
+                return ret;
+            }
+
+            Self::KVM_GET_VCPU_MMAP_SIZE => {
+                if arg != 0 {
+                    return Err(SystemError::EINVAL);
+                }
+                debug!("[KVM] KVM_GET_VCPU_MMAP_SIZE");
+                return Ok(MMArch::PAGE_SIZE);
+            }
+
+            _ => {
+                // TODO: arch_ioctl
+                warn!("[KVM]: unknown iooctl cmd {cmd:x}");
+            }
+        }
+
+        Ok(0)
+    }
+
+    fn close(
+        &self,
+        _data: crate::libs::spinlock::SpinLockGuard<crate::filesystem::vfs::FilePrivateData>,
+    ) -> Result<(), SystemError> {
+        Ok(())
+    }
+}
+
+#[derive(Debug)]
+pub struct KvmInstance {
+    kvm: Arc<LockedVm>,
+    metadata: Metadata,
+}
+
+impl KvmInstance {
+    const KVM_CREATE_VCPU: u32 = 0xAE41;
+    const KVM_SET_USER_MEMORY_REGION: u32 = 0x4020AE46;
+
+    pub fn new(vm: Arc<LockedVm>) -> Arc<Self> {
+        Arc::new(Self {
+            kvm: vm,
+            metadata: Metadata {
+                dev_id: 1,
+                inode_id: generate_inode_id(),
+                size: 0,
+                blk_size: 0,
+                blocks: 0,
+                atime: PosixTimeSpec::default(),
+                mtime: PosixTimeSpec::default(),
+                ctime: PosixTimeSpec::default(),
+                file_type: FileType::KvmDevice,
+                mode: ModeType::S_IALLUGO,
+                nlinks: 1,
+                uid: 0,
+                gid: 0,
+                raw_dev: DeviceNumber::default(), // 这里用来作为device number
+            },
+        })
+    }
+}
+
+impl IndexNode for KvmInstance {
+    fn open(
+        &self,
+        _data: crate::libs::spinlock::SpinLockGuard<crate::filesystem::vfs::FilePrivateData>,
+        _mode: &crate::filesystem::vfs::file::FileMode,
+    ) -> Result<(), SystemError> {
+        Ok(())
+    }
+
+    #[inline(never)]
+    fn ioctl(
+        &self,
+        cmd: u32,
+        arg: usize,
+        _private_data: &crate::filesystem::vfs::FilePrivateData,
+    ) -> Result<usize, SystemError> {
+        debug!("kvm instance ioctl cmd {cmd:x}");
+        match cmd {
+            Self::KVM_CREATE_VCPU => {
+                let ret = self.kvm.lock().create_vcpu(arg);
+                debug!("[KVM] create vcpu fd {ret:?}");
+                return ret;
+            }
+
+            Self::KVM_SET_USER_MEMORY_REGION => {
+                debug!("[KVM-INSTANCE] KVM_SET_USER_MEMORY_REGION");
+                let user_reader = UserBufferReader::new(
+                    arg as *const PosixKvmUserspaceMemoryRegion,
+                    core::mem::size_of::<PosixKvmUserspaceMemoryRegion>(),
+                    true,
+                )?;
+
+                let region = user_reader.read_one_from_user::<PosixKvmUserspaceMemoryRegion>(0)?;
+
+                self.kvm
+                    .lock()
+                    .set_memory_region(KvmUserspaceMemoryRegion::from_posix(region)?)?;
+
+                return Ok(0);
+            }
+
+            _ => {
+                // arch_ioctl
+            }
+        }
+
+        todo!()
+    }
+
+    fn read_at(
+        &self,
+        _offset: usize,
+        _len: usize,
+        _buf: &mut [u8],
+        _data: crate::libs::spinlock::SpinLockGuard<crate::filesystem::vfs::FilePrivateData>,
+    ) -> Result<usize, SystemError> {
+        todo!()
+    }
+
+    fn write_at(
+        &self,
+        _offset: usize,
+        _len: usize,
+        _buf: &[u8],
+        _data: crate::libs::spinlock::SpinLockGuard<crate::filesystem::vfs::FilePrivateData>,
+    ) -> Result<usize, SystemError> {
+        todo!()
+    }
+
+    fn fs(&self) -> Arc<dyn crate::filesystem::vfs::FileSystem> {
+        todo!()
+    }
+
+    fn as_any_ref(&self) -> &dyn core::any::Any {
+        todo!()
+    }
+
+    fn list(&self) -> Result<alloc::vec::Vec<alloc::string::String>, SystemError> {
+        todo!()
+    }
+
+    fn metadata(&self) -> Result<Metadata, SystemError> {
+        Ok(self.metadata.clone())
+    }
+
+    fn close(
+        &self,
+        _data: crate::libs::spinlock::SpinLockGuard<crate::filesystem::vfs::FilePrivateData>,
+    ) -> Result<(), SystemError> {
+        Ok(())
+    }
+}
+
+#[derive(Debug)]
+pub struct KvmVcpuDev {
+    vcpu: Arc<LockedVirtCpu>,
+    /// INode 元数据
+    metadata: Metadata,
+}
+
+impl KvmVcpuDev {
+    const KVM_RUN: u32 = 0xAE80;
+    const KVM_GET_REGS: u32 = 0x8090AE81;
+    const KVM_SET_REGS: u32 = 0x4090AE82;
+    const KVM_GET_SREGS: u32 = 0x8138AE83;
+    const KVM_SET_SREGS: u32 = 0x4138AE84;
+
+    pub fn new(vcpu: Arc<LockedVirtCpu>) -> Arc<Self> {
+        Arc::new(Self {
+            vcpu,
+            metadata: Metadata {
+                dev_id: 1,
+                inode_id: generate_inode_id(),
+                size: 0,
+                blk_size: 0,
+                blocks: 0,
+                atime: PosixTimeSpec::default(),
+                mtime: PosixTimeSpec::default(),
+                ctime: PosixTimeSpec::default(),
+                file_type: FileType::KvmDevice, // 文件夹,block设备,char设备
+                mode: ModeType::S_IALLUGO,
+                nlinks: 1,
+                uid: 0,
+                gid: 0,
+                raw_dev: DeviceNumber::default(), // 这里用来作为device number
+            },
+        })
+    }
+}
+
+impl IndexNode for KvmVcpuDev {
+    fn open(
+        &self,
+        _data: crate::libs::spinlock::SpinLockGuard<crate::filesystem::vfs::FilePrivateData>,
+        _mode: &FileMode,
+    ) -> Result<(), SystemError> {
+        Ok(())
+    }
+
+    fn close(
+        &self,
+        _data: crate::libs::spinlock::SpinLockGuard<crate::filesystem::vfs::FilePrivateData>,
+    ) -> Result<(), SystemError> {
+        Ok(())
+    }
+
+    fn ioctl(
+        &self,
+        cmd: u32,
+        arg: usize,
+        _private_data: &crate::filesystem::vfs::FilePrivateData,
+    ) -> Result<usize, SystemError> {
+        match cmd {
+            Self::KVM_RUN => {
+                if arg != 0 {
+                    return Err(SystemError::EINVAL);
+                }
+                let mut vcpu = self.vcpu.lock();
+                let oldpid = vcpu.pid;
+                if unlikely(oldpid != Some(ProcessManager::current_pid())) {
+                    vcpu.pid = Some(ProcessManager::current_pid());
+                }
+
+                return vcpu.run();
+            }
+            Self::KVM_GET_REGS => {
+                let kvm_regs = self.vcpu.lock().get_regs();
+                let mut user_writer = UserBufferWriter::new(
+                    arg as *const KvmCommonRegs as *mut KvmCommonRegs,
+                    core::mem::size_of::<KvmCommonRegs>(),
+                    true,
+                )?;
+
+                user_writer.copy_one_to_user(&kvm_regs, 0)?;
+                return Ok(0);
+            }
+
+            Self::KVM_SET_REGS => {
+                let user_reader = UserBufferReader::new(
+                    arg as *const KvmCommonRegs,
+                    core::mem::size_of::<KvmCommonRegs>(),
+                    true,
+                )?;
+
+                let regs = user_reader.read_one_from_user::<KvmCommonRegs>(0)?;
+
+                self.vcpu.lock().set_regs(regs)?;
+
+                return Ok(0);
+            }
+
+            Self::KVM_GET_SREGS => {
+                let sregs = self.vcpu.lock().get_segment_regs();
+
+                let mut writer = UserBufferWriter::new(
+                    arg as *const UapiKvmSegmentRegs as *mut UapiKvmSegmentRegs,
+                    core::mem::size_of::<UapiKvmSegmentRegs>(),
+                    true,
+                )?;
+
+                writer.copy_one_to_user(&sregs, 0)?;
+
+                return Ok(0);
+            }
+
+            Self::KVM_SET_SREGS => {
+                let user_reader = UserBufferReader::new(
+                    arg as *const UapiKvmSegmentRegs,
+                    core::mem::size_of::<UapiKvmSegmentRegs>(),
+                    true,
+                )?;
+
+                let mut sreg = UapiKvmSegmentRegs::default();
+                user_reader.copy_one_from_user(&mut sreg, 0)?;
+
+                if let Ok(_res) = self.vcpu.lock().set_segment_regs(&mut sreg) {
+                    return Ok(0);
+                } else {
+                    debug!("set segment regs failed");
+                    return Err(SystemError::EINVAL);
+                }
+            }
+
+            _ => {
+                // arch ioctl
+                warn!("[KVM-VCPU] unknown ioctl cmd {cmd:x}");
+            }
+        }
+
+        Ok(0)
+    }
+
+    fn metadata(&self) -> Result<Metadata, SystemError> {
+        Ok(self.metadata.clone())
+    }
+
+    fn read_at(
+        &self,
+        _offset: usize,
+        _len: usize,
+        _buf: &mut [u8],
+        _data: crate::libs::spinlock::SpinLockGuard<crate::filesystem::vfs::FilePrivateData>,
+    ) -> Result<usize, SystemError> {
+        todo!()
+    }
+
+    fn write_at(
+        &self,
+        _offset: usize,
+        _len: usize,
+        _buf: &[u8],
+        _data: crate::libs::spinlock::SpinLockGuard<crate::filesystem::vfs::FilePrivateData>,
+    ) -> Result<usize, SystemError> {
+        todo!()
+    }
+
+    fn fs(&self) -> Arc<dyn crate::filesystem::vfs::FileSystem> {
+        todo!()
+    }
+
+    fn as_any_ref(&self) -> &dyn core::any::Any {
+        todo!()
+    }
+
+    fn list(&self) -> Result<alloc::vec::Vec<alloc::string::String>, SystemError> {
+        todo!()
+    }
+}
+
+pub fn kvm_init() -> Result<(), SystemError> {
+    let kvm_inode = LockedKvmInode::new();
+
+    devfs_register("kvm", kvm_inode)?;
+
+    Ok(())
+}

+ 714 - 0
kernel/src/virt/vm/kvm_host/mem.rs

@@ -0,0 +1,714 @@
+use alloc::{
+    sync::{Arc, Weak},
+    vec::Vec,
+};
+use bitmap::AllocBitmap;
+use hashbrown::HashMap;
+use log::debug;
+use system_error::SystemError;
+
+use crate::{
+    arch::{vm::mmu::kvm_mmu::PAGE_SIZE, MMArch},
+    libs::{
+        rbtree::RBTree,
+        rwlock::{RwLock, RwLockReadGuard, RwLockWriteGuard},
+        spinlock::{SpinLock, SpinLockGuard},
+    },
+    mm::{kernel_mapper::KernelMapper, page::EntryFlags, MemoryManagementArch, VirtAddr},
+    virt::{
+        kvm::host_mem::PAGE_SHIFT,
+        vm::{kvm_host::KVM_ADDRESS_SPACE_NUM, user_api::KvmUserspaceMemoryRegion},
+    },
+};
+
+use super::{LockedVm, Vm};
+
+pub const KVM_USER_MEM_SLOTS: u16 = u16::MAX;
+pub const KVM_INTERNAL_MEM_SLOTS: u16 = 3;
+pub const KVM_MEM_SLOTS_NUM: u16 = KVM_USER_MEM_SLOTS - KVM_INTERNAL_MEM_SLOTS;
+pub const KVM_MEM_MAX_NR_PAGES: usize = (1 << 31) - 1;
+// pub const APIC_ACCESS_PAGE_PRIVATE_MEMSLOT: u16 = KVM_MEM_SLOTS_NUM + 1;
+
+/// 对于普通的页帧号(PFN),最高的12位应该为零,
+/// 因此我们可以mask位62到位52来表示错误的PFN,
+/// mask位63来表示无槽的PFN。
+// const KVM_PFN_ERR_MASK: u64 = 0x7ff << 52; //0x7FF0000000000000
+// const KVM_PFN_ERR_NOSLOT_MASK: u64 = 0xfff << 52; //0xFFF0000000000000
+// const KVM_PFN_NOSLOT: u64 = 1 << 63; //0x8000000000000000
+
+// const KVM_PFN_ERR_FAULT: u64 = KVM_PFN_ERR_MASK;
+// const KVM_PFN_ERR_HWPOISON: u64 = KVM_PFN_ERR_MASK + 1;
+// const KVM_PFN_ERR_RO_FAULT: u64 = KVM_PFN_ERR_MASK + 2;
+// const KVM_PFN_ERR_SIGPENDING: u64 = KVM_PFN_ERR_MASK + 3;
+
+#[derive(Debug, Default)]
+#[allow(dead_code)]
+pub struct KvmMmuMemoryCache {
+    gfp_zero: u32,
+    gfp_custom: u32,
+    capacity: usize,
+    nobjs: usize,
+    objects: Option<Vec<u8>>,
+}
+impl KvmMmuMemoryCache {
+    #[allow(dead_code)]
+    pub fn kvm_mmu_totup_memory_cache(
+        &mut self,
+        _capacity: usize,
+        _min: usize,
+    ) -> Result<(), SystemError> {
+        // let gfp = if self.gfp_custom != 0 {
+        //     self.gfp_custom
+        // } else {
+        //     todo!();
+        // };
+
+        // if self.nobjs >= min {
+        //     return Ok(());
+        // }
+
+        // if unlikely(self.objects.is_none()) {
+        //     if self.capacity == 0 {
+        //         return Err(SystemError::EIO);
+        //     }
+
+        //     // self.objects = Some(Box::new)
+        // }
+
+        Ok(())
+    }
+}
+
+#[derive(Debug, PartialEq, Eq, PartialOrd, Ord, Clone, Copy, Default)]
+pub struct AddrRange {
+    pub start: VirtAddr,
+    pub last: VirtAddr,
+}
+
+#[derive(Debug, Default)]
+pub struct KvmMemSlotSet {
+    /// 最后一次使用到的内存插槽
+    pub last_use: Option<Arc<LockedKvmMemSlot>>,
+    /// 存储虚拟地址(hva)和内存插槽之间的映射关系
+    hva_tree: RBTree<AddrRange, Arc<LockedKvmMemSlot>>,
+    /// 用于存储全局页帧号(gfn)和内存插槽之间的映射关系
+    pub gfn_tree: RBTree<u64, Arc<LockedKvmMemSlot>>,
+    /// 将内存插槽的ID映射到对应的内存插槽。
+    slots: HashMap<u16, Arc<LockedKvmMemSlot>>,
+
+    pub node_idx: usize,
+    pub generation: u64,
+}
+
+impl KvmMemSlotSet {
+    pub fn get_slot(&self, id: u16) -> Option<Arc<LockedKvmMemSlot>> {
+        self.slots.get(&id).cloned()
+    }
+}
+
+#[derive(Debug)]
+pub struct LockedKvmMemSlot {
+    inner: RwLock<KvmMemSlot>,
+}
+
+impl LockedKvmMemSlot {
+    pub fn new() -> Arc<Self> {
+        Arc::new(Self {
+            inner: RwLock::new(KvmMemSlot::default()),
+        })
+    }
+
+    #[inline]
+    pub fn read(&self) -> RwLockReadGuard<KvmMemSlot> {
+        self.inner.read()
+    }
+
+    #[inline]
+    pub fn write(&self) -> RwLockWriteGuard<KvmMemSlot> {
+        self.inner.write()
+    }
+
+    #[inline]
+    pub fn copy_from(&self, other: &Arc<LockedKvmMemSlot>) {
+        let mut guard = self.write();
+        let other = other.read();
+
+        guard.base_gfn = other.base_gfn;
+        guard.npages = other.npages;
+
+        guard.dirty_bitmap = other.dirty_bitmap.clone();
+        guard.arch = other.arch;
+        guard.userspace_addr = other.userspace_addr;
+        guard.flags = other.flags;
+        guard.id = other.id;
+        guard.as_id = other.as_id;
+    }
+}
+
+#[derive(Debug, Default)]
+pub struct KvmMemSlot {
+    /// 首个gfn
+    pub base_gfn: u64,
+    /// 页数量
+    pub npages: usize,
+    /// 脏页位图
+    dirty_bitmap: Option<AllocBitmap>,
+    /// 架构相关
+    arch: (),
+    userspace_addr: VirtAddr,
+    flags: UserMemRegionFlag,
+    id: u16,
+    as_id: u16,
+
+    hva_node_key: [AddrRange; 2],
+}
+#[allow(dead_code)]
+impl KvmMemSlot {
+    pub fn check_aligned_addr(&self, align: usize) -> bool {
+        self.userspace_addr.data() % align == 0
+    }
+    pub fn get_flags(&self) -> UserMemRegionFlag {
+        self.flags
+    }
+    pub fn get_id(&self) -> u16 {
+        self.id
+    }
+    // 检查内存槽是否可见
+    pub fn is_visible(&self) -> bool {
+        self.id < KVM_USER_MEM_SLOTS
+            && (self.flags.bits() & UserMemRegionFlag::KVM_MEMSLOT_INVALID.bits()) == 0
+    }
+}
+
+#[derive(Debug)]
+pub struct LockedVmMemSlotSet {
+    inner: SpinLock<KvmMemSlotSet>,
+}
+
+impl LockedVmMemSlotSet {
+    pub fn new(slots: KvmMemSlotSet) -> Arc<Self> {
+        Arc::new(Self {
+            inner: SpinLock::new(slots),
+        })
+    }
+
+    pub fn lock(&self) -> SpinLockGuard<KvmMemSlotSet> {
+        self.inner.lock()
+    }
+}
+
+#[derive(Debug, Default)]
+#[allow(dead_code)]
+pub struct GfnToHvaCache {
+    generation: u64,
+    /// 客户机对应物理地址(Guest Physical Address)
+    gpa: u64,
+    /// 主机用户空间虚拟地址(User Host Virtual Address)
+    uhva: Option<u64>,
+    /// 主机内核空间虚拟地址(Kernel Host Virtual Address)
+    khva: u64,
+    /// 对应内存插槽
+    memslot: Option<Arc<LockedKvmMemSlot>>,
+    /// 对应物理页帧号(Page Frame Number)
+    pfn: Option<u64>,
+    /// 缓存项的使用情况
+    usage: PfnCacheUsage,
+    /// 是否处于活动状态
+    active: bool,
+    /// 是否有效
+    valid: bool,
+    vm: Option<Weak<LockedVm>>,
+}
+
+impl GfnToHvaCache {
+    pub fn init(vm: Weak<LockedVm>, usage: PfnCacheUsage) -> Self {
+        // check_stack_usage();
+        // let mut ret: Box<GfnToHvaCache> = unsafe { Box::new_zeroed().assume_init() };
+        // ret.usage = usage;
+        // ret.vm = Some(vm);
+        // *ret
+        Self {
+            usage,
+            vm: Some(vm),
+            ..Default::default()
+        }
+    }
+}
+
+bitflags! {
+    #[derive(Default)]
+    pub struct PfnCacheUsage: u8 {
+        const GUEST_USES_PFN = 1 << 0;
+        const HOST_USES_PFN = 1 << 1;
+        const GUEST_AND_HOST_USES_PFN = Self::GUEST_USES_PFN.bits | Self::HOST_USES_PFN.bits;
+    }
+
+    pub struct UserMemRegionFlag: u32 {
+        /// 用来开启内存脏页
+        const LOG_DIRTY_PAGES = 1 << 0;
+        /// 开启内存只读
+        const READONLY = 1 << 1;
+        /// 标记invalid
+        const KVM_MEMSLOT_INVALID = 1 << 16;
+    }
+}
+
+impl Default for UserMemRegionFlag {
+    fn default() -> Self {
+        Self::empty()
+    }
+}
+
+#[derive(PartialEq, Eq, Debug, Clone, Copy)]
+pub enum KvmMemoryChangeMode {
+    Create,
+    Delete,
+    Move,
+    FlagsOnly,
+}
+
+impl Vm {
+    #[inline(never)]
+    pub fn set_memory_region(&mut self, mem: KvmUserspaceMemoryRegion) -> Result<(), SystemError> {
+        if mem.slot >= u16::MAX as u32 {
+            return Err(SystemError::EINVAL);
+        }
+
+        let as_id = mem.slot >> 16;
+        let id = mem.slot as u16;
+
+        // 检查内存对齐以及32位检测(虽然现在没什么用<)
+        if (mem.memory_size as usize & MMArch::PAGE_SIZE != 0)
+            || mem.memory_size != mem.memory_size as usize as u64
+        {
+            return Err(SystemError::EINVAL);
+        }
+
+        if !mem.guest_phys_addr.check_aligned(MMArch::PAGE_SIZE) {
+            return Err(SystemError::EINVAL);
+        }
+
+        if !mem.userspace_addr.check_aligned(MMArch::PAGE_SIZE) {
+            // 这里应该还需要判断从userspace_addr->userspace_addr+memory_size这段区间都是合法的
+            return Err(SystemError::EINVAL);
+        }
+
+        if as_id >= KVM_ADDRESS_SPACE_NUM as u32 || id >= KVM_MEM_SLOTS_NUM {
+            return Err(SystemError::EINVAL);
+        }
+
+        if (mem.memory_size >> MMArch::PAGE_SHIFT) > KVM_MEM_MAX_NR_PAGES as u64 {
+            return Err(SystemError::EINVAL);
+        }
+
+        let slots = self.memslot_set(as_id as usize).clone();
+
+        let slots_guard = slots.lock();
+        let old = slots_guard.get_slot(id);
+        if mem.memory_size == 0 {
+            if let Some(old) = &old {
+                let old_npages = old.read().npages;
+                if old_npages == 0 {
+                    return Err(SystemError::EINVAL);
+                }
+
+                if self.nr_memslot_pages < old_npages {
+                    return Err(SystemError::EIO);
+                }
+                drop(slots_guard);
+                return self.set_memslot(Some(old), None, KvmMemoryChangeMode::Delete);
+            } else {
+                return Err(SystemError::EINVAL);
+            }
+        }
+
+        let base_gfn = (mem.guest_phys_addr.data() >> MMArch::PAGE_SHIFT) as u64;
+        let npages = mem.memory_size >> MMArch::PAGE_SHIFT;
+
+        let change;
+        if let Some(old) = &old {
+            let old_guard = old.read();
+            if old_guard.npages == 0 {
+                change = KvmMemoryChangeMode::Create;
+                // 避免溢出
+                if let Some(new_pages) = self.nr_memslot_pages.checked_add(npages as usize) {
+                    if new_pages < self.nr_memslot_pages {
+                        return Err(SystemError::EINVAL);
+                    }
+                } else {
+                    return Err(SystemError::EINVAL);
+                }
+            } else {
+                if mem.userspace_addr != old_guard.userspace_addr
+                    || npages != old_guard.npages as u64
+                    || (mem.flags ^ old_guard.flags).contains(UserMemRegionFlag::READONLY)
+                {
+                    return Err(SystemError::EINVAL);
+                }
+
+                if base_gfn != old_guard.base_gfn {
+                    change = KvmMemoryChangeMode::Move;
+                } else if mem.flags != old_guard.flags {
+                    change = KvmMemoryChangeMode::FlagsOnly;
+                } else {
+                    return Ok(());
+                }
+            }
+        } else {
+            change = KvmMemoryChangeMode::Create;
+            // 避免溢出
+            if let Some(new_pages) = self.nr_memslot_pages.checked_add(npages as usize) {
+                if new_pages < self.nr_memslot_pages {
+                    return Err(SystemError::EINVAL);
+                }
+            } else {
+                return Err(SystemError::EINVAL);
+            }
+        };
+
+        if (change == KvmMemoryChangeMode::Create || change == KvmMemoryChangeMode::Move)
+            && slots_guard.gfn_tree.contains_key(&base_gfn)
+        {
+            return Err(SystemError::EEXIST);
+        }
+
+        let new = LockedKvmMemSlot::new();
+        let mut new_guard = new.write();
+
+        new_guard.as_id = as_id as u16;
+        new_guard.id = id;
+        new_guard.base_gfn = base_gfn;
+        new_guard.npages = npages as usize;
+        new_guard.flags = mem.flags;
+        new_guard.userspace_addr = mem.userspace_addr;
+
+        drop(new_guard);
+        drop(slots_guard);
+        return self.set_memslot(old.as_ref(), Some(&new), change);
+    }
+
+    #[allow(clippy::modulo_one)]
+    #[inline]
+    /// 获取活动内存插槽
+    fn memslot_set(&self, id: usize) -> &Arc<LockedVmMemSlotSet> {
+        // 避免越界
+        let id = id % KVM_ADDRESS_SPACE_NUM;
+        &self.memslots[id]
+    }
+
+    #[inline(never)]
+    fn set_memslot(
+        &mut self,
+        old: Option<&Arc<LockedKvmMemSlot>>,
+        new: Option<&Arc<LockedKvmMemSlot>>,
+        change: KvmMemoryChangeMode,
+    ) -> Result<(), SystemError> {
+        let invalid_slot = LockedKvmMemSlot::new();
+        if change == KvmMemoryChangeMode::Delete || change == KvmMemoryChangeMode::Move {
+            self.invalidate_memslot(old.unwrap(), &invalid_slot)
+        }
+
+        match self.prepare_memory_region(old, new, change) {
+            Ok(_) => {}
+            Err(e) => {
+                if change == KvmMemoryChangeMode::Delete || change == KvmMemoryChangeMode::Move {
+                    self.active_memslot(Some(&invalid_slot), old)
+                }
+                return Err(e);
+            }
+        }
+
+        match change {
+            KvmMemoryChangeMode::Create => self.create_memslot(new),
+            KvmMemoryChangeMode::Delete => self.delete_memslot(old, &invalid_slot),
+            KvmMemoryChangeMode::Move => self.move_memslot(old, new, &invalid_slot),
+            KvmMemoryChangeMode::FlagsOnly => self.update_flags_memslot(old, new),
+        }
+
+        // TODO:kvm_commit_memory_region(kvm, old, new, change);
+        Ok(())
+    }
+
+    fn create_memslot(&mut self, new: Option<&Arc<LockedKvmMemSlot>>) {
+        self.replace_memslot(None, new);
+        self.active_memslot(None, new);
+    }
+
+    fn delete_memslot(
+        &mut self,
+        old: Option<&Arc<LockedKvmMemSlot>>,
+        invalid_slot: &Arc<LockedKvmMemSlot>,
+    ) {
+        self.replace_memslot(old, None);
+        self.active_memslot(Some(invalid_slot), None);
+    }
+
+    fn move_memslot(
+        &mut self,
+        old: Option<&Arc<LockedKvmMemSlot>>,
+        new: Option<&Arc<LockedKvmMemSlot>>,
+        invalid_slot: &Arc<LockedKvmMemSlot>,
+    ) {
+        self.replace_memslot(old, new);
+        self.active_memslot(Some(invalid_slot), new);
+    }
+
+    fn update_flags_memslot(
+        &mut self,
+        old: Option<&Arc<LockedKvmMemSlot>>,
+        new: Option<&Arc<LockedKvmMemSlot>>,
+    ) {
+        self.replace_memslot(old, new);
+        self.active_memslot(old, new);
+    }
+
+    fn prepare_memory_region(
+        &self,
+        old: Option<&Arc<LockedKvmMemSlot>>,
+        new: Option<&Arc<LockedKvmMemSlot>>,
+        change: KvmMemoryChangeMode,
+    ) -> Result<(), SystemError> {
+        if change != KvmMemoryChangeMode::Delete {
+            let new = new.unwrap();
+            let mut new_guard = new.write();
+            if !new_guard.flags.contains(UserMemRegionFlag::LOG_DIRTY_PAGES) {
+                new_guard.dirty_bitmap = None;
+            } else if old.is_some() {
+                let old_guard = old.unwrap().read();
+                if old_guard.dirty_bitmap.is_some() {
+                    new_guard.dirty_bitmap = old_guard.dirty_bitmap.clone();
+                } else {
+                    new_guard.dirty_bitmap = Some(AllocBitmap::new(new_guard.npages * 2));
+                }
+            }
+        }
+
+        return self.arch_prepare_memory_region(old, new, change);
+    }
+
+    fn invalidate_memslot(
+        &mut self,
+        old: &Arc<LockedKvmMemSlot>,
+        invalid_slot: &Arc<LockedKvmMemSlot>,
+    ) {
+        invalid_slot.copy_from(old);
+
+        let mut old_guard = old.write();
+        let mut invalid_slot_guard = invalid_slot.write();
+        invalid_slot_guard
+            .flags
+            .insert(UserMemRegionFlag::KVM_MEMSLOT_INVALID);
+
+        self.swap_active_memslots(old_guard.as_id as usize);
+
+        old_guard.arch = invalid_slot_guard.arch;
+    }
+
+    #[inline(never)]
+    fn active_memslot(
+        &mut self,
+        old: Option<&Arc<LockedKvmMemSlot>>,
+        new: Option<&Arc<LockedKvmMemSlot>>,
+    ) {
+        let as_id = if let Some(slot) = old.or(new) {
+            slot.read().as_id
+        } else {
+            0
+        };
+
+        self.swap_active_memslots(as_id as usize);
+
+        self.replace_memslot(old, new);
+    }
+
+    #[inline(never)]
+    fn replace_memslot(
+        &self,
+        old: Option<&Arc<LockedKvmMemSlot>>,
+        new: Option<&Arc<LockedKvmMemSlot>>,
+    ) {
+        let as_id = if let Some(slot) = old.or(new) {
+            slot.read().as_id
+        } else {
+            0
+        };
+
+        let slot_set = self.get_inactive_memslot_set(as_id as usize);
+
+        let mut slots_guard = slot_set.lock();
+        let idx = slots_guard.node_idx;
+
+        if let Some(old) = old {
+            slots_guard.hva_tree.remove(&old.read().hva_node_key[idx]);
+
+            if let Some(last) = &slots_guard.last_use {
+                if Arc::ptr_eq(last, old) {
+                    slots_guard.last_use = new.cloned();
+                }
+            }
+
+            if new.is_none() {
+                slots_guard.gfn_tree.remove(&old.read().base_gfn);
+                return;
+            }
+        }
+
+        let new = new.unwrap();
+        let mut new_guard = new.write();
+        new_guard.hva_node_key[idx].start = new_guard.userspace_addr;
+        new_guard.hva_node_key[idx].last =
+            new_guard.userspace_addr + VirtAddr::new((new_guard.npages << MMArch::PAGE_SHIFT) - 1);
+
+        slots_guard
+            .hva_tree
+            .insert(new_guard.hva_node_key[idx], new.clone());
+
+        if let Some(old) = old {
+            slots_guard.gfn_tree.remove(&old.read().base_gfn);
+        }
+
+        slots_guard.gfn_tree.insert(new_guard.base_gfn, new.clone());
+    }
+
+    fn get_inactive_memslot_set(&self, as_id: usize) -> Arc<LockedVmMemSlotSet> {
+        let active = self.memslot_set(as_id);
+
+        let inactive_idx = active.lock().node_idx ^ 1;
+        return self.memslots_set[as_id][inactive_idx].clone();
+    }
+
+    fn swap_active_memslots(&mut self, as_id: usize) {
+        self.memslots[as_id] = self.get_inactive_memslot_set(as_id);
+    }
+}
+/// 将给定的客户机帧号(GFN)转换为用户空间虚拟地址(HVA),并根据内存槽的状态和标志进行相应的检查。
+///
+/// # 参数
+/// - `slot`: 可选的 `KvmMemSlot`,表示内存槽。
+/// - `gfn`: 客户机帧号(GFN),表示要转换的帧号。
+/// - `nr_pages`: 可选的可变引用,用于存储计算出的页数。
+/// - `write`: 布尔值,表示是否为写操作。
+///
+/// # 返回
+/// 如果成功,返回转换后的用户空间虚拟地址(HVA);如果失败,返回相应的错误。
+///
+/// # 错误
+/// 如果内存槽为空或无效,或者尝试对只读内存槽进行写操作,则返回 `SystemError::KVM_HVA_ERR_BAD`。
+pub fn __gfn_to_hva_many(
+    slot: &Option<&KvmMemSlot>,
+    gfn: u64,
+    nr_pages: Option<&mut u64>,
+    write: bool,
+) -> Result<u64, SystemError> {
+    debug!("__gfn_to_hva_many");
+
+    // 检查内存槽是否为空
+    if slot.is_none() {
+        return Err(SystemError::KVM_HVA_ERR_BAD);
+    }
+    let slot = slot.as_ref().unwrap();
+
+    // 检查内存槽是否无效或尝试对只读内存槽进行写操作
+    if slot.flags.bits() & UserMemRegionFlag::KVM_MEMSLOT_INVALID.bits() != 0
+        || (slot.flags.bits() & UserMemRegionFlag::READONLY.bits() != 0) && write
+    {
+        return Err(SystemError::KVM_HVA_ERR_BAD);
+    }
+
+    // 如果 `nr_pages` 不为空,计算并更新页数
+    if let Some(nr_pages) = nr_pages {
+        *nr_pages = slot.npages as u64 - (gfn - slot.base_gfn);
+    }
+
+    // 调用辅助函数将 GFN 转换为 HVA
+    return Ok(__gfn_to_hva_memslot(slot, gfn));
+}
+
+/// 将给定的全局帧号(GFN)转换为用户空间虚拟地址(HVA)。
+///
+/// # 参数
+/// - `slot`: `KvmMemSlot`,表示内存槽。
+/// - `gfn`: 全局帧号(GFN),表示要转换的帧号。
+///
+/// # 返回
+/// 转换后的用户空间虚拟地址(HVA)。
+fn __gfn_to_hva_memslot(slot: &KvmMemSlot, gfn: u64) -> u64 {
+    return slot.userspace_addr.data() as u64 + (gfn - slot.base_gfn) * PAGE_SIZE;
+}
+/// 将给定的全局帧号(GFN)转换为页帧号(PFN),并根据内存槽的状态和标志进行相应的检查。
+///
+/// # 参数
+/// - `slot`: 内存槽的引用。
+/// - `gfn`: 全局帧号(GFN),表示要转换的帧号。
+/// - `atomic`: 布尔值,表示是否为原子操作。
+/// - `interruptible`: 布尔值,表示操作是否可中断。
+/// - `async`: 可变引用,表示操作是否为异步。
+/// - `write_fault`: 布尔值,表示是否为写操作。
+/// - `writable`: 可变引用,表示是否可写。
+/// - `hva`: 可变引用,表示用户空间虚拟地址(HVA)。
+///
+/// # 返回
+/// 如果成功,返回转换后的页帧号(PFN);如果失败,返回相应的错误。
+pub fn __gfn_to_pfn_memslot(
+    slot: Option<&KvmMemSlot>,
+    gfn: u64,
+    atomic_or_async: (bool, &mut bool),
+    interruptible: bool,
+    write: bool,
+    writable: &mut bool,
+    hva: &mut u64,
+) -> Result<u64, SystemError> {
+    let addr = __gfn_to_hva_many(&slot, gfn, None, write)?;
+    *hva = addr;
+
+    //todo:检查地址是否为错误
+
+    // 如果内存槽为只读,且 writable 不为空,则更新 writable 的值
+    if slot.unwrap().flags.bits() & UserMemRegionFlag::READONLY.bits() != 0 {
+        *writable = false;
+    }
+
+    let pfn = hva_to_pfn(addr, atomic_or_async, interruptible, write, writable)?;
+    return Ok(pfn);
+}
+/// 将用户空间虚拟地址(HVA)转换为页帧号(PFN)。
+///
+/// # 参数
+/// - `addr`: 用户空间虚拟地址(HVA)。
+/// - `atomic`: 布尔值,表示是否为原子操作。
+/// - `interruptible`: 布尔值,表示操作是否可中断。
+/// - `is_async`: 可变引用,表示操作是否为异步。
+/// - `write_fault`: 布尔值,表示是否为写操作。
+/// - `writable`: 可变引用,表示是否可写。
+///
+/// # 返回
+/// 如果成功,返回转换后的页帧号(PFN);如果失败,返回相应的错误。
+// 正确性待验证
+pub fn hva_to_pfn(
+    addr: u64,
+    atomic_or_async: (bool, &mut bool),
+    _interruptible: bool,
+    _write_fault: bool,
+    _writable: &mut bool,
+) -> Result<u64, SystemError> {
+    // 我们可以原子地或异步地执行,但不能同时执行
+    assert!(
+        !(atomic_or_async.0 && *atomic_or_async.1),
+        "Cannot be both atomic and async"
+    );
+
+    debug!("hva_to_pfn");
+    // let hpa = MMArch::virt_2_phys(VirtAddr::new(addr)).unwrap().data() as u64;
+    let hva = VirtAddr::new(addr as usize);
+    let mut mapper = KernelMapper::lock();
+    let mapper = mapper.as_mut().unwrap();
+    if let Some((hpa, _)) = mapper.translate(hva) {
+        return Ok(hpa.data() as u64 >> PAGE_SHIFT);
+    }
+    debug!("hva_to_pfn NOT FOUND,try map a new pfn");
+    unsafe {
+        mapper.map(hva, EntryFlags::mmio_flags());
+    }
+    let (hpa, _) = mapper.translate(hva).unwrap();
+    return Ok(hpa.data() as u64 >> PAGE_SHIFT);
+}

+ 268 - 0
kernel/src/virt/vm/kvm_host/mod.rs

@@ -0,0 +1,268 @@
+use core::{
+    fmt::Debug,
+    sync::atomic::{AtomicUsize, Ordering},
+};
+
+use alloc::{
+    boxed::Box,
+    sync::{Arc, Weak},
+    vec::Vec,
+};
+use hashbrown::HashMap;
+use log::debug;
+use mem::LockedKvmMemSlot;
+use system_error::SystemError;
+
+use crate::{
+    arch::{
+        vm::{kvm_host::vcpu::VirtCpuRequest, vmx::KvmVmx, x86_kvm_manager},
+        CurrentKvmManager, KvmArch, VirtCpuArch,
+    },
+    filesystem::vfs::file::{File, FileMode},
+    libs::spinlock::{SpinLock, SpinLockGuard},
+    mm::ucontext::AddressSpace,
+    process::ProcessManager,
+    smp::cpu::ProcessorId,
+    virt::vm::{
+        kvm_dev::KvmVcpuDev,
+        kvm_host::vcpu::{LockedVirtCpu, VirtCpu},
+    },
+};
+
+use self::{
+    mem::{GfnToHvaCache, KvmMemSlotSet, LockedVmMemSlotSet, PfnCacheUsage},
+    vcpu::{GuestDebug, VcpuMode},
+};
+
+pub mod mem;
+pub mod vcpu;
+
+const KVM_ADDRESS_SPACE_NUM: usize = 1;
+pub const KVM_USERSAPCE_IRQ_SOURCE_ID: usize = 0;
+pub const KVM_IRQFD_RESAMPLE_IRQ_SOURCE_ID: usize = 1;
+
+#[derive(Debug)]
+pub struct LockedVm {
+    inner: SpinLock<Vm>,
+}
+
+static KVM_USAGE_COUNT: AtomicUsize = AtomicUsize::new(0);
+
+impl LockedVm {
+    pub fn lock(&self) -> SpinLockGuard<Vm> {
+        self.inner.lock()
+    }
+
+    pub fn create(vm_type: usize) -> Result<Arc<Self>, SystemError> {
+        let mut memslots_set = vec![];
+        let mut memslots = vec![];
+        for i in 0..KVM_ADDRESS_SPACE_NUM {
+            let mut tmp = vec![];
+            for j in 0..2 {
+                let mut slots = KvmMemSlotSet::default();
+                slots.last_use = None;
+                slots.node_idx = j;
+                slots.generation = i as u64;
+                tmp.push(LockedVmMemSlotSet::new(slots));
+            }
+            memslots_set.push(tmp);
+            memslots.push(memslots_set[i][0].clone());
+        }
+
+        let kvm = Vm {
+            mm: ProcessManager::current_pcb()
+                .basic()
+                .user_vm()
+                .unwrap()
+                .write()
+                .try_clone()?,
+            max_vcpus: CurrentKvmManager::KVM_MAX_VCPUS,
+            memslots_set,
+            memslots,
+            arch: KvmArch::init(vm_type)?,
+            created_vcpus: 0,
+            lock_vm_ref: Weak::new(),
+            nr_memslot_pages: 0,
+            online_vcpus: 0,
+            dirty_ring_size: 0,
+            dirty_ring_with_bitmap: false,
+            vcpus: HashMap::new(),
+            #[cfg(target_arch = "x86_64")]
+            kvm_vmx: KvmVmx::default(),
+            nr_memslots_dirty_logging: 0,
+            mmu_invalidate_seq: 0,
+        };
+
+        let ret = Arc::new(Self {
+            inner: SpinLock::new(kvm),
+        });
+
+        Self::hardware_enable_all()?;
+
+        ret.lock().lock_vm_ref = Arc::downgrade(&ret);
+        return Ok(ret);
+    }
+
+    fn hardware_enable_all() -> Result<(), SystemError> {
+        KVM_USAGE_COUNT.fetch_add(1, Ordering::SeqCst);
+
+        // 如果是第一个启动的,则需要对所有cpu都初始化硬件
+        if KVM_USAGE_COUNT.load(Ordering::SeqCst) == 1 {
+            // FIXME!!!!
+            // 这里是要对每个cpu都进行初始化,目前这里只对当前cpu调用了初始化流程
+            x86_kvm_manager().arch_hardware_enable()?;
+        }
+
+        Ok(())
+    }
+}
+
+#[derive(Debug)]
+#[allow(dead_code)]
+pub struct Vm {
+    lock_vm_ref: Weak<LockedVm>,
+    mm: Arc<AddressSpace>,
+    max_vcpus: usize,
+    created_vcpus: usize,
+    online_vcpus: usize,
+    /// vcpu集合
+    vcpus: HashMap<usize, Arc<LockedVirtCpu>>,
+    // name: String,
+    /// 对应活动和非活动内存槽,实际为:[[Arc<LockedVmMemSlots>; 2]; KVM_ADDRESS_SPACE_NUM],这里暂时写Vec
+    memslots_set: Vec<Vec<Arc<LockedVmMemSlotSet>>>,
+    /// 当前活动内存槽,实际为:[Arc<LockedVmMemSlots>; KVM_ADDRESS_SPACE_NUM],这里暂时写Vec
+    pub memslots: Vec<Arc<LockedVmMemSlotSet>>,
+    /// 内存槽对应的页数
+    nr_memslot_pages: usize,
+
+    pub arch: KvmArch,
+
+    pub dirty_ring_size: u32,
+    pub nr_memslots_dirty_logging: u32,
+    dirty_ring_with_bitmap: bool,
+
+    #[cfg(target_arch = "x86_64")]
+    pub kvm_vmx: KvmVmx,
+
+    pub mmu_invalidate_seq: u64, //用于表示内存管理单元(MMU)无效化序列号
+}
+
+impl Vm {
+    #[inline(never)]
+    pub fn create_vcpu(&mut self, id: usize) -> Result<usize, SystemError> {
+        if id >= self.max_vcpus {
+            return Err(SystemError::EINVAL);
+        }
+
+        if self.created_vcpus >= self.max_vcpus {
+            return Err(SystemError::EINVAL);
+        }
+
+        self.created_vcpus += 1;
+
+        let vcpu = self._create_vcpu(id)?;
+        if self.dirty_ring_size != 0 {
+            todo!()
+        }
+
+        vcpu.lock().vcpu_id = self.online_vcpus;
+
+        self.vcpus.insert(self.online_vcpus, vcpu.clone());
+
+        self.online_vcpus += 1;
+
+        let vcpu_inode = KvmVcpuDev::new(vcpu);
+
+        let file = File::new(vcpu_inode, FileMode::from_bits_truncate(0x777))?;
+
+        let fd = ProcessManager::current_pcb()
+            .fd_table()
+            .write()
+            .alloc_fd(file, None)?;
+
+        Ok(fd as usize)
+    }
+
+    /// ### 创建一个vcpu,并且初始化部分数据
+    #[inline(never)]
+    pub fn _create_vcpu(&mut self, id: usize) -> Result<Arc<LockedVirtCpu>, SystemError> {
+        let mut vcpu = self.new_vcpu(id);
+
+        vcpu.init_arch(self, id)?;
+
+        Ok(Arc::new(LockedVirtCpu::new(vcpu)))
+    }
+
+    #[inline(never)]
+    pub fn new_vcpu(&self, id: usize) -> VirtCpu {
+        return VirtCpu {
+            cpu: ProcessorId::INVALID,
+            kvm: Some(self.lock_vm_ref.clone()),
+            vcpu_id: id,
+            pid: None,
+            preempted: false,
+            ready: false,
+            last_used_slot: None,
+            stats_id: format!("kvm-{}/vcpu-{}", ProcessManager::current_pid().data(), id),
+            pv_time: GfnToHvaCache::init(self.lock_vm_ref.clone(), PfnCacheUsage::HOST_USES_PFN),
+            arch: VirtCpuArch::new(),
+            private: None,
+            request: VirtCpuRequest::empty(),
+            guest_debug: GuestDebug::empty(),
+            run: unsafe { Some(Box::new_zeroed().assume_init()) },
+            vcpu_idx: 0,
+            mode: VcpuMode::OutsideGuestMode,
+            stat: Default::default(),
+        };
+    }
+
+    #[cfg(target_arch = "x86_64")]
+    pub fn kvm_vmx_mut(&mut self) -> &mut KvmVmx {
+        &mut self.kvm_vmx
+    }
+
+    #[cfg(target_arch = "x86_64")]
+    pub fn kvm_vmx(&self) -> &KvmVmx {
+        &self.kvm_vmx
+    }
+}
+
+/// ## 多处理器状态(有些状态在某些架构并不合法)
+#[derive(Debug, Clone, Copy, PartialEq)]
+#[allow(dead_code)]
+pub enum MutilProcessorState {
+    Runnable,
+    Uninitialized,
+    InitReceived,
+    Halted,
+    SipiReceived,
+    Stopped,
+    CheckStop,
+    Operating,
+    Load,
+    ApResetHold,
+    Suspended,
+}
+///返回包含 gfn 的 memslot 的指针。如果没有找到,则返回 NULL。
+///当 "approx" 设置为 true 时,即使地址落在空洞中,也会返回 memslot。
+///在这种情况下,将返回空洞边界的其中一个 memslot。
+/// 先简陋完成,原本是二分,现在先遍历
+pub fn search_memslots(
+    slot_set: Arc<LockedVmMemSlotSet>,
+    gfn: u64, /*_approx:bool*/
+) -> Option<Arc<LockedKvmMemSlot>> {
+    let slots = slot_set.lock();
+    let node = &slots.gfn_tree;
+    //let(start,end)=(0,node.len()-1);
+    for (_gfn_num, slot) in node.iter() {
+        let slot_guard = slot.read();
+        debug!(
+            "gfn:{gfn},slot base_gfn: {},slot npages: {}",
+            slot_guard.base_gfn, slot_guard.npages
+        );
+        if gfn >= slot_guard.base_gfn && gfn < slot_guard.base_gfn + slot_guard.npages as u64 {
+            return Some(slot.clone());
+        }
+    }
+    return None;
+}

+ 117 - 0
kernel/src/virt/vm/kvm_host/vcpu.rs

@@ -0,0 +1,117 @@
+use alloc::{
+    boxed::Box,
+    string::String,
+    sync::{Arc, Weak},
+};
+
+use crate::{
+    arch::{
+        vm::{
+            kvm_host::{vcpu::VirtCpuRequest, KvmReg},
+            vmx::VmxVCpuPriv,
+        },
+        VirtCpuArch, VirtCpuStat,
+    },
+    libs::spinlock::{SpinLock, SpinLockGuard},
+    process::Pid,
+    smp::cpu::ProcessorId,
+    virt::vm::user_api::UapiKvmRun,
+};
+
+use super::{
+    mem::{GfnToHvaCache, KvmMemSlot},
+    LockedVm,
+};
+
+#[derive(Debug)]
+pub struct LockedVirtCpu {
+    inner: SpinLock<VirtCpu>,
+}
+
+impl LockedVirtCpu {
+    pub fn new(vcpu: VirtCpu) -> Self {
+        Self {
+            inner: SpinLock::new(vcpu),
+        }
+    }
+
+    pub fn lock(&self) -> SpinLockGuard<VirtCpu> {
+        self.inner.lock()
+    }
+}
+
+#[derive(Debug, PartialEq)]
+#[allow(dead_code)]
+pub enum VcpuMode {
+    OutsideGuestMode,
+    InGuestMode,
+    ExitingGuestMode,
+    ReadingShadowPageTables,
+}
+
+#[derive(Debug)]
+pub struct VirtCpu {
+    pub cpu: ProcessorId,
+    pub kvm: Option<Weak<LockedVm>>,
+    /// 从用户层获取
+    pub vcpu_id: usize,
+    /// id alloctor获取
+    pub vcpu_idx: usize,
+    pub pid: Option<Pid>,
+    pub preempted: bool,
+    pub ready: bool,
+    pub last_used_slot: Option<Arc<KvmMemSlot>>,
+    pub stats_id: String,
+    pub pv_time: GfnToHvaCache,
+    pub arch: VirtCpuArch,
+    pub stat: VirtCpuStat,
+
+    pub mode: VcpuMode,
+
+    pub guest_debug: GuestDebug,
+
+    #[cfg(target_arch = "x86_64")]
+    pub private: Option<VmxVCpuPriv>,
+
+    /// 记录请求
+    pub request: VirtCpuRequest,
+    pub run: Option<Box<UapiKvmRun>>,
+}
+
+impl VirtCpu {
+    #[inline]
+    pub fn kvm(&self) -> Arc<LockedVm> {
+        self.kvm.as_ref().unwrap().upgrade().unwrap()
+    }
+
+    #[cfg(target_arch = "x86_64")]
+    pub fn vmx(&self) -> &VmxVCpuPriv {
+        self.private.as_ref().unwrap()
+    }
+
+    #[cfg(target_arch = "x86_64")]
+    pub fn vmx_mut(&mut self) -> &mut VmxVCpuPriv {
+        self.private.as_mut().unwrap()
+    }
+    //https://code.dragonos.org.cn/xref/linux-6.6.21/arch/x86/kvm/vmx/vmx.h?fi=vmx_get_exit_qual#677
+    #[inline]
+    pub fn get_exit_qual(&mut self) -> u64 {
+        if !self
+            .arch
+            .test_and_mark_available(KvmReg::VcpuExregExitInfo1)
+        {
+            self.vmx_mut().vmread_exit_qual();
+        }
+        let vmx = self.vmx();
+        vmx.get_exit_qual()
+        //vmx.
+    }
+}
+
+bitflags! {
+    pub struct GuestDebug: usize {
+        const ENABLE = 0x00000001;
+        const SINGLESTEP = 0x00000002;
+        const USE_SW_BP = 0x00010000;
+    }
+}

+ 3 - 0
kernel/src/virt/vm/mod.rs

@@ -0,0 +1,3 @@
+pub mod kvm_dev;
+pub mod kvm_host;
+pub mod user_api;

+ 466 - 0
kernel/src/virt/vm/user_api.rs

@@ -0,0 +1,466 @@
+///
+/// 该文件定义了暴露给用户空间的结构体
+///
+use core::fmt::Debug;
+
+use system_error::SystemError;
+
+use crate::mm::{PhysAddr, VirtAddr};
+
+use super::kvm_host::mem::UserMemRegionFlag;
+
+#[repr(C)]
+#[derive(Debug, Default, Copy, Clone, PartialEq)]
+pub struct UapiKvmSegment {
+    pub base: u64,
+    pub limit: u32,
+    pub selector: u16,
+    pub type_: u8,
+    pub present: u8,
+    pub dpl: u8,
+    pub db: u8,
+    pub s: u8,
+    pub l: u8,
+    pub g: u8,
+    pub avl: u8,
+    pub unusable: u8,
+    pub padding: u8,
+}
+
+impl UapiKvmSegment {
+    pub fn vmx_segment_access_rights(&self) -> u32 {
+        let mut ar = self.type_ as u32 & 15;
+        ar |= (self.s as u32 & 1) << 4;
+        ar |= (self.dpl as u32 & 3) << 5;
+        ar |= (self.present as u32 & 1) << 7;
+        ar |= (self.avl as u32 & 1) << 12;
+        ar |= (self.l as u32 & 1) << 13;
+        ar |= (self.db as u32 & 1) << 14;
+        ar |= (self.g as u32 & 1) << 15;
+
+        let b = self.unusable != 0 || self.present == 0;
+        ar |= (b as u32) << 16;
+
+        return ar;
+    }
+}
+
+/// 通过这个结构可以将虚拟机的物理地址对应到用户进程的虚拟地址
+/// 用来表示虚拟机的一段物理内存
+#[repr(C)]
+#[derive(Default)]
+pub struct PosixKvmUserspaceMemoryRegion {
+    /// 在哪个slot上注册内存区间
+    pub slot: u32,
+    /// flags有两个取值,KVM_MEM_LOG_DIRTY_PAGES和KVM_MEM_READONLY,用来指示kvm针对这段内存应该做的事情。
+    /// KVM_MEM_LOG_DIRTY_PAGES用来开启内存脏页,KVM_MEM_READONLY用来开启内存只读。
+    pub flags: u32,
+    /// 虚机内存区间起始物理地址
+    pub guest_phys_addr: u64,
+    /// 虚机内存区间大小
+    pub memory_size: u64,
+    /// 虚机内存区间对应的主机虚拟地址
+    pub userspace_addr: u64,
+}
+
+/// PosixKvmUserspaceMemoryRegion对应内核表示
+pub struct KvmUserspaceMemoryRegion {
+    /// 在哪个slot上注册内存区间
+    pub slot: u32,
+    /// 用来指示kvm针对这段内存应该做的事情。
+    /// KVM_MEM_LOG_DIRTY_PAGES用来开启内存脏页,KVM_MEM_READONLY用来开启内存只读。
+    pub flags: UserMemRegionFlag,
+    /// 虚机内存区间起始物理地址
+    pub guest_phys_addr: PhysAddr,
+    /// 虚机内存区间大小
+    pub memory_size: u64,
+    /// 虚机内存区间对应的主机虚拟地址
+    pub userspace_addr: VirtAddr,
+}
+
+impl KvmUserspaceMemoryRegion {
+    pub fn from_posix(posix: &PosixKvmUserspaceMemoryRegion) -> Result<Self, SystemError> {
+        let flags = UserMemRegionFlag::from_bits(posix.flags).ok_or(SystemError::EINVAL)?;
+        Ok(Self {
+            slot: posix.slot,
+            flags,
+            guest_phys_addr: PhysAddr::new(posix.guest_phys_addr as usize),
+            memory_size: posix.memory_size,
+            userspace_addr: VirtAddr::new(posix.userspace_addr as usize),
+        })
+    }
+}
+
+#[repr(C)]
+#[derive(Copy, Clone, Debug)]
+pub struct UapiKvmRun {
+    pub request_interrupt_window: u8,
+    pub immediate_exit: u8,
+    pub padding1: [u8; 6usize],
+    pub exit_reason: u32,
+    pub ready_for_interrupt_injection: u8,
+    pub if_flag: u8,
+    pub flags: u16,
+    pub cr8: u64,
+    pub apic_base: u64,
+    pub __bindgen_anon_1: uapi_kvm_run__bindgen_ty_1,
+    pub kvm_valid_regs: u64,
+    pub kvm_dirty_regs: u64,
+    pub s: uapi_kvm_run__bindgen_ty_2,
+}
+
+#[repr(C)]
+#[derive(Copy, Clone)]
+pub union uapi_kvm_run__bindgen_ty_2 {
+    pub regs: UapiKvmSyncRegs,
+    pub padding: [u8; 2048usize],
+}
+
+impl Debug for uapi_kvm_run__bindgen_ty_2 {
+    fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
+        f.debug_struct("uapi_kvm_run__bindgen_ty_2").finish()
+    }
+}
+
+#[repr(C)]
+#[derive(Debug, Default, Copy, Clone, PartialEq)]
+pub struct UapiKvmSyncRegs {
+    pub device_irq_level: u64,
+}
+
+#[repr(C)]
+#[derive(Debug, Default, Copy, Clone, PartialEq)]
+pub struct UapiKvmRunBindgenTy1BindgenTy1 {
+    pub hardware_exit_reason: u64,
+}
+
+#[repr(C)]
+#[derive(Debug, Default, Copy, Clone, PartialEq)]
+pub struct UapiKvmRunBindgenTy1BindgenTy2 {
+    pub hardware_entry_failure_reason: u64,
+    pub cpu: u32,
+}
+
+#[repr(C)]
+#[derive(Debug, Default, Copy, Clone, PartialEq)]
+pub struct UapiKvmRunBindgenTy1BindgenTy3 {
+    pub exception: u32,
+    pub error_code: u32,
+}
+
+#[repr(C)]
+#[derive(Debug, Default, Copy, Clone, PartialEq)]
+pub struct UapiKvmRunBindgenTy1BindgenTy4 {
+    pub direction: u8,
+    pub size: u8,
+    pub port: u16,
+    pub count: u32,
+    pub data_offset: u64,
+}
+
+#[repr(C)]
+#[derive(Debug, Default, Copy, Clone, PartialEq)]
+pub struct UapiKvmDebugExitArch {
+    pub hsr: u32,
+    pub hsr_high: u32,
+    pub far: u64,
+}
+
+#[repr(C)]
+#[derive(Debug, Default, Copy, Clone, PartialEq)]
+pub struct UapiKvmRunBindgenTy1BindgenTy5 {
+    pub arch: UapiKvmDebugExitArch,
+}
+
+#[repr(C)]
+#[derive(Debug, Default, Copy, Clone, PartialEq)]
+pub struct UapiKvmRunBindgenTy1BindgenTy6 {
+    pub phys_addr: u64,
+    pub data: [u8; 8usize],
+    pub len: u32,
+    pub is_write: u8,
+}
+
+#[repr(C)]
+#[derive(Debug, Default, Copy, Clone, PartialEq)]
+pub struct UapiKvmRunBindgenTy1BindgenTy7 {
+    pub nr: u64,
+    pub args: [u64; 6usize],
+    pub ret: u64,
+    pub longmode: u32,
+    pub pad: u32,
+}
+
+#[repr(C)]
+#[derive(Debug, Default, Copy, Clone, PartialEq)]
+pub struct UapiKvmRunBindgenTy1BindgenTy8 {
+    pub rip: u64,
+    pub is_write: u32,
+    pub pad: u32,
+}
+
+#[repr(C)]
+#[derive(Debug, Default, Copy, Clone, PartialEq)]
+pub struct UapiKvmRunBindgenTy1BindgenTy9 {
+    pub icptcode: u8,
+    pub ipa: u16,
+    pub ipb: u32,
+}
+
+#[repr(C)]
+#[derive(Debug, Default, Copy, Clone, PartialEq)]
+pub struct UapiKvmRunBindgenTy1BindgenTy10 {
+    pub trans_exc_code: u64,
+    pub pgm_code: u32,
+}
+
+#[repr(C)]
+#[derive(Debug, Default, Copy, Clone, PartialEq)]
+pub struct UapiKvmRunBindgenTy1BindgenTy11 {
+    pub dcrn: u32,
+    pub data: u32,
+    pub is_write: u8,
+}
+
+#[repr(C)]
+#[derive(Debug, Default, Copy, Clone, PartialEq)]
+pub struct UapiKvmRunBindgenTy1BindgenTy12 {
+    pub suberror: u32,
+    pub ndata: u32,
+    pub data: [u64; 16usize],
+}
+
+#[repr(C)]
+#[derive(Copy, Clone)]
+pub struct UapiKvmRunBindgenTy1BindgenTy13 {
+    pub suberror: u32,
+    pub ndata: u32,
+    pub flags: u64,
+    pub __bindgen_anon_1: uapi_kvm_run__bindgen_ty_1__bindgen_ty_13__bindgen_ty_1,
+}
+
+#[repr(C)]
+#[derive(Copy, Clone)]
+pub union uapi_kvm_run__bindgen_ty_1__bindgen_ty_13__bindgen_ty_1 {
+    pub __bindgen_anon_1: UapiKvmRunBindgenTy1BindgenTy13BindgenTy1BindgenTy1,
+}
+
+#[repr(C)]
+#[derive(Debug, Default, Copy, Clone, PartialEq)]
+pub struct UapiKvmRunBindgenTy1BindgenTy13BindgenTy1BindgenTy1 {
+    pub insn_size: u8,
+    pub insn_bytes: [u8; 15usize],
+}
+
+#[repr(C)]
+#[derive(Debug, Default, Copy, Clone, PartialEq)]
+pub struct UapiKvmRunBindgenTy1BindgenTy14 {
+    pub gprs: [u64; 32usize],
+}
+
+#[repr(C)]
+#[derive(Debug, Default, Copy, Clone, PartialEq)]
+pub struct UapiKvmRunBindgenTy1BindgenTy15 {
+    pub nr: u64,
+    pub ret: u64,
+    pub args: [u64; 9usize],
+}
+
+#[repr(C)]
+#[derive(Debug, Default, Copy, Clone, PartialEq)]
+pub struct UapiKvmRunBindgenTy1BindgenTy16 {
+    pub subchannel_id: u16,
+    pub subchannel_nr: u16,
+    pub io_int_parm: u32,
+    pub io_int_word: u32,
+    pub ipb: u32,
+    pub dequeued: u8,
+}
+
+#[repr(C)]
+#[derive(Debug, Default, Copy, Clone, PartialEq)]
+pub struct UapiKvmRunBindgenTy1BindgenTy17 {
+    pub epr: u32,
+}
+
+#[repr(C)]
+#[derive(Copy, Clone)]
+pub struct UapiKvmRunBindgenTy1BindgenTy18 {
+    pub type_: u32,
+    pub ndata: u32,
+    pub __bindgen_anon_1: uapi_kvm_run__bindgen_ty_1__bindgen_ty_18__bindgen_ty_1,
+}
+
+#[repr(C)]
+#[derive(Copy, Clone)]
+pub union uapi_kvm_run__bindgen_ty_1__bindgen_ty_18__bindgen_ty_1 {
+    pub flags: u64,
+    pub data: [u64; 16usize],
+}
+
+#[repr(C)]
+#[derive(Debug, Default, Copy, Clone, PartialEq)]
+pub struct UapiKvmRunBindgenTy1BindgenTy19 {
+    pub addr: u64,
+    pub ar: u8,
+    pub reserved: u8,
+    pub fc: u8,
+    pub sel1: u8,
+    pub sel2: u16,
+}
+
+#[repr(C)]
+#[derive(Debug, Default, Copy, Clone, PartialEq)]
+pub struct UapiKvmRunBindgenTy1BindgenTy20 {
+    pub vector: u8,
+}
+
+#[repr(C)]
+#[derive(Debug, Default, Copy, Clone, PartialEq)]
+pub struct UapiKvmRunBindgenTy1BindgenTy21 {
+    pub esr_iss: u64,
+    pub fault_ipa: u64,
+}
+
+#[repr(C)]
+#[derive(Debug, Default, Copy, Clone, PartialEq)]
+pub struct UapiKvmRunBindgenTy1BindgenTy22 {
+    pub error: u8,
+    pub pad: [u8; 7usize],
+    pub reason: u32,
+    pub index: u32,
+    pub data: u64,
+}
+
+#[repr(C)]
+#[derive(Debug, Default, Copy, Clone, PartialEq)]
+pub struct UapiKvmRunBindgenTy1BindgenTy23 {
+    pub extension_id: usize,
+    pub function_id: usize,
+    pub args: [usize; 6usize],
+    pub ret: [usize; 2usize],
+}
+
+#[repr(C)]
+#[derive(Debug, Default, Copy, Clone, PartialEq)]
+pub struct UapiKvmRunBindgenTy1BindgenTy24 {
+    pub csr_num: usize,
+    pub new_value: usize,
+    pub write_mask: usize,
+    pub ret_value: usize,
+}
+
+#[repr(C)]
+#[derive(Debug, Default, Copy, Clone, PartialEq)]
+pub struct UapiKvmRunBindgenTy1BindgenTy25 {
+    pub flags: u32,
+}
+
+#[repr(C)]
+#[derive(Copy, Clone)]
+pub union uapi_kvm_run__bindgen_ty_1 {
+    pub hw: UapiKvmRunBindgenTy1BindgenTy1,
+    pub fail_entry: UapiKvmRunBindgenTy1BindgenTy2,
+    pub ex: UapiKvmRunBindgenTy1BindgenTy3,
+    pub io: UapiKvmRunBindgenTy1BindgenTy4,
+    pub debug: UapiKvmRunBindgenTy1BindgenTy5,
+    pub mmio: UapiKvmRunBindgenTy1BindgenTy6,
+    pub hypercall: UapiKvmRunBindgenTy1BindgenTy7,
+    pub tpr_access: UapiKvmRunBindgenTy1BindgenTy8,
+    pub s390_sieic: UapiKvmRunBindgenTy1BindgenTy9,
+    pub s390_reset_flags: u64,
+    pub s390_ucontrol: UapiKvmRunBindgenTy1BindgenTy10,
+    pub dcr: UapiKvmRunBindgenTy1BindgenTy11,
+    pub internal: UapiKvmRunBindgenTy1BindgenTy12,
+    pub emulation_failure: UapiKvmRunBindgenTy1BindgenTy13,
+    pub osi: UapiKvmRunBindgenTy1BindgenTy14,
+    pub papr_hcall: UapiKvmRunBindgenTy1BindgenTy15,
+    pub s390_tsch: UapiKvmRunBindgenTy1BindgenTy16,
+    pub epr: UapiKvmRunBindgenTy1BindgenTy17,
+    pub system_event: UapiKvmRunBindgenTy1BindgenTy18,
+    pub s390_stsi: UapiKvmRunBindgenTy1BindgenTy19,
+    pub eoi: UapiKvmRunBindgenTy1BindgenTy20,
+    pub hyperv: UapiKvmHypervExit,
+    pub arm_nisv: UapiKvmRunBindgenTy1BindgenTy21,
+    pub msr: UapiKvmRunBindgenTy1BindgenTy22,
+    pub xen: UapiKvmXenExit,
+    pub riscv_sbi: UapiKvmRunBindgenTy1BindgenTy23,
+    pub riscv_csr: UapiKvmRunBindgenTy1BindgenTy24,
+    pub notify: UapiKvmRunBindgenTy1BindgenTy25,
+    pub padding: [u8; 256usize],
+}
+
+impl Debug for uapi_kvm_run__bindgen_ty_1 {
+    fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
+        f.debug_struct("uapi_kvm_run__bindgen_ty_1").finish()
+    }
+}
+
+#[repr(C)]
+#[derive(Copy, Clone)]
+pub struct UapiKvmHypervExit {
+    pub type_: u32,
+    pub pad1: u32,
+    pub u: uapi_kvm_hyperv_exit__bindgen_ty_1,
+}
+
+#[repr(C)]
+#[derive(Copy, Clone)]
+pub union uapi_kvm_hyperv_exit__bindgen_ty_1 {
+    pub synic: UapiKvmHypervExitBindgenTy1BindgenTy1,
+    pub hcall: UapiKvmHypervExitBindgenTy1BindgenTy2,
+    pub syndbg: UapiKvmHypervExitBindgenTy1BindgenTy3,
+}
+
+#[repr(C)]
+#[derive(Debug, Default, Copy, Clone, PartialEq)]
+pub struct UapiKvmHypervExitBindgenTy1BindgenTy1 {
+    pub msr: u32,
+    pub pad2: u32,
+    pub control: u64,
+    pub evt_page: u64,
+    pub msg_page: u64,
+}
+
+#[repr(C)]
+#[derive(Debug, Default, Copy, Clone, PartialEq)]
+pub struct UapiKvmHypervExitBindgenTy1BindgenTy2 {
+    pub input: u64,
+    pub result: u64,
+    pub params: [u64; 2usize],
+}
+
+#[repr(C)]
+#[derive(Debug, Default, Copy, Clone, PartialEq)]
+pub struct UapiKvmHypervExitBindgenTy1BindgenTy3 {
+    pub msr: u32,
+    pub pad2: u32,
+    pub control: u64,
+    pub status: u64,
+    pub send_page: u64,
+    pub recv_page: u64,
+    pub pending_page: u64,
+}
+
+#[repr(C)]
+#[derive(Copy, Clone)]
+pub struct UapiKvmXenExit {
+    pub type_: u32,
+    pub u: uapi_kvm_xen_exit__bindgen_ty_1,
+}
+
+#[repr(C)]
+#[derive(Copy, Clone)]
+pub union uapi_kvm_xen_exit__bindgen_ty_1 {
+    pub hcall: UapiKvmXenExitBindgenTy1BindgenTy1,
+}
+
+#[repr(C)]
+#[derive(Debug, Default, Copy, Clone, PartialEq)]
+pub struct UapiKvmXenExitBindgenTy1BindgenTy1 {
+    pub longmode: u32,
+    pub cpl: u32,
+    pub input: u64,
+    pub result: u64,
+    pub params: [u64; 6usize],
+}

+ 17 - 0
package-lock.json

@@ -0,0 +1,17 @@
+{
+  "name": "DragonOS",
+  "lockfileVersion": 3,
+  "requires": true,
+  "packages": {
+    "": {
+      "dependencies": {
+        "crypto-js": "^4.2.0"
+      }
+    },
+    "node_modules/crypto-js": {
+      "version": "4.2.0",
+      "resolved": "https://mirrors.huaweicloud.com/repository/npm/crypto-js/-/crypto-js-4.2.0.tgz",
+      "integrity": "sha512-KALDyEYgpY+Rlob/iriUtjV6d5Eq+Y191A5g4UqLAi8CyGP9N1+FdVbkc1SxKc2r4YAYqG8JzO2KGL+AizD70Q=="
+    }
+  }
+}

+ 0 - 3
tools/.gdbinit

@@ -1,3 +0,0 @@
-target remote localhost:1234
-file bin/kernel/kernel.elf
-set follow-fork-mode child

+ 521 - 96
user/apps/test_kvm/main.c

@@ -1,115 +1,540 @@
-/**
- * @file main.c
- * @author xiaoyez ([email protected])
- * @brief 测试kvm的程序
- * @version 0.1
- * @date 2023-07-13
- *
- * @copyright Copyright (c) 2023
- *
- */
-
-/**
- * 测试kvm命令的方法:
- * 1.在DragonOS的控制台输入 exec bin/test_kvm.elf
- *
- */
-#include <fcntl.h>
+
 #include <stdint.h>
 #include <stdio.h>
+#include <string.h>
 #include <sys/ioctl.h>
-#include <unistd.h>
+#include <sys/mman.h>
+#include <fcntl.h>
+//#include <linux/kvm.h>
+
+typedef __signed__ char __s8;
+typedef unsigned char __u8;
+
+typedef __signed__ short __s16;
+typedef unsigned short __u16;
+
+typedef __signed__ int __s32;
+typedef unsigned int __u32;
 
-#define KVM_CREATE_VCPU 0x00
-#define KVM_SET_USER_MEMORY_REGION 0x01
+#ifdef __GNUC__
+__extension__ typedef __signed__ long long __s64;
+__extension__ typedef unsigned long long __u64;
+#else
+typedef __signed__ long long __s64;
+typedef unsigned long long __u64;
+#endif
 
-#define KVM_RUN 0x00
-#define KVM_GET_REGS 0x01
-#define KVM_SET_REGS 0x02
+//from linux/kvm.h
+#define KVM_CREATE_VM             _IO(KVMIO,   0x01) /* returns a VM fd */
+#define KVM_CREATE_VCPU           _IO(KVMIO,   0x41)
+#define KVM_GET_VCPU_MMAP_SIZE    _IO(KVMIO,   0x04) /* in bytes */
 
+#define KVM_RUN                   _IO(KVMIO,   0x80)
+#define KVM_GET_REGS              _IOR(KVMIO,  0x81, struct kvm_regs)
+#define KVM_SET_REGS              _IOW(KVMIO,  0x82, struct kvm_regs)
+#define KVM_GET_SREGS             _IOR(KVMIO,  0x83, struct kvm_sregs)
+#define KVM_SET_SREGS             _IOW(KVMIO,  0x84, struct kvm_sregs)
+
+#define KVMIO 0xAE
+#define KVM_SET_USER_MEMORY_REGION _IOW(KVMIO, 0x46, \
+					struct kvm_userspace_memory_region)
+/* Architectural interrupt line count. */
+#define KVM_NR_INTERRUPTS 256
+struct kvm_hyperv_exit {
+#define KVM_EXIT_HYPERV_SYNIC          1
+#define KVM_EXIT_HYPERV_HCALL          2
+#define KVM_EXIT_HYPERV_SYNDBG         3
+	__u32 type;
+	__u32 pad1;
+	union {
+		struct {
+			__u32 msr;
+			__u32 pad2;
+			__u64 control;
+			__u64 evt_page;
+			__u64 msg_page;
+		} synic;
+		struct {
+			__u64 input;
+			__u64 result;
+			__u64 params[2];
+		} hcall;
+		struct {
+			__u32 msr;
+			__u32 pad2;
+			__u64 control;
+			__u64 status;
+			__u64 send_page;
+			__u64 recv_page;
+			__u64 pending_page;
+		} syndbg;
+	} u;
+};
+struct kvm_debug_exit_arch {
+	__u32 exception;
+	__u32 pad;
+	__u64 pc;
+	__u64 dr6;
+	__u64 dr7;
+};
+/* for KVM_SET_USER_MEMORY_REGION */
 struct kvm_userspace_memory_region {
-    uint32_t slot; // 要在哪个slot上注册内存区间
-    // flags有两个取值,KVM_MEM_LOG_DIRTY_PAGES和KVM_MEM_READONLY,用来指示kvm针对这段内存应该做的事情。
-    // KVM_MEM_LOG_DIRTY_PAGES用来开启内存脏页,KVM_MEM_READONLY用来开启内存只读。
-    uint32_t flags;
-    uint64_t guest_phys_addr; // 虚机内存区间起始物理地址
-    uint64_t memory_size;     // 虚机内存区间大小
-    uint64_t userspace_addr;  // 虚机内存区间对应的主机虚拟地址
+	__u32 slot;
+	__u32 flags;
+	__u64 guest_phys_addr;
+	__u64 memory_size; /* bytes */
+	__u64 userspace_addr; /* start of the userspace allocated memory */
 };
-
+struct kvm_xen_exit {
+#define KVM_EXIT_XEN_HCALL          1
+	__u32 type;
+	union {
+		struct {
+			__u32 longmode;
+			__u32 cpl;
+			__u64 input;
+			__u64 result;
+			__u64 params[6];
+		} hcall;
+	} u;
+};
+/* for KVM_GET_REGS and KVM_SET_REGS */
 struct kvm_regs {
 	/* out (KVM_GET_REGS) / in (KVM_SET_REGS) */
-	uint64_t rax, rbx, rcx, rdx;
-	uint64_t rsi, rdi, rsp, rbp;
-	uint64_t r8,  r9,  r10, r11;
-	uint64_t r12, r13, r14, r15;
-	uint64_t rip, rflags;
+	__u64 rax, rbx, rcx, rdx;
+	__u64 rsi, rdi, rsp, rbp;
+	__u64 r8,  r9,  r10, r11;
+	__u64 r12, r13, r14, r15;
+	__u64 rip, rflags;
+};
+struct my_kvm_segment {
+	__u64 base;
+	__u32 limit;
+	__u16 selector;
+	__u8  type;
+	__u8  present, dpl, db, s, l, g, avl;
+	__u8  unusable;
+	__u8  padding;
+};
+struct kvm_dtable {
+	__u64 base;
+	__u16 limit;
+	__u16 padding[3];
+};
+/* for KVM_GET_SREGS and KVM_SET_SREGS */
+struct kvm_sregs {
+	/* out (KVM_GET_SREGS) / in (KVM_SET_SREGS) */
+	struct my_kvm_segment cs, ds, es, fs, gs, ss;
+	struct my_kvm_segment tr, ldt;
+	struct kvm_dtable gdt, idt;
+	__u64 cr0, cr2, cr3, cr4, cr8;
+	__u64 efer;
+	__u64 apic_base;
+	__u64 interrupt_bitmap[(KVM_NR_INTERRUPTS + 63) / 64];
+};
+
+/* for KVM_GET/SET_VCPU_EVENTS */
+struct kvm_vcpu_events {
+	struct {
+		__u8 injected;
+		__u8 nr;
+		__u8 has_error_code;
+		__u8 pending;
+		__u32 error_code;
+	} exception;
+	struct {
+		__u8 injected;
+		__u8 nr;
+		__u8 soft;
+		__u8 shadow;
+	} interrupt;
+	struct {
+		__u8 injected;
+		__u8 pending;
+		__u8 masked;
+		__u8 pad;
+	} nmi;
+	__u32 sipi_vector;
+	__u32 flags;
+	struct {
+		__u8 smm;
+		__u8 pending;
+		__u8 smm_inside_nmi;
+		__u8 latched_init;
+	} smi;
+	__u8 reserved[27];
+	__u8 exception_has_payload;
+	__u64 exception_payload;
+};
+/* kvm_sync_regs struct included by kvm_run struct */
+struct kvm_sync_regs {
+	/* Members of this structure are potentially malicious.
+	 * Care must be taken by code reading, esp. interpreting,
+	 * data fields from them inside KVM to prevent TOCTOU and
+	 * double-fetch types of vulnerabilities.
+	 */
+	struct kvm_regs regs;
+	struct kvm_sregs sregs;
+	struct kvm_vcpu_events events;
+};
+
+/* for KVM_RUN, returned by mmap(vcpu_fd, offset=0) */
+struct kvm_run {
+	/* in */
+	__u8 request_interrupt_window;
+	__u8 immediate_exit;
+	__u8 padding1[6];
+
+	/* out */
+	__u32 exit_reason;
+	__u8 ready_for_interrupt_injection;
+	__u8 if_flag;
+	__u16 flags;
+
+	/* in (pre_kvm_run), out (post_kvm_run) */
+	__u64 cr8;
+	__u64 apic_base;
+
+#ifdef __KVM_S390
+	/* the processor status word for s390 */
+	__u64 psw_mask; /* psw upper half */
+	__u64 psw_addr; /* psw lower half */
+#endif
+	union {
+		/* KVM_EXIT_UNKNOWN */
+		struct {
+			__u64 hardware_exit_reason;
+		} hw;
+		/* KVM_EXIT_FAIL_ENTRY */
+		struct {
+			__u64 hardware_entry_failure_reason;
+			__u32 cpu;
+		} fail_entry;
+		/* KVM_EXIT_EXCEPTION */
+		struct {
+			__u32 exception;
+			__u32 error_code;
+		} ex;
+		/* KVM_EXIT_IO */
+		struct {
+#define KVM_EXIT_IO_IN  0
+#define KVM_EXIT_IO_OUT 1
+			__u8 direction;
+			__u8 size; /* bytes */
+			__u16 port;
+			__u32 count;
+			__u64 data_offset; /* relative to kvm_run start */
+		} io;
+		/* KVM_EXIT_DEBUG */
+		struct {
+			struct kvm_debug_exit_arch arch;
+		} debug;
+		/* KVM_EXIT_MMIO */
+		struct {
+			__u64 phys_addr;
+			__u8  data[8];
+			__u32 len;
+			__u8  is_write;
+		} mmio;
+		/* KVM_EXIT_HYPERCALL */
+		struct {
+			__u64 nr;
+			__u64 args[6];
+			__u64 ret;
+			__u32 longmode;
+			__u32 pad;
+		} hypercall;
+		/* KVM_EXIT_TPR_ACCESS */
+		struct {
+			__u64 rip;
+			__u32 is_write;
+			__u32 pad;
+		} tpr_access;
+		/* KVM_EXIT_S390_SIEIC */
+		struct {
+			__u8 icptcode;
+			__u16 ipa;
+			__u32 ipb;
+		} s390_sieic;
+		/* KVM_EXIT_S390_RESET */
+#define KVM_S390_RESET_POR       1
+#define KVM_S390_RESET_CLEAR     2
+#define KVM_S390_RESET_SUBSYSTEM 4
+#define KVM_S390_RESET_CPU_INIT  8
+#define KVM_S390_RESET_IPL       16
+		__u64 s390_reset_flags;
+		/* KVM_EXIT_S390_UCONTROL */
+		struct {
+			__u64 trans_exc_code;
+			__u32 pgm_code;
+		} s390_ucontrol;
+		/* KVM_EXIT_DCR (deprecated) */
+		struct {
+			__u32 dcrn;
+			__u32 data;
+			__u8  is_write;
+		} dcr;
+		/* KVM_EXIT_INTERNAL_ERROR */
+		struct {
+			__u32 suberror;
+			/* Available with KVM_CAP_INTERNAL_ERROR_DATA: */
+			__u32 ndata;
+			__u64 data[16];
+		} internal;
+		/*
+		 * KVM_INTERNAL_ERROR_EMULATION
+		 *
+		 * "struct emulation_failure" is an overlay of "struct internal"
+		 * that is used for the KVM_INTERNAL_ERROR_EMULATION sub-type of
+		 * KVM_EXIT_INTERNAL_ERROR.  Note, unlike other internal error
+		 * sub-types, this struct is ABI!  It also needs to be backwards
+		 * compatible with "struct internal".  Take special care that
+		 * "ndata" is correct, that new fields are enumerated in "flags",
+		 * and that each flag enumerates fields that are 64-bit aligned
+		 * and sized (so that ndata+internal.data[] is valid/accurate).
+		 */
+		struct {
+			__u32 suberror;
+			__u32 ndata;
+			__u64 flags;
+			__u8  insn_size;
+			__u8  insn_bytes[15];
+		} emulation_failure;
+		/* KVM_EXIT_OSI */
+		struct {
+			__u64 gprs[32];
+		} osi;
+		/* KVM_EXIT_PAPR_HCALL */
+		struct {
+			__u64 nr;
+			__u64 ret;
+			__u64 args[9];
+		} papr_hcall;
+		/* KVM_EXIT_S390_TSCH */
+		struct {
+			__u16 subchannel_id;
+			__u16 subchannel_nr;
+			__u32 io_int_parm;
+			__u32 io_int_word;
+			__u32 ipb;
+			__u8 dequeued;
+		} s390_tsch;
+		/* KVM_EXIT_EPR */
+		struct {
+			__u32 epr;
+		} epr;
+		/* KVM_EXIT_SYSTEM_EVENT */
+		struct {
+#define KVM_SYSTEM_EVENT_SHUTDOWN       1
+#define KVM_SYSTEM_EVENT_RESET          2
+#define KVM_SYSTEM_EVENT_CRASH          3
+			__u32 type;
+			__u64 flags;
+		} system_event;
+		/* KVM_EXIT_S390_STSI */
+		struct {
+			__u64 addr;
+			__u8 ar;
+			__u8 reserved;
+			__u8 fc;
+			__u8 sel1;
+			__u16 sel2;
+		} s390_stsi;
+		/* KVM_EXIT_IOAPIC_EOI */
+		struct {
+			__u8 vector;
+		} eoi;
+		/* KVM_EXIT_HYPERV */
+		struct kvm_hyperv_exit hyperv;
+		/* KVM_EXIT_ARM_NISV */
+		struct {
+			__u64 esr_iss;
+			__u64 fault_ipa;
+		} arm_nisv;
+		/* KVM_EXIT_X86_RDMSR / KVM_EXIT_X86_WRMSR */
+		struct {
+			__u8 error; /* user -> kernel */
+			__u8 pad[7];
+#define KVM_MSR_EXIT_REASON_INVAL	(1 << 0)
+#define KVM_MSR_EXIT_REASON_UNKNOWN	(1 << 1)
+#define KVM_MSR_EXIT_REASON_FILTER	(1 << 2)
+			__u32 reason; /* kernel -> user */
+			__u32 index; /* kernel -> user */
+			__u64 data; /* kernel <-> user */
+		} msr;
+		/* KVM_EXIT_XEN */
+		struct kvm_xen_exit xen;
+		/* Fix the size of the union. */
+		char padding[256];
+	};
+
+	/* 2048 is the size of the char array used to bound/pad the size
+	 * of the union that holds sync regs.
+	 */
+	#define SYNC_REGS_SIZE_BYTES 2048
+	/*
+	 * shared registers between kvm and userspace.
+	 * kvm_valid_regs specifies the register classes set by the host
+	 * kvm_dirty_regs specified the register classes dirtied by userspace
+	 * struct kvm_sync_regs is architecture specific, as well as the
+	 * bits for kvm_valid_regs and kvm_dirty_regs
+	 */
+	__u64 kvm_valid_regs;
+	__u64 kvm_dirty_regs;
+	union {
+		struct kvm_sync_regs regs;
+		char padding[SYNC_REGS_SIZE_BYTES];
+	} s;
 };
 
-int guest_code(){
-    while (1)
+
+int kvm(uint8_t code[], size_t code_len)
+{
+  // step 1, open /dev/kvm
+  int kvmfd = open("/dev/kvm", O_RDWR | O_CLOEXEC);
+  if (kvmfd == -1)
+  {
+    printf("failed to open /dev/kvm\n");
+    return 0;
+  }
+
+  // step 2, create VM
+  int vmfd = ioctl(kvmfd, KVM_CREATE_VM, 0);
+  printf("vmfd %d\n", vmfd);
+  // step 3, set up user memory region
+  size_t mem_size = 0x100000; // size of user memory you want to assign
+  void *mem = mmap(0, mem_size, PROT_READ | PROT_WRITE,
+                   MAP_SHARED | MAP_ANONYMOUS, -1, 0);
+
+  printf("map mem %p\n", mem);
+  int user_entry = 0x0;
+  memcpy((void *)((size_t)mem + user_entry), code, code_len);
+  struct kvm_userspace_memory_region region = {
+      .slot = 0,
+      .flags = 0,
+      .guest_phys_addr = 0,
+      .memory_size = mem_size,
+      .userspace_addr = (size_t)mem};
+  ioctl(vmfd, KVM_SET_USER_MEMORY_REGION, &region);
+  /* end of step 3 */
+
+  // step 4, create vCPU
+  int vcpufd = ioctl(vmfd, KVM_CREATE_VCPU, 0);
+  printf("create vcpu,fd: %p\n", vcpufd);
+  // step 5, set up memory for vCPU
+  size_t vcpu_mmap_size = ioctl(kvmfd, KVM_GET_VCPU_MMAP_SIZE, NULL);
+  struct kvm_run *run = (struct kvm_run *)mmap(0, vcpu_mmap_size, PROT_READ | PROT_WRITE, MAP_SHARED, vcpufd, 0);
+
+  // step 6, set up vCPU's registers
+  /* standard registers include general-purpose registers and flags */
+  struct kvm_regs regs;
+  ioctl(vcpufd, KVM_GET_REGS, &regs);
+  regs.rip = user_entry;
+  regs.rsp = 0x200000; // stack address
+  regs.rflags = 0x2; // in x86 the 0x2 bit should always be set
+  ioctl(vcpufd, KVM_SET_REGS, &regs); // set registers
+
+  /* special registers include segment registers */
+  struct kvm_sregs sregs;
+  ioctl(vcpufd, KVM_GET_SREGS, &sregs);
+  sregs.cs.base = sregs.cs.selector = 0; // let base of code segment equal to zero
+  ioctl(vcpufd, KVM_SET_SREGS, &sregs);
+  ioctl(vcpufd, KVM_GET_SREGS, &sregs);
+  // step 7, execute vm and handle exit reason
+  #define KVM_EXIT_UNKNOWN          0
+#define KVM_EXIT_EXCEPTION        1
+#define KVM_EXIT_IO               2
+#define KVM_EXIT_HYPERCALL        3
+#define KVM_EXIT_DEBUG            4
+#define KVM_EXIT_HLT              5
+#define KVM_EXIT_MMIO             6
+#define KVM_EXIT_IRQ_WINDOW_OPEN  7
+#define KVM_EXIT_SHUTDOWN         8
+#define KVM_EXIT_FAIL_ENTRY       9
+#define KVM_EXIT_INTR             10
+#define KVM_EXIT_SET_TPR          11
+#define KVM_EXIT_TPR_ACCESS       12
+#define KVM_EXIT_S390_SIEIC       13
+#define KVM_EXIT_S390_RESET       14
+#define KVM_EXIT_DCR              15 /* deprecated */
+#define KVM_EXIT_NMI              16
+#define KVM_EXIT_INTERNAL_ERROR   17
+#define KVM_EXIT_OSI              18
+#define KVM_EXIT_PAPR_HCALL	  19
+#define KVM_EXIT_S390_UCONTROL	  20
+#define KVM_EXIT_WATCHDOG         21
+#define KVM_EXIT_S390_TSCH        22
+#define KVM_EXIT_EPR              23
+#define KVM_EXIT_SYSTEM_EVENT     24
+#define KVM_EXIT_S390_STSI        25
+#define KVM_EXIT_IOAPIC_EOI       26
+#define KVM_EXIT_HYPERV           27
+#define KVM_EXIT_ARM_NISV         28
+#define KVM_EXIT_X86_RDMSR        29
+#define KVM_EXIT_X86_WRMSR        30
+#define KVM_EXIT_DIRTY_RING_FULL  31
+#define KVM_EXIT_AP_RESET_HOLD    32
+#define KVM_EXIT_X86_BUS_LOCK     33
+#define KVM_EXIT_XEN              34
+  while (1)
+  {
+    ioctl(vcpufd, KVM_RUN, NULL);
+    ioctl(vcpufd, KVM_GET_SREGS, &sregs);
+    printf("Guest CR3: 0x%llx\n", sregs.cr3);
+    switch (run->exit_reason)
     {
-        // printf("guest code\n");
-        __asm__ __volatile__ (
-            "mov %rax, 0\n\t"
-            "mov %rcx, 0\n\t"
-            "cpuid\n\t"
-        );
+    case KVM_EXIT_HLT:
+      fputs("KVM_EXIT_HLT \n", stderr);
+      return 0;
+    case KVM_EXIT_IO:
+      /* TODO: check port and direction here */
+      putchar(*(((char *)run) + run->io.data_offset));
+      printf("KVM_EXIT_IO: run->io.port = %lx \n",
+             run->io.port);
+      break;
+    case KVM_EXIT_FAIL_ENTRY:
+      printf("KVM_EXIT_FAIL_ENTRY: hardware_entry_failure_reason = 0x%lx",
+             run->fail_entry.hardware_entry_failure_reason);
+      return 0;
+    case KVM_EXIT_INTERNAL_ERROR:
+      printf("KVM_EXIT_INTERNAL_ERROR: suberror = 0x%x",
+             run->internal.suberror);
+      return 0;
+    case KVM_EXIT_SHUTDOWN:
+      printf("KVM_EXIT_SHUTDOWN");
+      return 0;
+    default:
+      printf("Unhandled reason: %d", run->exit_reason);
+      return 0;
     }
-    return 0;
+  }
 }
 
-int main()
-{
-    printf("Test kvm running...\n");
-    printf("Open /dev/kvm\n");
-    int kvm_fd = open("/dev/kvm", O_RDWR|O_CLOEXEC);
-    int vmfd = ioctl(kvm_fd, 0x01, 0);
-    printf("vmfd=%d\n", vmfd);
-
-    /*
-         __asm__ __volatile__ (
-            "mov %rax, 0\n\t"
-            "mov %rcx, 0\n\t"
-            "cpuid\n\t"
-        ); 
-    */
-    const uint8_t code[] = {
-        0xba, 0xf8, 0x03, /* mov $0x3f8, %dx */
-        0x00, 0xd8,       /* add %bl, %al */
-        0x04, '0',        /* add $'0', %al */
-        0xee,             /* out %al, (%dx) */
-        0xb0, '\n',       /* mov $'\n', %al */
-        0xee,             /* out %al, (%dx) */
-        0xf4,             /* hlt */
-    };
-
-    size_t mem_size = 0x4000; // size of user memory you want to assign
-    printf("code=%p\n", code);
-    // void *mem = mmap(0, mem_size, 0x7, -1, 0);
-    // memcpy(mem, code, sizeof(code));
-    struct kvm_userspace_memory_region region = {
-        .slot = 0,
-        .flags = 0,
-        .guest_phys_addr = 0,
-        .memory_size = mem_size,
-        .userspace_addr = (size_t)code
-    };
-    ioctl(vmfd, KVM_SET_USER_MEMORY_REGION, &region);
-
-    int vcpufd = ioctl(vmfd, KVM_CREATE_VCPU, 0);
-    printf("vcpufd=%d\n", vcpufd);
-    int user_entry = 0x0;
-
-    struct kvm_regs regs = {0};
-    regs.rip = user_entry;
-    regs.rsp = 0x3000; // stack address
-    regs.rflags = 0x2; // in x86 the 0x2 bit should always be set
-    ioctl(vcpufd, KVM_SET_REGS, &regs); // set registers
-
-    ioctl(vcpufd, KVM_RUN, 0);
+  /*汇编指令解释
+0xB0 0x61 (mov al, 0x61)
+解释:将立即数 0x61(ASCII 字符 'a')加载到 AL 寄存器中。
 
-    return 0;
-}
+0xBA 0x17 0x02 (mov dx, 0x0217)
+Linux: ilen = 3 外中断和EPT_VIOLATION
+解释:将立即数 0x0217 加载到 DX 寄存器中。
+
+0xEE (out dx, al)
+解释:将 AL 寄存器的值输出到 DX 寄存器指定的端口。
 
+0xB0 0x0A (mov al, 0x0A)
+解释:将立即数 0x0A(换行符)加载到 AL 寄存器中。
 
+0xEE (out dx, al)
+解释:将 AL 寄存器的值输出到 DX 寄存器指定的端口。
+
+0xF4 (hlt)
+解释:执行 hlt 指令,使处理器进入休眠状态,直到下一个外部中断到来。*/
+
+int main()
+{
+	//uint8_t code[] = "\xB0\x61\xBA\x17\x02\xEE\xB0\n\xEE\xF4";
+  	//uint8_t code[] = "\xB0\x61\xBA\x17\x02\xEE\xF4";
+	uint8_t code[] = "\xB0\x61\xF4";
+  kvm(code, sizeof(code));
+  return 0;
+}