mod.rs 12 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463
  1. use core::{fmt::Debug, sync::atomic::AtomicU32};
  2. use alloc::{boxed::Box, vec::Vec};
  3. use bit_field::BitField;
  4. use bitmap::{traits::BitMapOps, AllocBitmap};
  5. use system_error::SystemError;
  6. use x86::{
  7. bits64::rflags::RFlags,
  8. controlregs::{Cr0, Cr4},
  9. dtables::DescriptorTablePointer,
  10. };
  11. use x86_64::registers::control::EferFlags;
  12. use crate::{
  13. smp::cpu::ProcessorId,
  14. virt::vm::{
  15. kvm_host::{
  16. vcpu::VirtCpu, Vm, KVM_IRQFD_RESAMPLE_IRQ_SOURCE_ID, KVM_USERSAPCE_IRQ_SOURCE_ID,
  17. },
  18. user_api::UapiKvmSegment,
  19. },
  20. };
  21. use crate::arch::VirtCpuArch;
  22. use super::{
  23. asm::{MsrData, VcpuSegment, VmxMsrEntry},
  24. vmx::{exit::ExitFastpathCompletion, vmx_info},
  25. x86_kvm_manager, x86_kvm_ops,
  26. };
  27. pub mod lapic;
  28. pub mod page;
  29. pub mod vcpu;
  30. #[allow(dead_code)]
  31. pub const TSS_IOPB_BASE_OFFSET: usize = 0x66;
  32. pub const TSS_BASE_SIZE: usize = 0x68;
  33. pub const TSS_IOPB_SIZE: usize = 65536 / 8;
  34. pub const TSS_REDIRECTION_SIZE: usize = 256 / 8;
  35. pub const RMODE_TSS_SIZE: usize = TSS_BASE_SIZE + TSS_REDIRECTION_SIZE + TSS_IOPB_SIZE + 1;
  36. pub const KVM_PFN_NOSLOT: u64 = 0x1 << 63;
  37. #[allow(dead_code)]
  38. #[derive(Debug, Default)]
  39. pub struct X86KvmArch {
  40. /// 中断芯片模式
  41. pub irqchip_mode: KvmIrqChipMode,
  42. /// 负责引导(bootstrap)kvm的vcpu_id
  43. bsp_vcpu_id: usize,
  44. pub pause_in_guest: bool,
  45. pub cstate_in_guest: bool,
  46. pub mwait_in_guest: bool,
  47. pub hlt_in_guest: bool,
  48. pub bus_lock_detection_enabled: bool,
  49. irq_sources_bitmap: u64,
  50. default_tsc_khz: u64,
  51. guest_can_read_msr_platform_info: bool,
  52. apicv_inhibit_reasons: usize,
  53. pub max_vcpu_ids: usize,
  54. pub notify_vmexit_flags: NotifyVmExitFlags,
  55. pub notify_window: u32,
  56. msr_fliter: Option<Box<KvmX86MsrFilter>>,
  57. pub noncoherent_dma_count: AtomicU32,
  58. pub active_mmu_pages: Vec<u64>,
  59. pub n_max_mmu_pages: usize,
  60. pub n_used_mmu_pages: usize,
  61. }
  62. impl X86KvmArch {
  63. pub fn init(kvm_type: usize) -> Result<Self, SystemError> {
  64. if kvm_type != 0 {
  65. return Err(SystemError::EINVAL);
  66. }
  67. let mut arch = x86_kvm_ops().vm_init();
  68. // 设置中断源位图
  69. arch.irq_sources_bitmap
  70. .set_bit(KVM_USERSAPCE_IRQ_SOURCE_ID, true)
  71. .set_bit(KVM_IRQFD_RESAMPLE_IRQ_SOURCE_ID, true);
  72. arch.default_tsc_khz = x86_kvm_manager().max_tsc_khz;
  73. arch.guest_can_read_msr_platform_info = true;
  74. arch.apicv_init();
  75. Ok(arch)
  76. }
  77. fn apicv_init(&mut self) {
  78. self.apicv_inhibit_reasons
  79. .set_bit(KvmApicvInhibit::ABSENT, true);
  80. if !vmx_info().enable_apicv {
  81. self.apicv_inhibit_reasons
  82. .set_bit(KvmApicvInhibit::DISABLE, true);
  83. }
  84. }
  85. pub fn msr_allowed(&self, msr: u32, ftype: MsrFilterType) -> bool {
  86. // x2APIC MSRs
  87. if (0x800..=0x8ff).contains(&msr) {
  88. return true;
  89. }
  90. if let Some(msr_filter) = &self.msr_fliter {
  91. let mut allowed = msr_filter.default_allow;
  92. for i in 0..msr_filter.count as usize {
  93. let range = &msr_filter.ranges[i];
  94. let start = range.base;
  95. let end = start + range.nmsrs;
  96. let flags = range.flags;
  97. let bitmap = &range.bitmap;
  98. if msr >= start && msr < end && flags.contains(ftype) {
  99. allowed = bitmap.get((msr - start) as usize).unwrap_or(false);
  100. break;
  101. }
  102. }
  103. return allowed;
  104. } else {
  105. return true;
  106. }
  107. }
  108. }
  109. #[derive(Debug, Clone, Copy, PartialEq)]
  110. #[allow(dead_code)]
  111. pub enum KvmIrqChipMode {
  112. None,
  113. Kernel,
  114. Split,
  115. }
  116. impl Default for KvmIrqChipMode {
  117. fn default() -> Self {
  118. Self::None
  119. }
  120. }
  121. #[allow(dead_code)]
  122. pub trait KvmInitFunc {
  123. fn hardware_setup(&self) -> Result<(), SystemError>;
  124. fn handle_intel_pt_intr(&self) -> u32;
  125. fn runtime_funcs(&self) -> &'static dyn KvmFunc;
  126. }
  127. pub trait KvmFunc: Send + Sync + Debug {
  128. /// 返回该硬件支持的名字,例如“Vmx”
  129. fn name(&self) -> &'static str;
  130. /// 启用硬件支持
  131. fn hardware_enable(&self) -> Result<(), SystemError>;
  132. fn vm_init(&self) -> X86KvmArch;
  133. fn vcpu_precreate(&self, vm: &mut Vm) -> Result<(), SystemError>;
  134. fn vcpu_create(&self, vcpu: &mut VirtCpu, vm: &Vm);
  135. fn vcpu_load(&self, vcpu: &mut VirtCpu, cpu: ProcessorId);
  136. fn load_mmu_pgd(&self, vcpu: &mut VirtCpu, vm: &Vm, root_hpa: u64, root_level: u32);
  137. fn cache_reg(&self, vcpu: &mut VirtCpuArch, reg: KvmReg);
  138. fn apicv_pre_state_restore(&self, vcpu: &mut VirtCpu);
  139. fn set_msr(&self, vcpu: &mut VirtCpu, msr: MsrData) -> Result<(), SystemError>;
  140. fn set_rflags(&self, vcpu: &mut VirtCpu, rflags: RFlags);
  141. fn get_rflags(&self, vcpu: &mut VirtCpu) -> RFlags;
  142. fn set_cr0(&self, vm: &Vm, vcpu: &mut VirtCpu, cr0: Cr0);
  143. fn is_vaild_cr0(&self, vcpu: &VirtCpu, cr0: Cr0) -> bool;
  144. fn set_cr4(&self, vcpu: &mut VirtCpu, cr4: Cr4);
  145. fn post_set_cr3(&self, vcpu: &VirtCpu, cr3: u64);
  146. fn is_vaild_cr4(&self, vcpu: &VirtCpu, cr4: Cr4) -> bool;
  147. fn set_efer(&self, vcpu: &mut VirtCpu, efer: EferFlags);
  148. fn set_segment(&self, vcpu: &mut VirtCpu, var: &mut UapiKvmSegment, seg: VcpuSegment);
  149. fn get_segment(
  150. &self,
  151. vcpu: &mut VirtCpu,
  152. var: UapiKvmSegment,
  153. seg: VcpuSegment,
  154. ) -> UapiKvmSegment;
  155. /// 这个函数不会用到VCPU,这里拿到只是为了确保上一层拿到锁
  156. fn get_idt(&self, _vcpu: &mut VirtCpu, dt: &mut DescriptorTablePointer<u8>);
  157. fn set_idt(&self, _vcpu: &mut VirtCpu, dt: &DescriptorTablePointer<u8>);
  158. fn get_gdt(&self, _vcpu: &mut VirtCpu, dt: &mut DescriptorTablePointer<u8>);
  159. fn set_gdt(&self, _vcpu: &mut VirtCpu, dt: &DescriptorTablePointer<u8>);
  160. fn update_exception_bitmap(&self, vcpu: &mut VirtCpu);
  161. fn vcpu_reset(&self, vcpu: &mut VirtCpu, vm: &Vm, init_event: bool);
  162. fn has_emulated_msr(&self, msr: u32) -> bool;
  163. fn get_msr_feature(&self, msr: &mut VmxMsrEntry) -> bool;
  164. fn prepare_switch_to_guest(&self, vcpu: &mut VirtCpu);
  165. fn flush_tlb_all(&self, vcpu: &mut VirtCpu);
  166. fn vcpu_run(&self, vcpu: &mut VirtCpu) -> ExitFastpathCompletion;
  167. fn handle_exit_irqoff(&self, vcpu: &mut VirtCpu);
  168. fn handle_exit(
  169. &self,
  170. vcpu: &mut VirtCpu,
  171. vm: &Vm,
  172. fastpath: ExitFastpathCompletion,
  173. ) -> Result<i32, SystemError>;
  174. }
  175. /// ## 中断抑制的原因位
  176. #[derive(Debug)]
  177. pub struct KvmApicvInhibit;
  178. #[allow(dead_code)]
  179. impl KvmApicvInhibit {
  180. // Intel与AMD共用
  181. /// APIC 加速功能被模块参数禁用,或者硬件不支持
  182. pub const DISABLE: usize = 0;
  183. /// Hyper-V 客户机正在使用 AutoEOI 功能,导致 APIC 加速被禁用。
  184. pub const HYPERV: usize = 1;
  185. /// 因为用户空间尚未启用内核或分裂的中断控制器,导致 APIC 加速被禁用。
  186. pub const ABSENT: usize = 2;
  187. /// KVM_GUESTDBG_BLOCKIRQ(一种调试措施,用于阻止该 vCPU 上的所有中断)被启用,以避免 AVIC/APICv 绕过此功能。
  188. pub const BLOCKIRQ: usize = 3;
  189. /// 当所有 vCPU 的 APIC ID 和 vCPU 的 1:1 映射被更改且 KVM 未应用其 x2APIC 热插拔修补程序时,APIC 加速被禁用。
  190. pub const PHYSICAL_ID_ALIASED: usize = 4;
  191. /// 当 vCPU 的 APIC ID 或 APIC 基址从其复位值更改时,首次禁用 APIC 加速。
  192. pub const APIC_ID_MODIFIED: usize = 5;
  193. /// 当 vCPU 的 APIC ID 或 APIC 基址从其复位值更改时,首次禁用 APIC 加速。
  194. pub const APIC_BASE_MODIFIED: usize = 6;
  195. // 仅仅对AMD适用
  196. /// 当 vCPU 运行嵌套客户机时,AVIC 被禁用。因为与 APICv 不同,当 vCPU 运行嵌套时,该 vCPU 的同级无法使用门铃机制通过 AVIC 信号中断。
  197. pub const NESTED: usize = 7;
  198. /// 在 SVM 上,等待 IRQ 窗口的实现使用挂起的虚拟中断,而在 KVM 等待 IRQ 窗口时无法注入这些虚拟中断,因此在等待 IRQ 窗口时 AVIC 被禁用。
  199. pub const IRQWIN: usize = 8;
  200. /// PIT(i8254)的“重新注入”模式依赖于 EOI 拦截,而 AVIC 不支持边沿触发中断的 EOI 拦截。
  201. pub const PIT_REINJ: usize = 9;
  202. /// SEV 不支持 AVIC,因此 AVIC 被禁用。
  203. pub const SEV: usize = 10;
  204. /// 当所有带有有效 LDR 的 vCPU 之间的逻辑 ID 和 vCPU 的 1:1 映射被更改时,AVIC 被禁用。
  205. pub const LOGICAL_ID_ALIASED: usize = 11;
  206. }
  207. #[derive(Debug)]
  208. pub struct KvmX86MsrFilter {
  209. count: u8,
  210. default_allow: bool,
  211. ranges: Vec<KernelMsrRange>,
  212. }
  213. #[derive(Debug)]
  214. pub struct KernelMsrRange {
  215. pub flags: MsrFilterType,
  216. pub nmsrs: u32,
  217. pub base: u32,
  218. pub bitmap: AllocBitmap,
  219. }
  220. #[repr(C)]
  221. #[allow(dead_code)]
  222. pub struct PosixMsrFilterRange {
  223. pub flags: u32,
  224. pub nmsrs: u32,
  225. pub base: u32,
  226. pub bitmap: *const u8,
  227. }
  228. bitflags! {
  229. pub struct MsrFilterType: u8 {
  230. const KVM_MSR_FILTER_READ = 1 << 0;
  231. const KVM_MSR_FILTER_WRITE = 1 << 1;
  232. }
  233. pub struct NotifyVmExitFlags: u8 {
  234. const KVM_X86_NOTIFY_VMEXIT_ENABLED = 1 << 0;
  235. const KVM_X86_NOTIFY_VMEXIT_USER = 1 << 1;
  236. }
  237. }
  238. impl Default for NotifyVmExitFlags {
  239. fn default() -> Self {
  240. NotifyVmExitFlags::empty()
  241. }
  242. }
  243. #[derive(Debug, Clone, Copy)]
  244. pub enum KvmReg {
  245. VcpuRegsRax = 0,
  246. VcpuRegsRcx = 1,
  247. VcpuRegsRdx = 2,
  248. VcpuRegsRbx = 3,
  249. VcpuRegsRsp = 4,
  250. VcpuRegsRbp = 5,
  251. VcpuRegsRsi = 6,
  252. VcpuRegsRdi = 7,
  253. VcpuRegsR8 = 8,
  254. VcpuRegsR9 = 9,
  255. VcpuRegsR10 = 10,
  256. VcpuRegsR11 = 11,
  257. VcpuRegsR12 = 12,
  258. VcpuRegsR13 = 13,
  259. VcpuRegsR14 = 14,
  260. VcpuRegsR15 = 15,
  261. VcpuRegsRip = 16,
  262. NrVcpuRegs = 17,
  263. //VcpuExregPdptr = NrVcpuRegs,
  264. VcpuExregCr0,
  265. VcpuExregCr3,
  266. VcpuExregCr4,
  267. VcpuExregRflags,
  268. VcpuExregSegments,
  269. VcpuExregExitInfo1, //EXITINFO1 provides the linear address of the memory operand.
  270. VcpuExregExitInfo2, //EXITINFO2 provides the contents of the register operand.
  271. }
  272. bitflags! {
  273. pub struct HFlags: u8 {
  274. const HF_GUEST_MASK = 1 << 0; /* VCPU is in guest-mode */
  275. const HF_SMM_MASK = 1 << 1;
  276. const HF_SMM_INSIDE_NMI_MASK = 1 << 2;
  277. }
  278. }
  279. /// ### 虚拟机的通用寄存器
  280. #[derive(Debug, Default, Clone, Copy)]
  281. #[repr(C)]
  282. pub struct KvmCommonRegs {
  283. rax: u64,
  284. rbx: u64,
  285. rcx: u64,
  286. rdx: u64,
  287. rsi: u64,
  288. rdi: u64,
  289. rsp: u64,
  290. rbp: u64,
  291. r8: u64,
  292. r9: u64,
  293. r10: u64,
  294. r11: u64,
  295. r12: u64,
  296. r13: u64,
  297. r14: u64,
  298. r15: u64,
  299. rip: u64,
  300. rflags: u64,
  301. }
  302. impl Vm {
  303. pub fn vcpu_precreate(&mut self, id: usize) -> Result<(), SystemError> {
  304. if self.arch.max_vcpu_ids == 0 {
  305. self.arch.max_vcpu_ids = 1024 * 4;
  306. }
  307. if id >= self.arch.max_vcpu_ids {
  308. return Err(SystemError::EINVAL);
  309. }
  310. return x86_kvm_ops().vcpu_precreate(self);
  311. }
  312. }
  313. bitflags! {
  314. pub struct EmulType: u32 {
  315. const NO_DECODE = 1 << 0;
  316. const TRAP_UD = 1 << 1;
  317. const SKIP = 1 << 2;
  318. const ALLOW_RETRY_PF = 1 << 3;
  319. const TRAP_UD_FORCED = 1 << 4;
  320. const VMWARE_GP = 1 << 5;
  321. const PF = 1 << 6;
  322. const COMPLETE_USER_EXIT = 1 << 7;
  323. const WRITE_PF_TO_SP = 1 << 8;
  324. }
  325. }
  326. #[allow(dead_code)]
  327. #[derive(Default, Debug)]
  328. ///用于跟踪和记录VCPU的各种统计信息。
  329. pub struct KvmVcpuStat {
  330. //pub generic: KvmVcpuStatGeneric,
  331. pub pf_taken: u64,
  332. pub pf_fixed: u64,
  333. pub pf_emulate: u64,
  334. pub pf_spurious: u64,
  335. pub pf_fast: u64,
  336. pub pf_mmio_spte_created: u64,
  337. pub pf_guest: u64,
  338. pub tlb_flush: u64,
  339. pub invlpg: u64,
  340. pub exits: u64,
  341. pub io_exits: u64,
  342. pub mmio_exits: u64,
  343. pub signal_exits: u64,
  344. pub irq_window_exits: u64,
  345. pub nmi_window_exits: u64,
  346. pub l1d_flush: u64,
  347. pub halt_exits: u64,
  348. pub request_irq_exits: u64,
  349. pub irq_exits: u64,
  350. pub host_state_reload: u64,
  351. pub fpu_reload: u64,
  352. pub insn_emulation: u64,
  353. pub insn_emulation_fail: u64,
  354. pub hypercalls: u64,
  355. pub irq_injections: u64,
  356. pub nmi_injections: u64,
  357. pub req_event: u64,
  358. pub nested_run: u64,
  359. pub directed_yield_attempted: u64,
  360. pub directed_yield_successful: u64,
  361. pub preemption_reported: u64,
  362. pub preemption_other: u64,
  363. pub guest_mode: u64,
  364. pub notify_window_exits: u64,
  365. }
  366. #[inline]
  367. /// 将 GFN 转换为 GPA
  368. pub fn gfn_to_gpa(gfn: u64) -> u64 {
  369. gfn << 12
  370. }
  371. #[allow(dead_code)]
  372. #[inline]
  373. /// 将 GPA 转换为 GFN
  374. pub fn gpa_to_gfn(gfn: u64) -> u64 {
  375. gfn >> 12
  376. }