Просмотр исходного кода

重写调度模块 (#679)

## PR:重写调度模块
--- 
### 完成的部分
- 实现cfs调度策略
- 搭建框架,后续功能可以迭代开发
- 目前能跑,未测试性能

### 需要后续接力的部分
- 实现组内调度(task_group)
- 实现跨核负载均衡(pelt算法)
- 接入sysfs,实现参数动态调节(sched_stat等)
- nice值以及priority等参数的设置及调优
GnoCiYeH 1 год назад
Родитель
Сommit
f0c87a897f
44 измененных файлов с 3743 добавлено и 1076 удалено
  1. 8 5
      kernel/src/arch/x86_64/driver/apic/apic_timer.rs
  2. 5 0
      kernel/src/arch/x86_64/driver/apic/x2apic.rs
  3. 3 5
      kernel/src/arch/x86_64/interrupt/handle.rs
  4. 2 2
      kernel/src/arch/x86_64/interrupt/ipi.rs
  5. 2 2
      kernel/src/arch/x86_64/ipc/signal.rs
  6. 11 1
      kernel/src/arch/x86_64/process/idle.rs
  7. 35 0
      kernel/src/arch/x86_64/process/mod.rs
  8. 11 12
      kernel/src/arch/x86_64/sched.rs
  9. 8 11
      kernel/src/driver/tty/kthread.rs
  10. 10 2
      kernel/src/exception/ipi.rs
  11. 7 1
      kernel/src/exception/softirq.rs
  12. 3 7
      kernel/src/filesystem/procfs/mod.rs
  13. 0 1
      kernel/src/include/bindings/wrapper.h
  14. 3 2
      kernel/src/init/init.rs
  15. 4 3
      kernel/src/ipc/pipe.rs
  16. 3 2
      kernel/src/libs/futex/futex.rs
  17. 3 2
      kernel/src/libs/mutex.rs
  18. 75 58
      kernel/src/libs/rbtree.rs
  19. 4 0
      kernel/src/libs/spinlock.rs
  20. 11 10
      kernel/src/libs/wait_queue.rs
  21. 2 2
      kernel/src/net/event_poll/mod.rs
  22. 7 4
      kernel/src/net/socket/mod.rs
  23. 2 2
      kernel/src/process/exit.rs
  24. 13 0
      kernel/src/process/fork.rs
  25. 16 0
      kernel/src/process/idle.rs
  26. 8 4
      kernel/src/process/kthread.rs
  27. 185 96
      kernel/src/process/mod.rs
  28. 0 283
      kernel/src/sched/cfs.rs
  29. 38 0
      kernel/src/sched/clock.rs
  30. 0 225
      kernel/src/sched/core.rs
  31. 107 0
      kernel/src/sched/cputime.rs
  32. 1801 0
      kernel/src/sched/fair.rs
  33. 67 0
      kernel/src/sched/idle.rs
  34. 969 25
      kernel/src/sched/mod.rs
  35. 260 0
      kernel/src/sched/pelt.rs
  36. 33 0
      kernel/src/sched/prio.rs
  37. 0 235
      kernel/src/sched/rt.rs
  38. 0 23
      kernel/src/sched/sched.h
  39. 0 39
      kernel/src/sched/syscall.rs
  40. 16 5
      kernel/src/syscall/mod.rs
  41. 3 2
      kernel/src/time/clocksource.rs
  42. 1 0
      kernel/src/time/jiffies.rs
  43. 3 2
      kernel/src/time/sleep.rs
  44. 4 3
      kernel/src/time/timer.rs

+ 8 - 5
kernel/src/arch/x86_64/driver/apic/apic_timer.rs

@@ -1,4 +1,5 @@
 use core::cell::RefCell;
+use core::sync::atomic::{fence, Ordering};
 
 use crate::arch::driver::tsc::TSCManager;
 use crate::arch::interrupt::TrapFrame;
@@ -12,7 +13,7 @@ use crate::exception::IrqNumber;
 
 use crate::kdebug;
 use crate::mm::percpu::PerCpu;
-use crate::sched::core::sched_update_jiffies;
+use crate::process::ProcessManager;
 use crate::smp::core::smp_get_processor_id;
 use crate::smp::cpu::ProcessorId;
 use crate::time::clocksource::HZ;
@@ -66,9 +67,10 @@ impl IrqHandler for LocalApicTimerHandler {
 struct LocalApicTimerIrqFlowHandler;
 
 impl IrqFlowHandler for LocalApicTimerIrqFlowHandler {
-    fn handle(&self, _irq_desc: &Arc<IrqDesc>, _trap_frame: &mut TrapFrame) {
-        LocalApicTimer::handle_irq().ok();
+    fn handle(&self, _irq_desc: &Arc<IrqDesc>, trap_frame: &mut TrapFrame) {
+        LocalApicTimer::handle_irq(trap_frame).ok();
         CurrentApic.send_eoi();
+        fence(Ordering::SeqCst)
     }
 }
 
@@ -274,8 +276,9 @@ impl LocalApicTimer {
         return (res.ecx & (1 << 24)) != 0;
     }
 
-    pub(super) fn handle_irq() -> Result<IrqReturn, SystemError> {
-        sched_update_jiffies();
+    pub(super) fn handle_irq(trap_frame: &TrapFrame) -> Result<IrqReturn, SystemError> {
+        // sched_update_jiffies();
+        ProcessManager::update_process_times(trap_frame.is_from_user());
         return Ok(IrqReturn::Handled);
     }
 }

+ 5 - 0
kernel/src/arch/x86_64/driver/apic/x2apic.rs

@@ -1,3 +1,5 @@
+use core::sync::atomic::{fence, Ordering};
+
 use x86::msr::{
     rdmsr, wrmsr, IA32_APIC_BASE, IA32_X2APIC_APICID, IA32_X2APIC_EOI, IA32_X2APIC_SIVR,
     IA32_X2APIC_VERSION,
@@ -62,9 +64,12 @@ impl LocalAPIC for X2Apic {
 
     /// 发送 EOI (End Of Interrupt)
     fn send_eoi(&self) {
+        fence(Ordering::SeqCst);
         unsafe {
             wrmsr(IA32_X2APIC_EOI, 0);
         }
+
+        fence(Ordering::SeqCst);
     }
 
     /// 获取 x2APIC 版本

+ 3 - 5
kernel/src/arch/x86_64/interrupt/handle.rs

@@ -1,15 +1,13 @@
 use core::intrinsics::likely;
 
 use crate::{
-    arch::{
-        driver::apic::{apic_timer::APIC_TIMER_IRQ_NUM, CurrentApic, LocalAPIC},
-        sched::sched,
-    },
+    arch::driver::apic::{apic_timer::APIC_TIMER_IRQ_NUM, CurrentApic, LocalAPIC},
     exception::{irqdesc::irq_desc_manager, softirq::do_softirq, IrqNumber},
     process::{
         utils::{current_pcb_flags, current_pcb_preempt_count},
         ProcessFlags,
     },
+    sched::{SchedMode, __schedule},
 };
 
 use super::TrapFrame;
@@ -47,6 +45,6 @@ unsafe extern "C" fn x86_64_do_irq(trap_frame: &mut TrapFrame, vector: u32) {
     if (current_pcb_flags().contains(ProcessFlags::NEED_SCHEDULE))
         && vector == APIC_TIMER_IRQ_NUM.data()
     {
-        sched();
+        __schedule(SchedMode::SM_PREEMPT);
     }
 }

+ 2 - 2
kernel/src/arch/x86_64/interrupt/ipi.rs

@@ -254,12 +254,12 @@ impl IrqFlowHandler for X86_64IpiIrqFlowHandler {
             }
             IPI_NUM_FLUSH_TLB => {
                 FlushTLBIpiHandler.handle(irq, None, None).ok();
+                CurrentApic.send_eoi();
             }
             _ => {
                 kerror!("Unknown IPI: {}", irq.data());
+                CurrentApic.send_eoi();
             }
         }
-
-        CurrentApic.send_eoi();
     }
 }

+ 2 - 2
kernel/src/arch/x86_64/ipc/signal.rs

@@ -7,7 +7,6 @@ use crate::{
         fpu::FpState,
         interrupt::TrapFrame,
         process::table::{USER_CS, USER_DS},
-        sched::sched,
         CurrentIrqArch, MMArch,
     },
     exception::InterruptArch,
@@ -18,6 +17,7 @@ use crate::{
     kerror,
     mm::MemoryManagementArch,
     process::ProcessManager,
+    sched::{schedule, SchedMode},
     syscall::{user_access::UserBufferWriter, Syscall},
 };
 
@@ -715,7 +715,7 @@ fn sig_stop(sig: Signal) {
         );
     });
     drop(guard);
-    sched();
+    schedule(SchedMode::SM_NONE);
     // TODO 暂停进程
 }
 /// 信号默认处理函数——继续进程

+ 11 - 1
kernel/src/arch/x86_64/process/idle.rs

@@ -1,11 +1,21 @@
 use core::hint::spin_loop;
 
-use crate::{arch::CurrentIrqArch, exception::InterruptArch, kBUG, process::ProcessManager};
+use crate::{
+    arch::CurrentIrqArch,
+    exception::InterruptArch,
+    kBUG,
+    process::{ProcessFlags, ProcessManager},
+    sched::{SchedMode, __schedule},
+};
 
 impl ProcessManager {
     /// 每个核的idle进程
     pub fn arch_idle_func() -> ! {
         loop {
+            let pcb = ProcessManager::current_pcb();
+            if pcb.flags().contains(ProcessFlags::NEED_SCHEDULE) {
+                __schedule(SchedMode::SM_NONE);
+            }
             if CurrentIrqArch::is_irq_enabled() {
                 unsafe {
                     x86::halt();

+ 35 - 0
kernel/src/arch/x86_64/process/mod.rs

@@ -563,6 +563,8 @@ pub unsafe fn arch_switch_to_user(path: String, argv: Vec<String>, envp: Vec<Str
     current_pcb.flags().remove(ProcessFlags::KTHREAD);
     current_pcb.worker_private().take();
 
+    *current_pcb.sched_info().sched_policy.write_irqsave() = crate::sched::SchedPolicy::CFS;
+
     let mut trap_frame = TrapFrame::new();
 
     compiler_fence(Ordering::SeqCst);
@@ -591,6 +593,7 @@ unsafe extern "sysv64" fn ready_to_switch_to_user(
     new_rip: usize,
 ) -> ! {
     *(trapframe_vaddr as *mut TrapFrame) = trap_frame;
+    compiler_fence(Ordering::SeqCst);
     asm!(
         "swapgs",
         "mov rsp, {trapframe_vaddr}",
@@ -601,3 +604,35 @@ unsafe extern "sysv64" fn ready_to_switch_to_user(
     );
     unreachable!()
 }
+
+// bitflags! {
+//     pub struct ProcessThreadFlags: u32 {
+//     /*
+//     * thread information flags
+//     * - these are process state flags that various assembly files
+//     *   may need to access
+//     */
+//     const TIF_NOTIFY_RESUME	= 1 << 1;	/* callback before returning to user */
+//     const TIF_SIGPENDING	=	1 << 2;	/* signal pending */
+//     const TIF_NEED_RESCHED	= 1 << 3;	/* rescheduling necessary */
+//     const TIF_SINGLESTEP	=	1 << 4;	/* reenable singlestep on user return*/
+//     const TIF_SSBD		= 1 << 5;	/* Speculative store bypass disable */
+//     const TIF_SPEC_IB		= 1 << 9;	/* Indirect branch speculation mitigation */
+//     const TIF_SPEC_L1D_FLUSH	= 1 << 10;	/* Flush L1D on mm switches (processes) */
+//     const TIF_USER_RETURN_NOTIFY	= 1 << 11;	/* notify kernel of userspace return */
+//     const TIF_UPROBE		= 1 << 12;	/* breakpointed or singlestepping */
+//     const TIF_PATCH_PENDING	= 1 << 13;	/* pending live patching update */
+//     const TIF_NEED_FPU_LOAD	= 1 << 14;	/* load FPU on return to userspace */
+//     const TIF_NOCPUID		= 1 << 15;	/* CPUID is not accessible in userland */
+//     const TIF_NOTSC		= 1 << 16;	/* TSC is not accessible in userland */
+//     const TIF_NOTIFY_SIGNAL	= 1 << 17;	/* signal notifications exist */
+//     const TIF_MEMDIE		= 1 << 20;	/* is terminating due to OOM killer */
+//     const TIF_POLLING_NRFLAG	= 1 << 21;	/* idle is polling for TIF_NEED_RESCHED */
+//     const TIF_IO_BITMAP		= 1 << 22;	/* uses I/O bitmap */
+//     const TIF_SPEC_FORCE_UPDATE	= 1 << 23;	/* Force speculation MSR update in context switch */
+//     const TIF_FORCED_TF		= 1 << 24;	/* true if TF in eflags artificially */
+//     const TIF_BLOCKSTEP		= 1 << 25;	/* set when we want DEBUGCTLMSR_BTF */
+//     const TIF_LAZY_MMU_UPDATES	= 1 << 27;	/* task is updating the mmu lazily */
+//     const TIF_ADDR32		= 1 << 29;	/* 32-bit address space on 64 bits */
+//     }
+// }

+ 11 - 12
kernel/src/arch/x86_64/sched.rs

@@ -1,20 +1,19 @@
 use core::hint::spin_loop;
 
-use crate::{
-    exception::InterruptArch, include::bindings::bindings::enter_syscall_int, sched::SchedArch,
-    smp::core::smp_get_processor_id, syscall::SYS_SCHED,
-};
+use crate::{exception::InterruptArch, sched::SchedArch, smp::core::smp_get_processor_id};
 
 use super::{driver::apic::apic_timer::apic_timer_init, CurrentIrqArch};
 
-/// @brief 若内核代码不处在中断上下文中,那么将可以使用本函数,发起一个sys_sched系统调用,然后运行调度器。
-/// 由于只能在中断上下文中进行进程切换,因此需要发起一个系统调用SYS_SCHED。
-#[no_mangle]
-pub extern "C" fn sched() {
-    unsafe {
-        enter_syscall_int(SYS_SCHED as u64, 0, 0, 0, 0, 0, 0);
-    }
-}
+// /// @brief 若内核代码不处在中断上下文中,那么将可以使用本函数,发起一个sys_sched系统调用,然后运行调度器。
+// /// 由于只能在中断上下文中进行进程切换,因此需要发起一个系统调用SYS_SCHED。
+// #[no_mangle]
+// pub extern "C" fn sched() {
+//     let _guard = unsafe { CurrentIrqArch::save_and_disable_irq() };
+//     __schedule(SchedMode::SM_NONE);
+//     // unsafe {
+//     //     enter_syscall_int(SYS_SCHED as u64, 0, 0, 0, 0, 0, 0);
+//     // }
+// }
 
 static mut BSP_INIT_OK: bool = false;
 

+ 8 - 11
kernel/src/driver/tty/kthread.rs

@@ -4,12 +4,14 @@ use alloc::{string::ToString, sync::Arc};
 use kdepends::thingbuf::StaticThingBuf;
 
 use crate::{
-    arch::sched::sched,
+    arch::CurrentIrqArch,
     driver::tty::virtual_terminal::virtual_console::CURRENT_VCNUM,
+    exception::InterruptArch,
     process::{
         kthread::{KernelThreadClosure, KernelThreadMechanism},
-        ProcessControlBlock, ProcessFlags,
+        ProcessControlBlock, ProcessManager,
     },
+    sched::{schedule, SchedMode},
 };
 
 use super::tty_port::current_tty_port;
@@ -35,15 +37,9 @@ fn tty_refresh_thread() -> i32 {
     loop {
         if KEYBUF.is_empty() {
             // 如果缓冲区为空,就休眠
-            unsafe {
-                TTY_REFRESH_THREAD
-                    .as_ref()
-                    .unwrap()
-                    .flags()
-                    .insert(ProcessFlags::NEED_SCHEDULE)
-            };
-
-            sched();
+            let _guard = unsafe { CurrentIrqArch::save_and_disable_irq() };
+            ProcessManager::mark_sleep(true).expect("TTY_REFRESH_THREAD can not mark sleep");
+            schedule(SchedMode::SM_NONE);
         }
 
         let to_dequeue = core::cmp::min(KEYBUF.len(), TO_DEQUEUE_MAX);
@@ -69,4 +65,5 @@ pub fn send_to_tty_refresh_thread(data: &[u8]) {
     for item in data {
         KEYBUF.push(*item).ok();
     }
+    let _ = ProcessManager::wakeup(unsafe { TTY_REFRESH_THREAD.as_ref().unwrap() });
 }

+ 10 - 2
kernel/src/exception/ipi.rs

@@ -1,9 +1,13 @@
 use alloc::sync::Arc;
 use system_error::SystemError;
 
+#[cfg(target_arch = "x86_64")]
+use crate::arch::driver::apic::{CurrentApic, LocalAPIC};
+
 use crate::{
-    arch::{sched::sched, MMArch},
+    arch::MMArch,
     mm::MemoryManagementArch,
+    sched::{SchedMode, __schedule},
     smp::cpu::ProcessorId,
 };
 
@@ -47,7 +51,11 @@ impl IrqHandler for KickCpuIpiHandler {
         _static_data: Option<&dyn IrqHandlerData>,
         _dynamic_data: Option<Arc<dyn IrqHandlerData>>,
     ) -> Result<IrqReturn, SystemError> {
-        sched();
+        #[cfg(target_arch = "x86_64")]
+        CurrentApic.send_eoi();
+
+        // 被其他cpu kick时应该是抢占调度
+        __schedule(SchedMode::SM_PREEMPT);
         Ok(IrqReturn::Handled)
     }
 }

+ 7 - 1
kernel/src/exception/softirq.rs

@@ -3,7 +3,7 @@ use core::{
     intrinsics::unlikely,
     mem::{self, MaybeUninit},
     ptr::null_mut,
-    sync::atomic::{compiler_fence, AtomicI16, Ordering},
+    sync::atomic::{compiler_fence, fence, AtomicI16, Ordering},
 };
 
 use alloc::{boxed::Box, sync::Arc, vec::Vec};
@@ -17,6 +17,7 @@ use crate::{
     libs::rwlock::RwLock,
     mm::percpu::{PerCpu, PerCpuVar},
     process::ProcessManager,
+    sched::cputime::IrqTime,
     smp::{core::smp_get_processor_id, cpu::ProcessorId},
     time::timer::clock,
 };
@@ -286,6 +287,11 @@ impl<'a> Drop for RunningCountGuard<'a> {
     }
 }
 
+#[inline(never)]
 pub fn do_softirq() {
+    fence(Ordering::SeqCst);
+    IrqTime::irqtime_start();
     softirq_vectors().do_softirq();
+    IrqTime::irqtime_account_irq(ProcessManager::current_pcb());
+    fence(Ordering::SeqCst);
 }

+ 3 - 7
kernel/src/filesystem/procfs/mod.rs

@@ -168,8 +168,8 @@ impl ProcFSInode {
             .map(|cpu| cpu.data() as i32)
             .unwrap_or(-1);
 
-        let priority = sched_info_guard.priority();
-        let vrtime = sched_info_guard.virtual_runtime();
+        let priority = sched_info_guard.policy();
+        let vrtime = sched_info_guard.sched_entity.vruntime;
 
         pdata.append(&mut format!("\nState:\t{:?}", state).as_bytes().to_owned());
         pdata.append(
@@ -183,11 +183,7 @@ impl ProcFSInode {
                 .to_owned(),
         );
         pdata.append(&mut format!("\ncpu_id:\t{}", cpu_id).as_bytes().to_owned());
-        pdata.append(
-            &mut format!("\npriority:\t{}", priority.data())
-                .as_bytes()
-                .to_owned(),
-        );
+        pdata.append(&mut format!("\npriority:\t{:?}", priority).as_bytes().to_owned());
         pdata.append(
             &mut format!("\npreempt:\t{}", pcb.preempt_count())
                 .as_bytes()

+ 0 - 1
kernel/src/include/bindings/wrapper.h

@@ -24,6 +24,5 @@
 #include <mm/mmio.h>
 #include <mm/slab.h>
 #include <process/process.h>
-#include <sched/sched.h>
 #include <time/sleep.h>
 #include <common/errno.h>

+ 3 - 2
kernel/src/init/init.rs

@@ -17,7 +17,7 @@ use crate::{
     },
     mm::init::mm_init,
     process::{kthread::kthread_init, process_init, ProcessManager},
-    sched::{core::sched_init, SchedArch},
+    sched::SchedArch,
     smp::{early_smp_init, SMPArch},
     syscall::Syscall,
     time::{
@@ -59,13 +59,14 @@ fn do_start_kernel() {
     unsafe {
         acpi_init()
     };
+    crate::sched::sched_init();
     process_init();
     early_smp_init().expect("early smp init failed");
     irq_init().expect("irq init failed");
     setup_arch().expect("setup_arch failed");
     CurrentSMPArch::prepare_cpus().expect("prepare_cpus failed");
 
-    sched_init();
+    // sched_init();
     softirq_init().expect("softirq init failed");
     Syscall::init().expect("syscall init failed");
     timekeeping_init();

+ 4 - 3
kernel/src/ipc/pipe.rs

@@ -1,5 +1,5 @@
 use crate::{
-    arch::{sched::sched, CurrentIrqArch},
+    arch::CurrentIrqArch,
     exception::InterruptArch,
     filesystem::vfs::{
         core::generate_inode_id, file::FileMode, syscall::ModeType, FilePrivateData, FileSystem,
@@ -11,6 +11,7 @@ use crate::{
     },
     net::event_poll::{EPollEventType, EPollItem, EventPoll},
     process::ProcessState,
+    sched::{schedule, SchedMode},
     time::TimeSpec,
 };
 
@@ -197,7 +198,7 @@ impl IndexNode for LockedPipeInode {
                 self.read_wait_queue.sleep_without_schedule();
                 drop(irq_guard);
             }
-            sched();
+            schedule(SchedMode::SM_NONE);
             inode = self.inner.lock();
         }
 
@@ -354,7 +355,7 @@ impl IndexNode for LockedPipeInode {
                 self.write_wait_queue.sleep_without_schedule();
                 drop(irq_guard);
             }
-            sched();
+            schedule(SchedMode::SM_NONE);
             inode = self.inner.lock();
         }
 

+ 3 - 2
kernel/src/libs/futex/futex.rs

@@ -8,11 +8,12 @@ use hashbrown::HashMap;
 use system_error::SystemError;
 
 use crate::{
-    arch::{sched::sched, CurrentIrqArch, MMArch},
+    arch::{CurrentIrqArch, MMArch},
     exception::InterruptArch,
     libs::spinlock::{SpinLock, SpinLockGuard},
     mm::{ucontext::AddressSpace, MemoryManagementArch, VirtAddr},
     process::{ProcessControlBlock, ProcessManager},
+    sched::{schedule, SchedMode},
     syscall::user_access::UserBufferReader,
     time::{
         timer::{next_n_us_timer_jiffies, Timer, WakeUpHelper},
@@ -287,7 +288,7 @@ impl Futex {
         })?;
         drop(futex_map_guard);
         drop(irq_guard);
-        sched();
+        schedule(SchedMode::SM_NONE);
 
         // 被唤醒后的检查
         let mut futex_map_guard = FutexData::futex_map();

+ 3 - 2
kernel/src/libs/mutex.rs

@@ -7,10 +7,11 @@ use alloc::{collections::LinkedList, sync::Arc};
 use system_error::SystemError;
 
 use crate::{
-    arch::{sched::sched, CurrentIrqArch},
+    arch::CurrentIrqArch,
     exception::InterruptArch,
     libs::spinlock::SpinLockGuard,
     process::{Pid, ProcessControlBlock, ProcessManager},
+    sched::{schedule, SchedMode},
 };
 
 use super::spinlock::SpinLock;
@@ -106,7 +107,7 @@ impl<T> Mutex<T> {
         let irq_guard = unsafe { CurrentIrqArch::save_and_disable_irq() };
         ProcessManager::mark_sleep(true).ok();
         drop(irq_guard);
-        sched();
+        schedule(SchedMode::SM_NONE);
     }
 
     /// @brief 放锁。

+ 75 - 58
kernel/src/libs/rbtree.rs

@@ -32,7 +32,7 @@ enum Color {
 }
 
 /*****************RBTreeNode***************************/
-struct RBTreeNode<K: Ord, V> {
+struct RBTreeNode<K: Ord + Debug, V: Debug> {
     color: Color,
     left: NodePtr<K, V>,
     right: NodePtr<K, V>,
@@ -41,7 +41,7 @@ struct RBTreeNode<K: Ord, V> {
     value: V,
 }
 
-impl<K: Ord, V> RBTreeNode<K, V> {
+impl<K: Ord + Debug, V: Debug> RBTreeNode<K, V> {
     #[inline]
     fn pair(self) -> (K, V) {
         (self.key, self.value)
@@ -60,37 +60,37 @@ where
 
 /*****************NodePtr***************************/
 #[derive(Debug)]
-struct NodePtr<K: Ord, V>(*mut RBTreeNode<K, V>);
+struct NodePtr<K: Ord + Debug, V: Debug>(*mut RBTreeNode<K, V>);
 
-impl<K: Ord, V> Clone for NodePtr<K, V> {
+impl<K: Ord + Debug, V: Debug> Clone for NodePtr<K, V> {
     fn clone(&self) -> NodePtr<K, V> {
         *self
     }
 }
 
-impl<K: Ord, V> Copy for NodePtr<K, V> {}
+impl<K: Ord + Debug, V: Debug> Copy for NodePtr<K, V> {}
 
-impl<K: Ord, V> Ord for NodePtr<K, V> {
+impl<K: Ord + Debug, V: Debug> Ord for NodePtr<K, V> {
     fn cmp(&self, other: &NodePtr<K, V>) -> Ordering {
         unsafe { (*self.0).key.cmp(&(*other.0).key) }
     }
 }
 
-impl<K: Ord, V> PartialOrd for NodePtr<K, V> {
+impl<K: Ord + Debug, V: Debug> PartialOrd for NodePtr<K, V> {
     fn partial_cmp(&self, other: &NodePtr<K, V>) -> Option<Ordering> {
         Some(self.cmp(other))
     }
 }
 
-impl<K: Ord, V> PartialEq for NodePtr<K, V> {
+impl<K: Ord + Debug, V: Debug> PartialEq for NodePtr<K, V> {
     fn eq(&self, other: &NodePtr<K, V>) -> bool {
         self.0 == other.0
     }
 }
 
-impl<K: Ord, V> Eq for NodePtr<K, V> {}
+impl<K: Ord + Debug, V: Debug> Eq for NodePtr<K, V> {}
 
-impl<K: Ord, V> NodePtr<K, V> {
+impl<K: Ord + Debug, V: Debug> NodePtr<K, V> {
     fn new(k: K, v: V) -> NodePtr<K, V> {
         let node = RBTreeNode {
             color: Color::Black,
@@ -270,7 +270,7 @@ impl<K: Ord, V> NodePtr<K, V> {
     }
 }
 
-impl<K: Ord + Clone, V: Clone> NodePtr<K, V> {
+impl<K: Ord + Clone + Debug, V: Clone + Debug> NodePtr<K, V> {
     unsafe fn deep_clone(&self) -> NodePtr<K, V> {
         let mut node = NodePtr::new((*self.0).key.clone(), (*self.0).value.clone());
         if !self.left().is_null() {
@@ -339,16 +339,16 @@ impl<K: Ord + Clone, V: Clone> NodePtr<K, V> {
 ///   .iter().cloned().collect();
 ///  // use the values stored in rbtree
 ///  ```
-pub struct RBTree<K: Ord, V> {
+pub struct RBTree<K: Ord + Debug, V: Debug> {
     root: NodePtr<K, V>,
     len: usize,
 }
 
-unsafe impl<K: Ord, V> Send for RBTree<K, V> {}
-unsafe impl<K: Ord, V> Sync for RBTree<K, V> {}
+unsafe impl<K: Ord + Debug, V: Debug> Send for RBTree<K, V> {}
+unsafe impl<K: Ord + Debug, V: Debug> Sync for RBTree<K, V> {}
 
 // Drop all owned pointers if the tree is dropped
-impl<K: Ord, V> Drop for RBTree<K, V> {
+impl<K: Ord + Debug, V: Debug> Drop for RBTree<K, V> {
     #[inline]
     fn drop(&mut self) {
         self.clear();
@@ -356,7 +356,7 @@ impl<K: Ord, V> Drop for RBTree<K, V> {
 }
 
 /// If key and value are both impl Clone, we can call clone to get a copy.
-impl<K: Ord + Clone, V: Clone> Clone for RBTree<K, V> {
+impl<K: Ord + Clone + Debug, V: Clone + Debug> Clone for RBTree<K, V> {
     fn clone(&self) -> RBTree<K, V> {
         unsafe {
             let mut new = RBTree::new();
@@ -417,8 +417,8 @@ impl<K: Ord + Debug, V: Debug> RBTree<K, V> {
 /// all key be same, but it has multi key, if has multi key, it perhaps no correct
 impl<K, V> PartialEq for RBTree<K, V>
 where
-    K: Eq + Ord,
-    V: PartialEq,
+    K: Eq + Ord + Debug,
+    V: PartialEq + Debug,
 {
     fn eq(&self, other: &RBTree<K, V>) -> bool {
         if self.len() != other.len() {
@@ -430,17 +430,9 @@ where
     }
 }
 
-impl<K, V> Eq for RBTree<K, V>
-where
-    K: Eq + Ord,
-    V: Eq,
-{
-}
+impl<K: Eq + Ord + Debug, V: Eq + Debug> Eq for RBTree<K, V> {}
 
-impl<'a, K, V> Index<&'a K> for RBTree<K, V>
-where
-    K: Ord,
-{
+impl<'a, K: Ord + Debug, V: Debug> Index<&'a K> for RBTree<K, V> {
     type Output = V;
 
     #[inline]
@@ -449,7 +441,7 @@ where
     }
 }
 
-impl<K: Ord, V> FromIterator<(K, V)> for RBTree<K, V> {
+impl<K: Ord + Debug, V: Debug> FromIterator<(K, V)> for RBTree<K, V> {
     fn from_iter<T: IntoIterator<Item = (K, V)>>(iter: T) -> RBTree<K, V> {
         let mut tree = RBTree::new();
         tree.extend(iter);
@@ -458,7 +450,7 @@ impl<K: Ord, V> FromIterator<(K, V)> for RBTree<K, V> {
 }
 
 /// RBTree into iter
-impl<K: Ord, V> Extend<(K, V)> for RBTree<K, V> {
+impl<K: Ord + Debug, V: Debug> Extend<(K, V)> for RBTree<K, V> {
     fn extend<T: IntoIterator<Item = (K, V)>>(&mut self, iter: T) {
         let iter = iter.into_iter();
         for (k, v) in iter {
@@ -479,11 +471,11 @@ impl<K: Ord, V> Extend<(K, V)> for RBTree<K, V> {
 /// let key_vec: Vec<_> = m.keys().cloned().collect();
 /// assert_eq!(vec, key_vec);
 /// ```
-pub struct Keys<'a, K: Ord + 'a, V: 'a> {
+pub struct Keys<'a, K: Ord + Debug + 'a, V: Debug + 'a> {
     inner: Iter<'a, K, V>,
 }
 
-impl<'a, K: Ord, V> Clone for Keys<'a, K, V> {
+impl<'a, K: Ord + Debug, V: Debug> Clone for Keys<'a, K, V> {
     fn clone(&self) -> Keys<'a, K, V> {
         Keys {
             inner: self.inner.clone(),
@@ -491,13 +483,13 @@ impl<'a, K: Ord, V> Clone for Keys<'a, K, V> {
     }
 }
 
-impl<'a, K: Ord + Debug, V> fmt::Debug for Keys<'a, K, V> {
+impl<'a, K: Ord + Debug, V: Debug> fmt::Debug for Keys<'a, K, V> {
     fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
         f.debug_list().entries(self.clone()).finish()
     }
 }
 
-impl<'a, K: Ord, V> Iterator for Keys<'a, K, V> {
+impl<'a, K: Ord + Debug, V: Debug> Iterator for Keys<'a, K, V> {
     type Item = &'a K;
 
     #[inline]
@@ -524,11 +516,11 @@ impl<'a, K: Ord, V> Iterator for Keys<'a, K, V> {
 /// let key_vec: Vec<_> = m.values().cloned().collect();
 /// assert_eq!(vec, key_vec);
 /// ```
-pub struct Values<'a, K: 'a + Ord, V: 'a> {
+pub struct Values<'a, K: Ord + Debug, V: Debug> {
     inner: Iter<'a, K, V>,
 }
 
-impl<'a, K: Ord, V> Clone for Values<'a, K, V> {
+impl<'a, K: Ord + Debug, V: Debug> Clone for Values<'a, K, V> {
     fn clone(&self) -> Values<'a, K, V> {
         Values {
             inner: self.inner.clone(),
@@ -542,7 +534,7 @@ impl<'a, K: Ord + Debug, V: Debug> fmt::Debug for Values<'a, K, V> {
     }
 }
 
-impl<'a, K: Ord, V> Iterator for Values<'a, K, V> {
+impl<'a, K: Ord + Debug, V: Debug> Iterator for Values<'a, K, V> {
     type Item = &'a V;
 
     #[inline]
@@ -572,11 +564,11 @@ impl<'a, K: Ord, V> Iterator for Values<'a, K, V> {
 ///     assert_eq!(m.get(&i).unwrap(), &(i * 2));
 /// }
 /// ```
-pub struct ValuesMut<'a, K: 'a + Ord, V: 'a> {
+pub struct ValuesMut<'a, K: Ord + Debug + 'a, V: Debug + 'a> {
     inner: IterMut<'a, K, V>,
 }
 
-impl<'a, K: Ord, V> Clone for ValuesMut<'a, K, V> {
+impl<'a, K: Ord + Debug, V: Debug> Clone for ValuesMut<'a, K, V> {
     fn clone(&self) -> ValuesMut<'a, K, V> {
         ValuesMut {
             inner: self.inner.clone(),
@@ -590,7 +582,7 @@ impl<'a, K: Ord + Debug, V: Debug> fmt::Debug for ValuesMut<'a, K, V> {
     }
 }
 
-impl<'a, K: Ord, V> Iterator for ValuesMut<'a, K, V> {
+impl<'a, K: Ord + Debug, V: Debug> Iterator for ValuesMut<'a, K, V> {
     type Item = &'a mut V;
 
     #[inline]
@@ -605,21 +597,21 @@ impl<'a, K: Ord, V> Iterator for ValuesMut<'a, K, V> {
 }
 
 /// Convert RBTree to iter, move out the tree.
-pub struct IntoIter<K: Ord, V> {
+pub struct IntoIter<K: Ord + Debug, V: Debug> {
     head: NodePtr<K, V>,
     tail: NodePtr<K, V>,
     len: usize,
 }
 
 // Drop all owned pointers if the collection is dropped
-impl<K: Ord, V> Drop for IntoIter<K, V> {
+impl<K: Ord + Debug, V: Debug> Drop for IntoIter<K, V> {
     #[inline]
     fn drop(&mut self) {
         for (_, _) in self {}
     }
 }
 
-impl<K: Ord, V> Iterator for IntoIter<K, V> {
+impl<K: Ord + Debug, V: Debug> Iterator for IntoIter<K, V> {
     type Item = (K, V);
 
     fn next(&mut self) -> Option<(K, V)> {
@@ -648,7 +640,7 @@ impl<K: Ord, V> Iterator for IntoIter<K, V> {
     }
 }
 
-impl<K: Ord, V> DoubleEndedIterator for IntoIter<K, V> {
+impl<K: Ord + Debug, V: Debug> DoubleEndedIterator for IntoIter<K, V> {
     #[inline]
     fn next_back(&mut self) -> Option<(K, V)> {
         if self.len == 0 {
@@ -684,14 +676,14 @@ impl<K: Ord, V> DoubleEndedIterator for IntoIter<K, V> {
 /// }
 /// assert_eq!(observed, 0xFFFF_FFFF);
 /// ```
-pub struct Iter<'a, K: Ord + 'a, V: 'a> {
+pub struct Iter<'a, K: Ord + Debug + 'a, V: Debug + 'a> {
     head: NodePtr<K, V>,
     tail: NodePtr<K, V>,
     len: usize,
     _marker: marker::PhantomData<&'a ()>,
 }
 
-impl<'a, K: Ord + 'a, V: 'a> Clone for Iter<'a, K, V> {
+impl<'a, K: Ord + Debug + 'a, V: Debug + 'a> Clone for Iter<'a, K, V> {
     fn clone(&self) -> Iter<'a, K, V> {
         Iter {
             head: self.head,
@@ -702,7 +694,7 @@ impl<'a, K: Ord + 'a, V: 'a> Clone for Iter<'a, K, V> {
     }
 }
 
-impl<'a, K: Ord + 'a, V: 'a> Iterator for Iter<'a, K, V> {
+impl<'a, K: Ord + Debug + 'a, V: Debug + 'a> Iterator for Iter<'a, K, V> {
     type Item = (&'a K, &'a V);
 
     fn next(&mut self) -> Option<(&'a K, &'a V)> {
@@ -725,7 +717,7 @@ impl<'a, K: Ord + 'a, V: 'a> Iterator for Iter<'a, K, V> {
     }
 }
 
-impl<'a, K: Ord + 'a, V: 'a> DoubleEndedIterator for Iter<'a, K, V> {
+impl<'a, K: Ord + Debug + 'a, V: Debug + 'a> DoubleEndedIterator for Iter<'a, K, V> {
     #[inline]
     fn next_back(&mut self) -> Option<(&'a K, &'a V)> {
         // kdebug!("len = {:?}", self.len);
@@ -733,10 +725,6 @@ impl<'a, K: Ord + 'a, V: 'a> DoubleEndedIterator for Iter<'a, K, V> {
             return None;
         }
 
-        if self.tail == self.head {
-            return None;
-        }
-
         let (k, v) = unsafe { (&(*self.tail.0).key, &(*self.tail.0).value) };
         self.tail = self.tail.prev();
         self.len -= 1;
@@ -760,14 +748,14 @@ impl<'a, K: Ord + 'a, V: 'a> DoubleEndedIterator for Iter<'a, K, V> {
 ///     assert_eq!(m.get(&i).unwrap(), &(i * 2));
 /// }
 /// ```
-pub struct IterMut<'a, K: Ord + 'a, V: 'a> {
+pub struct IterMut<'a, K: Ord + Debug + 'a, V: Debug + 'a> {
     head: NodePtr<K, V>,
     tail: NodePtr<K, V>,
     len: usize,
     _marker: marker::PhantomData<&'a ()>,
 }
 
-impl<'a, K: Ord + 'a, V: 'a> Clone for IterMut<'a, K, V> {
+impl<'a, K: Ord + Debug + 'a, V: Debug + 'a> Clone for IterMut<'a, K, V> {
     fn clone(&self) -> IterMut<'a, K, V> {
         IterMut {
             head: self.head,
@@ -778,7 +766,7 @@ impl<'a, K: Ord + 'a, V: 'a> Clone for IterMut<'a, K, V> {
     }
 }
 
-impl<'a, K: Ord + 'a, V: 'a> Iterator for IterMut<'a, K, V> {
+impl<'a, K: Ord + Debug + 'a, V: Debug + 'a> Iterator for IterMut<'a, K, V> {
     type Item = (&'a K, &'a mut V);
 
     fn next(&mut self) -> Option<(&'a K, &'a mut V)> {
@@ -801,7 +789,7 @@ impl<'a, K: Ord + 'a, V: 'a> Iterator for IterMut<'a, K, V> {
     }
 }
 
-impl<'a, K: Ord + 'a, V: 'a> DoubleEndedIterator for IterMut<'a, K, V> {
+impl<'a, K: Ord + Debug + 'a, V: Debug + 'a> DoubleEndedIterator for IterMut<'a, K, V> {
     #[inline]
     fn next_back(&mut self) -> Option<(&'a K, &'a mut V)> {
         if self.len == 0 {
@@ -819,7 +807,7 @@ impl<'a, K: Ord + 'a, V: 'a> DoubleEndedIterator for IterMut<'a, K, V> {
     }
 }
 
-impl<K: Ord, V> IntoIterator for RBTree<K, V> {
+impl<K: Ord + Debug, V: Debug> IntoIterator for RBTree<K, V> {
     type Item = (K, V);
     type IntoIter = IntoIter<K, V>;
 
@@ -843,7 +831,7 @@ impl<K: Ord, V> IntoIterator for RBTree<K, V> {
     }
 }
 
-impl<K: Ord, V> RBTree<K, V> {
+impl<K: Ord + Debug, V: Debug> RBTree<K, V> {
     /// Creates an empty `RBTree`.
     pub fn new() -> RBTree<K, V> {
         RBTree {
@@ -1200,17 +1188,31 @@ impl<K: Ord, V> RBTree<K, V> {
         }
     }
 
+    /// clear all red back tree elements.
+    /// # Examples
+    /// ```
+    /// use rbtree::RBTree;
+    /// let mut m = RBTree::new();
+    /// for i in 0..6 {
+    ///     m.insert(i, i);
+    /// }
+    /// assert_eq!(m.len(), 6);
+    /// m.clear();
+    /// assert_eq!(m.len(), 0);
+    /// ```
     #[inline]
     pub fn clear(&mut self) {
         let root = self.root;
         self.root = NodePtr::null();
         self.clear_recurse(root);
+        self.len = 0;
     }
 
     /// Empties the `RBTree` without freeing objects in it.
     #[inline]
     fn fast_clear(&mut self) {
         self.root = NodePtr::null();
+        self.len = 0;
     }
 
     #[inline]
@@ -1814,4 +1816,19 @@ mod tests {
         assert_eq!(a[&2], "two");
         assert_eq!(a[&3], "three");
     }
+
+    #[test]
+    fn test_rev_iter() {
+        let mut a = RBTree::new();
+        a.insert(1, 1);
+        a.insert(2, 2);
+        a.insert(3, 3);
+
+        assert_eq!(a.len(), 3);
+        let mut cache = vec![];
+        for e in a.iter().rev() {
+            cache.push(e.0.clone());
+        }
+        assert_eq!(&cache, &vec![3, 2, 1]);
+    }
 }

+ 4 - 0
kernel/src/libs/spinlock.rs

@@ -167,6 +167,10 @@ impl<T> SpinLock<T> {
         self.lock.store(false, Ordering::SeqCst);
         ProcessManager::preempt_enable();
     }
+
+    pub fn is_locked(&self) -> bool {
+        self.lock.load(Ordering::SeqCst)
+    }
 }
 
 /// 实现Deref trait,支持通过获取SpinLockGuard来获取临界区数据的不可变引用

+ 11 - 10
kernel/src/libs/wait_queue.rs

@@ -4,10 +4,11 @@ use core::intrinsics::unlikely;
 use alloc::{collections::LinkedList, sync::Arc, vec::Vec};
 
 use crate::{
-    arch::{sched::sched, CurrentIrqArch},
+    arch::CurrentIrqArch,
     exception::InterruptArch,
     kerror,
     process::{ProcessControlBlock, ProcessManager, ProcessState},
+    sched::{schedule, SchedMode},
 };
 
 use super::{
@@ -40,7 +41,7 @@ impl WaitQueue {
         });
         guard.wait_list.push_back(ProcessManager::current_pcb());
         drop(guard);
-        sched();
+        schedule(SchedMode::SM_NONE);
     }
 
     /// @brief 让当前进程在等待队列上进行等待,并且,在释放waitqueue的锁之前,执行f函数闭包
@@ -59,7 +60,7 @@ impl WaitQueue {
         f();
 
         drop(guard);
-        sched();
+        schedule(SchedMode::SM_NONE);
     }
 
     /// @brief 让当前进程在等待队列上进行等待. 但是,在释放waitqueue的锁之后,不会调用调度函数。
@@ -110,7 +111,7 @@ impl WaitQueue {
         drop(irq_guard);
         guard.wait_list.push_back(ProcessManager::current_pcb());
         drop(guard);
-        sched();
+        schedule(SchedMode::SM_NONE);
     }
 
     /// @brief 让当前进程在等待队列上进行等待,并且,允许被信号打断。
@@ -126,7 +127,7 @@ impl WaitQueue {
         guard.wait_list.push_back(ProcessManager::current_pcb());
         drop(to_unlock);
         drop(guard);
-        sched();
+        schedule(SchedMode::SM_NONE);
     }
 
     /// @brief 让当前进程在等待队列上进行等待,并且,允许被信号打断。
@@ -142,7 +143,7 @@ impl WaitQueue {
         guard.wait_list.push_back(ProcessManager::current_pcb());
         drop(to_unlock);
         drop(guard);
-        sched();
+        schedule(SchedMode::SM_NONE);
     }
 
     /// @brief 让当前进程在等待队列上进行等待,并且,不允许被信号打断。
@@ -158,7 +159,7 @@ impl WaitQueue {
         guard.wait_list.push_back(ProcessManager::current_pcb());
         drop(to_unlock);
         drop(guard);
-        sched();
+        schedule(SchedMode::SM_NONE);
     }
 
     /// @brief 让当前进程在等待队列上进行等待,并且,不允许被信号打断。
@@ -176,7 +177,7 @@ impl WaitQueue {
 
         drop(to_unlock);
         drop(guard);
-        sched();
+        schedule(SchedMode::SM_NONE);
     }
 
     /// @brief 唤醒在队列中等待的第一个进程。
@@ -306,7 +307,7 @@ impl EventWaitQueue {
         });
         guard.push((events, ProcessManager::current_pcb()));
         drop(guard);
-        sched();
+        schedule(SchedMode::SM_NONE);
     }
 
     pub unsafe fn sleep_without_schedule(&self, events: u64) {
@@ -330,7 +331,7 @@ impl EventWaitQueue {
         guard.push((events, ProcessManager::current_pcb()));
         drop(to_unlock);
         drop(guard);
-        sched();
+        schedule(SchedMode::SM_NONE);
     }
 
     /// ### 唤醒该队列上等待events的进程

+ 2 - 2
kernel/src/net/event_poll/mod.rs

@@ -11,7 +11,6 @@ use alloc::{
 use system_error::SystemError;
 
 use crate::{
-    arch::sched::sched,
     filesystem::vfs::{
         file::{File, FileMode},
         FilePrivateData, IndexNode, Metadata,
@@ -24,6 +23,7 @@ use crate::{
         wait_queue::WaitQueue,
     },
     process::ProcessManager,
+    sched::{schedule, SchedMode},
     time::{
         timer::{next_n_us_timer_jiffies, Timer, WakeUpHelper},
         TimeSpec,
@@ -489,7 +489,7 @@ impl EventPoll {
                 let guard = epoll.0.lock_irqsave();
                 unsafe { guard.epoll_wq.sleep_without_schedule() };
                 drop(guard);
-                sched();
+                schedule(SchedMode::SM_NONE);
                 // 被唤醒后,检查是否有事件可读
                 available = epoll.0.lock_irqsave().ep_events_available();
                 if let Some(timer) = timer {

+ 7 - 4
kernel/src/net/socket/mod.rs

@@ -15,7 +15,7 @@ use smoltcp::{
 use system_error::SystemError;
 
 use crate::{
-    arch::{rand::rand, sched::sched},
+    arch::rand::rand,
     filesystem::vfs::{
         file::FileMode, syscall::ModeType, FilePrivateData, FileSystem, FileType, IndexNode,
         Metadata,
@@ -25,6 +25,7 @@ use crate::{
         spinlock::{SpinLock, SpinLockGuard},
         wait_queue::EventWaitQueue,
     },
+    sched::{schedule, SchedMode},
 };
 
 use self::{
@@ -337,8 +338,9 @@ impl IndexNode for SocketInode {
         _offset: usize,
         len: usize,
         buf: &mut [u8],
-        _data: SpinLockGuard<FilePrivateData>,
+        data: SpinLockGuard<FilePrivateData>,
     ) -> Result<usize, SystemError> {
+        drop(data);
         self.0.lock_no_preempt().read(&mut buf[0..len]).0
     }
 
@@ -347,8 +349,9 @@ impl IndexNode for SocketInode {
         _offset: usize,
         len: usize,
         buf: &[u8],
-        _data: SpinLockGuard<FilePrivateData>,
+        data: SpinLockGuard<FilePrivateData>,
     ) -> Result<usize, SystemError> {
+        drop(data);
         self.0.lock_no_preempt().write(&buf[0..len], None)
     }
 
@@ -417,7 +420,7 @@ impl SocketHandleItem {
                 .sleep_without_schedule(events)
         };
         drop(handle_map_guard);
-        sched();
+        schedule(SchedMode::SM_NONE);
     }
 
     pub fn shutdown_type(&self) -> ShutdownType {

+ 2 - 2
kernel/src/process/exit.rs

@@ -6,10 +6,10 @@ use system_error::SystemError;
 use crate::{
     arch::{
         ipc::signal::{SigChildCode, Signal},
-        sched::sched,
         CurrentIrqArch,
     },
     exception::InterruptArch,
+    sched::{schedule, SchedMode},
     syscall::user_access::UserBufferWriter,
 };
 
@@ -164,7 +164,7 @@ fn do_wait(kwo: &mut KernelWaitOption) -> Result<usize, SystemError> {
                 }
             }
             drop(irq_guard);
-            sched();
+            schedule(SchedMode::SM_NONE);
         } else {
             // todo: 对于pgid的处理
             kwarn!("kernel_wait4: currently not support {:?}", kwo.pid_type);

+ 13 - 0
kernel/src/process/fork.rs

@@ -10,6 +10,8 @@ use crate::{
     libs::rwlock::RwLock,
     mm::VirtAddr,
     process::ProcessFlags,
+    sched::{sched_cgroup_fork, sched_fork},
+    smp::core::smp_get_processor_id,
     syscall::user_access::UserBufferWriter,
 };
 
@@ -185,6 +187,8 @@ impl ProcessManager {
             )
         });
 
+        pcb.sched_info().set_on_cpu(Some(smp_get_processor_id()));
+
         ProcessManager::wakeup(&pcb).unwrap_or_else(|e| {
             panic!(
                 "fork: Failed to wakeup new process, pid: [{:?}]. Error: {:?}",
@@ -388,6 +392,13 @@ impl ProcessManager {
             writer.copy_one_to_user(&(pcb.pid().0 as i32), 0)?;
         }
 
+        sched_fork(pcb).unwrap_or_else(|e| {
+            panic!(
+                "fork: Failed to set sched info from current process, current pid: [{:?}], new pid: [{:?}]. Error: {:?}",
+                current_pcb.pid(), pcb.pid(), e
+            )
+        });
+
         // 拷贝标志位
         Self::copy_flags(&clone_flags, pcb).unwrap_or_else(|e| {
             panic!(
@@ -474,6 +485,8 @@ impl ProcessManager {
 
         // todo: 增加线程组相关的逻辑。 参考 https://code.dragonos.org.cn/xref/linux-6.1.9/kernel/fork.c#2437
 
+        sched_cgroup_fork(pcb);
+
         Ok(())
     }
 }

+ 16 - 0
kernel/src/process/idle.rs

@@ -8,6 +8,7 @@ use alloc::{sync::Arc, vec::Vec};
 use crate::{
     mm::{percpu::PerCpu, VirtAddr, IDLE_PROCESS_ADDRESS_SPACE},
     process::KernelStack,
+    sched::{cpu_rq, OnRq},
     smp::{core::smp_get_processor_id, cpu::ProcessorId},
 };
 
@@ -58,6 +59,21 @@ impl ProcessManager {
 
             assert!(idle_pcb.sched_info().on_cpu().is_none());
             idle_pcb.sched_info().set_on_cpu(Some(ProcessorId::new(i)));
+            *idle_pcb.sched_info().sched_policy.write_irqsave() = crate::sched::SchedPolicy::IDLE;
+
+            let rq = cpu_rq(i as usize);
+            let (rq, _guard) = rq.self_lock();
+            rq.set_current(Arc::downgrade(&idle_pcb));
+            rq.set_idle(Arc::downgrade(&idle_pcb));
+
+            *idle_pcb.sched_info().on_rq.lock_irqsave() = OnRq::Queued;
+
+            idle_pcb
+                .sched_info()
+                .sched_entity()
+                .force_mut()
+                .set_cfs(Arc::downgrade(&rq.cfs_rq()));
+
             v.push(idle_pcb);
         }
 

+ 8 - 4
kernel/src/process/kthread.rs

@@ -1,6 +1,6 @@
 use core::{
     hint::spin_loop,
-    sync::atomic::{AtomicBool, Ordering},
+    sync::atomic::{compiler_fence, AtomicBool, Ordering},
 };
 
 use alloc::{
@@ -13,12 +13,13 @@ use atomic_enum::atomic_enum;
 use system_error::SystemError;
 
 use crate::{
-    arch::{sched::sched, CurrentIrqArch},
+    arch::CurrentIrqArch,
     exception::{irqdesc::IrqAction, InterruptArch},
     init::initial_kthread::initial_kernel_thread,
     kinfo,
     libs::{once::Once, spinlock::SpinLock},
     process::{ProcessManager, ProcessState},
+    sched::{schedule, SchedMode},
 };
 
 use super::{fork::CloneFlags, Pid, ProcessControlBlock, ProcessFlags};
@@ -302,6 +303,8 @@ impl KernelThreadMechanism {
             // 初始化kthreadd
             let closure = KernelThreadClosure::EmptyClosure((Box::new(Self::kthread_daemon), ()));
             let info = KernelThreadCreateInfo::new(closure, "kthreadd".to_string());
+            info.set_to_mark_sleep(false)
+                .expect("kthreadadd should be run first");
             let kthreadd_pid: Pid = Self::__inner_create(
                 &info,
                 CloneFlags::CLONE_VM | CloneFlags::CLONE_FS | CloneFlags::CLONE_SIGNAL,
@@ -334,6 +337,7 @@ impl KernelThreadMechanism {
             spin_loop()
         }
         KTHREAD_CREATE_LIST.lock().push_back(info.clone());
+        compiler_fence(Ordering::SeqCst);
         ProcessManager::wakeup(unsafe { KTHREAD_DAEMON_PCB.as_ref().unwrap() })
             .expect("Failed to wakeup kthread daemon");
         return info.poll_result();
@@ -470,7 +474,7 @@ impl KernelThreadMechanism {
             let irq_guard = unsafe { CurrentIrqArch::save_and_disable_irq() };
             ProcessManager::mark_sleep(true).ok();
             drop(irq_guard);
-            sched();
+            schedule(SchedMode::SM_NONE);
         }
     }
 }
@@ -495,7 +499,7 @@ pub unsafe extern "C" fn kernel_thread_bootstrap_stage2(ptr: *const KernelThread
         let irq_guard = CurrentIrqArch::save_and_disable_irq();
         ProcessManager::mark_sleep(true).expect("Failed to mark sleep");
         drop(irq_guard);
-        sched();
+        schedule(SchedMode::SM_NONE);
     }
 
     let mut retval = SystemError::EINTR.to_posix_errno();

+ 185 - 96
kernel/src/process/mod.rs

@@ -3,7 +3,7 @@ use core::{
     hint::spin_loop,
     intrinsics::{likely, unlikely},
     mem::ManuallyDrop,
-    sync::atomic::{compiler_fence, AtomicBool, AtomicIsize, AtomicUsize, Ordering},
+    sync::atomic::{compiler_fence, fence, AtomicBool, AtomicUsize, Ordering},
 };
 
 use alloc::{
@@ -18,7 +18,6 @@ use crate::{
     arch::{
         ipc::signal::{AtomicSignal, SigSet, Signal},
         process::ArchPCBInfo,
-        sched::sched,
         CurrentIrqArch,
     },
     driver::tty::tty_core::TtyCore,
@@ -37,7 +36,7 @@ use crate::{
             futex::Futex,
         },
         lock_free_flags::LockFreeFlags,
-        rwlock::{RwLock, RwLockReadGuard, RwLockUpgradableGuard, RwLockWriteGuard},
+        rwlock::{RwLock, RwLockReadGuard, RwLockWriteGuard},
         spinlock::{SpinLock, SpinLockGuard},
         wait_queue::WaitQueue,
     },
@@ -48,12 +47,13 @@ use crate::{
         VirtAddr,
     },
     net::socket::SocketInode,
+    sched::completion::Completion,
     sched::{
-        completion::Completion,
-        core::{sched_enqueue, CPU_EXECUTING},
-        SchedPolicy, SchedPriority,
+        cpu_rq, fair::FairSchedEntity, prio::MAX_PRIO, DequeueFlag, EnqueueFlag, OnRq, SchedMode,
+        WakeupFlags, __schedule,
     },
     smp::{
+        core::smp_get_processor_id,
         cpu::{AtomicProcessorId, ProcessorId},
         kick_cpu,
     },
@@ -225,10 +225,23 @@ impl ProcessManager {
             let state = writer.state();
             if state.is_blocked() {
                 writer.set_state(ProcessState::Runnable);
+                writer.set_wakeup();
+
                 // avoid deadlock
                 drop(writer);
 
-                sched_enqueue(pcb.clone(), true);
+                let rq = cpu_rq(pcb.sched_info().on_cpu().unwrap().data() as usize);
+
+                let (rq, _guard) = rq.self_lock();
+                rq.update_rq_clock();
+                rq.activate_task(
+                    pcb,
+                    EnqueueFlag::ENQUEUE_WAKEUP | EnqueueFlag::ENQUEUE_NOCLOCK,
+                );
+
+                rq.check_preempt_currnet(pcb, WakeupFlags::empty());
+
+                // sched_enqueue(pcb.clone(), true);
                 return Ok(());
             } else if state.is_exited() {
                 return Err(SystemError::EINVAL);
@@ -254,7 +267,18 @@ impl ProcessManager {
                 // avoid deadlock
                 drop(writer);
 
-                sched_enqueue(pcb.clone(), true);
+                let rq = cpu_rq(pcb.sched_info().on_cpu().unwrap().data() as usize);
+
+                let (rq, _guard) = rq.self_lock();
+                rq.update_rq_clock();
+                rq.activate_task(
+                    pcb,
+                    EnqueueFlag::ENQUEUE_WAKEUP | EnqueueFlag::ENQUEUE_NOCLOCK,
+                );
+
+                rq.check_preempt_currnet(pcb, WakeupFlags::empty());
+
+                // sched_enqueue(pcb.clone(), true);
                 return Ok(());
             } else if state.is_runnable() {
                 return Ok(());
@@ -280,14 +304,14 @@ impl ProcessManager {
             !CurrentIrqArch::is_irq_enabled(),
             "interrupt must be disabled before enter ProcessManager::mark_sleep()"
         );
-
         let pcb = ProcessManager::current_pcb();
         let mut writer = pcb.sched_info().inner_lock_write_irqsave();
         if !matches!(writer.state(), ProcessState::Exited(_)) {
             writer.set_state(ProcessState::Blocked(interruptable));
+            writer.set_sleep();
             pcb.flags().insert(ProcessFlags::NEED_SCHEDULE);
+            fence(Ordering::SeqCst);
             drop(writer);
-
             return Ok(());
         }
         return Err(SystemError::EINTR);
@@ -351,7 +375,7 @@ impl ProcessManager {
     /// - `exit_code` : 进程的退出码
     pub fn exit(exit_code: usize) -> ! {
         // 关中断
-        unsafe { CurrentIrqArch::interrupt_disable() };
+        let _guard = unsafe { CurrentIrqArch::save_and_disable_irq() };
         let pcb = ProcessManager::current_pcb();
         let pid = pcb.pid();
         pcb.sched_info
@@ -359,6 +383,14 @@ impl ProcessManager {
             .set_state(ProcessState::Exited(exit_code));
         pcb.wait_queue.wakeup(Some(ProcessState::Blocked(true)));
 
+        let rq = cpu_rq(smp_get_processor_id().data() as usize);
+        let (rq, guard) = rq.self_lock();
+        rq.deactivate_task(
+            pcb.clone(),
+            DequeueFlag::DEQUEUE_SLEEP | DequeueFlag::DEQUEUE_NOCLOCK,
+        );
+        drop(guard);
+
         // 进行进程退出后的工作
         let thread = pcb.thread.write_irqsave();
         if let Some(addr) = thread.set_child_tid {
@@ -381,9 +413,8 @@ impl ProcessManager {
         unsafe { pcb.basic_mut().set_user_vm(None) };
         drop(pcb);
         ProcessManager::exit_notify();
-        unsafe { CurrentIrqArch::interrupt_enable() };
-
-        sched();
+        // unsafe { CurrentIrqArch::interrupt_enable() };
+        __schedule(SchedMode::SM_NONE);
         kerror!("pid {pid:?} exited but sched again!");
         #[allow(clippy::empty_loop)]
         loop {
@@ -446,7 +477,7 @@ impl ProcessManager {
         let cpu_id = pcb.sched_info().on_cpu();
 
         if let Some(cpu_id) = cpu_id {
-            if pcb.pid() == CPU_EXECUTING.get(cpu_id) {
+            if pcb.pid() == cpu_rq(cpu_id.data() as usize).current().pid() {
                 kick_cpu(cpu_id).expect("ProcessManager::kick(): Failed to kick cpu");
             }
         }
@@ -672,6 +703,10 @@ impl ProcessControlBlock {
 
         let pcb = Arc::new(pcb);
 
+        pcb.sched_info()
+            .sched_entity()
+            .force_mut()
+            .set_pcb(Arc::downgrade(&pcb));
         // 设置进程的arc指针到内核栈和系统调用栈的最低地址处
         unsafe {
             pcb.kernel_stack
@@ -1038,14 +1073,51 @@ pub struct ProcessSchedulerInfo {
     on_cpu: AtomicProcessorId,
     /// 如果当前进程等待被迁移到另一个cpu核心上(也就是flags中的PF_NEED_MIGRATE被置位),
     /// 该字段存储要被迁移到的目标处理器核心号
-    migrate_to: AtomicProcessorId,
+    // migrate_to: AtomicProcessorId,
     inner_locked: RwLock<InnerSchedInfo>,
     /// 进程的调度优先级
-    priority: SchedPriority,
+    // priority: SchedPriority,
     /// 当前进程的虚拟运行时间
-    virtual_runtime: AtomicIsize,
+    // virtual_runtime: AtomicIsize,
     /// 由实时调度器管理的时间片
-    rt_time_slice: AtomicIsize,
+    // rt_time_slice: AtomicIsize,
+    pub sched_stat: RwLock<SchedInfo>,
+    /// 调度策略
+    pub sched_policy: RwLock<crate::sched::SchedPolicy>,
+    /// cfs调度实体
+    pub sched_entity: Arc<FairSchedEntity>,
+    pub on_rq: SpinLock<OnRq>,
+
+    pub prio_data: RwLock<PrioData>,
+}
+
+#[derive(Debug, Default)]
+pub struct SchedInfo {
+    /// 记录任务在特定 CPU 上运行的次数
+    pub pcount: usize,
+    /// 记录任务等待在运行队列上的时间
+    pub run_delay: usize,
+    /// 记录任务上次在 CPU 上运行的时间戳
+    pub last_arrival: u64,
+    /// 记录任务上次被加入到运行队列中的时间戳
+    pub last_queued: u64,
+}
+
+#[derive(Debug)]
+pub struct PrioData {
+    pub prio: i32,
+    pub static_prio: i32,
+    pub normal_prio: i32,
+}
+
+impl Default for PrioData {
+    fn default() -> Self {
+        Self {
+            prio: MAX_PRIO - 20,
+            static_prio: MAX_PRIO - 20,
+            normal_prio: MAX_PRIO - 20,
+        }
+    }
 }
 
 #[derive(Debug)]
@@ -1053,7 +1125,7 @@ pub struct InnerSchedInfo {
     /// 当前进程的状态
     state: ProcessState,
     /// 进程的调度策略
-    sched_policy: SchedPolicy,
+    sleep: bool,
 }
 
 impl InnerSchedInfo {
@@ -1065,8 +1137,16 @@ impl InnerSchedInfo {
         self.state = state;
     }
 
-    pub fn policy(&self) -> SchedPolicy {
-        return self.sched_policy;
+    pub fn set_sleep(&mut self) {
+        self.sleep = true;
+    }
+
+    pub fn set_wakeup(&mut self) {
+        self.sleep = false;
+    }
+
+    pub fn is_mark_sleep(&self) -> bool {
+        self.sleep
     }
 }
 
@@ -1076,17 +1156,26 @@ impl ProcessSchedulerInfo {
         let cpu_id = on_cpu.unwrap_or(ProcessorId::INVALID);
         return Self {
             on_cpu: AtomicProcessorId::new(cpu_id),
-            migrate_to: AtomicProcessorId::new(ProcessorId::INVALID),
+            // migrate_to: AtomicProcessorId::new(ProcessorId::INVALID),
             inner_locked: RwLock::new(InnerSchedInfo {
                 state: ProcessState::Blocked(false),
-                sched_policy: SchedPolicy::CFS,
+                sleep: false,
             }),
-            virtual_runtime: AtomicIsize::new(0),
-            rt_time_slice: AtomicIsize::new(0),
-            priority: SchedPriority::new(100).unwrap(),
+            // virtual_runtime: AtomicIsize::new(0),
+            // rt_time_slice: AtomicIsize::new(0),
+            // priority: SchedPriority::new(100).unwrap(),
+            sched_stat: RwLock::new(SchedInfo::default()),
+            sched_policy: RwLock::new(crate::sched::SchedPolicy::CFS),
+            sched_entity: FairSchedEntity::new(),
+            on_rq: SpinLock::new(OnRq::None),
+            prio_data: RwLock::new(PrioData::default()),
         };
     }
 
+    pub fn sched_entity(&self) -> Arc<FairSchedEntity> {
+        return self.sched_entity.clone();
+    }
+
     pub fn on_cpu(&self) -> Option<ProcessorId> {
         let on_cpu = self.on_cpu.load(Ordering::SeqCst);
         if on_cpu == ProcessorId::INVALID {
@@ -1104,23 +1193,23 @@ impl ProcessSchedulerInfo {
         }
     }
 
-    pub fn migrate_to(&self) -> Option<ProcessorId> {
-        let migrate_to = self.migrate_to.load(Ordering::SeqCst);
-        if migrate_to == ProcessorId::INVALID {
-            return None;
-        } else {
-            return Some(migrate_to);
-        }
-    }
-
-    pub fn set_migrate_to(&self, migrate_to: Option<ProcessorId>) {
-        if let Some(data) = migrate_to {
-            self.migrate_to.store(data, Ordering::SeqCst);
-        } else {
-            self.migrate_to
-                .store(ProcessorId::INVALID, Ordering::SeqCst)
-        }
-    }
+    // pub fn migrate_to(&self) -> Option<ProcessorId> {
+    //     let migrate_to = self.migrate_to.load(Ordering::SeqCst);
+    //     if migrate_to == ProcessorId::INVALID {
+    //         return None;
+    //     } else {
+    //         return Some(migrate_to);
+    //     }
+    // }
+
+    // pub fn set_migrate_to(&self, migrate_to: Option<ProcessorId>) {
+    //     if let Some(data) = migrate_to {
+    //         self.migrate_to.store(data, Ordering::SeqCst);
+    //     } else {
+    //         self.migrate_to
+    //             .store(ProcessorId::INVALID, Ordering::SeqCst)
+    //     }
+    // }
 
     pub fn inner_lock_write_irqsave(&self) -> RwLockWriteGuard<InnerSchedInfo> {
         return self.inner_locked.write_irqsave();
@@ -1130,58 +1219,58 @@ impl ProcessSchedulerInfo {
         return self.inner_locked.read_irqsave();
     }
 
-    pub fn inner_lock_try_read_irqsave(
-        &self,
-        times: u8,
-    ) -> Option<RwLockReadGuard<InnerSchedInfo>> {
-        for _ in 0..times {
-            if let Some(r) = self.inner_locked.try_read_irqsave() {
-                return Some(r);
-            }
-        }
-
-        return None;
-    }
-
-    pub fn inner_lock_try_upgradable_read_irqsave(
-        &self,
-        times: u8,
-    ) -> Option<RwLockUpgradableGuard<InnerSchedInfo>> {
-        for _ in 0..times {
-            if let Some(r) = self.inner_locked.try_upgradeable_read_irqsave() {
-                return Some(r);
-            }
-        }
-
-        return None;
-    }
-
-    pub fn virtual_runtime(&self) -> isize {
-        return self.virtual_runtime.load(Ordering::SeqCst);
-    }
-
-    pub fn set_virtual_runtime(&self, virtual_runtime: isize) {
-        self.virtual_runtime
-            .store(virtual_runtime, Ordering::SeqCst);
-    }
-    pub fn increase_virtual_runtime(&self, delta: isize) {
-        self.virtual_runtime.fetch_add(delta, Ordering::SeqCst);
-    }
-
-    pub fn rt_time_slice(&self) -> isize {
-        return self.rt_time_slice.load(Ordering::SeqCst);
-    }
-
-    pub fn set_rt_time_slice(&self, rt_time_slice: isize) {
-        self.rt_time_slice.store(rt_time_slice, Ordering::SeqCst);
-    }
-
-    pub fn increase_rt_time_slice(&self, delta: isize) {
-        self.rt_time_slice.fetch_add(delta, Ordering::SeqCst);
-    }
-
-    pub fn priority(&self) -> SchedPriority {
-        return self.priority;
+    // pub fn inner_lock_try_read_irqsave(
+    //     &self,
+    //     times: u8,
+    // ) -> Option<RwLockReadGuard<InnerSchedInfo>> {
+    //     for _ in 0..times {
+    //         if let Some(r) = self.inner_locked.try_read_irqsave() {
+    //             return Some(r);
+    //         }
+    //     }
+
+    //     return None;
+    // }
+
+    // pub fn inner_lock_try_upgradable_read_irqsave(
+    //     &self,
+    //     times: u8,
+    // ) -> Option<RwLockUpgradableGuard<InnerSchedInfo>> {
+    //     for _ in 0..times {
+    //         if let Some(r) = self.inner_locked.try_upgradeable_read_irqsave() {
+    //             return Some(r);
+    //         }
+    //     }
+
+    //     return None;
+    // }
+
+    // pub fn virtual_runtime(&self) -> isize {
+    //     return self.virtual_runtime.load(Ordering::SeqCst);
+    // }
+
+    // pub fn set_virtual_runtime(&self, virtual_runtime: isize) {
+    //     self.virtual_runtime
+    //         .store(virtual_runtime, Ordering::SeqCst);
+    // }
+    // pub fn increase_virtual_runtime(&self, delta: isize) {
+    //     self.virtual_runtime.fetch_add(delta, Ordering::SeqCst);
+    // }
+
+    // pub fn rt_time_slice(&self) -> isize {
+    //     return self.rt_time_slice.load(Ordering::SeqCst);
+    // }
+
+    // pub fn set_rt_time_slice(&self, rt_time_slice: isize) {
+    //     self.rt_time_slice.store(rt_time_slice, Ordering::SeqCst);
+    // }
+
+    // pub fn increase_rt_time_slice(&self, delta: isize) {
+    //     self.rt_time_slice.fetch_add(delta, Ordering::SeqCst);
+    // }
+
+    pub fn policy(&self) -> crate::sched::SchedPolicy {
+        return *self.sched_policy.read_irqsave();
     }
 }
 

+ 0 - 283
kernel/src/sched/cfs.rs

@@ -1,283 +0,0 @@
-use core::sync::atomic::compiler_fence;
-
-use alloc::{boxed::Box, sync::Arc, vec::Vec};
-
-use crate::{
-    arch::CurrentIrqArch,
-    exception::InterruptArch,
-    kBUG,
-    libs::{
-        rbtree::RBTree,
-        spinlock::{SpinLock, SpinLockGuard},
-    },
-    mm::percpu::PerCpu,
-    process::{
-        ProcessControlBlock, ProcessFlags, ProcessManager, ProcessSchedulerInfo, ProcessState,
-    },
-    smp::{core::smp_get_processor_id, cpu::ProcessorId},
-};
-
-use super::{
-    core::{sched_enqueue, Scheduler},
-    SchedPriority,
-};
-
-/// 声明全局的cfs调度器实例
-pub static mut CFS_SCHEDULER_PTR: Option<Box<SchedulerCFS>> = None;
-
-/// @brief 获取cfs调度器实例的可变引用
-#[inline]
-pub fn __get_cfs_scheduler() -> &'static mut SchedulerCFS {
-    return unsafe { CFS_SCHEDULER_PTR.as_mut().unwrap() };
-}
-
-/// @brief 初始化cfs调度器
-pub unsafe fn sched_cfs_init() {
-    if CFS_SCHEDULER_PTR.is_none() {
-        CFS_SCHEDULER_PTR = Some(Box::new(SchedulerCFS::new()));
-    } else {
-        kBUG!("Try to init CFS Scheduler twice.");
-        panic!("Try to init CFS Scheduler twice.");
-    }
-}
-
-/// @brief CFS队列(per-cpu的)
-#[derive(Debug)]
-struct CFSQueue {
-    /// 当前cpu上执行的进程剩余的时间片
-    cpu_exec_proc_jiffies: i64,
-    /// 自旋锁保护的队列
-    locked_queue: SpinLock<RBTree<i64, Arc<ProcessControlBlock>>>,
-    /// 当前核心的队列专属的IDLE进程的pcb
-    idle_pcb: Arc<ProcessControlBlock>,
-}
-
-impl CFSQueue {
-    pub fn new(idle_pcb: Arc<ProcessControlBlock>) -> CFSQueue {
-        CFSQueue {
-            cpu_exec_proc_jiffies: 0,
-            locked_queue: SpinLock::new(RBTree::new()),
-            idle_pcb,
-        }
-    }
-
-    /// @brief 将pcb加入队列
-    pub fn enqueue(&mut self, pcb: Arc<ProcessControlBlock>) {
-        let mut queue = self.locked_queue.lock_irqsave();
-
-        // 如果进程是IDLE进程,那么就不加入队列
-        if pcb.pid().into() == 0 {
-            return;
-        }
-
-        queue.insert(pcb.sched_info().virtual_runtime() as i64, pcb.clone());
-    }
-
-    /// @brief 将pcb从调度队列中弹出,若队列为空,则返回IDLE进程的pcb
-    pub fn dequeue(&mut self) -> Arc<ProcessControlBlock> {
-        let res: Arc<ProcessControlBlock>;
-        let mut queue = self.locked_queue.lock_irqsave();
-        if !queue.is_empty() {
-            // 队列不为空,返回下一个要执行的pcb
-            res = queue.pop_first().unwrap().1;
-        } else {
-            // 如果队列为空,则返回IDLE进程的pcb
-            res = self.idle_pcb.clone();
-        }
-        return res;
-    }
-
-    /// @brief 获取cfs队列的最小运行时间
-    ///
-    /// @return Option<i64> 如果队列不为空,那么返回队列中,最小的虚拟运行时间;否则返回None
-    pub fn min_vruntime(
-        queue: &SpinLockGuard<RBTree<i64, Arc<ProcessControlBlock>>>,
-    ) -> Option<i64> {
-        if !queue.is_empty() {
-            return Some(queue.get_first().unwrap().1.sched_info().virtual_runtime() as i64);
-        } else {
-            return None;
-        }
-    }
-    /// 获取运行队列的长度
-    #[allow(dead_code)]
-    pub fn get_cfs_queue_size(
-        queue: &SpinLockGuard<RBTree<i64, Arc<ProcessControlBlock>>>,
-    ) -> usize {
-        return queue.len();
-    }
-}
-
-/// @brief CFS调度器类
-pub struct SchedulerCFS {
-    cpu_queue: Vec<&'static mut CFSQueue>,
-}
-
-impl SchedulerCFS {
-    pub fn new() -> SchedulerCFS {
-        // 暂时手动指定核心数目
-        // todo: 从cpu模块来获取核心的数目
-        let mut result = SchedulerCFS {
-            cpu_queue: Default::default(),
-        };
-
-        // 为每个cpu核心创建队列,进程重构后可以直接初始化Idle_pcb?
-        for i in 0..PerCpu::MAX_CPU_NUM {
-            let idle_pcb = ProcessManager::idle_pcb()[i as usize].clone();
-            result
-                .cpu_queue
-                .push(Box::leak(Box::new(CFSQueue::new(idle_pcb))));
-        }
-
-        return result;
-    }
-
-    /// @brief 更新这个cpu上,这个进程的可执行时间。
-    #[inline]
-    fn update_cpu_exec_proc_jiffies(
-        _priority: SchedPriority,
-        cfs_queue: &mut CFSQueue,
-        is_idle: bool,
-    ) -> &mut CFSQueue {
-        // todo: 引入调度周期以及所有进程的优先权进行计算,然后设置分配给进程的可执行时间
-        if !is_idle {
-            cfs_queue.cpu_exec_proc_jiffies = 10;
-        } else {
-            cfs_queue.cpu_exec_proc_jiffies = 0;
-        }
-
-        return cfs_queue;
-    }
-
-    /// @brief 时钟中断到来时,由sched的core模块中的函数,调用本函数,更新CFS进程的可执行时间
-    pub fn timer_update_jiffies(&mut self, sched_info: &ProcessSchedulerInfo) {
-        let current_cpu_queue: &mut CFSQueue =
-            self.cpu_queue[smp_get_processor_id().data() as usize];
-        // todo: 引入调度周期以及所有进程的优先权进行计算,然后设置进程的可执行时间
-
-        let mut queue = None;
-        for _ in 0..10 {
-            if let Ok(q) = current_cpu_queue.locked_queue.try_lock_irqsave() {
-                queue = Some(q);
-                break;
-            }
-        }
-        if queue.is_none() {
-            return;
-        }
-        let queue = queue.unwrap();
-        // 更新进程的剩余可执行时间
-        current_cpu_queue.cpu_exec_proc_jiffies -= 1;
-        // 时间片耗尽,标记需要被调度
-        if current_cpu_queue.cpu_exec_proc_jiffies <= 0 {
-            ProcessManager::current_pcb()
-                .flags()
-                .insert(ProcessFlags::NEED_SCHEDULE);
-        }
-        drop(queue);
-
-        // 更新当前进程的虚拟运行时间
-        sched_info.increase_virtual_runtime(1);
-    }
-
-    /// @brief 将进程加入cpu的cfs调度队列,并且重设其虚拟运行时间为当前队列的最小值
-    pub fn enqueue_reset_vruntime(&mut self, pcb: Arc<ProcessControlBlock>) {
-        let cpu_queue = &mut self.cpu_queue[pcb.sched_info().on_cpu().unwrap().data() as usize];
-        let queue = cpu_queue.locked_queue.lock_irqsave();
-        if queue.len() > 0 {
-            pcb.sched_info()
-                .set_virtual_runtime(CFSQueue::min_vruntime(&queue).unwrap_or(0) as isize)
-        }
-        drop(queue);
-        cpu_queue.enqueue(pcb);
-    }
-
-    /// @brief 设置cpu的队列的IDLE进程的pcb
-    #[allow(dead_code)]
-    pub fn set_cpu_idle(&mut self, cpu_id: usize, pcb: Arc<ProcessControlBlock>) {
-        // kdebug!("set cpu idle: id={}", cpu_id);
-        self.cpu_queue[cpu_id].idle_pcb = pcb;
-    }
-    /// 获取某个cpu的运行队列中的进程数
-    pub fn get_cfs_queue_len(&mut self, cpu_id: ProcessorId) -> usize {
-        let queue = self.cpu_queue[cpu_id.data() as usize]
-            .locked_queue
-            .lock_irqsave();
-        return CFSQueue::get_cfs_queue_size(&queue);
-    }
-}
-
-impl Scheduler for SchedulerCFS {
-    /// @brief 在当前cpu上进行调度。
-    /// 请注意,进入该函数之前,需要关中断
-    fn sched(&mut self) -> Option<Arc<ProcessControlBlock>> {
-        assert!(!CurrentIrqArch::is_irq_enabled());
-
-        ProcessManager::current_pcb()
-            .flags()
-            .remove(ProcessFlags::NEED_SCHEDULE);
-
-        let current_cpu_id = smp_get_processor_id().data() as usize;
-
-        let current_cpu_queue: &mut CFSQueue = self.cpu_queue[current_cpu_id];
-
-        let proc: Arc<ProcessControlBlock> = current_cpu_queue.dequeue();
-
-        compiler_fence(core::sync::atomic::Ordering::SeqCst);
-        // 如果当前不是running态,或者当前进程的虚拟运行时间大于等于下一个进程的,那就需要切换。
-        let state = ProcessManager::current_pcb()
-            .sched_info()
-            .inner_lock_read_irqsave()
-            .state();
-        if (state != ProcessState::Runnable)
-            || (ProcessManager::current_pcb().sched_info().virtual_runtime()
-                >= proc.sched_info().virtual_runtime())
-        {
-            compiler_fence(core::sync::atomic::Ordering::SeqCst);
-            // 本次切换由于时间片到期引发,则再次加入就绪队列,否则交由其它功能模块进行管理
-            if state == ProcessState::Runnable {
-                sched_enqueue(ProcessManager::current_pcb(), false);
-                compiler_fence(core::sync::atomic::Ordering::SeqCst);
-            }
-            compiler_fence(core::sync::atomic::Ordering::SeqCst);
-            // 设置进程可以执行的时间
-            if current_cpu_queue.cpu_exec_proc_jiffies <= 0 {
-                SchedulerCFS::update_cpu_exec_proc_jiffies(
-                    proc.sched_info().priority(),
-                    current_cpu_queue,
-                    Arc::ptr_eq(&proc, &current_cpu_queue.idle_pcb),
-                );
-            }
-
-            compiler_fence(core::sync::atomic::Ordering::SeqCst);
-
-            return Some(proc);
-        } else {
-            // 不进行切换
-
-            // 设置进程可以执行的时间
-            compiler_fence(core::sync::atomic::Ordering::SeqCst);
-            if current_cpu_queue.cpu_exec_proc_jiffies <= 0 {
-                SchedulerCFS::update_cpu_exec_proc_jiffies(
-                    ProcessManager::current_pcb().sched_info().priority(),
-                    current_cpu_queue,
-                    Arc::ptr_eq(&proc, &current_cpu_queue.idle_pcb),
-                );
-                // kdebug!("cpu:{:?}",current_cpu_id);
-            }
-
-            compiler_fence(core::sync::atomic::Ordering::SeqCst);
-            sched_enqueue(proc, false);
-            compiler_fence(core::sync::atomic::Ordering::SeqCst);
-        }
-        compiler_fence(core::sync::atomic::Ordering::SeqCst);
-
-        return None;
-    }
-
-    fn enqueue(&mut self, pcb: Arc<ProcessControlBlock>) {
-        let cpu_queue = &mut self.cpu_queue[pcb.sched_info().on_cpu().unwrap().data() as usize];
-
-        cpu_queue.enqueue(pcb);
-    }
-}

+ 38 - 0
kernel/src/sched/clock.rs

@@ -0,0 +1,38 @@
+/*
+    这个文件实现的是调度过程中设计到的时钟
+*/
+#[cfg(target_arch = "x86_64")]
+use crate::{
+    arch::{driver::tsc::TSCManager, CurrentTimeArch},
+    time::TimeArch,
+};
+
+pub struct SchedClock;
+
+impl SchedClock {
+    #[inline]
+    pub fn sched_clock_cpu(_cpu: usize) -> u64 {
+        #[cfg(target_arch = "x86_64")]
+        {
+            if TSCManager::cpu_khz() == 0 {
+                // TCS no init
+                return 0;
+            }
+            return CurrentTimeArch::cycles2ns(CurrentTimeArch::get_cycles()) as u64;
+        }
+
+        #[cfg(target_arch = "riscv64")]
+        todo!()
+    }
+}
+
+bitflags! {
+    pub struct ClockUpdataFlag: u8 {
+        /// 请求在下一次调用 __schedule() 时跳过时钟更新
+        const RQCF_REQ_SKIP = 0x01;
+        /// 表示跳过时钟更新正在生效,update_rq_clock() 的调用将被忽略。
+        const RQCF_ACT_SKIP = 0x02;
+        /// 调试标志,指示自上次固定 rq::lock 以来是否已调用过
+        const RQCF_UPDATE = 0x04;
+    }
+}

+ 0 - 225
kernel/src/sched/core.rs

@@ -1,225 +0,0 @@
-use core::{
-    intrinsics::unlikely,
-    sync::atomic::{compiler_fence, Ordering},
-};
-
-use alloc::{sync::Arc, vec::Vec};
-
-use crate::{
-    kinfo,
-    mm::percpu::PerCpu,
-    process::{AtomicPid, Pid, ProcessControlBlock, ProcessFlags, ProcessManager, ProcessState},
-    smp::{core::smp_get_processor_id, cpu::ProcessorId},
-};
-
-use super::rt::{sched_rt_init, SchedulerRT, __get_rt_scheduler};
-use super::{
-    cfs::{sched_cfs_init, SchedulerCFS, __get_cfs_scheduler},
-    SchedPolicy,
-};
-
-lazy_static! {
-    /// 记录每个cpu上正在执行的进程的pid
-    pub static ref CPU_EXECUTING: CpuExecuting = CpuExecuting::new();
-}
-
-#[derive(Debug)]
-pub struct CpuExecuting {
-    data: Vec<AtomicPid>,
-}
-
-impl CpuExecuting {
-    pub fn new() -> Self {
-        let mut data = Vec::new();
-        for _ in 0..PerCpu::MAX_CPU_NUM {
-            data.push(AtomicPid::new(Pid::new(0)));
-        }
-        Self { data }
-    }
-
-    #[inline(always)]
-    pub fn set(&self, cpu_id: ProcessorId, pid: Pid) {
-        self.data[cpu_id.data() as usize].store(pid, Ordering::SeqCst);
-    }
-
-    #[inline(always)]
-    pub fn get(&self, cpu_id: ProcessorId) -> Pid {
-        self.data[cpu_id.data() as usize].load(Ordering::SeqCst)
-    }
-}
-
-// 获取某个cpu的负载情况,返回当前负载,cpu_id 是获取负载的cpu的id
-// TODO:将获取负载情况调整为最近一段时间运行进程的数量
-#[allow(dead_code)]
-pub fn get_cpu_loads(cpu_id: ProcessorId) -> u32 {
-    let cfs_scheduler = __get_cfs_scheduler();
-    let rt_scheduler = __get_rt_scheduler();
-    let len_cfs = cfs_scheduler.get_cfs_queue_len(cpu_id);
-    let len_rt = rt_scheduler.rt_queue_len(cpu_id);
-    // let load_rt = rt_scheduler.get_load_list_len(cpu_id);
-    // kdebug!("this cpu_id {} is load rt {}", cpu_id, load_rt);
-
-    return (len_rt + len_cfs) as u32;
-}
-// 负载均衡
-pub fn loads_balance(pcb: Arc<ProcessControlBlock>) {
-    // FIXME: 由于目前负载均衡是直接添加到目标CPU的队列中,导致会由于时序问题导致进程在两个CPU上都存在。
-    // 在调度子系统重写/改进之前,暂时只设置进程在0号CPU上运行
-    // 由于调度器问题,暂时不进行负载均衡,见issue: https://github.com/DragonOS-Community/DragonOS/issues/571
-    let min_loads_cpu_id = ProcessorId::new(0);
-
-    // 获取总的CPU数量
-    // let cpu_num = unsafe { smp_get_total_cpu() };
-    // 获取当前负载最小的CPU的id
-    // let mut min_loads = get_cpu_loads(smp_get_processor_id());
-    // for cpu_id in 0..cpu_num {
-    //     let cpu_id = ProcessorId::new(cpu_id);
-    //     let tmp_cpu_loads = get_cpu_loads(cpu_id);
-    //     if min_loads - tmp_cpu_loads > 0 {
-    //         min_loads_cpu_id = cpu_id;
-    //         min_loads = tmp_cpu_loads;
-    //     }
-    // }
-
-    let pcb_cpu = pcb.sched_info().on_cpu();
-    // 将当前pcb迁移到负载最小的CPU
-    // 如果当前pcb的PF_NEED_MIGRATE已经置位,则不进行迁移操作
-    if pcb_cpu.is_none()
-        || (min_loads_cpu_id != pcb_cpu.unwrap()
-            && !pcb.flags().contains(ProcessFlags::NEED_MIGRATE))
-    {
-        pcb.flags().insert(ProcessFlags::NEED_MIGRATE);
-        pcb.sched_info().set_migrate_to(Some(min_loads_cpu_id));
-        // kdebug!("set migrating, pcb:{:?}", pcb);
-    }
-}
-/// @brief 具体的调度器应当实现的trait
-pub trait Scheduler {
-    /// @brief 使用该调度器发起调度的时候,要调用的函数
-    fn sched(&mut self) -> Option<Arc<ProcessControlBlock>>;
-
-    /// @brief 将pcb加入这个调度器的调度队列
-    fn enqueue(&mut self, pcb: Arc<ProcessControlBlock>);
-}
-
-pub fn do_sched() -> Option<Arc<ProcessControlBlock>> {
-    // 当前进程持有锁,不切换,避免死锁
-    if ProcessManager::current_pcb().preempt_count() != 0 {
-        let binding = ProcessManager::current_pcb();
-        let guard = binding
-            .sched_info()
-            .inner_lock_try_upgradable_read_irqsave(5);
-        if unlikely(guard.is_none()) {
-            return None;
-        }
-
-        let mut guard = guard.unwrap();
-
-        let state = guard.state();
-        if state.is_blocked() {
-            // try to upgrade
-            for _ in 0..50 {
-                match guard.try_upgrade() {
-                    Ok(mut writer) => {
-                        // 被mark_sleep但是还在临界区的进程将其设置为Runnable
-                        writer.set_state(ProcessState::Runnable);
-                        break;
-                    }
-                    Err(s) => {
-                        guard = s;
-                    }
-                }
-            }
-        }
-        return None;
-    }
-
-    compiler_fence(core::sync::atomic::Ordering::SeqCst);
-    let cfs_scheduler: &mut SchedulerCFS = __get_cfs_scheduler();
-    let rt_scheduler: &mut SchedulerRT = __get_rt_scheduler();
-    compiler_fence(core::sync::atomic::Ordering::SeqCst);
-
-    let next: Arc<ProcessControlBlock>;
-    match rt_scheduler.pick_next_task_rt(smp_get_processor_id()) {
-        Some(p) => {
-            next = p;
-            // 将pick的进程放回原处
-            rt_scheduler.enqueue_front(next);
-
-            return rt_scheduler.sched();
-        }
-        None => {
-            return cfs_scheduler.sched();
-        }
-    }
-}
-
-/// @brief 将进程加入调度队列
-///
-/// @param pcb 要被加入队列的pcb
-/// @param reset_time 是否重置虚拟运行时间
-pub fn sched_enqueue(pcb: Arc<ProcessControlBlock>, mut reset_time: bool) {
-    compiler_fence(core::sync::atomic::Ordering::SeqCst);
-    if pcb.sched_info().inner_lock_read_irqsave().state() != ProcessState::Runnable {
-        return;
-    }
-    let cfs_scheduler = __get_cfs_scheduler();
-    let rt_scheduler = __get_rt_scheduler();
-    // 除了IDLE以外的进程,都进行负载均衡
-    if pcb.pid().into() > 0 {
-        loads_balance(pcb.clone());
-    }
-
-    if pcb.flags().contains(ProcessFlags::NEED_MIGRATE) {
-        // kdebug!("migrating pcb:{:?}", pcb);
-        pcb.flags().remove(ProcessFlags::NEED_MIGRATE);
-        pcb.sched_info().set_on_cpu(pcb.sched_info().migrate_to());
-        reset_time = true;
-    }
-
-    assert!(pcb.sched_info().on_cpu().is_some());
-
-    match pcb.sched_info().inner_lock_read_irqsave().policy() {
-        SchedPolicy::CFS => {
-            if reset_time {
-                cfs_scheduler.enqueue_reset_vruntime(pcb.clone());
-            } else {
-                cfs_scheduler.enqueue(pcb.clone());
-            }
-        }
-        SchedPolicy::FIFO | SchedPolicy::RR => rt_scheduler.enqueue(pcb.clone()),
-    }
-}
-
-/// 初始化进程调度器模块
-#[inline(never)]
-pub fn sched_init() {
-    kinfo!("Initializing schedulers...");
-    unsafe {
-        sched_cfs_init();
-        sched_rt_init();
-    }
-    kinfo!("Schedulers initialized");
-}
-
-/// @brief 当时钟中断到达时,更新时间片
-/// 请注意,该函数只能被时钟中断处理程序调用
-#[inline(never)]
-pub fn sched_update_jiffies() {
-    let binding = ProcessManager::current_pcb();
-    let guard = binding.sched_info().inner_lock_try_read_irqsave(10);
-    if unlikely(guard.is_none()) {
-        return;
-    }
-    let guard = guard.unwrap();
-    let policy = guard.policy();
-    drop(guard);
-    match policy {
-        SchedPolicy::CFS => {
-            __get_cfs_scheduler().timer_update_jiffies(binding.sched_info());
-        }
-        SchedPolicy::FIFO | SchedPolicy::RR => {
-            __get_rt_scheduler().timer_update_jiffies();
-        }
-    }
-}

+ 107 - 0
kernel/src/sched/cputime.rs

@@ -0,0 +1,107 @@
+use core::sync::atomic::{compiler_fence, AtomicUsize, Ordering};
+
+use crate::{
+    arch::CurrentIrqArch, exception::InterruptArch, process::ProcessControlBlock,
+    smp::core::smp_get_processor_id, time::jiffies::TICK_NESC,
+};
+use alloc::sync::Arc;
+
+use super::{clock::SchedClock, cpu_irq_time};
+
+pub fn irq_time_read(cpu: usize) -> u64 {
+    compiler_fence(Ordering::SeqCst);
+    let irqtime = cpu_irq_time(cpu);
+
+    let mut total;
+
+    loop {
+        let seq = irqtime.sync.load(Ordering::SeqCst);
+        total = irqtime.total;
+
+        if seq == irqtime.sync.load(Ordering::SeqCst) {
+            break;
+        }
+    }
+    compiler_fence(Ordering::SeqCst);
+    total
+}
+
+#[derive(Debug, Default)]
+pub struct IrqTime {
+    pub total: u64,
+    pub tick_delta: u64,
+    pub irq_start_time: u64,
+    pub sync: AtomicUsize,
+}
+
+impl IrqTime {
+    pub fn account_delta(&mut self, delta: u64) {
+        // 开始更改时增加序列号
+        self.sync.fetch_add(1, Ordering::SeqCst);
+        self.total += delta;
+        self.tick_delta += delta;
+    }
+
+    pub fn irqtime_tick_accounted(&mut self, max: u64) -> u64 {
+        let delta = self.tick_delta.min(max);
+        self.tick_delta -= delta;
+        return delta;
+    }
+
+    pub fn irqtime_start() {
+        let cpu = smp_get_processor_id().data() as usize;
+        let irq_time = cpu_irq_time(cpu);
+        compiler_fence(Ordering::SeqCst);
+        irq_time.irq_start_time = SchedClock::sched_clock_cpu(cpu) as u64;
+        compiler_fence(Ordering::SeqCst);
+    }
+
+    pub fn irqtime_account_irq(_pcb: Arc<ProcessControlBlock>) {
+        compiler_fence(Ordering::SeqCst);
+        let cpu = smp_get_processor_id().data() as usize;
+        let irq_time = cpu_irq_time(cpu);
+        compiler_fence(Ordering::SeqCst);
+        let delta = SchedClock::sched_clock_cpu(cpu) as u64 - irq_time.irq_start_time;
+        compiler_fence(Ordering::SeqCst);
+
+        irq_time.account_delta(delta);
+        compiler_fence(Ordering::SeqCst);
+    }
+}
+
+pub struct CpuTimeFunc;
+impl CpuTimeFunc {
+    pub fn irqtime_account_process_tick(
+        _pcb: &Arc<ProcessControlBlock>,
+        _user_tick: bool,
+        ticks: u64,
+    ) {
+        let cputime = TICK_NESC as u64 * ticks;
+
+        let other = Self::account_other_time(u64::MAX);
+
+        if other >= cputime {
+            return;
+        }
+
+        // TODO: update process time
+    }
+
+    pub fn account_other_time(max: u64) -> u64 {
+        assert!(!CurrentIrqArch::is_irq_enabled());
+
+        let mut accounted = Self::steal_account_process_time(max);
+
+        if accounted < max {
+            let irqtime = cpu_irq_time(smp_get_processor_id().data() as usize);
+            accounted += irqtime.irqtime_tick_accounted(max - accounted);
+        }
+
+        accounted
+    }
+
+    pub fn steal_account_process_time(_max: u64) -> u64 {
+        // 这里未考虑虚拟机时间窃取
+        0
+    }
+}

+ 1801 - 0
kernel/src/sched/fair.rs

@@ -0,0 +1,1801 @@
+use core::intrinsics::likely;
+use core::intrinsics::unlikely;
+use core::mem::swap;
+use core::sync::atomic::fence;
+use core::sync::atomic::{AtomicU64, Ordering};
+
+use crate::libs::rbtree::RBTree;
+use crate::libs::spinlock::SpinLock;
+use crate::process::ProcessControlBlock;
+use crate::process::ProcessFlags;
+use crate::sched::clock::ClockUpdataFlag;
+use crate::sched::{cpu_rq, SchedFeature, SCHED_FEATURES};
+use crate::smp::core::smp_get_processor_id;
+use crate::time::jiffies::TICK_NESC;
+use crate::time::timer::clock;
+use crate::time::NSEC_PER_MSEC;
+use alloc::sync::{Arc, Weak};
+
+use super::idle::IdleScheduler;
+use super::pelt::{add_positive, sub_positive, SchedulerAvg, UpdateAvgFlags, PELT_MIN_DIVIDER};
+use super::{
+    CpuRunQueue, DequeueFlag, EnqueueFlag, LoadWeight, OnRq, SchedPolicy, Scheduler, TaskGroup,
+    WakeupFlags, SCHED_CAPACITY_SHIFT,
+};
+
+/// 用于设置 CPU-bound 任务的最小抢占粒度的参数。
+/// 默认值为 0.75 毫秒乘以(1 加上 CPU 数量的二进制对数),单位为纳秒。
+/// 这个值影响到任务在 CPU-bound 情况下的抢占行为。
+static SYSCTL_SHCED_MIN_GRANULARITY: AtomicU64 = AtomicU64::new(750000);
+/// 规范化最小抢占粒度参数
+#[allow(dead_code)]
+static NORMALIZED_SYSCTL_SCHED_MIN_GRANULARITY: AtomicU64 = AtomicU64::new(750000);
+
+static SYSCTL_SHCED_BASE_SLICE: AtomicU64 = AtomicU64::new(750000);
+#[allow(dead_code)]
+static NORMALIZED_SYSCTL_SHCED_BASE_SLICE: AtomicU64 = AtomicU64::new(750000);
+
+/// 预设的调度延迟任务数量
+static SCHED_NR_LATENCY: AtomicU64 = AtomicU64::new(8);
+
+/// 调度实体单位,一个调度实体可以是一个进程、一个进程组或者是一个用户等等划分
+#[derive(Debug)]
+pub struct FairSchedEntity {
+    /// 负载相关
+    pub load: LoadWeight,
+    pub deadline: u64,
+    pub min_deadline: u64,
+
+    /// 是否在运行队列中
+    pub on_rq: OnRq,
+    /// 当前调度实体的开始执行时间
+    pub exec_start: u64,
+    /// 总运行时长
+    pub sum_exec_runtime: u64,
+    /// 虚拟运行时间
+    pub vruntime: u64,
+    /// 进程的调度延迟 它等于进程的权重(weight)乘以(V - v_i),其中V是系统当前的时间,v_i是进程的运行时间
+    pub vlag: i64,
+    // 运行时间片
+    pub slice: u64,
+    /// 上一个调度实体运行总时间
+    pub prev_sum_exec_runtime: u64,
+
+    pub avg: SchedulerAvg,
+
+    /// 父节点
+    parent: Weak<FairSchedEntity>,
+
+    pub depth: u32,
+
+    /// 指向自身
+    self_ref: Weak<FairSchedEntity>,
+
+    /// 所在的CFS运行队列
+    cfs_rq: Weak<CfsRunQueue>,
+
+    /// group持有的私有cfs队列
+    my_cfs_rq: Option<Arc<CfsRunQueue>>,
+
+    runnable_weight: u64,
+
+    pcb: Weak<ProcessControlBlock>,
+}
+
+impl FairSchedEntity {
+    pub fn new() -> Arc<Self> {
+        let ret = Arc::new(Self {
+            parent: Weak::new(),
+            self_ref: Weak::new(),
+            pcb: Weak::new(),
+            cfs_rq: Weak::new(),
+            my_cfs_rq: None,
+            on_rq: OnRq::None,
+            slice: SYSCTL_SHCED_BASE_SLICE.load(Ordering::SeqCst),
+            load: Default::default(),
+            deadline: Default::default(),
+            min_deadline: Default::default(),
+            exec_start: Default::default(),
+            sum_exec_runtime: Default::default(),
+            vruntime: Default::default(),
+            vlag: Default::default(),
+            prev_sum_exec_runtime: Default::default(),
+            avg: Default::default(),
+            depth: Default::default(),
+            runnable_weight: Default::default(),
+        });
+
+        ret.force_mut().self_ref = Arc::downgrade(&ret);
+
+        ret
+    }
+}
+
+impl FairSchedEntity {
+    pub fn self_arc(&self) -> Arc<FairSchedEntity> {
+        self.self_ref.upgrade().unwrap()
+    }
+
+    #[inline]
+    pub fn on_rq(&self) -> bool {
+        self.on_rq != OnRq::None
+    }
+
+    pub fn pcb(&self) -> Arc<ProcessControlBlock> {
+        self.pcb.upgrade().unwrap()
+    }
+
+    pub fn set_pcb(&mut self, pcb: Weak<ProcessControlBlock>) {
+        self.pcb = pcb
+    }
+
+    #[inline]
+    pub fn cfs_rq(&self) -> Arc<CfsRunQueue> {
+        self.cfs_rq.upgrade().unwrap()
+    }
+
+    pub fn set_cfs(&mut self, cfs: Weak<CfsRunQueue>) {
+        self.cfs_rq = cfs;
+    }
+
+    pub fn parent(&self) -> Option<Arc<FairSchedEntity>> {
+        self.parent.upgrade()
+    }
+
+    #[allow(clippy::mut_from_ref)]
+    pub fn force_mut(&self) -> &mut Self {
+        unsafe { &mut *(self as *const Self as usize as *mut Self) }
+    }
+
+    /// 判断是否是进程持有的调度实体
+    #[inline]
+    pub fn is_task(&self) -> bool {
+        // TODO: 调度组
+        true
+    }
+
+    #[inline]
+    pub fn is_idle(&self) -> bool {
+        if self.is_task() {
+            return self.pcb().sched_info().policy() == SchedPolicy::IDLE;
+        }
+
+        return self.cfs_rq().is_idle();
+    }
+
+    pub fn clear_buddies(&self) {
+        let mut se = self.self_arc();
+
+        Self::for_each_in_group(&mut se, |se| {
+            let binding = se.cfs_rq();
+            let cfs_rq = binding.force_mut();
+
+            if let Some(next) = cfs_rq.next.upgrade() {
+                if !Arc::ptr_eq(&next, &se) {
+                    return (false, true);
+                }
+            }
+            cfs_rq.next = Weak::new();
+            return (true, true);
+        });
+    }
+
+    pub fn calculate_delta_fair(&self, delta: u64) -> u64 {
+        if unlikely(self.load.weight != LoadWeight::NICE_0_LOAD_SHIFT as u64) {
+            return self
+                .force_mut()
+                .load
+                .calculate_delta(delta, LoadWeight::NICE_0_LOAD_SHIFT as u64);
+        };
+
+        delta
+    }
+
+    /// 更新组内的权重信息
+    pub fn update_cfs_group(&self) {
+        if self.my_cfs_rq.is_none() {
+            return;
+        }
+
+        let group_cfs = self.my_cfs_rq.clone().unwrap();
+
+        let shares = group_cfs.task_group().shares;
+
+        if unlikely(self.load.weight != shares) {
+            // TODO: reweight
+            self.cfs_rq()
+                .force_mut()
+                .reweight_entity(self.self_arc(), shares);
+        }
+    }
+
+    /// 遍历se组,如果返回false则需要调用的函数return,
+    /// 会将se指向其顶层parent
+    /// 该函数会改变se指向
+    /// 参数:
+    /// - se: 对应调度实体
+    /// - f: 对调度实体执行操作的闭包,返回值对应(no_break,should_continue),no_break为假时,退出循环,should_continue为假时表示需要将调用者return
+    ///
+    /// 返回值:
+    /// - bool: 是否需要调度者return
+    /// - Option<Arc<FairSchedEntity>>:最终se的指向
+    pub fn for_each_in_group(
+        se: &mut Arc<FairSchedEntity>,
+        mut f: impl FnMut(Arc<FairSchedEntity>) -> (bool, bool),
+    ) -> (bool, Option<Arc<FairSchedEntity>>) {
+        let mut should_continue;
+        let ret;
+        // 这一步是循环计算,直到根节点
+        // 比如有任务组 A ,有进程B,B属于A任务组,那么B的时间分配依赖于A组的权重以及B进程自己的权重
+        loop {
+            let (no_break, flag) = f(se.clone());
+            should_continue = flag;
+            if !no_break || !should_continue {
+                ret = Some(se.clone());
+                break;
+            }
+
+            let parent = se.parent();
+            if parent.is_none() {
+                ret = None;
+                break;
+            }
+
+            *se = parent.unwrap();
+        }
+
+        (should_continue, ret)
+    }
+
+    pub fn runnable(&self) -> u64 {
+        if self.is_task() {
+            return self.on_rq as u64;
+        } else {
+            self.runnable_weight
+        }
+    }
+
+    /// 更新task和其cfsrq的负载均值
+    pub fn propagate_entity_load_avg(&mut self) -> bool {
+        if self.is_task() {
+            return false;
+        }
+
+        let binding = self.my_cfs_rq.clone().unwrap();
+        let gcfs_rq = binding.force_mut();
+
+        if gcfs_rq.propagate == 0 {
+            return false;
+        }
+
+        gcfs_rq.propagate = 0;
+
+        let binding = self.cfs_rq();
+        let cfs_rq = binding.force_mut();
+
+        cfs_rq.add_task_group_propagate(gcfs_rq.prop_runnable_sum);
+
+        cfs_rq.update_task_group_util(self.self_arc(), gcfs_rq);
+        cfs_rq.update_task_group_runnable(self.self_arc(), gcfs_rq);
+        cfs_rq.update_task_group_load(self.self_arc(), gcfs_rq);
+
+        return true;
+    }
+
+    /// 更新runnable_weight
+    pub fn update_runnable(&mut self) {
+        if !self.is_task() {
+            self.runnable_weight = self.my_cfs_rq.clone().unwrap().h_nr_running;
+        }
+    }
+
+    /// 初始化实体运行均值
+    pub fn init_entity_runnable_average(&mut self) {
+        self.avg = SchedulerAvg::default();
+
+        if self.is_task() {
+            self.avg.load_avg = LoadWeight::scale_load_down(self.load.weight) as usize;
+        }
+    }
+}
+
+/// CFS的运行队列,这个队列需确保是percpu的
+#[allow(dead_code)]
+#[derive(Debug)]
+pub struct CfsRunQueue {
+    load: LoadWeight,
+
+    /// 全局运行的调度实体计数器,用于负载均衡
+    nr_running: u64,
+    /// 针对特定 CPU 核心的任务计数器
+    pub h_nr_running: u64,
+    /// 运行时间
+    exec_clock: u64,
+    /// 最少虚拟运行时间
+    min_vruntime: u64,
+    /// remain runtime
+    runtime_remaining: u64,
+
+    /// 存放调度实体的红黑树
+    pub(super) entities: RBTree<u64, Arc<FairSchedEntity>>,
+
+    /// IDLE
+    idle: usize,
+
+    idle_nr_running: u64,
+
+    pub idle_h_nr_running: u64,
+
+    /// 当前运行的调度实体
+    current: Weak<FairSchedEntity>,
+    /// 下一个调度的实体
+    next: Weak<FairSchedEntity>,
+    /// 最后的调度实体
+    last: Weak<FairSchedEntity>,
+    /// 跳过运行的调度实体
+    skip: Weak<FairSchedEntity>,
+
+    avg_load: i64,
+    avg_vruntime: i64,
+
+    last_update_time_copy: u64,
+
+    pub avg: SchedulerAvg,
+
+    rq: Weak<CpuRunQueue>,
+    /// 拥有此队列的taskgroup
+    task_group: Weak<TaskGroup>,
+
+    pub throttled_clock: u64,
+    pub throttled_clock_pelt: u64,
+    pub throttled_clock_pelt_time: u64,
+    pub throttled_pelt_idle: u64,
+
+    pub throttled: bool,
+    pub throttled_count: u64,
+
+    pub removed: SpinLock<CfsRemoved>,
+
+    pub propagate: isize,
+    pub prop_runnable_sum: isize,
+}
+
+#[derive(Debug, Default)]
+pub struct CfsRemoved {
+    pub nr: u32,
+    pub load_avg: usize,
+    pub util_avg: usize,
+    pub runnable_avg: usize,
+}
+
+impl CfsRunQueue {
+    pub fn new() -> Self {
+        Self {
+            load: LoadWeight::default(),
+            nr_running: 0,
+            h_nr_running: 0,
+            exec_clock: 0,
+            min_vruntime: 1 << 20,
+            entities: RBTree::new(),
+            idle: 0,
+            idle_nr_running: 0,
+            idle_h_nr_running: 0,
+            current: Weak::new(),
+            next: Weak::new(),
+            last: Weak::new(),
+            skip: Weak::new(),
+            avg_load: 0,
+            avg_vruntime: 0,
+            last_update_time_copy: 0,
+            avg: SchedulerAvg::default(),
+            rq: Weak::new(),
+            task_group: Weak::new(),
+            throttled_clock: 0,
+            throttled_clock_pelt: 0,
+            throttled_clock_pelt_time: 0,
+            throttled_pelt_idle: 0,
+            throttled: false,
+            throttled_count: 0,
+            removed: SpinLock::new(CfsRemoved::default()),
+            propagate: 0,
+            prop_runnable_sum: 0,
+            runtime_remaining: 0,
+        }
+    }
+
+    #[inline]
+    pub fn rq(&self) -> Arc<CpuRunQueue> {
+        self.rq.upgrade().unwrap()
+    }
+
+    #[inline]
+    pub fn set_rq(&mut self, rq: Weak<CpuRunQueue>) {
+        self.rq = rq;
+    }
+
+    #[inline]
+    #[allow(clippy::mut_from_ref)]
+    pub fn force_mut(&self) -> &mut Self {
+        unsafe { &mut *(self as *const Self as usize as *mut Self) }
+    }
+
+    #[inline]
+    pub fn is_idle(&self) -> bool {
+        self.idle > 0
+    }
+
+    #[inline]
+    pub fn current(&self) -> Option<Arc<FairSchedEntity>> {
+        self.current.upgrade()
+    }
+
+    #[inline]
+    pub fn set_current(&mut self, curr: Weak<FairSchedEntity>) {
+        self.current = curr
+    }
+
+    #[inline]
+    pub fn next(&self) -> Option<Arc<FairSchedEntity>> {
+        self.next.upgrade()
+    }
+
+    pub fn task_group(&self) -> Arc<TaskGroup> {
+        self.task_group.upgrade().unwrap()
+    }
+
+    #[allow(dead_code)]
+    #[inline]
+    pub const fn bandwidth_used() -> bool {
+        false
+    }
+
+    /// ## 计算调度周期,基本思想是在一个周期内让每个任务都至少运行一次。
+    /// 这样可以确保所有的任务都能够得到执行,而且可以避免某些任务被长时间地阻塞。
+    pub fn sched_period(nr_running: u64) -> u64 {
+        if unlikely(nr_running > SCHED_NR_LATENCY.load(Ordering::SeqCst)) {
+            // 如果当前活跃的任务数量超过了预设的调度延迟任务数量
+            // 调度周期的长度将直接设置为活跃任务数量乘以最小抢占粒度
+            return nr_running * SYSCTL_SHCED_MIN_GRANULARITY.load(Ordering::SeqCst);
+        } else {
+            // 如果活跃任务数量未超过预设的延迟任务数量,那么调度周期的长度将设置为SCHED_NR_LATENCY
+            return SCHED_NR_LATENCY.load(Ordering::SeqCst);
+        }
+    }
+
+    /// ## 计算调度任务的虚拟运行时间片大小
+    ///
+    /// vruntime = runtime / weight
+    #[allow(dead_code)]
+    pub fn sched_vslice(&self, entity: Arc<FairSchedEntity>) -> u64 {
+        let slice = self.sched_slice(entity.clone());
+        return entity.calculate_delta_fair(slice);
+    }
+
+    /// ## 计算调度任务的实际运行时间片大小
+    #[allow(dead_code)]
+    pub fn sched_slice(&self, mut entity: Arc<FairSchedEntity>) -> u64 {
+        let mut nr_running = self.nr_running;
+        if SCHED_FEATURES.contains(SchedFeature::ALT_PERIOD) {
+            nr_running = self.h_nr_running;
+        }
+
+        // 计算一个调度周期的整个slice
+        let mut slice = Self::sched_period(nr_running + (!entity.on_rq()) as u64);
+
+        // 这一步是循环计算,直到根节点
+        // 比如有任务组 A ,有进程B,B属于A任务组,那么B的时间分配依赖于A组的权重以及B进程自己的权重
+        FairSchedEntity::for_each_in_group(&mut entity, |se| {
+            if unlikely(!se.on_rq()) {
+                se.cfs_rq().force_mut().load.update_load_add(se.load.weight);
+            }
+            slice = se
+                .cfs_rq()
+                .force_mut()
+                .load
+                .calculate_delta(slice, se.load.weight);
+
+            (true, true)
+        });
+
+        if SCHED_FEATURES.contains(SchedFeature::BASE_SLICE) {
+            // TODO: IDLE?
+            let min_gran = SYSCTL_SHCED_MIN_GRANULARITY.load(Ordering::SeqCst);
+
+            slice = min_gran.max(slice)
+        }
+
+        slice
+    }
+
+    /// ## 在时间片到期时检查当前任务是否需要被抢占,
+    /// 如果需要,则抢占当前任务,并确保不会由于与其他任务的“好友偏爱(buddy favours)”而重新选举为下一个运行的任务。
+    #[allow(dead_code)]
+    pub fn check_preempt_tick(&mut self, curr: Arc<FairSchedEntity>) {
+        // 计算理想状态下该调度实体的理想运行时间
+        let ideal_runtime = self.sched_slice(curr.clone());
+
+        let delta_exec = curr.sum_exec_runtime - curr.prev_sum_exec_runtime;
+
+        if delta_exec > ideal_runtime {
+            // 表明实际运行时间长于理想运行时间
+            self.rq().resched_current();
+
+            self.clear_buddies(&curr);
+            return;
+        }
+
+        if delta_exec < SYSCTL_SHCED_MIN_GRANULARITY.load(Ordering::SeqCst) {
+            return;
+        }
+
+        todo!()
+    }
+
+    pub fn clear_buddies(&mut self, se: &Arc<FairSchedEntity>) {
+        if let Some(next) = self.next.upgrade() {
+            if Arc::ptr_eq(&next, se) {
+                se.clear_buddies();
+            }
+        }
+    }
+
+    /// 处理调度实体的时间片到期事件
+    pub fn entity_tick(&mut self, curr: Arc<FairSchedEntity>, queued: bool) {
+        // 更新当前调度实体的运行时间统计信息
+        self.update_current();
+
+        self.update_load_avg(&curr, UpdateAvgFlags::UPDATE_TG);
+
+        // 更新组调度相关
+        curr.update_cfs_group();
+
+        if queued {
+            self.rq().resched_current();
+            return;
+        }
+    }
+
+    /// 更新当前调度实体的运行时间统计信息
+    pub fn update_current(&mut self) {
+        let curr = self.current();
+        if unlikely(curr.is_none()) {
+            return;
+        }
+
+        let now = self.rq().clock_task();
+        let curr = curr.unwrap();
+
+        fence(Ordering::SeqCst);
+        if unlikely(now <= curr.exec_start) {
+            // kwarn!(
+            //     "update_current return now <= curr.exec_start now {now} execstart {}",
+            //     curr.exec_start
+            // );
+            return;
+        }
+
+        fence(Ordering::SeqCst);
+        let delta_exec = now - curr.exec_start;
+
+        let curr = curr.force_mut();
+
+        curr.exec_start = now;
+
+        curr.sum_exec_runtime += delta_exec;
+
+        // 根据实际运行时长加权增加虚拟运行时长
+        curr.vruntime += curr.calculate_delta_fair(delta_exec);
+        fence(Ordering::SeqCst);
+        self.update_deadline(&curr.self_arc());
+        self.update_min_vruntime();
+
+        self.account_cfs_rq_runtime(delta_exec);
+    }
+
+    /// 计算当前cfs队列的运行时间是否到期
+    fn account_cfs_rq_runtime(&mut self, delta_exec: u64) {
+        if likely(self.runtime_remaining > delta_exec) {
+            self.runtime_remaining -= delta_exec;
+            // kerror!("runtime_remaining {}", self.runtime_remaining);
+            return;
+        }
+
+        // kwarn!(
+        //     "runtime_remaining {} delta exec {delta_exec} nr_running {}",
+        //     self.runtime_remaining,
+        //     self.nr_running
+        // );
+        // fixme: 目前只是简单分配一个时间片
+        self.runtime_remaining = 5000 * NSEC_PER_MSEC as u64;
+
+        if likely(self.current().is_some()) && self.nr_running > 1 {
+            // kerror!("account_cfs_rq_runtime");
+            self.rq().resched_current();
+        }
+    }
+
+    /// 计算deadline,如果vruntime到期会重调度
+    pub fn update_deadline(&mut self, se: &Arc<FairSchedEntity>) {
+        // kerror!("vruntime {} deadline {}", se.vruntime, se.deadline);
+        if se.vruntime < se.deadline {
+            return;
+        }
+
+        se.force_mut().slice = SYSCTL_SHCED_BASE_SLICE.load(Ordering::SeqCst);
+
+        se.force_mut().deadline = se.vruntime + se.calculate_delta_fair(se.slice);
+
+        if self.nr_running > 1 {
+            self.rq().resched_current();
+            self.clear_buddies(se);
+        }
+    }
+
+    /// ## 更新最小虚拟运行时间
+    pub fn update_min_vruntime(&mut self) {
+        let curr = self.current();
+
+        let mut vruntime = self.min_vruntime;
+
+        if curr.is_some() {
+            let curr = curr.as_ref().unwrap();
+            if curr.on_rq() {
+                vruntime = curr.vruntime;
+            } else {
+                self.set_current(Weak::default());
+            }
+        }
+
+        // 找到最小虚拟运行时间的调度实体
+        let leftmost = self.entities.get_first();
+        if let Some(leftmost) = leftmost {
+            let se = leftmost.1;
+
+            if curr.is_none() {
+                vruntime = se.vruntime;
+            } else {
+                vruntime = vruntime.min(se.vruntime);
+            }
+        }
+
+        self.min_vruntime = self.__update_min_vruntime(vruntime);
+    }
+
+    fn __update_min_vruntime(&mut self, vruntime: u64) -> u64 {
+        let mut min_vruntime = self.min_vruntime;
+
+        let delta = vruntime as i64 - min_vruntime as i64;
+        if delta > 0 {
+            self.avg_vruntime -= self.avg_load * delta;
+            min_vruntime = vruntime;
+        }
+
+        return min_vruntime;
+    }
+
+    // 判断是否为当前任务
+    pub fn is_curr(&self, se: &Arc<FairSchedEntity>) -> bool {
+        if self.current().is_none() {
+            false
+        } else {
+            // 判断当前和传入的se是否相等
+            Arc::ptr_eq(se, self.current().as_ref().unwrap())
+        }
+    }
+
+    // 修改后
+    pub fn reweight_entity(&mut self, se: Arc<FairSchedEntity>, weight: u64) {
+        // 判断是否为当前任务
+        let is_curr = self.is_curr(&se);
+
+        // 如果se在队列中
+        if se.on_rq() {
+            // 如果是当前任务
+            if is_curr {
+                self.update_current();
+            } else {
+                // 否则,出队
+                self.inner_dequeue_entity(&se);
+            }
+
+            // 减去该权重
+            self.load.update_load_sub(se.load.weight);
+        }
+
+        self.dequeue_load_avg(&se);
+
+        if !se.on_rq() {
+            se.force_mut().vlag = se.vlag * se.load.weight as i64 / weight as i64;
+        } else {
+            self.reweight_eevdf(&se, weight);
+        }
+        se.force_mut().load.update_load_set(weight);
+
+        // SMP
+        let divider = se.avg.get_pelt_divider();
+        se.force_mut().avg.load_avg = LoadWeight::scale_load_down(se.load.weight) as usize
+            * se.avg.load_sum as usize
+            / divider;
+
+        self.enqueue_load_avg(se.clone());
+
+        if se.on_rq() {
+            self.load.update_load_add(se.load.weight);
+            if !is_curr {
+                self.inner_enqueue_entity(&se);
+            }
+
+            self.update_min_vruntime();
+        }
+    }
+
+    /// 用于重新计算调度实体(sched_entity)的权重(weight)和虚拟运行时间(vruntime)
+    fn reweight_eevdf(&mut self, se: &Arc<FairSchedEntity>, weight: u64) {
+        let old_weight = se.load.weight;
+        let avg_vruntime = self.avg_vruntime();
+        let mut vlag;
+        if avg_vruntime != se.vruntime {
+            vlag = avg_vruntime as i64 - se.vruntime as i64;
+            vlag = vlag * old_weight as i64 / weight as i64;
+            se.force_mut().vruntime = (avg_vruntime as i64 - vlag) as u64;
+        }
+
+        let mut vslice = se.deadline as i64 - avg_vruntime as i64;
+        vslice = vslice * old_weight as i64 / weight as i64;
+        se.force_mut().deadline = avg_vruntime + vslice as u64;
+    }
+
+    fn avg_vruntime(&self) -> u64 {
+        let curr = self.current();
+        let mut avg = self.avg_vruntime;
+        let mut load = self.avg_load;
+
+        if let Some(curr) = curr {
+            if curr.on_rq() {
+                let weight = LoadWeight::scale_load_down(curr.load.weight);
+                avg += self.entity_key(&curr) * weight as i64;
+                load += weight as i64;
+            }
+        }
+
+        if load > 0 {
+            if avg < 0 {
+                avg -= load - 1;
+            }
+
+            avg /= load;
+        }
+
+        return self.min_vruntime + avg as u64;
+    }
+
+    #[inline]
+    pub fn entity_key(&self, se: &Arc<FairSchedEntity>) -> i64 {
+        return se.vruntime as i64 - self.min_vruntime as i64;
+    }
+
+    pub fn avg_vruntime_add(&mut self, se: &Arc<FairSchedEntity>) {
+        let weight = LoadWeight::scale_load_down(se.load.weight);
+
+        let key = self.entity_key(se);
+
+        let avg_vruntime = self.avg_vruntime + key * weight as i64;
+
+        self.avg_vruntime = avg_vruntime;
+        self.avg_load += weight as i64;
+    }
+
+    pub fn avg_vruntime_sub(&mut self, se: &Arc<FairSchedEntity>) {
+        let weight = LoadWeight::scale_load_down(se.load.weight);
+
+        let key = self.entity_key(se);
+
+        let avg_vruntime = self.avg_vruntime - key * weight as i64;
+
+        self.avg_vruntime = avg_vruntime;
+        self.avg_load -= weight as i64;
+    }
+
+    /// 为调度实体计算初始vruntime等信息
+    fn place_entity(&mut self, se: Arc<FairSchedEntity>, flags: EnqueueFlag) {
+        let vruntime = self.avg_vruntime();
+        let mut lag = 0;
+
+        let se = se.force_mut();
+        se.slice = SYSCTL_SHCED_BASE_SLICE.load(Ordering::SeqCst);
+
+        let mut vslice = se.calculate_delta_fair(se.slice);
+
+        if self.nr_running > 0 {
+            let curr = self.current();
+
+            lag = se.vlag;
+
+            let mut load = self.avg_load;
+
+            if let Some(curr) = curr {
+                if curr.on_rq() {
+                    load += LoadWeight::scale_load_down(curr.load.weight) as i64;
+                }
+            }
+
+            lag *= load + LoadWeight::scale_load_down(se.load.weight) as i64;
+
+            if load == 0 {
+                load = 1;
+            }
+
+            lag /= load;
+        }
+
+        se.vruntime = vruntime - lag as u64;
+
+        if flags.contains(EnqueueFlag::ENQUEUE_INITIAL) {
+            vslice /= 2;
+        }
+
+        se.deadline = se.vruntime + vslice;
+    }
+
+    /// 更新负载均值
+    fn update_load_avg(&mut self, se: &Arc<FairSchedEntity>, flags: UpdateAvgFlags) {
+        let now = self.cfs_rq_clock_pelt();
+
+        if se.avg.last_update_time > 0 && !flags.contains(UpdateAvgFlags::SKIP_AGE_LOAD) {
+            se.force_mut().update_load_avg(self, now);
+        }
+
+        let mut decayed = self.update_self_load_avg(now);
+        decayed |= se.force_mut().propagate_entity_load_avg() as u32;
+
+        if se.avg.last_update_time > 0 && flags.contains(UpdateAvgFlags::DO_ATTACH) {
+            todo!()
+        } else if flags.contains(UpdateAvgFlags::DO_ATTACH) {
+            self.detach_entity_load_avg(se);
+        } else if decayed > 0 {
+            // cfs_rq_util_change
+
+            todo!()
+        }
+    }
+
+    /// 将实体的负载均值与对应cfs分离
+    fn detach_entity_load_avg(&mut self, se: &Arc<FairSchedEntity>) {
+        self.dequeue_load_avg(se);
+
+        sub_positive(&mut self.avg.util_avg, se.avg.util_avg);
+        sub_positive(&mut (self.avg.util_sum as usize), se.avg.util_sum as usize);
+        self.avg.util_sum = self
+            .avg
+            .util_sum
+            .max((self.avg.util_avg * PELT_MIN_DIVIDER) as u64);
+
+        sub_positive(&mut self.avg.runnable_avg, se.avg.runnable_avg);
+        sub_positive(
+            &mut (self.avg.runnable_sum as usize),
+            se.avg.runnable_sum as usize,
+        );
+        self.avg.runnable_sum = self
+            .avg
+            .runnable_sum
+            .max((self.avg.runnable_avg * PELT_MIN_DIVIDER) as u64);
+
+        self.propagate = 1;
+        self.prop_runnable_sum += se.avg.load_sum as isize;
+    }
+
+    fn update_self_load_avg(&mut self, now: u64) -> u32 {
+        let mut removed_load = 0;
+        let mut removed_util = 0;
+        let mut removed_runnable = 0;
+
+        let mut decayed = 0;
+
+        if self.removed.lock().nr > 0 {
+            let mut removed_guard = self.removed.lock();
+            let divider = self.avg.get_pelt_divider();
+
+            swap::<usize>(&mut removed_guard.util_avg, &mut removed_util);
+            swap::<usize>(&mut removed_guard.load_avg, &mut removed_load);
+            swap::<usize>(&mut removed_guard.runnable_avg, &mut removed_runnable);
+
+            removed_guard.nr = 0;
+
+            let mut r = removed_load;
+
+            sub_positive(&mut self.avg.load_avg, r);
+            sub_positive(&mut (self.avg.load_sum as usize), r * divider);
+
+            self.avg.load_sum = self
+                .avg
+                .load_sum
+                .max((self.avg.load_avg * PELT_MIN_DIVIDER) as u64);
+
+            r = removed_util;
+            sub_positive(&mut self.avg.util_avg, r);
+            sub_positive(&mut (self.avg.util_sum as usize), r * divider);
+            self.avg.util_sum = self
+                .avg
+                .util_sum
+                .max((self.avg.util_avg * PELT_MIN_DIVIDER) as u64);
+
+            r = removed_runnable;
+            sub_positive(&mut self.avg.runnable_avg, r);
+            sub_positive(&mut (self.avg.runnable_sum as usize), r * divider);
+            self.avg.runnable_sum = self
+                .avg
+                .runnable_sum
+                .max((self.avg.runnable_avg * PELT_MIN_DIVIDER) as u64);
+
+            drop(removed_guard);
+            self.add_task_group_propagate(
+                -(removed_runnable as isize * divider as isize) >> SCHED_CAPACITY_SHIFT,
+            );
+
+            decayed = 1;
+        }
+
+        decayed |= self.__update_load_avg(now) as u32;
+
+        self.last_update_time_copy = self.avg.last_update_time;
+
+        return decayed;
+    }
+
+    fn __update_load_avg(&mut self, now: u64) -> bool {
+        if self.avg.update_load_sum(
+            now,
+            LoadWeight::scale_load_down(self.load.weight) as u32,
+            self.h_nr_running as u32,
+            self.current().is_some() as u32,
+        ) {
+            self.avg.update_load_avg(1);
+            return true;
+        }
+
+        return false;
+    }
+
+    fn add_task_group_propagate(&mut self, runnable_sum: isize) {
+        self.propagate = 1;
+        self.prop_runnable_sum += runnable_sum;
+    }
+
+    /// 将实体加入队列
+    pub fn enqueue_entity(&mut self, se: &Arc<FairSchedEntity>, flags: EnqueueFlag) {
+        let is_curr = self.is_curr(se);
+
+        if is_curr {
+            self.place_entity(se.clone(), flags);
+        }
+
+        self.update_current();
+
+        self.update_load_avg(se, UpdateAvgFlags::UPDATE_TG | UpdateAvgFlags::DO_ATTACH);
+
+        se.force_mut().update_runnable();
+
+        se.update_cfs_group();
+
+        if !is_curr {
+            self.place_entity(se.clone(), flags);
+        }
+
+        self.account_entity_enqueue(se);
+
+        if flags.contains(EnqueueFlag::ENQUEUE_MIGRATED) {
+            se.force_mut().exec_start = 0;
+        }
+
+        if !is_curr {
+            self.inner_enqueue_entity(se);
+        }
+
+        se.force_mut().on_rq = OnRq::Queued;
+
+        if self.nr_running == 1 {
+            // 只有上面加入的
+            // TODO: throttle
+        }
+    }
+
+    pub fn dequeue_entity(&mut self, se: &Arc<FairSchedEntity>, flags: DequeueFlag) {
+        let mut action = UpdateAvgFlags::UPDATE_TG;
+
+        if se.is_task() && se.on_rq == OnRq::Migrating {
+            action |= UpdateAvgFlags::DO_DETACH;
+        }
+
+        self.update_current();
+
+        self.update_load_avg(se, action);
+
+        se.force_mut().update_runnable();
+
+        self.clear_buddies(se);
+
+        self.update_entity_lag(se);
+
+        if let Some(curr) = self.current() {
+            if !Arc::ptr_eq(&curr, se) {
+                self.inner_dequeue_entity(se);
+            }
+        } else {
+            self.inner_dequeue_entity(se);
+        }
+
+        se.force_mut().on_rq = OnRq::None;
+
+        self.account_entity_dequeue(se);
+
+        // return_cfs_rq_runtime
+
+        se.update_cfs_group();
+
+        if flags & (DequeueFlag::DEQUEUE_SAVE | DequeueFlag::DEQUEUE_MOVE)
+            != DequeueFlag::DEQUEUE_SAVE
+        {
+            self.update_min_vruntime();
+        }
+
+        if self.nr_running == 0 {
+            self.update_idle_clock_pelt()
+        }
+    }
+
+    /// 将前一个调度的task放回队列
+    pub fn put_prev_entity(&mut self, prev: Arc<FairSchedEntity>) {
+        if prev.on_rq() {
+            self.update_current();
+        }
+
+        if prev.on_rq() {
+            self.inner_enqueue_entity(&prev);
+        }
+
+        self.set_current(Weak::default());
+    }
+
+    /// 将下一个运行的task设置为current
+    pub fn set_next_entity(&mut self, se: &Arc<FairSchedEntity>) {
+        self.clear_buddies(se);
+
+        if se.on_rq() {
+            self.inner_dequeue_entity(se);
+            self.update_load_avg(se, UpdateAvgFlags::UPDATE_TG);
+            se.force_mut().vlag = se.deadline as i64;
+        }
+
+        self.set_current(Arc::downgrade(se));
+
+        se.force_mut().prev_sum_exec_runtime = se.sum_exec_runtime;
+    }
+
+    fn update_idle_clock_pelt(&mut self) {
+        let throttled = if unlikely(self.throttled_count > 0) {
+            u64::MAX
+        } else {
+            self.throttled_clock_pelt_time
+        };
+
+        self.throttled_clock_pelt = throttled;
+    }
+
+    fn update_entity_lag(&mut self, se: &Arc<FairSchedEntity>) {
+        let lag = self.avg_vruntime() as i64 - se.vruntime as i64;
+
+        let limit = se.calculate_delta_fair((TICK_NESC as u64).max(2 * se.slice)) as i64;
+
+        se.force_mut().vlag = if lag < -limit {
+            -limit
+        } else if lag > limit {
+            limit
+        } else {
+            lag
+        }
+    }
+
+    fn account_entity_enqueue(&mut self, se: &Arc<FairSchedEntity>) {
+        self.load.update_load_add(se.load.weight);
+
+        if se.is_task() {
+            let rq = self.rq();
+            let (rq, _guard) = rq.self_lock();
+            // TODO:numa
+            rq.cfs_tasks.push_back(se.clone());
+        }
+        self.nr_running += 1;
+        if se.is_idle() {
+            self.idle_nr_running += 1;
+        }
+    }
+
+    fn account_entity_dequeue(&mut self, se: &Arc<FairSchedEntity>) {
+        self.load.update_load_sub(se.load.weight);
+
+        if se.is_task() {
+            let rq = self.rq();
+            let (rq, _guard) = rq.self_lock();
+
+            // TODO:numa
+            let _ = rq.cfs_tasks.extract_if(|x| Arc::ptr_eq(x, se));
+        }
+
+        self.nr_running -= 1;
+        if se.is_idle() {
+            self.idle_nr_running -= 1;
+        }
+    }
+
+    pub fn inner_enqueue_entity(&mut self, se: &Arc<FairSchedEntity>) {
+        self.avg_vruntime_add(se);
+        se.force_mut().min_deadline = se.deadline;
+        self.entities.insert(se.vruntime, se.clone());
+        // kwarn!(
+        //     "enqueue pcb {:?} cfsrq {:?}",
+        //     se.pcb().pid(),
+        //     self.entities
+        //         .iter()
+        //         .map(|x| (x.0, x.1.pcb().pid()))
+        //         .collect::<Vec<_>>()
+        // );
+        // send_to_default_serial8250_port(
+        //     format!(
+        //         "enqueue pcb {:?} cfsrq {:?}\n",
+        //         se.pcb().pid(),
+        //         self.entities
+        //             .iter()
+        //             .map(|x| (x.0, x.1.pcb().pid()))
+        //             .collect::<Vec<_>>()
+        //     )
+        //     .as_bytes(),
+        // );
+    }
+
+    fn inner_dequeue_entity(&mut self, se: &Arc<FairSchedEntity>) {
+        // kwarn!(
+        //     "before dequeue pcb {:?} cfsrq {:?}",
+        //     se.pcb().pid(),
+        //     self.entities
+        //         .iter()
+        //         .map(|x| (x.0, x.1.pcb().pid()))
+        //         .collect::<Vec<_>>()
+        // );
+
+        // send_to_default_serial8250_port(
+        //     format!(
+        //         "before dequeue pcb {:?} cfsrq {:?}\n",
+        //         se.pcb().pid(),
+        //         self.entities
+        //             .iter()
+        //             .map(|x| (x.0, x.1.pcb().pid()))
+        //             .collect::<Vec<_>>()
+        //     )
+        //     .as_bytes(),
+        // );
+
+        let mut i = 1;
+        while let Some(rm) = self.entities.remove(&se.vruntime) {
+            if Arc::ptr_eq(&rm, se) {
+                break;
+            }
+            rm.force_mut().vruntime += i;
+            self.entities.insert(rm.vruntime, rm);
+
+            i += 1;
+        }
+        // send_to_default_serial8250_port(
+        //     format!(
+        //         "after dequeue pcb {:?}(real: {:?}) cfsrq {:?}\n",
+        //         se.pcb().pid(),
+        //         remove.pcb().pid(),
+        //         self.entities
+        //             .iter()
+        //             .map(|x| (x.0, x.1.pcb().pid()))
+        //             .collect::<Vec<_>>()
+        //     )
+        //     .as_bytes(),
+        // );
+        // kwarn!(
+        //     "after dequeue pcb {:?}(real: {:?}) cfsrq {:?}",
+        //     se.pcb().pid(),
+        //     remove.pcb().pid(),
+        //     self.entities
+        //         .iter()
+        //         .map(|x| (x.0, x.1.pcb().pid()))
+        //         .collect::<Vec<_>>()
+        // );
+        self.avg_vruntime_sub(se);
+    }
+
+    pub fn enqueue_load_avg(&mut self, se: Arc<FairSchedEntity>) {
+        self.avg.load_avg += se.avg.load_avg;
+        self.avg.load_sum += LoadWeight::scale_load_down(se.load.weight) * se.avg.load_sum;
+    }
+
+    pub fn dequeue_load_avg(&mut self, se: &Arc<FairSchedEntity>) {
+        if self.avg.load_avg > se.avg.load_avg {
+            self.avg.load_avg -= se.avg.load_avg;
+        } else {
+            self.avg.load_avg = 0;
+        };
+
+        let se_load = LoadWeight::scale_load_down(se.load.weight) * se.avg.load_sum;
+
+        if self.avg.load_sum > se_load {
+            self.avg.load_sum -= se_load;
+        } else {
+            self.avg.load_sum = 0;
+        }
+
+        self.avg.load_sum = self
+            .avg
+            .load_sum
+            .max((self.avg.load_avg * PELT_MIN_DIVIDER) as u64)
+    }
+
+    pub fn update_task_group_util(&mut self, se: Arc<FairSchedEntity>, gcfs_rq: &CfsRunQueue) {
+        let mut delta_sum = gcfs_rq.avg.load_avg as isize - se.avg.load_avg as isize;
+        let delta_avg = delta_sum;
+
+        if delta_avg == 0 {
+            return;
+        }
+
+        let divider = self.avg.get_pelt_divider();
+
+        let se = se.force_mut();
+        se.avg.util_avg = gcfs_rq.avg.util_avg;
+        let new_sum = se.avg.util_avg * divider;
+        delta_sum = new_sum as isize - se.avg.util_sum as isize;
+
+        se.avg.util_sum = new_sum as u64;
+
+        add_positive(&mut (self.avg.util_avg as isize), delta_avg);
+        add_positive(&mut (self.avg.util_sum as isize), delta_sum);
+
+        self.avg.util_sum = self
+            .avg
+            .util_sum
+            .max((self.avg.util_avg * PELT_MIN_DIVIDER) as u64);
+    }
+
+    pub fn update_task_group_runnable(&mut self, se: Arc<FairSchedEntity>, gcfs_rq: &CfsRunQueue) {
+        let mut delta_sum = gcfs_rq.avg.runnable_avg as isize - se.avg.runnable_avg as isize;
+        let delta_avg = delta_sum;
+
+        if delta_avg == 0 {
+            return;
+        }
+
+        let divider = self.avg.get_pelt_divider();
+
+        let se = se.force_mut();
+        se.avg.runnable_avg = gcfs_rq.avg.runnable_avg;
+        let new_sum = se.avg.runnable_sum * divider as u64;
+        delta_sum = new_sum as isize - se.avg.runnable_sum as isize;
+
+        se.avg.runnable_sum = new_sum;
+
+        add_positive(&mut (self.avg.runnable_avg as isize), delta_avg);
+        add_positive(&mut (self.avg.runnable_sum as isize), delta_sum);
+
+        self.avg.runnable_sum = self
+            .avg
+            .runnable_sum
+            .max((self.avg.runnable_avg * PELT_MIN_DIVIDER) as u64);
+    }
+
+    pub fn update_task_group_load(&mut self, se: Arc<FairSchedEntity>, gcfs_rq: &mut CfsRunQueue) {
+        let mut runnable_sum = gcfs_rq.prop_runnable_sum;
+
+        let mut load_sum = 0;
+
+        if runnable_sum == 0 {
+            return;
+        }
+
+        gcfs_rq.prop_runnable_sum = 0;
+
+        let divider = self.avg.get_pelt_divider();
+
+        if runnable_sum >= 0 {
+            runnable_sum += se.avg.load_sum as isize;
+            runnable_sum = runnable_sum.min(divider as isize);
+        } else {
+            if LoadWeight::scale_load_down(gcfs_rq.load.weight) > 0 {
+                load_sum = gcfs_rq.avg.load_sum / LoadWeight::scale_load_down(gcfs_rq.load.weight);
+            }
+
+            runnable_sum = se.avg.load_sum.min(load_sum) as isize;
+        }
+
+        let running_sum = se.avg.util_sum as isize >> SCHED_CAPACITY_SHIFT;
+        runnable_sum = runnable_sum.max(running_sum);
+
+        load_sum = LoadWeight::scale_load_down(se.load.weight) * runnable_sum as u64;
+        let load_avg = load_sum / divider as u64;
+
+        let delta_avg = load_avg as isize - se.avg.load_avg as isize;
+        if delta_avg == 0 {
+            return;
+        }
+
+        let delta_sum = load_sum as isize
+            - LoadWeight::scale_load_down(se.load.weight) as isize * se.avg.load_sum as isize;
+
+        let se = se.force_mut();
+        se.avg.load_sum = runnable_sum as u64;
+        se.avg.load_avg = load_avg as usize;
+
+        add_positive(&mut (self.avg.load_avg as isize), delta_avg);
+        add_positive(&mut (self.avg.util_sum as isize), delta_sum);
+
+        self.avg.load_sum = self
+            .avg
+            .load_sum
+            .max((self.avg.load_avg * PELT_MIN_DIVIDER) as u64);
+    }
+
+    /// pick下一个运行的task
+    pub fn pick_next_entity(&self) -> Option<Arc<FairSchedEntity>> {
+        if SCHED_FEATURES.contains(SchedFeature::NEXT_BUDDY)
+            && self.next().is_some()
+            && self.entity_eligible(&self.next().unwrap())
+        {
+            return self.next();
+        }
+        self.entities.get_first().map(|val| val.1.clone())
+    }
+
+    pub fn entity_eligible(&self, se: &Arc<FairSchedEntity>) -> bool {
+        let curr = self.current();
+        let mut avg = self.avg_vruntime;
+        let mut load = self.avg_load;
+
+        if let Some(curr) = curr {
+            if curr.on_rq() {
+                let weight = LoadWeight::scale_load_down(curr.load.weight);
+
+                avg += self.entity_key(&curr) * weight as i64;
+                load += weight as i64;
+            }
+        }
+
+        return avg >= self.entity_key(se) * load;
+    }
+}
+
+pub struct CompletelyFairScheduler;
+
+impl CompletelyFairScheduler {
+    /// 寻找到最近公共组长
+    fn find_matching_se(se: &mut Arc<FairSchedEntity>, pse: &mut Arc<FairSchedEntity>) {
+        let mut se_depth = se.depth;
+        let mut pse_depth = pse.depth;
+
+        while se_depth > pse_depth {
+            se_depth -= 1;
+            *se = se.parent().unwrap();
+        }
+
+        while pse_depth > se_depth {
+            pse_depth -= 1;
+            *pse = pse.parent().unwrap();
+        }
+
+        while !Arc::ptr_eq(&se.cfs_rq(), &pse.cfs_rq()) {
+            *se = se.parent().unwrap();
+            *pse = pse.parent().unwrap();
+        }
+    }
+}
+
+impl Scheduler for CompletelyFairScheduler {
+    fn enqueue(
+        rq: &mut CpuRunQueue,
+        pcb: Arc<crate::process::ProcessControlBlock>,
+        mut flags: EnqueueFlag,
+    ) {
+        let mut se = pcb.sched_info().sched_entity();
+        let mut idle_h_nr_running = pcb.sched_info().policy() == SchedPolicy::IDLE;
+        let (should_continue, se) = FairSchedEntity::for_each_in_group(&mut se, |se| {
+            if se.on_rq() {
+                return (false, true);
+            }
+
+            let binding = se.cfs_rq();
+            let cfs_rq = binding.force_mut();
+            cfs_rq.enqueue_entity(&se, flags);
+
+            cfs_rq.h_nr_running += 1;
+            cfs_rq.idle_h_nr_running += idle_h_nr_running as u64;
+
+            if cfs_rq.is_idle() {
+                idle_h_nr_running = true;
+            }
+
+            // TODO: cfs_rq_throttled
+
+            flags = EnqueueFlag::ENQUEUE_WAKEUP;
+
+            return (true, true);
+        });
+
+        if !should_continue {
+            return;
+        }
+
+        if let Some(mut se) = se {
+            FairSchedEntity::for_each_in_group(&mut se, |se| {
+                let binding = se.cfs_rq();
+                let cfs_rq = binding.force_mut();
+
+                cfs_rq.update_load_avg(&se, UpdateAvgFlags::UPDATE_TG);
+
+                let se = se.force_mut();
+                se.update_runnable();
+
+                se.update_cfs_group();
+
+                cfs_rq.h_nr_running += 1;
+                cfs_rq.idle_h_nr_running += idle_h_nr_running as u64;
+
+                if cfs_rq.is_idle() {
+                    idle_h_nr_running = true;
+                }
+
+                // TODO: cfs_rq_throttled
+
+                return (true, true);
+            });
+        }
+
+        rq.add_nr_running(1);
+    }
+
+    fn dequeue(
+        rq: &mut CpuRunQueue,
+        pcb: Arc<crate::process::ProcessControlBlock>,
+        mut flags: DequeueFlag,
+    ) {
+        let mut se = pcb.sched_info().sched_entity();
+        let mut idle_h_nr_running = pcb.sched_info().policy() == SchedPolicy::IDLE;
+        let task_sleep = flags.contains(DequeueFlag::DEQUEUE_SLEEP);
+        let was_sched_idle = rq.sched_idle_rq();
+
+        let (should_continue, se) = FairSchedEntity::for_each_in_group(&mut se, |se| {
+            let binding = se.cfs_rq();
+            let cfs_rq = binding.force_mut();
+            cfs_rq.dequeue_entity(&se, flags);
+
+            cfs_rq.h_nr_running -= 1;
+            cfs_rq.idle_h_nr_running -= idle_h_nr_running as u64;
+
+            if cfs_rq.is_idle() {
+                idle_h_nr_running = true;
+            }
+
+            // TODO: cfs_rq_throttled
+
+            if cfs_rq.load.weight > 0 {
+                let sep = se.parent();
+
+                if task_sleep && sep.is_some() {
+                    todo!()
+                }
+            }
+
+            flags |= DequeueFlag::DEQUEUE_SLEEP;
+
+            return (true, true);
+        });
+
+        if !should_continue {
+            return;
+        }
+
+        if let Some(mut se) = se {
+            FairSchedEntity::for_each_in_group(&mut se, |se| {
+                let binding = se.cfs_rq();
+                let cfs_rq = binding.force_mut();
+
+                cfs_rq.update_load_avg(&se, UpdateAvgFlags::UPDATE_TG);
+
+                let se = se.force_mut();
+                se.update_runnable();
+
+                se.update_cfs_group();
+
+                cfs_rq.h_nr_running -= 1;
+                cfs_rq.idle_h_nr_running -= idle_h_nr_running as u64;
+
+                if cfs_rq.is_idle() {
+                    idle_h_nr_running = true;
+                }
+
+                // TODO: cfs_rq_throttled
+
+                return (true, true);
+            });
+        }
+
+        rq.sub_nr_running(1);
+
+        if unlikely(!was_sched_idle && rq.sched_idle_rq()) {
+            rq.next_balance = clock();
+        }
+    }
+
+    fn yield_task(rq: &mut CpuRunQueue) {
+        let curr = rq.current();
+        let se = curr.sched_info().sched_entity();
+        let binding = se.cfs_rq();
+        let cfs_rq = binding.force_mut();
+
+        if unlikely(rq.nr_running == 1) {
+            return;
+        }
+
+        cfs_rq.clear_buddies(&se);
+
+        rq.update_rq_clock();
+
+        cfs_rq.update_current();
+
+        rq.clock_updata_flags |= ClockUpdataFlag::RQCF_REQ_SKIP;
+
+        se.force_mut().deadline += se.calculate_delta_fair(se.slice);
+    }
+
+    fn check_preempt_currnet(
+        rq: &mut CpuRunQueue,
+        pcb: &Arc<crate::process::ProcessControlBlock>,
+        wake_flags: WakeupFlags,
+    ) {
+        let curr = rq.current();
+        let mut se = curr.sched_info().sched_entity();
+        let mut pse = pcb.sched_info().sched_entity();
+
+        if unlikely(Arc::ptr_eq(&se, &pse)) {
+            return;
+        }
+
+        // TODO:https://code.dragonos.org.cn/xref/linux-6.6.21/kernel/sched/fair.c#8160
+
+        let _next_buddy_mark = if SCHED_FEATURES.contains(SchedFeature::NEXT_BUDDY)
+            && !wake_flags.contains(WakeupFlags::WF_FORK)
+        {
+            FairSchedEntity::for_each_in_group(&mut pse, |se| {
+                if !se.on_rq() {
+                    return (false, true);
+                }
+
+                if se.is_idle() {
+                    return (false, true);
+                }
+
+                se.cfs_rq().force_mut().next = Arc::downgrade(&se);
+
+                return (true, true);
+            });
+            true
+        } else {
+            false
+        };
+
+        if curr.flags().contains(ProcessFlags::NEED_SCHEDULE) {
+            return;
+        }
+
+        if unlikely(curr.sched_info().policy() == SchedPolicy::IDLE)
+            && likely(pcb.sched_info().policy() != SchedPolicy::IDLE)
+        {
+            rq.resched_current();
+            return;
+        }
+
+        if unlikely(pcb.sched_info().policy() != SchedPolicy::CFS)
+            || !SCHED_FEATURES.contains(SchedFeature::WAKEUP_PREEMPTION)
+        {
+            return;
+        }
+
+        Self::find_matching_se(&mut se, &mut pse);
+
+        let cse_is_idle = se.is_idle();
+        let pse_is_idle = pse.is_idle();
+
+        if cse_is_idle && !pse_is_idle {
+            rq.resched_current();
+            return;
+        }
+
+        if cse_is_idle != pse_is_idle {
+            return;
+        }
+
+        let cfs_rq = se.cfs_rq();
+        cfs_rq.force_mut().update_current();
+
+        if let Some((_, pick_se)) = cfs_rq.entities.get_first() {
+            if Arc::ptr_eq(pick_se, &pse) {
+                rq.resched_current();
+                return;
+            }
+        }
+    }
+
+    fn pick_task(rq: &mut CpuRunQueue) -> Option<Arc<crate::process::ProcessControlBlock>> {
+        let mut cfs_rq = Some(rq.cfs_rq());
+        if cfs_rq.as_ref().unwrap().nr_running == 0 {
+            return None;
+        }
+
+        let mut se;
+        loop {
+            let cfs = cfs_rq.unwrap();
+            let cfs = cfs.force_mut();
+            let curr = cfs.current();
+            if let Some(curr) = curr {
+                if curr.on_rq() {
+                    cfs.update_current();
+                } else {
+                    cfs.set_current(Weak::default());
+                }
+            }
+
+            se = cfs.pick_next_entity();
+            match se.clone() {
+                Some(val) => cfs_rq = val.my_cfs_rq.clone(),
+                None => {
+                    break;
+                }
+            }
+
+            if cfs_rq.is_none() {
+                break;
+            }
+        }
+
+        se.map(|se| se.pcb())
+    }
+
+    fn tick(_rq: &mut CpuRunQueue, pcb: Arc<crate::process::ProcessControlBlock>, queued: bool) {
+        let mut se = pcb.sched_info().sched_entity();
+
+        FairSchedEntity::for_each_in_group(&mut se, |se| {
+            let binding = se.clone();
+            let binding = binding.cfs_rq();
+            let cfs_rq = binding.force_mut();
+
+            cfs_rq.entity_tick(se, queued);
+            (true, true)
+        });
+    }
+
+    fn task_fork(pcb: Arc<ProcessControlBlock>) {
+        let rq = cpu_rq(smp_get_processor_id().data() as usize);
+        let se = pcb.sched_info().sched_entity();
+
+        let (rq, _guard) = rq.self_lock();
+
+        rq.update_rq_clock();
+
+        let binding = se.cfs_rq();
+        let cfs_rq = binding.force_mut();
+
+        if cfs_rq.current().is_some() {
+            cfs_rq.update_current();
+        }
+
+        cfs_rq.place_entity(se.clone(), EnqueueFlag::ENQUEUE_INITIAL);
+    }
+
+    fn pick_next_task(
+        rq: &mut CpuRunQueue,
+        prev: Option<Arc<ProcessControlBlock>>,
+    ) -> Option<Arc<ProcessControlBlock>> {
+        let mut cfs_rq = rq.cfs_rq();
+        if rq.nr_running == 0 {
+            return None;
+        }
+
+        if prev.is_none()
+            || (prev.is_some() && prev.as_ref().unwrap().sched_info().policy() != SchedPolicy::CFS)
+        {
+            if let Some(prev) = prev {
+                match prev.sched_info().policy() {
+                    SchedPolicy::RT => todo!(),
+                    SchedPolicy::FIFO => todo!(),
+                    SchedPolicy::CFS => todo!(),
+                    SchedPolicy::IDLE => IdleScheduler::put_prev_task(rq, prev),
+                }
+            }
+            let mut se;
+            loop {
+                match cfs_rq.pick_next_entity() {
+                    Some(s) => se = s,
+                    None => return None,
+                }
+
+                cfs_rq.force_mut().set_next_entity(&se);
+
+                match &se.my_cfs_rq {
+                    Some(q) => cfs_rq = q.clone(),
+                    None => break,
+                }
+            }
+
+            return Some(se.pcb());
+        }
+
+        let prev = prev.unwrap();
+        let se = cfs_rq.pick_next_entity();
+
+        if let Some(mut se) = se {
+            loop {
+                let curr = cfs_rq.current();
+                if let Some(current) = curr {
+                    if current.on_rq() {
+                        cfs_rq.force_mut().update_current()
+                    } else {
+                        cfs_rq.force_mut().set_current(Weak::default());
+                    }
+                }
+
+                match cfs_rq.pick_next_entity() {
+                    Some(e) => se = e,
+                    None => break,
+                }
+
+                if let Some(q) = se.my_cfs_rq.clone() {
+                    cfs_rq = q;
+                } else {
+                    break;
+                }
+            }
+
+            let p = se.pcb();
+
+            if !Arc::ptr_eq(&prev, &p) {
+                let mut pse = prev.sched_info().sched_entity();
+
+                while !(Arc::ptr_eq(&se.cfs_rq(), &pse.cfs_rq())
+                    && Arc::ptr_eq(&se.cfs_rq(), &cfs_rq))
+                {
+                    let se_depth = se.depth;
+                    let pse_depth = pse.depth;
+
+                    if se_depth <= pse_depth {
+                        pse.cfs_rq().force_mut().put_prev_entity(pse.clone());
+                        pse = pse.parent().unwrap();
+                    }
+
+                    if se_depth >= pse_depth {
+                        se.cfs_rq().force_mut().set_next_entity(&se);
+                        se = se.parent().unwrap();
+                    }
+                }
+
+                cfs_rq.force_mut().put_prev_entity(pse);
+                cfs_rq.force_mut().set_next_entity(&se);
+            }
+
+            return Some(p);
+        } else {
+            return None;
+        }
+    }
+
+    fn put_prev_task(_rq: &mut CpuRunQueue, prev: Arc<ProcessControlBlock>) {
+        let mut se = prev.sched_info().sched_entity();
+
+        FairSchedEntity::for_each_in_group(&mut se, |se| {
+            let cfs = se.cfs_rq();
+            cfs.force_mut().put_prev_entity(se);
+
+            return (true, true);
+        });
+    }
+}

+ 67 - 0
kernel/src/sched/idle.rs

@@ -0,0 +1,67 @@
+use super::Scheduler;
+
+pub struct IdleScheduler;
+
+impl Scheduler for IdleScheduler {
+    fn enqueue(
+        _rq: &mut super::CpuRunQueue,
+        _pcb: alloc::sync::Arc<crate::process::ProcessControlBlock>,
+        _flags: super::EnqueueFlag,
+    ) {
+        // idle已经被设置,无需入队
+    }
+
+    fn dequeue(
+        _rq: &mut super::CpuRunQueue,
+        _pcb: alloc::sync::Arc<crate::process::ProcessControlBlock>,
+        _flags: super::DequeueFlag,
+    ) {
+    }
+
+    fn yield_task(_rq: &mut super::CpuRunQueue) {}
+
+    fn check_preempt_currnet(
+        rq: &mut super::CpuRunQueue,
+        _pcb: &alloc::sync::Arc<crate::process::ProcessControlBlock>,
+        _flags: super::WakeupFlags,
+    ) {
+        rq.resched_current();
+    }
+
+    fn pick_task(
+        _rq: &mut super::CpuRunQueue,
+    ) -> Option<alloc::sync::Arc<crate::process::ProcessControlBlock>> {
+        panic!()
+    }
+
+    /// ### 表示idle被选中
+    ///
+    /// 主要做一些统计工作
+    fn pick_next_task(
+        _rq: &mut super::CpuRunQueue,
+        _pcb: Option<alloc::sync::Arc<crate::process::ProcessControlBlock>>,
+    ) -> Option<alloc::sync::Arc<crate::process::ProcessControlBlock>> {
+        // TODO: Fixme
+        // 做一些统计工作
+        None
+    }
+
+    fn tick(
+        _rq: &mut super::CpuRunQueue,
+        _pcb: alloc::sync::Arc<crate::process::ProcessControlBlock>,
+        _queued: bool,
+    ) {
+        // Nothing to do
+    }
+
+    fn task_fork(_pcb: alloc::sync::Arc<crate::process::ProcessControlBlock>) {
+        todo!()
+    }
+
+    fn put_prev_task(
+        _rq: &mut super::CpuRunQueue,
+        _prev: alloc::sync::Arc<crate::process::ProcessControlBlock>,
+    ) {
+        // Nothing todo
+    }
+}

+ 969 - 25
kernel/src/sched/mod.rs

@@ -1,45 +1,269 @@
-pub mod cfs;
+pub mod clock;
 pub mod completion;
-pub mod core;
-pub mod rt;
-pub mod syscall;
+pub mod cputime;
+pub mod fair;
+pub mod idle;
+pub mod pelt;
+pub mod prio;
+
+use core::{
+    intrinsics::{likely, unlikely},
+    sync::atomic::{compiler_fence, fence, AtomicUsize, Ordering},
+};
+
+use alloc::{
+    boxed::Box,
+    collections::LinkedList,
+    sync::{Arc, Weak},
+    vec::Vec,
+};
+use system_error::SystemError;
+
+use crate::{
+    arch::{interrupt::ipi::send_ipi, CurrentIrqArch},
+    exception::{
+        ipi::{IpiKind, IpiTarget},
+        InterruptArch,
+    },
+    libs::{
+        lazy_init::Lazy,
+        spinlock::{SpinLock, SpinLockGuard},
+    },
+    mm::percpu::{PerCpu, PerCpuVar},
+    process::{ProcessControlBlock, ProcessFlags, ProcessManager, ProcessState, SchedInfo},
+    sched::idle::IdleScheduler,
+    smp::{core::smp_get_processor_id, cpu::ProcessorId},
+    time::{clocksource::HZ, timer::clock},
+};
+
+use self::{
+    clock::{ClockUpdataFlag, SchedClock},
+    cputime::{irq_time_read, CpuTimeFunc, IrqTime},
+    fair::{CfsRunQueue, CompletelyFairScheduler, FairSchedEntity},
+    prio::PrioUtil,
+};
+
+static mut CPU_IRQ_TIME: Option<Vec<&'static mut IrqTime>> = None;
+
+// 这里虽然rq是percpu的,但是在负载均衡的时候需要修改对端cpu的rq,所以仍需加锁
+static CPU_RUNQUEUE: Lazy<PerCpuVar<Arc<CpuRunQueue>>> = PerCpuVar::define_lazy();
+
+/// 用于记录系统中所有 CPU 的可执行进程数量的总和。
+static CALCULATE_LOAD_TASKS: AtomicUsize = AtomicUsize::new(0);
+
+const LOAD_FREQ: usize = HZ as usize * 5 + 1;
+
+pub const SCHED_FIXEDPOINT_SHIFT: u64 = 10;
+#[allow(dead_code)]
+pub const SCHED_FIXEDPOINT_SCALE: u64 = 1 << SCHED_FIXEDPOINT_SHIFT;
+#[allow(dead_code)]
+pub const SCHED_CAPACITY_SHIFT: u64 = SCHED_FIXEDPOINT_SHIFT;
+#[allow(dead_code)]
+pub const SCHED_CAPACITY_SCALE: u64 = 1 << SCHED_CAPACITY_SHIFT;
+
+#[inline]
+pub fn cpu_irq_time(cpu: usize) -> &'static mut IrqTime {
+    unsafe { CPU_IRQ_TIME.as_mut().unwrap()[cpu] }
+}
+
+#[inline]
+pub fn cpu_rq(cpu: usize) -> Arc<CpuRunQueue> {
+    CPU_RUNQUEUE.ensure();
+    unsafe {
+        CPU_RUNQUEUE
+            .get()
+            .force_get(ProcessorId::new(cpu as u32))
+            .clone()
+    }
+}
+
+lazy_static! {
+    pub static ref SCHED_FEATURES: SchedFeature = SchedFeature::GENTLE_FAIR_SLEEPERS
+        | SchedFeature::START_DEBIT
+        | SchedFeature::LAST_BUDDY
+        | SchedFeature::CACHE_HOT_BUDDY
+        | SchedFeature::WAKEUP_PREEMPTION
+        | SchedFeature::NONTASK_CAPACITY
+        | SchedFeature::TTWU_QUEUE
+        | SchedFeature::SIS_UTIL
+        | SchedFeature::RT_PUSH_IPI
+        | SchedFeature::ALT_PERIOD
+        | SchedFeature::BASE_SLICE
+        | SchedFeature::UTIL_EST
+        | SchedFeature::UTIL_EST_FASTUP;
+}
+
+pub trait Scheduler {
+    /// ## 加入当任务进入可运行状态时调用。它将调度实体(任务)放到红黑树中,增加nr_running变量的值。
+    fn enqueue(rq: &mut CpuRunQueue, pcb: Arc<ProcessControlBlock>, flags: EnqueueFlag);
+
+    /// ## 当任务不再可运行时被调用,对应的调度实体被移出红黑树。它减少nr_running变量的值。
+    fn dequeue(rq: &mut CpuRunQueue, pcb: Arc<ProcessControlBlock>, flags: DequeueFlag);
+
+    /// ## 主动让出cpu,这个函数的行为基本上是出队,紧接着入队
+    fn yield_task(rq: &mut CpuRunQueue);
+
+    /// ## 检查进入可运行状态的任务能否抢占当前正在运行的任务
+    fn check_preempt_currnet(
+        rq: &mut CpuRunQueue,
+        pcb: &Arc<ProcessControlBlock>,
+        flags: WakeupFlags,
+    );
+
+    /// ## 选择接下来最适合运行的任务
+    fn pick_task(rq: &mut CpuRunQueue) -> Option<Arc<ProcessControlBlock>>;
+
+    /// ## 选择接下来最适合运行的任务
+    fn pick_next_task(
+        rq: &mut CpuRunQueue,
+        pcb: Option<Arc<ProcessControlBlock>>,
+    ) -> Option<Arc<ProcessControlBlock>>;
+
+    /// ## 被时间滴答函数调用,它可能导致进程切换。驱动了运行时抢占。
+    fn tick(rq: &mut CpuRunQueue, pcb: Arc<ProcessControlBlock>, queued: bool);
+
+    /// ## 在进程fork时,如需加入cfs,则调用
+    fn task_fork(pcb: Arc<ProcessControlBlock>);
+
+    fn put_prev_task(rq: &mut CpuRunQueue, prev: Arc<ProcessControlBlock>);
+}
 
 /// 调度策略
 #[allow(dead_code)]
-#[derive(Debug, Clone, Copy, PartialEq, Eq)]
+#[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord)]
 pub enum SchedPolicy {
-    /// 完全公平调度
-    CFS,
+    /// 实时进程
+    RT,
     /// 先进先出调度
     FIFO,
-    /// 轮转调度
-    RR,
+    /// 完全公平调度
+    CFS,
+    /// IDLE
+    IDLE,
 }
 
-/// 调度优先级
-#[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord)]
-pub struct SchedPriority(i32);
+#[allow(dead_code)]
+pub struct TaskGroup {
+    /// CFS管理的调度实体,percpu的
+    entitys: Vec<Arc<FairSchedEntity>>,
+    /// 每个CPU的CFS运行队列
+    cfs: Vec<Arc<CfsRunQueue>>,
+    /// 父节点
+    parent: Option<Arc<TaskGroup>>,
+
+    shares: u64,
+}
+
+#[derive(Debug, Default)]
+pub struct LoadWeight {
+    /// 负载权重
+    pub weight: u64,
+    /// weight的倒数,方便计算
+    pub inv_weight: u32,
+}
+
+impl LoadWeight {
+    /// 用于限制权重在一个合适的区域内
+    pub const SCHED_FIXEDPOINT_SHIFT: u32 = 10;
+
+    pub const WMULT_SHIFT: u32 = 32;
+    pub const WMULT_CONST: u32 = !0;
+
+    pub const NICE_0_LOAD_SHIFT: u32 = Self::SCHED_FIXEDPOINT_SHIFT + Self::SCHED_FIXEDPOINT_SHIFT;
+
+    pub fn update_load_add(&mut self, inc: u64) {
+        self.weight += inc;
+        self.inv_weight = 0;
+    }
+
+    pub fn update_load_sub(&mut self, dec: u64) {
+        self.weight -= dec;
+        self.inv_weight = 0;
+    }
+
+    pub fn update_load_set(&mut self, weight: u64) {
+        self.weight = weight;
+        self.inv_weight = 0;
+    }
+
+    /// ## 更新负载权重的倒数
+    pub fn update_inv_weight(&mut self) {
+        // 已经更新
+        if likely(self.inv_weight != 0) {
+            return;
+        }
 
-impl SchedPriority {
-    const MIN: i32 = 0;
-    const MAX: i32 = 139;
+        let w = Self::scale_load_down(self.weight);
 
-    /// 创建一个新的调度优先级
-    pub const fn new(priority: i32) -> Option<Self> {
-        if Self::validate(priority) {
-            Some(Self(priority))
+        if unlikely(w >= Self::WMULT_CONST as u64) {
+            // 高位有数据
+            self.inv_weight = 1;
+        } else if unlikely(w == 0) {
+            // 倒数去最大
+            self.inv_weight = Self::WMULT_CONST;
         } else {
-            None
+            // 计算倒数
+            self.inv_weight = Self::WMULT_CONST / w as u32;
+        }
+    }
+
+    /// ## 计算任务的执行时间差
+    ///
+    /// 计算公式:(delta_exec * (weight * self.inv_weight)) >> WMULT_SHIFT
+    pub fn calculate_delta(&mut self, delta_exec: u64, weight: u64) -> u64 {
+        // 降低精度
+        let mut fact = Self::scale_load_down(weight);
+
+        // 记录fact高32位
+        let mut fact_hi = (fact >> 32) as u32;
+        // 用于恢复
+        let mut shift = Self::WMULT_SHIFT;
+
+        self.update_inv_weight();
+
+        if unlikely(fact_hi != 0) {
+            // 这里表示高32位还有数据
+            // 需要计算最高位,然后继续调整fact
+            let fs = 32 - fact_hi.leading_zeros();
+            shift -= fs;
+
+            // 确保高32位全为0
+            fact >>= fs;
+        }
+
+        // 这里确定了fact已经在32位内
+        fact *= self.inv_weight as u64;
+
+        fact_hi = (fact >> 32) as u32;
+
+        if fact_hi != 0 {
+            // 这里表示高32位还有数据
+            // 需要计算最高位,然后继续调整fact
+            let fs = 32 - fact_hi.leading_zeros();
+            shift -= fs;
+
+            // 确保高32位全为0
+            fact >>= fs;
         }
+
+        return ((delta_exec as u128 * fact as u128) >> shift) as u64;
     }
 
-    /// 校验优先级是否合法
-    pub const fn validate(priority: i32) -> bool {
-        priority >= Self::MIN && priority <= Self::MAX
+    /// ## 将负载权重缩小到到一个小的范围中计算,相当于减小精度计算
+    pub const fn scale_load_down(mut weight: u64) -> u64 {
+        if weight != 0 {
+            weight >>= Self::SCHED_FIXEDPOINT_SHIFT;
+
+            if weight < 2 {
+                weight = 2;
+            }
+        }
+        weight
     }
 
-    pub fn data(&self) -> i32 {
-        self.0
+    #[allow(dead_code)]
+    pub const fn scale_load(weight: u64) -> u64 {
+        weight << Self::SCHED_FIXEDPOINT_SHIFT
     }
 }
 
@@ -54,3 +278,723 @@ pub trait SchedArch {
     /// 注意区别于sched_init,这个函数只是做初始化时钟的工作等等。
     fn initial_setup_sched_local() {}
 }
+
+/// ## PerCpu的运行队列,其中维护了各个调度器对应的rq
+#[allow(dead_code)]
+#[derive(Debug)]
+pub struct CpuRunQueue {
+    lock: SpinLock<()>,
+    lock_on_who: AtomicUsize,
+
+    cpu: usize,
+    clock_task: u64,
+    clock: u64,
+    prev_irq_time: u64,
+    clock_updata_flags: ClockUpdataFlag,
+
+    /// 过载
+    overload: bool,
+
+    next_balance: u64,
+
+    /// 运行任务数
+    nr_running: usize,
+
+    /// 被阻塞的任务数量
+    nr_uninterruptible: usize,
+
+    /// 记录上次更新负载时间
+    cala_load_update: usize,
+    cala_load_active: usize,
+
+    /// CFS调度器
+    cfs: Arc<CfsRunQueue>,
+
+    clock_pelt: u64,
+    lost_idle_time: u64,
+    clock_idle: u64,
+
+    cfs_tasks: LinkedList<Arc<FairSchedEntity>>,
+
+    /// 最近一次的调度信息
+    sched_info: SchedInfo,
+
+    /// 当前在运行队列上执行的进程
+    current: Weak<ProcessControlBlock>,
+
+    idle: Weak<ProcessControlBlock>,
+}
+
+impl CpuRunQueue {
+    pub fn new(cpu: usize) -> Self {
+        Self {
+            lock: SpinLock::new(()),
+            lock_on_who: AtomicUsize::new(usize::MAX),
+            cpu,
+            clock_task: 0,
+            clock: 0,
+            prev_irq_time: 0,
+            clock_updata_flags: ClockUpdataFlag::empty(),
+            overload: false,
+            next_balance: 0,
+            nr_running: 0,
+            nr_uninterruptible: 0,
+            cala_load_update: (clock() + (5 * HZ + 1)) as usize,
+            cala_load_active: 0,
+            cfs: Arc::new(CfsRunQueue::new()),
+            clock_pelt: 0,
+            lost_idle_time: 0,
+            clock_idle: 0,
+            cfs_tasks: LinkedList::new(),
+            sched_info: SchedInfo::default(),
+            current: Weak::new(),
+            idle: Weak::new(),
+        }
+    }
+
+    /// 此函数只能在关中断的情况下使用!!!
+    /// 获取到rq的可变引用,需要注意的是返回的第二个值需要确保其生命周期
+    /// 所以可以说这个函数是unsafe的,需要确保正确性
+    /// 在中断上下文,关中断的情况下,此函数是安全的
+    pub fn self_lock(&self) -> (&mut Self, Option<SpinLockGuard<()>>) {
+        if self.lock.is_locked()
+            && smp_get_processor_id().data() as usize == self.lock_on_who.load(Ordering::SeqCst)
+        {
+            // 在本cpu已上锁则可以直接拿
+            (
+                unsafe { &mut *(self as *const Self as usize as *mut Self) },
+                None,
+            )
+        } else {
+            // 否则先上锁再拿
+            let guard = self.lock();
+            (
+                unsafe { &mut *(self as *const Self as usize as *mut Self) },
+                Some(guard),
+            )
+        }
+    }
+
+    fn lock(&self) -> SpinLockGuard<()> {
+        let guard = self.lock.lock_irqsave();
+
+        // 更新在哪一个cpu上锁
+        self.lock_on_who
+            .store(smp_get_processor_id().data() as usize, Ordering::SeqCst);
+
+        guard
+    }
+
+    pub fn enqueue_task(&mut self, pcb: Arc<ProcessControlBlock>, flags: EnqueueFlag) {
+        if !flags.contains(EnqueueFlag::ENQUEUE_NOCLOCK) {
+            self.update_rq_clock();
+        }
+
+        if !flags.contains(EnqueueFlag::ENQUEUE_RESTORE) {
+            let sched_info = pcb.sched_info().sched_stat.upgradeable_read_irqsave();
+            if sched_info.last_queued == 0 {
+                sched_info.upgrade().last_queued = self.clock;
+            }
+        }
+
+        match pcb.sched_info().policy() {
+            SchedPolicy::CFS => CompletelyFairScheduler::enqueue(self, pcb, flags),
+            SchedPolicy::FIFO => todo!(),
+            SchedPolicy::RT => todo!(),
+            SchedPolicy::IDLE => IdleScheduler::enqueue(self, pcb, flags),
+        }
+
+        // TODO:https://code.dragonos.org.cn/xref/linux-6.6.21/kernel/sched/core.c#239
+    }
+
+    pub fn dequeue_task(&mut self, pcb: Arc<ProcessControlBlock>, flags: DequeueFlag) {
+        // TODO:sched_core
+
+        if !flags.contains(DequeueFlag::DEQUEUE_NOCLOCK) {
+            self.update_rq_clock()
+        }
+
+        if !flags.contains(DequeueFlag::DEQUEUE_SAVE) {
+            let sched_info = pcb.sched_info().sched_stat.upgradeable_read_irqsave();
+
+            if sched_info.last_queued > 0 {
+                let delta = self.clock - sched_info.last_queued;
+
+                let mut sched_info = sched_info.upgrade();
+                sched_info.last_queued = 0;
+                sched_info.run_delay += delta as usize;
+
+                self.sched_info.run_delay += delta as usize;
+            }
+        }
+
+        match pcb.sched_info().policy() {
+            SchedPolicy::CFS => CompletelyFairScheduler::dequeue(self, pcb, flags),
+            SchedPolicy::FIFO => todo!(),
+            SchedPolicy::RT => todo!(),
+            SchedPolicy::IDLE => todo!(),
+        }
+    }
+
+    /// 启用一个任务,将加入队列
+    pub fn activate_task(&mut self, pcb: &Arc<ProcessControlBlock>, mut flags: EnqueueFlag) {
+        if *pcb.sched_info().on_rq.lock_irqsave() == OnRq::Migrating {
+            flags |= EnqueueFlag::ENQUEUE_MIGRATED;
+        }
+
+        if flags.contains(EnqueueFlag::ENQUEUE_MIGRATED) {
+            todo!()
+        }
+
+        self.enqueue_task(pcb.clone(), flags);
+
+        *pcb.sched_info().on_rq.lock_irqsave() = OnRq::Queued;
+    }
+
+    /// 检查对应的task是否可以抢占当前运行的task
+    #[allow(clippy::comparison_chain)]
+    pub fn check_preempt_currnet(&mut self, pcb: &Arc<ProcessControlBlock>, flags: WakeupFlags) {
+        if pcb.sched_info().policy() == self.current().sched_info().policy() {
+            match self.current().sched_info().policy() {
+                SchedPolicy::CFS => {
+                    CompletelyFairScheduler::check_preempt_currnet(self, pcb, flags)
+                }
+                SchedPolicy::FIFO => todo!(),
+                SchedPolicy::RT => todo!(),
+                SchedPolicy::IDLE => IdleScheduler::check_preempt_currnet(self, pcb, flags),
+            }
+        } else if pcb.sched_info().policy() < self.current().sched_info().policy() {
+            // 调度优先级更高
+            self.resched_current();
+        }
+
+        if *self.current().sched_info().on_rq.lock_irqsave() == OnRq::Queued
+            && self.current().flags().contains(ProcessFlags::NEED_SCHEDULE)
+        {
+            self.clock_updata_flags
+                .insert(ClockUpdataFlag::RQCF_REQ_SKIP);
+        }
+    }
+
+    /// 禁用一个任务,将离开队列
+    pub fn deactivate_task(&mut self, pcb: Arc<ProcessControlBlock>, flags: DequeueFlag) {
+        *pcb.sched_info().on_rq.lock_irqsave() = if flags.contains(DequeueFlag::DEQUEUE_SLEEP) {
+            OnRq::None
+        } else {
+            OnRq::Migrating
+        };
+
+        self.dequeue_task(pcb, flags);
+    }
+
+    #[inline]
+    pub fn cfs_rq(&self) -> Arc<CfsRunQueue> {
+        self.cfs.clone()
+    }
+
+    /// 更新rq时钟
+    pub fn update_rq_clock(&mut self) {
+        // 需要跳过这次时钟更新
+        if self
+            .clock_updata_flags
+            .contains(ClockUpdataFlag::RQCF_ACT_SKIP)
+        {
+            return;
+        }
+
+        let clock = SchedClock::sched_clock_cpu(self.cpu);
+        if clock < self.clock {
+            return;
+        }
+
+        let delta = clock - self.clock;
+        self.clock += delta;
+        // kerror!("clock {}", self.clock);
+        self.update_rq_clock_task(delta);
+    }
+
+    /// 更新任务时钟
+    pub fn update_rq_clock_task(&mut self, mut delta: u64) {
+        let mut irq_delta = irq_time_read(self.cpu) - self.prev_irq_time;
+        // if self.cpu == 0 {
+        //     kerror!(
+        //         "cpu 0 delta {delta} irq_delta {} irq_time_read(self.cpu) {} self.prev_irq_time {}",
+        //         irq_delta,
+        //         irq_time_read(self.cpu),
+        //         self.prev_irq_time
+        //     );
+        // }
+        compiler_fence(Ordering::SeqCst);
+
+        if irq_delta > delta {
+            irq_delta = delta;
+        }
+
+        self.prev_irq_time += irq_delta;
+
+        delta -= irq_delta;
+
+        // todo: psi?
+
+        // send_to_default_serial8250_port(format!("\n{delta}\n",).as_bytes());
+        compiler_fence(Ordering::SeqCst);
+        self.clock_task += delta;
+        compiler_fence(Ordering::SeqCst);
+        // if self.cpu == 0 {
+        //     kerror!("cpu {} clock_task {}", self.cpu, self.clock_task);
+        // }
+        // todo: pelt?
+    }
+
+    /// 计算当前进程中的可执行数量
+    fn calculate_load_fold_active(&mut self, adjust: usize) -> usize {
+        let mut nr_active = self.nr_running - adjust;
+        nr_active += self.nr_uninterruptible;
+        let mut delta = 0;
+
+        if nr_active != self.cala_load_active {
+            delta = nr_active - self.cala_load_active;
+            self.cala_load_active = nr_active;
+        }
+
+        delta
+    }
+
+    /// ## tick计算全局负载
+    pub fn calculate_global_load_tick(&mut self) {
+        if clock() < self.cala_load_update as u64 {
+            // 如果当前时间在上次更新时间之前,则直接返回
+            return;
+        }
+
+        let delta = self.calculate_load_fold_active(0);
+
+        if delta != 0 {
+            CALCULATE_LOAD_TASKS.fetch_add(delta, Ordering::SeqCst);
+        }
+
+        self.cala_load_update += LOAD_FREQ;
+    }
+
+    pub fn add_nr_running(&mut self, nr_running: usize) {
+        let prev = self.nr_running;
+
+        self.nr_running = prev + nr_running;
+        if prev < 2 && self.nr_running >= 2 && !self.overload {
+            self.overload = true;
+        }
+    }
+
+    pub fn sub_nr_running(&mut self, count: usize) {
+        self.nr_running -= count;
+    }
+
+    /// 在运行idle?
+    pub fn sched_idle_rq(&self) -> bool {
+        return unlikely(
+            self.nr_running == self.cfs.idle_h_nr_running as usize && self.nr_running > 0,
+        );
+    }
+
+    #[inline]
+    pub fn current(&self) -> Arc<ProcessControlBlock> {
+        self.current.upgrade().unwrap()
+    }
+
+    #[inline]
+    pub fn set_current(&mut self, pcb: Weak<ProcessControlBlock>) {
+        self.current = pcb;
+    }
+
+    #[inline]
+    pub fn set_idle(&mut self, pcb: Weak<ProcessControlBlock>) {
+        self.idle = pcb;
+    }
+
+    #[inline]
+    pub fn clock_task(&self) -> u64 {
+        self.clock_task
+    }
+
+    /// 重新调度当前进程
+    pub fn resched_current(&self) {
+        let current = self.current();
+
+        // 又需要被调度?
+        if unlikely(current.flags().contains(ProcessFlags::NEED_SCHEDULE)) {
+            return;
+        }
+
+        let cpu = self.cpu;
+
+        if cpu == smp_get_processor_id().data() as usize {
+            // assert!(
+            //     Arc::ptr_eq(&current, &ProcessManager::current_pcb()),
+            //     "rq current name {} process current {}",
+            //     current.basic().name().to_string(),
+            //     ProcessManager::current_pcb().basic().name().to_string(),
+            // );
+            // 设置需要调度
+            ProcessManager::current_pcb()
+                .flags()
+                .insert(ProcessFlags::NEED_SCHEDULE);
+            return;
+        }
+
+        // 向目标cpu发送重调度ipi
+        send_resched_ipi(ProcessorId::new(cpu as u32));
+    }
+
+    /// 选择下一个task
+    pub fn pick_next_task(&mut self, prev: Arc<ProcessControlBlock>) -> Arc<ProcessControlBlock> {
+        if likely(prev.sched_info().policy() >= SchedPolicy::CFS)
+            && self.nr_running == self.cfs.h_nr_running as usize
+        {
+            let p = CompletelyFairScheduler::pick_next_task(self, Some(prev.clone()));
+
+            if let Some(pcb) = p.as_ref() {
+                return pcb.clone();
+            } else {
+                // kerror!(
+                //     "pick idle cfs rq {:?}",
+                //     self.cfs_rq()
+                //         .entities
+                //         .iter()
+                //         .map(|x| x.1.pid)
+                //         .collect::<Vec<_>>()
+                // );
+                match prev.sched_info().policy() {
+                    SchedPolicy::FIFO => todo!(),
+                    SchedPolicy::RT => todo!(),
+                    SchedPolicy::CFS => CompletelyFairScheduler::put_prev_task(self, prev),
+                    SchedPolicy::IDLE => IdleScheduler::put_prev_task(self, prev),
+                }
+                // 选择idle
+                return self.idle.upgrade().unwrap();
+            }
+        }
+
+        todo!()
+    }
+}
+
+bitflags! {
+    pub struct SchedFeature:u32 {
+        /// 给予睡眠任务仅有 50% 的服务赤字。这意味着睡眠任务在被唤醒后会获得一定的服务,但不能过多地占用资源。
+        const GENTLE_FAIR_SLEEPERS = 1 << 0;
+        /// 将新任务排在前面,以避免已经运行的任务被饿死
+        const START_DEBIT = 1 << 1;
+        /// 在调度时优先选择上次唤醒的任务,因为它可能会访问之前唤醒的任务所使用的数据,从而提高缓存局部性。
+        const NEXT_BUDDY = 1 << 2;
+        /// 在调度时优先选择上次运行的任务,因为它可能会访问与之前运行的任务相同的数据,从而提高缓存局部性。
+        const LAST_BUDDY = 1 << 3;
+        /// 认为任务的伙伴(buddy)在缓存中是热点,减少缓存伙伴被迁移的可能性,从而提高缓存局部性。
+        const CACHE_HOT_BUDDY = 1 << 4;
+        /// 允许唤醒时抢占当前任务。
+        const WAKEUP_PREEMPTION = 1 << 5;
+        /// 基于任务未运行时间来减少 CPU 的容量。
+        const NONTASK_CAPACITY = 1 << 6;
+        /// 将远程唤醒排队到目标 CPU,并使用调度器 IPI 处理它们,以减少运行队列锁的争用。
+        const TTWU_QUEUE = 1 << 7;
+        /// 在唤醒时尝试限制对最后级联缓存(LLC)域的无谓扫描。
+        const SIS_UTIL = 1 << 8;
+        /// 在 RT(Real-Time)任务迁移时,通过发送 IPI 来减少 CPU 之间的锁竞争。
+        const RT_PUSH_IPI = 1 << 9;
+        /// 启用估计的 CPU 利用率功能,用于调度决策。
+        const UTIL_EST = 1 << 10;
+        const UTIL_EST_FASTUP = 1 << 11;
+        /// 启用备选调度周期
+        const ALT_PERIOD = 1 << 12;
+        /// 启用基本时间片
+        const BASE_SLICE = 1 << 13;
+    }
+
+    pub struct EnqueueFlag: u8 {
+        const ENQUEUE_WAKEUP	= 0x01;
+        const ENQUEUE_RESTORE	= 0x02;
+        const ENQUEUE_MOVE	= 0x04;
+        const ENQUEUE_NOCLOCK	= 0x08;
+
+        const ENQUEUE_MIGRATED	= 0x40;
+
+        const ENQUEUE_INITIAL	= 0x80;
+    }
+
+    pub struct DequeueFlag: u8 {
+        const DEQUEUE_SLEEP		= 0x01;
+        const DEQUEUE_SAVE		= 0x02; /* Matches ENQUEUE_RESTORE */
+        const DEQUEUE_MOVE		= 0x04; /* Matches ENQUEUE_MOVE */
+        const DEQUEUE_NOCLOCK		= 0x08; /* Matches ENQUEUE_NOCLOCK */
+    }
+
+    pub struct WakeupFlags: u8 {
+        /* Wake flags. The first three directly map to some SD flag value */
+        const WF_EXEC         = 0x02; /* Wakeup after exec; maps to SD_BALANCE_EXEC */
+        const WF_FORK         = 0x04; /* Wakeup after fork; maps to SD_BALANCE_FORK */
+        const WF_TTWU         = 0x08; /* Wakeup;            maps to SD_BALANCE_WAKE */
+
+        const WF_SYNC         = 0x10; /* Waker goes to sleep after wakeup */
+        const WF_MIGRATED     = 0x20; /* Internal use, task got migrated */
+        const WF_CURRENT_CPU  = 0x40; /* Prefer to move the wakee to the current CPU. */
+    }
+
+    pub struct SchedMode: u8 {
+        /*
+        * Constants for the sched_mode argument of __schedule().
+        *
+        * The mode argument allows RT enabled kernels to differentiate a
+        * preemption from blocking on an 'sleeping' spin/rwlock. Note that
+        * SM_MASK_PREEMPT for !RT has all bits set, which allows the compiler to
+        * optimize the AND operation out and just check for zero.
+        */
+        /// 在调度过程中不会再次进入队列,即需要手动唤醒
+        const SM_NONE			= 0x0;
+        /// 重新加入队列,即当前进程被抢占,需要时钟调度
+        const SM_PREEMPT		= 0x1;
+        /// rt相关
+        const SM_RTLOCK_WAIT		= 0x2;
+        /// 默认与SM_PREEMPT相同
+        const SM_MASK_PREEMPT	= Self::SM_PREEMPT.bits;
+    }
+}
+
+#[derive(Copy, Clone, Debug, PartialEq)]
+pub enum OnRq {
+    Queued,
+    Migrating,
+    None,
+}
+
+impl ProcessManager {
+    pub fn update_process_times(user_tick: bool) {
+        let pcb = Self::current_pcb();
+        CpuTimeFunc::irqtime_account_process_tick(&pcb, user_tick, 1);
+
+        scheduler_tick();
+    }
+}
+
+/// ## 时钟tick时调用此函数
+pub fn scheduler_tick() {
+    fence(Ordering::SeqCst);
+    // 获取当前CPU索引
+    let cpu_idx = smp_get_processor_id().data() as usize;
+
+    // 获取当前CPU的请求队列
+    let rq = cpu_rq(cpu_idx);
+
+    let (rq, guard) = rq.self_lock();
+
+    // 获取当前请求队列的当前请求
+    let current = rq.current();
+
+    // 更新请求队列时钟
+    rq.update_rq_clock();
+
+    match current.sched_info().policy() {
+        SchedPolicy::CFS => CompletelyFairScheduler::tick(rq, current, false),
+        SchedPolicy::FIFO => todo!(),
+        SchedPolicy::RT => todo!(),
+        SchedPolicy::IDLE => IdleScheduler::tick(rq, current, false),
+    }
+
+    rq.calculate_global_load_tick();
+
+    drop(guard);
+    // TODO:处理负载均衡
+}
+
+/// ## 执行调度
+/// 若preempt_count不为0则报错
+#[inline]
+pub fn schedule(sched_mod: SchedMode) {
+    let _guard = unsafe { CurrentIrqArch::save_and_disable_irq() };
+    assert!(ProcessManager::current_pcb().preempt_count() == 0);
+    __schedule(sched_mod);
+}
+
+/// ## 执行调度
+/// 此函数与schedule的区别为,该函数不会检查preempt_count
+/// 适用于时钟中断等场景
+pub fn __schedule(sched_mod: SchedMode) {
+    let cpu = smp_get_processor_id().data() as usize;
+    let rq = cpu_rq(cpu);
+
+    let mut prev = rq.current();
+    if let ProcessState::Exited(_) = prev.clone().sched_info().inner_lock_read_irqsave().state() {
+        // 从exit进的Schedule
+        prev = ProcessManager::current_pcb();
+    }
+
+    // TODO: hrtick_clear(rq);
+
+    let (rq, _guard) = rq.self_lock();
+
+    rq.clock_updata_flags = ClockUpdataFlag::from_bits_truncate(rq.clock_updata_flags.bits() << 1);
+
+    rq.update_rq_clock();
+    rq.clock_updata_flags = ClockUpdataFlag::RQCF_UPDATE;
+
+    // kBUG!(
+    //     "before cfs rq pcbs {:?}\nvruntimes {:?}\n",
+    //     rq.cfs
+    //         .entities
+    //         .iter()
+    //         .map(|x| { x.1.pcb().pid() })
+    //         .collect::<Vec<_>>(),
+    //     rq.cfs
+    //         .entities
+    //         .iter()
+    //         .map(|x| { x.1.vruntime })
+    //         .collect::<Vec<_>>(),
+    // );
+    // kwarn!(
+    //     "before cfs rq {:?} prev {:?}",
+    //     rq.cfs
+    //         .entities
+    //         .iter()
+    //         .map(|x| { x.1.pcb().pid() })
+    //         .collect::<Vec<_>>(),
+    //     prev.pid()
+    // );
+
+    // kerror!("prev pid {:?} {:?}", prev.pid(), prev.sched_info().policy());
+    if !sched_mod.contains(SchedMode::SM_MASK_PREEMPT)
+        && prev.sched_info().policy() != SchedPolicy::IDLE
+        && prev.sched_info().inner_lock_read_irqsave().is_mark_sleep()
+    {
+        // kwarn!("deactivate_task prev {:?}", prev.pid());
+        // TODO: 这里需要处理信号
+        // https://code.dragonos.org.cn/xref/linux-6.6.21/kernel/sched/core.c?r=&mo=172979&fi=6578#6630
+        rq.deactivate_task(
+            prev.clone(),
+            DequeueFlag::DEQUEUE_SLEEP | DequeueFlag::DEQUEUE_NOCLOCK,
+        );
+    }
+
+    let next = rq.pick_next_task(prev.clone());
+
+    // kBUG!(
+    //     "after cfs rq pcbs {:?}\nvruntimes {:?}\n",
+    //     rq.cfs
+    //         .entities
+    //         .iter()
+    //         .map(|x| { x.1.pcb().pid() })
+    //         .collect::<Vec<_>>(),
+    //     rq.cfs
+    //         .entities
+    //         .iter()
+    //         .map(|x| { x.1.vruntime })
+    //         .collect::<Vec<_>>(),
+    // );
+
+    // kerror!("next {:?}", next.pid());
+
+    prev.flags().remove(ProcessFlags::NEED_SCHEDULE);
+    fence(Ordering::SeqCst);
+    if likely(!Arc::ptr_eq(&prev, &next)) {
+        rq.set_current(Arc::downgrade(&next));
+        // kwarn!(
+        //     "switch_process prev {:?} next {:?} sched_mode {sched_mod:?}",
+        //     prev.pid(),
+        //     next.pid()
+        // );
+
+        // send_to_default_serial8250_port(
+        //     format!(
+        //         "switch_process prev {:?} next {:?} sched_mode {sched_mod:?}\n",
+        //         prev.pid(),
+        //         next.pid()
+        //     )
+        //     .as_bytes(),
+        // );
+
+        // CurrentApic.send_eoi();
+        compiler_fence(Ordering::SeqCst);
+        #[cfg(target_arch = "x86_64")]
+        unsafe {
+            ProcessManager::switch_process(prev, next)
+        };
+        #[cfg(target_arch = "riscv64")]
+        todo!()
+    } else {
+        kwarn!(
+            "!!!switch_process {} {:?} to self ",
+            prev.basic().name(),
+            prev.pid(),
+        );
+
+        assert!(
+            Arc::ptr_eq(&ProcessManager::current_pcb(), &prev),
+            "{}",
+            ProcessManager::current_pcb().basic().name()
+        );
+    }
+}
+
+pub fn sched_fork(pcb: &Arc<ProcessControlBlock>) -> Result<(), SystemError> {
+    let mut prio_guard = pcb.sched_info().prio_data.write_irqsave();
+    let current = ProcessManager::current_pcb();
+
+    prio_guard.prio = current.sched_info().prio_data.read_irqsave().normal_prio;
+
+    if PrioUtil::dl_prio(prio_guard.prio) {
+        return Err(SystemError::EAGAIN_OR_EWOULDBLOCK);
+    } else if PrioUtil::rt_prio(prio_guard.prio) {
+        let policy = &pcb.sched_info().sched_policy;
+        *policy.write_irqsave() = SchedPolicy::RT;
+    } else {
+        let policy = &pcb.sched_info().sched_policy;
+        *policy.write_irqsave() = SchedPolicy::CFS;
+    }
+
+    pcb.sched_info()
+        .sched_entity()
+        .force_mut()
+        .init_entity_runnable_average();
+
+    Ok(())
+}
+
+pub fn sched_cgroup_fork(pcb: &Arc<ProcessControlBlock>) {
+    __set_task_cpu(pcb, smp_get_processor_id());
+    match pcb.sched_info().policy() {
+        SchedPolicy::RT => todo!(),
+        SchedPolicy::FIFO => todo!(),
+        SchedPolicy::CFS => CompletelyFairScheduler::task_fork(pcb.clone()),
+        SchedPolicy::IDLE => todo!(),
+    }
+}
+
+fn __set_task_cpu(pcb: &Arc<ProcessControlBlock>, cpu: ProcessorId) {
+    // TODO: Fixme There is not implement group sched;
+    let se = pcb.sched_info().sched_entity();
+    let rq = cpu_rq(cpu.data() as usize);
+    se.force_mut().set_cfs(Arc::downgrade(&rq.cfs));
+}
+
+#[inline(never)]
+pub fn sched_init() {
+    // 初始化percpu变量
+    unsafe {
+        CPU_IRQ_TIME = Some(Vec::with_capacity(PerCpu::MAX_CPU_NUM as usize));
+        CPU_IRQ_TIME
+            .as_mut()
+            .unwrap()
+            .resize_with(PerCpu::MAX_CPU_NUM as usize, || Box::leak(Box::default()));
+
+        let mut cpu_runqueue = Vec::with_capacity(PerCpu::MAX_CPU_NUM as usize);
+        for cpu in 0..PerCpu::MAX_CPU_NUM as usize {
+            let rq = Arc::new(CpuRunQueue::new(cpu));
+            rq.cfs.force_mut().set_rq(Arc::downgrade(&rq));
+            cpu_runqueue.push(rq);
+        }
+
+        CPU_RUNQUEUE.init(PerCpuVar::new(cpu_runqueue).unwrap());
+    };
+}
+
+#[inline]
+pub fn send_resched_ipi(cpu: ProcessorId) {
+    send_ipi(IpiKind::KickCpu, IpiTarget::Specified(cpu));
+}

+ 260 - 0
kernel/src/sched/pelt.rs

@@ -0,0 +1,260 @@
+use core::intrinsics::unlikely;
+
+use alloc::sync::Arc;
+
+use crate::process::ProcessControlBlock;
+
+use super::{
+    fair::{CfsRunQueue, FairSchedEntity},
+    CpuRunQueue, LoadWeight, SchedPolicy, SCHED_CAPACITY_SCALE, SCHED_CAPACITY_SHIFT,
+};
+
+const RUNNABLE_AVG_Y_N_INV: [u32; 32] = [
+    0xffffffff, 0xfa83b2da, 0xf5257d14, 0xefe4b99a, 0xeac0c6e6, 0xe5b906e6, 0xe0ccdeeb, 0xdbfbb796,
+    0xd744fcc9, 0xd2a81d91, 0xce248c14, 0xc9b9bd85, 0xc5672a10, 0xc12c4cc9, 0xbd08a39e, 0xb8fbaf46,
+    0xb504f333, 0xb123f581, 0xad583ee9, 0xa9a15ab4, 0xa5fed6a9, 0xa2704302, 0x9ef5325f, 0x9b8d39b9,
+    0x9837f050, 0x94f4efa8, 0x91c3d373, 0x8ea4398a, 0x8b95c1e3, 0x88980e80, 0x85aac367, 0x82cd8698,
+];
+
+pub const LOAD_AVG_PERIOD: u64 = 32;
+pub const LOAD_AVG_MAX: usize = 47742;
+pub const PELT_MIN_DIVIDER: usize = LOAD_AVG_MAX - 1024;
+
+#[derive(Debug, Default)]
+pub struct SchedulerAvg {
+    /// 存储上次更新这些平均值的时间
+    pub last_update_time: u64,
+    /// 存储所有可运行任务的负载之和
+    pub load_sum: u64,
+    /// 存储所有可运行任务的时间之和
+    pub runnable_sum: u64,
+    /// 存储所有运行任务的时间之和
+    pub util_sum: u64,
+    /// 记录周期性任务的贡献值,用于计算平均CPU利用率
+    pub period_contrib: u32,
+
+    pub load_avg: usize,
+    pub runnable_avg: usize,
+    pub util_avg: usize,
+}
+
+impl SchedulerAvg {
+    #[inline]
+    pub fn get_pelt_divider(&self) -> usize {
+        return PELT_MIN_DIVIDER + self.period_contrib as usize;
+    }
+
+    pub fn update_load_sum(
+        &mut self,
+        now: u64,
+        load: u32,
+        mut runnable: u32,
+        mut running: u32,
+    ) -> bool {
+        if now < self.last_update_time {
+            self.last_update_time = now;
+            return false;
+        }
+
+        let mut delta = now - self.last_update_time;
+        delta >>= 10;
+
+        if delta == 0 {
+            return false;
+        }
+
+        self.last_update_time += delta << 10;
+
+        if load == 0 {
+            runnable = 0;
+            running = 0;
+        }
+
+        self.accumulate_sum(delta, load, runnable, running) != 0
+    }
+
+    pub fn accumulate_sum(
+        &mut self,
+        mut delta: u64,
+        load: u32,
+        runnable: u32,
+        running: u32,
+    ) -> u64 {
+        let mut contrib = delta as u32;
+
+        delta += self.period_contrib as u64;
+
+        let periods = delta / 1024;
+
+        if periods > 0 {
+            self.load_sum = Self::decay_load(self.load_sum, periods);
+            self.runnable_sum = Self::decay_load(self.runnable_sum, periods);
+            self.util_sum = Self::decay_load(self.util_sum, periods);
+
+            delta %= 1024;
+            if load > 0 {
+                contrib = Self::accumulate_pelt_segments(
+                    periods,
+                    1024 - self.period_contrib,
+                    delta as u32,
+                );
+            }
+        }
+
+        self.period_contrib = delta as u32;
+
+        if load > 0 {
+            self.load_sum += (contrib * load) as u64;
+        }
+        if runnable > 0 {
+            self.runnable_sum += (runnable & contrib << SCHED_CAPACITY_SHIFT) as u64;
+        }
+
+        if running > 0 {
+            self.util_sum += (contrib << SCHED_CAPACITY_SHIFT) as u64;
+        }
+
+        return periods;
+    }
+
+    fn decay_load(mut val: u64, n: u64) -> u64 {
+        if unlikely(n > LOAD_AVG_PERIOD) {
+            return 0;
+        }
+
+        let mut local_n = n;
+
+        if unlikely(local_n >= LOAD_AVG_PERIOD) {
+            val >>= local_n / LOAD_AVG_PERIOD;
+            local_n %= LOAD_AVG_PERIOD;
+        }
+
+        ((val as i128 * RUNNABLE_AVG_Y_N_INV[local_n as usize] as i128) >> 32) as u64
+    }
+
+    fn accumulate_pelt_segments(periods: u64, d1: u32, d3: u32) -> u32 {
+        /* y^0 == 1 */
+        let c3 = d3;
+
+        /*
+         * c1 = d1 y^p
+         */
+        let c1 = Self::decay_load(d1 as u64, periods) as u32;
+
+        /*
+         *            p-1
+         * c2 = 1024 \Sum y^n
+         *            n=1
+         *
+         *              inf        inf
+         *    = 1024 ( \Sum y^n - \Sum y^n - y^0 )
+         *              n=0        n=p
+         */
+        let c2 = LOAD_AVG_MAX as u32 - Self::decay_load(LOAD_AVG_MAX as u64, periods) as u32 - 1024;
+
+        return c1 + c2 + c3;
+    }
+
+    pub fn update_load_avg(&mut self, load: u64) {
+        let divider = self.get_pelt_divider();
+
+        self.load_avg = (load * self.load_sum) as usize / divider;
+        self.runnable_avg = self.runnable_sum as usize / divider;
+        self.util_avg = self.util_sum as usize / divider;
+    }
+
+    #[allow(dead_code)]
+    pub fn post_init_entity_util_avg(pcb: &Arc<ProcessControlBlock>) {
+        let se = pcb.sched_info().sched_entity();
+        let cfs_rq = se.cfs_rq();
+        let sa = &mut se.force_mut().avg;
+
+        // TODO: 这里和架构相关
+        let cpu_scale = SCHED_CAPACITY_SCALE;
+
+        let cap = (cpu_scale as isize - cfs_rq.avg.util_avg as isize) / 2;
+
+        if pcb.sched_info().policy() != SchedPolicy::CFS {
+            sa.last_update_time = cfs_rq.cfs_rq_clock_pelt();
+        }
+
+        if cap > 0 {
+            if cfs_rq.avg.util_avg != 0 {
+                sa.util_avg = cfs_rq.avg.util_avg * se.load.weight as usize;
+                sa.util_avg /= cfs_rq.avg.load_avg + 1;
+
+                if sa.util_avg as isize > cap {
+                    sa.util_avg = cap as usize;
+                }
+            } else {
+                sa.util_avg = cap as usize;
+            }
+        }
+
+        sa.runnable_avg = sa.util_avg;
+    }
+}
+
+impl CpuRunQueue {
+    pub fn rq_clock_pelt(&self) -> u64 {
+        self.clock_pelt - self.lost_idle_time
+    }
+}
+
+impl CfsRunQueue {
+    pub fn cfs_rq_clock_pelt(&self) -> u64 {
+        if unlikely(self.throttled_count > 0) {
+            return self.throttled_clock_pelt - self.throttled_clock_pelt_time;
+        }
+
+        let rq = self.rq();
+        let (rq, _guard) = rq.self_lock();
+
+        return rq.rq_clock_pelt() - self.throttled_clock_pelt_time;
+    }
+}
+
+impl FairSchedEntity {
+    pub fn update_load_avg(&mut self, cfs_rq: &mut CfsRunQueue, now: u64) -> bool {
+        if self.avg.update_load_sum(
+            now,
+            self.on_rq as u32,
+            self.runnable() as u32,
+            cfs_rq.is_curr(&self.self_arc()) as u32,
+        ) {
+            self.avg
+                .update_load_avg(LoadWeight::scale_load_down(self.load.weight));
+
+            return true;
+        }
+
+        return false;
+    }
+}
+
+bitflags! {
+    pub struct UpdateAvgFlags: u8 {
+        /// 更新任务组(task group)信息
+        const UPDATE_TG	= 0x1;
+
+        /// 跳过年龄和负载的更新
+        const SKIP_AGE_LOAD	= 0x2;
+        /// 执行附加操作
+        const DO_ATTACH	= 0x4;
+        /// 执行分离操作
+        const DO_DETACH	= 0x8;
+    }
+}
+
+pub fn add_positive(x: &mut isize, y: isize) {
+    let res = *x + y;
+    *x = res.max(0);
+}
+
+pub fn sub_positive(x: &mut usize, y: usize) {
+    if *x > y {
+        *x -= y;
+    } else {
+        *x = 0;
+    }
+}

+ 33 - 0
kernel/src/sched/prio.rs

@@ -0,0 +1,33 @@
+pub const MAX_NICE: i32 = 20;
+pub const MIN_NICE: i32 = -20;
+pub const NICE_WIDTH: i32 = MAX_NICE - MIN_NICE + 1;
+
+pub const MAX_RT_PRIO: i32 = 100;
+pub const MAX_PRIO: i32 = MAX_RT_PRIO + NICE_WIDTH;
+#[allow(dead_code)]
+pub const DEFAULT_PRIO: i32 = MAX_RT_PRIO + NICE_WIDTH / 2;
+
+pub const MAX_DL_PRIO: i32 = 0;
+pub struct PrioUtil;
+#[allow(dead_code)]
+impl PrioUtil {
+    #[inline]
+    pub fn nice_to_prio(nice: i32) -> i32 {
+        nice + DEFAULT_PRIO
+    }
+
+    #[inline]
+    pub fn prio_to_nice(prio: i32) -> i32 {
+        prio - DEFAULT_PRIO
+    }
+
+    #[inline]
+    pub fn dl_prio(prio: i32) -> bool {
+        return prio < MAX_DL_PRIO;
+    }
+
+    #[inline]
+    pub fn rt_prio(prio: i32) -> bool {
+        return prio < MAX_RT_PRIO;
+    }
+}

+ 0 - 235
kernel/src/sched/rt.rs

@@ -1,235 +0,0 @@
-use core::sync::atomic::compiler_fence;
-
-use alloc::{boxed::Box, collections::LinkedList, sync::Arc, vec::Vec};
-
-use crate::{
-    arch::cpu::current_cpu_id,
-    kBUG, kdebug,
-    libs::spinlock::SpinLock,
-    mm::percpu::PerCpu,
-    process::{ProcessControlBlock, ProcessFlags, ProcessManager},
-    smp::cpu::ProcessorId,
-};
-
-use super::{
-    core::{sched_enqueue, Scheduler},
-    SchedPolicy,
-};
-
-/// 声明全局的rt调度器实例
-pub static mut RT_SCHEDULER_PTR: Option<Box<SchedulerRT>> = None;
-
-/// @brief 获取rt调度器实例的可变引用
-#[inline]
-pub fn __get_rt_scheduler() -> &'static mut SchedulerRT {
-    return unsafe { RT_SCHEDULER_PTR.as_mut().unwrap() };
-}
-
-/// @brief 初始化rt调度器
-pub unsafe fn sched_rt_init() {
-    kdebug!("rt scheduler init");
-    if RT_SCHEDULER_PTR.is_none() {
-        RT_SCHEDULER_PTR = Some(Box::new(SchedulerRT::new()));
-    } else {
-        kBUG!("Try to init RT Scheduler twice.");
-        panic!("Try to init RT Scheduler twice.");
-    }
-}
-/// @brief RT队列(per-cpu的)
-#[derive(Debug)]
-struct RTQueue {
-    /// 加锁保护的存储进程的双向队列
-    locked_queue: SpinLock<LinkedList<Arc<ProcessControlBlock>>>,
-}
-
-impl RTQueue {
-    pub fn new() -> RTQueue {
-        RTQueue {
-            locked_queue: SpinLock::new(LinkedList::new()),
-        }
-    }
-    /// @brief 将pcb加入队列
-    pub fn enqueue(&mut self, pcb: Arc<ProcessControlBlock>) {
-        let mut queue = self.locked_queue.lock_irqsave();
-
-        // 如果进程是IDLE进程,那么就不加入队列
-        if pcb.pid().into() == 0 {
-            return;
-        }
-        queue.push_back(pcb);
-    }
-
-    /// @brief 将pcb从调度队列头部取出,若队列为空,则返回None
-    pub fn dequeue(&mut self) -> Option<Arc<ProcessControlBlock>> {
-        let res: Option<Arc<ProcessControlBlock>>;
-        let mut queue = self.locked_queue.lock_irqsave();
-        if queue.len() > 0 {
-            // 队列不为空,返回下一个要执行的pcb
-            res = Some(queue.pop_front().unwrap());
-        } else {
-            // 如果队列为空,则返回None
-            res = None;
-        }
-        return res;
-    }
-    pub fn enqueue_front(&mut self, pcb: Arc<ProcessControlBlock>) {
-        let mut queue = self.locked_queue.lock_irqsave();
-
-        // 如果进程是IDLE进程,那么就不加入队列
-        if pcb.pid().into() == 0 {
-            return;
-        }
-        queue.push_front(pcb);
-    }
-
-    #[allow(dead_code)]
-    pub fn get_rt_queue_size(&mut self) -> usize {
-        let queue = self.locked_queue.lock_irqsave();
-        return queue.len();
-    }
-}
-
-/// @brief RT调度器类
-pub struct SchedulerRT {
-    cpu_queue: Vec<Vec<&'static mut RTQueue>>,
-    load_list: Vec<&'static mut LinkedList<u64>>,
-}
-
-impl SchedulerRT {
-    const RR_TIMESLICE: isize = 100;
-    const MAX_RT_PRIO: isize = 100;
-
-    pub fn new() -> SchedulerRT {
-        // 暂时手动指定核心数目
-        // todo: 从cpu模块来获取核心的数目
-        let mut result = SchedulerRT {
-            cpu_queue: Default::default(),
-            load_list: Default::default(),
-        };
-
-        // 为每个cpu核心创建队列
-        for cpu_id in 0..PerCpu::MAX_CPU_NUM {
-            result.cpu_queue.push(Vec::new());
-            // 每个CPU有MAX_RT_PRIO个优先级队列
-            for _ in 0..SchedulerRT::MAX_RT_PRIO {
-                result.cpu_queue[cpu_id as usize].push(Box::leak(Box::new(RTQueue::new())));
-            }
-        }
-        // 为每个cpu核心创建负载统计队列
-        for _ in 0..PerCpu::MAX_CPU_NUM {
-            result.load_list.push(Box::leak(Box::default()));
-        }
-        return result;
-    }
-
-    /// @brief 挑选下一个可执行的rt进程
-    pub fn pick_next_task_rt(&mut self, cpu_id: ProcessorId) -> Option<Arc<ProcessControlBlock>> {
-        // 循环查找,直到找到
-        // 这里应该是优先级数量,而不是CPU数量,需要修改
-        for i in 0..SchedulerRT::MAX_RT_PRIO {
-            let cpu_queue_i: &mut RTQueue = self.cpu_queue[cpu_id.data() as usize][i as usize];
-            let proc: Option<Arc<ProcessControlBlock>> = cpu_queue_i.dequeue();
-            if proc.is_some() {
-                return proc;
-            }
-        }
-        // return 一个空值
-        None
-    }
-
-    pub fn rt_queue_len(&mut self, cpu_id: ProcessorId) -> usize {
-        let mut sum = 0;
-        for prio in 0..SchedulerRT::MAX_RT_PRIO {
-            sum += self.cpu_queue[cpu_id.data() as usize][prio as usize].get_rt_queue_size();
-        }
-        return sum;
-    }
-
-    #[allow(dead_code)]
-    #[inline]
-    pub fn load_list_len(&mut self, cpu_id: u32) -> usize {
-        return self.load_list[cpu_id as usize].len();
-    }
-
-    pub fn enqueue_front(&mut self, pcb: Arc<ProcessControlBlock>) {
-        let cpu_id = current_cpu_id().data() as usize;
-        let priority = pcb.sched_info().priority().data() as usize;
-
-        self.cpu_queue[cpu_id][priority].enqueue_front(pcb);
-    }
-
-    pub fn timer_update_jiffies(&self) {
-        ProcessManager::current_pcb()
-            .sched_info()
-            .increase_rt_time_slice(-1);
-    }
-}
-
-impl Scheduler for SchedulerRT {
-    /// @brief 在当前cpu上进行调度。
-    /// 请注意,进入该函数之前,需要关中断
-    fn sched(&mut self) -> Option<Arc<ProcessControlBlock>> {
-        ProcessManager::current_pcb()
-            .flags()
-            .remove(ProcessFlags::NEED_SCHEDULE);
-        // 正常流程下,这里一定是会pick到next的pcb的,如果是None的话,要抛出错误
-        let cpu_id = current_cpu_id();
-        let proc: Arc<ProcessControlBlock> =
-            self.pick_next_task_rt(cpu_id).expect("No RT process found");
-        let priority = proc.sched_info().priority();
-        let policy = proc.sched_info().inner_lock_read_irqsave().policy();
-        match policy {
-            // 如果是fifo策略,则可以一直占有cpu直到有优先级更高的任务就绪(即使优先级相同也不行)或者主动放弃(等待资源)
-            SchedPolicy::FIFO => {
-                // 如果挑选的进程优先级小于当前进程,则不进行切换
-                if proc.sched_info().priority()
-                    <= ProcessManager::current_pcb().sched_info().priority()
-                {
-                    sched_enqueue(proc, false);
-                } else {
-                    // 将当前的进程加进队列
-                    sched_enqueue(ProcessManager::current_pcb(), false);
-                    compiler_fence(core::sync::atomic::Ordering::SeqCst);
-                    return Some(proc);
-                }
-            }
-
-            // RR调度策略需要考虑时间片
-            SchedPolicy::RR => {
-                // 同等优先级的,考虑切换
-                if proc.sched_info().priority()
-                    >= ProcessManager::current_pcb().sched_info().priority()
-                {
-                    // 判断这个进程时间片是否耗尽,若耗尽则将其时间片赋初值然后入队
-                    if proc.sched_info().rt_time_slice() <= 0 {
-                        proc.sched_info()
-                            .set_rt_time_slice(SchedulerRT::RR_TIMESLICE);
-                        proc.flags().insert(ProcessFlags::NEED_SCHEDULE);
-                        sched_enqueue(proc, false);
-                    }
-                    // 目标进程时间片未耗尽,切换到目标进程
-                    else {
-                        // 将当前进程加进队列
-                        sched_enqueue(ProcessManager::current_pcb(), false);
-                        compiler_fence(core::sync::atomic::Ordering::SeqCst);
-                        return Some(proc);
-                    }
-                }
-                // curr优先级更大,说明一定是实时进程,将所选进程入队列,此时需要入队首
-                else {
-                    self.cpu_queue[cpu_id.data() as usize][priority.data() as usize]
-                        .enqueue_front(proc);
-                }
-            }
-            _ => panic!("unsupported schedule policy"),
-        }
-        return None;
-    }
-
-    fn enqueue(&mut self, pcb: Arc<ProcessControlBlock>) {
-        let cpu_id = pcb.sched_info().on_cpu().unwrap();
-        let cpu_queue = &mut self.cpu_queue[cpu_id.data() as usize];
-        let priority = pcb.sched_info().priority().data() as usize;
-        cpu_queue[priority].enqueue(pcb);
-    }
-}

+ 0 - 23
kernel/src/sched/sched.h

@@ -1,23 +0,0 @@
-#pragma once
-
-#include <common/glib.h>
-#include <process/process.h>
-
-/*
- * Scheduling policies
- */
-#define SCHED_NORMAL 0
-#define SCHED_FIFO 1
-#define SCHED_RR 2
-#define SCHED_BATCH 3
-/* SCHED_ISO: reserved but not implemented yet */
-#define SCHED_IDLE 5
-#define SCHED_DEADLINE 6
-#define SCHED_MAX_POLICY_NUM SCHED_DEADLINE
-
-#define IS_VALID_SCHED_POLICY(_policy) ((_policy) > 0 && (_policy) <= SCHED_MAX_POLICY_NUM)
-
-// ================= Rust 实现 =============
-
-extern void sched_init();
-extern void sched();

+ 0 - 39
kernel/src/sched/syscall.rs

@@ -1,39 +0,0 @@
-use system_error::SystemError;
-
-use crate::{
-    arch::CurrentIrqArch, exception::InterruptArch, process::ProcessManager,
-    smp::core::smp_get_processor_id, syscall::Syscall,
-};
-
-use super::core::{do_sched, CPU_EXECUTING};
-
-impl Syscall {
-    /// @brief 让系统立即运行调度器的系统调用
-    /// 请注意,该系统调用不能由ring3的程序发起
-    #[inline(always)]
-    pub fn sched(from_user: bool) -> Result<usize, SystemError> {
-        let irq_guard = unsafe { CurrentIrqArch::save_and_disable_irq() };
-
-        // 进行权限校验,拒绝用户态发起调度
-        if from_user {
-            return Err(SystemError::EPERM);
-        }
-        // 根据调度结果统一进行切换
-        let pcb = do_sched();
-
-        if let Some(next_pcb) = pcb {
-            let current_pcb = ProcessManager::current_pcb();
-            // kdebug!("sched: current_pcb: {:?}, next_pcb: {:?}\n", current_pcb, next_pcb);
-            if current_pcb.pid() != next_pcb.pid() {
-                CPU_EXECUTING.set(smp_get_processor_id(), next_pcb.pid());
-                unsafe { ProcessManager::switch_process(current_pcb, next_pcb) };
-            }
-        }
-        drop(irq_guard);
-        return Ok(0);
-    }
-
-    pub fn sched_yield() -> Result<usize, SystemError> {
-        return Syscall::sched(false);
-    }
-}

+ 16 - 5
kernel/src/syscall/mod.rs

@@ -6,6 +6,7 @@ use core::{
 
 use crate::{
     arch::{ipc::signal::SigSet, syscall::nr::*},
+    driver::base::device::device_number::DeviceNumber,
     filesystem::vfs::syscall::{PosixStatfs, PosixStatx},
     libs::{futex::constant::FutexFlag, rand::GRandFlags},
     mm::syscall::MremapFlags,
@@ -13,8 +14,9 @@ use crate::{
     process::{
         fork::KernelCloneArgs,
         resource::{RLimit64, RUsage},
-        ProcessManager,
+        ProcessFlags, ProcessManager,
     },
+    sched::{schedule, SchedMode},
     syscall::user_access::check_and_clone_cstr,
 };
 
@@ -381,7 +383,11 @@ impl Syscall {
 
             SYS_GETPID => Self::getpid().map(|pid| pid.into()),
 
-            SYS_SCHED => Self::sched(frame.is_from_user()),
+            SYS_SCHED => {
+                kwarn!("syscall sched");
+                schedule(SchedMode::SM_NONE);
+                Ok(0)
+            }
             SYS_DUP => {
                 let oldfd: i32 = args[0] as c_int;
                 Self::dup(oldfd)
@@ -652,8 +658,6 @@ impl Syscall {
 
             #[cfg(target_arch = "x86_64")]
             SYS_MKNOD => {
-                use crate::driver::base::device::device_number::DeviceNumber;
-
                 let path = args[0];
                 let flags = args[1];
                 let dev_t = args[2];
@@ -1019,7 +1023,7 @@ impl Syscall {
                 Err(SystemError::ENOSYS)
             }
 
-            SYS_SCHED_YIELD => Self::sched_yield(),
+            // SYS_SCHED_YIELD => Self::sched_yield(),
             SYS_UNAME => {
                 let name = args[0] as *mut PosixOldUtsName;
                 Self::uname(name)
@@ -1028,6 +1032,13 @@ impl Syscall {
             _ => panic!("Unsupported syscall ID: {}", syscall_num),
         };
 
+        if ProcessManager::current_pcb()
+            .flags()
+            .contains(ProcessFlags::NEED_SCHEDULE)
+        {
+            schedule(SchedMode::SM_PREEMPT);
+        }
+
         return r;
     }
 

+ 3 - 2
kernel/src/time/clocksource.rs

@@ -15,7 +15,7 @@ use system_error::SystemError;
 use unified_init::macros::unified_init;
 
 use crate::{
-    arch::{sched::sched, CurrentIrqArch},
+    arch::CurrentIrqArch,
     exception::InterruptArch,
     init::initcall::INITCALL_LATE,
     kdebug, kinfo,
@@ -24,6 +24,7 @@ use crate::{
         kthread::{KernelThreadClosure, KernelThreadMechanism},
         ProcessControlBlock, ProcessManager,
     },
+    sched::{schedule, SchedMode},
 };
 
 use super::{
@@ -823,7 +824,7 @@ pub fn clocksource_watchdog_kthread() -> i32 {
         let irq_guard = unsafe { CurrentIrqArch::save_and_disable_irq() };
         ProcessManager::mark_sleep(true).expect("clocksource_watchdog_kthread:mark sleep failed");
         drop(irq_guard);
-        sched();
+        schedule(SchedMode::SM_NONE);
     }
     return 0;
 }

+ 1 - 0
kernel/src/time/jiffies.rs

@@ -18,6 +18,7 @@ lazy_static! {
 pub const JIFFIES_SHIFT: u32 = 8;
 pub const LATCH: u32 = (CLOCK_TICK_RATE + (HZ as u32) / 2) / HZ as u32;
 pub const ACTHZ: u32 = sh_div(CLOCK_TICK_RATE, LATCH, 8);
+pub const TICK_NESC: u32 = (NSEC_PER_SEC + (HZ as u32) / 2) / HZ as u32;
 //TODO 编写测试,保证始终跳动间隔与现实一致(两种时钟源进行对拍)
 pub const NSEC_PER_JIFFY: u32 = (((NSEC_PER_SEC as u64) << 8) / ACTHZ as u64) as u32;
 pub const fn sh_div(nom: u32, den: u32, lsh: u32) -> u32 {

+ 3 - 2
kernel/src/time/sleep.rs

@@ -4,10 +4,11 @@ use alloc::{boxed::Box, sync::Arc};
 use system_error::SystemError;
 
 use crate::{
-    arch::{sched::sched, CurrentIrqArch, CurrentTimeArch},
+    arch::{CurrentIrqArch, CurrentTimeArch},
     exception::InterruptArch,
     include::bindings::bindings::useconds_t,
     process::ProcessManager,
+    sched::{schedule, SchedMode},
     time::timekeeping::getnstimeofday,
 };
 
@@ -53,7 +54,7 @@ pub fn nanosleep(sleep_time: TimeSpec) -> Result<TimeSpec, SystemError> {
     timer.activate();
 
     drop(irq_guard);
-    sched();
+    schedule(SchedMode::SM_NONE);
 
     let end_time = getnstimeofday();
     // 返回正确的剩余时间

+ 4 - 3
kernel/src/time/timer.rs

@@ -12,7 +12,7 @@ use alloc::{
 use system_error::SystemError;
 
 use crate::{
-    arch::{sched::sched, CurrentIrqArch},
+    arch::CurrentIrqArch,
     exception::{
         softirq::{softirq_vectors, SoftirqNumber, SoftirqVec},
         InterruptArch,
@@ -20,6 +20,7 @@ use crate::{
     kerror, kinfo,
     libs::spinlock::{SpinLock, SpinLockGuard},
     process::{ProcessControlBlock, ProcessManager},
+    sched::{schedule, SchedMode},
 };
 
 use super::{jiffies::NSEC_PER_JIFFY, timekeeping::update_wall_time};
@@ -258,7 +259,7 @@ pub fn schedule_timeout(mut timeout: i64) -> Result<i64, SystemError> {
         let irq_guard = unsafe { CurrentIrqArch::save_and_disable_irq() };
         ProcessManager::mark_sleep(true).ok();
         drop(irq_guard);
-        sched();
+        schedule(SchedMode::SM_PREEMPT);
         return Ok(MAX_TIMEOUT);
     } else if timeout < 0 {
         kerror!("timeout can't less than 0");
@@ -277,7 +278,7 @@ pub fn schedule_timeout(mut timeout: i64) -> Result<i64, SystemError> {
 
         drop(irq_guard);
 
-        sched();
+        schedule(SchedMode::SM_PREEMPT);
         let time_remaining: i64 = timeout - TIMER_JIFFIES.load(Ordering::SeqCst) as i64;
         if time_remaining >= 0 {
             // 被提前唤醒,返回剩余时间