scx_chaos/
main.rs

1// Copyright (c) Meta Platforms, Inc. and affiliates.
2
3// This software may be used and distributed according to the terms of the
4// GNU General Public License version 2.
5use scx_chaos::Builder;
6use scx_chaos::RequiresPpid;
7use scx_chaos::Scheduler;
8use scx_chaos::Trait;
9
10use scx_p2dq::SchedulerOpts as P2dqOpts;
11
12use anyhow::{Context, Result, anyhow};
13use clap::Parser;
14use log::info;
15use nix::unistd::Pid;
16
17use std::panic;
18use std::pin::Pin;
19use std::process::Command;
20use std::sync::Arc;
21use std::sync::Condvar;
22use std::sync::Mutex;
23use std::thread;
24use std::time::Duration;
25
26/// Randomly delay a process.
27#[derive(Debug, Parser)]
28pub struct RandomDelayArgs {
29    /// Chance of randomly delaying a process.
30    #[clap(long, requires = "random_delay_min_us")]
31    pub random_delay_frequency: Option<f64>,
32
33    /// Minimum time to add for random delay.
34    #[clap(long, requires = "random_delay_max_us")]
35    pub random_delay_min_us: Option<u64>,
36
37    /// Maximum time to add for random delay.
38    #[clap(long, requires = "random_delay_frequency")]
39    pub random_delay_max_us: Option<u64>,
40}
41
42/// scx_chaos: A general purpose sched_ext scheduler designed to amplify race conditions
43///
44/// WARNING: This scheduler is a very early alpha, and hasn't been production tested yet. The CLI
45/// in particular is likely very unstable and does not guarantee compatibility between versions.
46///
47/// scx_chaos is a general purpose scheduler designed to run apps with acceptable performance. It
48/// has a series of features designed to add latency in paths in an application. All control is
49/// through the CLI. Running without arguments will not attempt to introduce latency and can set a
50/// baseline for performance impact. The other command line arguments allow for specifying latency
51/// inducing behaviours which attempt to induce a crash.
52///
53/// Unlike most other schedulers, you can also run scx_chaos with a named target. For example:
54///     scx_chaos -- ./app_that_might_crash --arg1 --arg2
55/// In this mode the scheduler will automatically detach after the application exits, unless run
56/// with `--repeat-failure` where it will restart the application on failure.
57#[derive(Debug, Parser)]
58pub struct Args {
59    /// Whether to continue on failure of the command under test.
60    #[clap(long, action = clap::ArgAction::SetTrue, requires = "args")]
61    pub repeat_failure: bool,
62
63    /// Whether to continue on successful exit of the command under test.
64    #[clap(long, action = clap::ArgAction::SetTrue, requires = "args")]
65    pub repeat_success: bool,
66
67    /// Whether to focus on the named task and its children instead of the entire system. Only
68    /// takes effect if pid or args provided.
69    #[clap(long, default_value = "true", action = clap::ArgAction::Set)]
70    pub ppid_targeting: bool,
71
72    /// Enable verbose output, including libbpf details. Specify multiple
73    /// times to increase verbosity.
74    #[clap(short = 'v', long, action = clap::ArgAction::Count)]
75    pub verbose: u8,
76
77    #[command(flatten, next_help_heading = "Random Delays")]
78    pub random_delay: RandomDelayArgs,
79
80    #[command(flatten, next_help_heading = "General Scheduling")]
81    pub p2dq: P2dqOpts,
82
83    /// Stop the scheduler if specified process terminates
84    #[arg(
85        long,
86        short = 'p',
87        help_heading = "Test Command",
88        conflicts_with = "args"
89    )]
90    pub pid: Option<libc::pid_t>,
91
92    /// Program to run under the chaos scheduler
93    ///
94    /// Runs a program under test and tracks when it terminates, similar to most debuggers. Note
95    /// that the scheduler still attaches for every process on the system.
96    #[arg(
97        trailing_var_arg = true,
98        allow_hyphen_values = true,
99        help_heading = "Test Command"
100    )]
101    pub args: Vec<String>,
102}
103
104struct BuilderIterator<'a> {
105    args: &'a Args,
106    idx: u32,
107}
108
109impl<'a> From<&'a Args> for BuilderIterator<'a> {
110    fn from(args: &'a Args) -> BuilderIterator<'a> {
111        BuilderIterator { args, idx: 0 }
112    }
113}
114
115impl<'a> Iterator for BuilderIterator<'a> {
116    type Item = Builder<'a>;
117
118    fn next(&mut self) -> Option<Self::Item> {
119        self.idx += 1;
120
121        if self.idx > 1 {
122            None
123        } else {
124            let mut traits = vec![];
125
126            if let RandomDelayArgs {
127                random_delay_frequency: Some(frequency),
128                random_delay_min_us: Some(min_us),
129                random_delay_max_us: Some(max_us),
130            } = self.args.random_delay
131            {
132                traits.push(Trait::RandomDelays {
133                    frequency,
134                    min_us,
135                    max_us,
136                });
137            };
138
139            let requires_ppid = if self.args.ppid_targeting {
140                if let Some(p) = self.args.pid {
141                    Some(RequiresPpid::IncludeParent(Pid::from_raw(p)))
142                } else if !self.args.args.is_empty() {
143                    Some(RequiresPpid::ExcludeParent(Pid::this()))
144                } else {
145                    None
146                }
147            } else {
148                None
149            };
150
151            Some(Builder {
152                traits,
153                verbose: self.args.verbose,
154                p2dq_opts: &self.args.p2dq,
155                requires_ppid,
156            })
157        }
158    }
159}
160
161fn main() -> Result<()> {
162    let args = Arc::new(Args::parse());
163
164    let llv = match &args.verbose {
165        0 => simplelog::LevelFilter::Info,
166        1 => simplelog::LevelFilter::Debug,
167        _ => simplelog::LevelFilter::Trace,
168    };
169    simplelog::TermLogger::init(
170        llv,
171        simplelog::ConfigBuilder::new()
172            .set_time_level(simplelog::LevelFilter::Error)
173            .set_location_level(simplelog::LevelFilter::Off)
174            .set_target_level(simplelog::LevelFilter::Off)
175            .set_thread_level(simplelog::LevelFilter::Off)
176            .build(),
177        simplelog::TerminalMode::Stderr,
178        simplelog::ColorChoice::Auto,
179    )?;
180
181    if args.pid.is_some() {
182        return Err(anyhow!("args.pid is not yet implemented"));
183    }
184
185    let shutdown = Arc::new((Mutex::new(false), Condvar::new()));
186
187    ctrlc::set_handler({
188        let shutdown = shutdown.clone();
189        move || {
190            let (lock, cvar) = &*shutdown;
191            *lock.lock().unwrap() = true;
192            cvar.notify_all();
193        }
194    })
195    .context("Error setting Ctrl-C handler")?;
196
197    let scheduler_thread = thread::spawn({
198        let args = args.clone();
199        let shutdown = shutdown.clone();
200
201        move || -> Result<()> {
202            for builder in BuilderIterator::from(&*args) {
203                info!("{:?}", &builder);
204
205                let sched: Pin<Box<Scheduler>> = builder.try_into()?;
206
207                sched.observe(&shutdown, None)?;
208            }
209
210            Ok(())
211        }
212    });
213
214    let mut should_run_app = !args.args.is_empty();
215    while should_run_app {
216        let (cmd, vargs) = args.args.split_first().unwrap();
217
218        let mut child = Command::new(cmd).args(vargs).spawn()?;
219        loop {
220            should_run_app &= !*shutdown.0.lock().unwrap();
221
222            if scheduler_thread.is_finished() {
223                child.kill()?;
224                break;
225            }
226            if let Some(s) = child.try_wait()? {
227                if s.success() && args.repeat_success {
228                    should_run_app &= !*shutdown.0.lock().unwrap();
229                    if should_run_app {
230                        info!("app under test terminated successfully, restarting...");
231                    };
232                } else if s.success() {
233                    info!("app under test terminated successfully, exiting...");
234                    should_run_app = false;
235                } else {
236                    info!("TODO: report what the scheduler was doing when it crashed");
237                    should_run_app &= !*shutdown.0.lock().unwrap() && args.repeat_failure;
238                };
239
240                break;
241            };
242
243            thread::sleep(Duration::from_millis(100));
244        }
245    }
246
247    if !args.args.is_empty() {
248        let (lock, cvar) = &*shutdown;
249        *lock.lock().unwrap() = true;
250        cvar.notify_all();
251    }
252
253    match scheduler_thread.join() {
254        Ok(_) => {}
255        Err(e) => panic::resume_unwind(e),
256    };
257
258    Ok(())
259}