diff --git a/.gitignore b/.gitignore index 204739ff..0cf8023a 100644 --- a/.gitignore +++ b/.gitignore @@ -60,4 +60,5 @@ venv.bak/ # IDEs .idea/ -.vscode/ \ No newline at end of file +.vscode/target/ +Cargo.lock diff --git a/src/main.rs b/src/main.rs index 3d0399b4..3e22feb3 100644 --- a/src/main.rs +++ b/src/main.rs @@ -57,7 +57,7 @@ async fn scan(req: web::Json) -> impl Responder { path.clone().unwrap() }; - let result = Python::with_gil(|py| -> Result { + let result = Python::attach(|py| -> Result { // Import the required modules let pyspector_cli = py.import("pyspector.cli").map_err(|e| { format!("Failed to import pyspector.cli: {}. Is PySpector installed?", e) diff --git a/src/pyspector/_rust_core/src/analysis/ast_analysis.rs b/src/pyspector/_rust_core/src/analysis/ast_analysis.rs index 715dd5c9..a5d08932 100644 --- a/src/pyspector/_rust_core/src/analysis/ast_analysis.rs +++ b/src/pyspector/_rust_core/src/analysis/ast_analysis.rs @@ -4,19 +4,25 @@ use crate::rules::{RuleSet, Rule, Defaults}; // Main entry point for AST scanning pub fn scan_ast(ast: &AstNode, file_path: &str, content: &str, ruleset: &RuleSet) -> Vec { - let mut issues = Vec::new(); + // Pre-filter applicable rules ONCE per file — not per AST node. + // This is critical for performance: file_content_exclude runs a regex against + // the full file content. Calling it inside walk_ast meant it ran O(nodes × rules) + // times — 5M+ times for large files. Pre-filtering reduces this to O(rules) = ~100. let ast_rules: Vec<&Rule> = ruleset.rules.iter() .filter(|r| r.ast_match.is_some()) + .filter(|r| !r.is_excluded(file_path, content, &ruleset.defaults)) .collect(); - if ast_rules.is_empty() { return issues; } + if ast_rules.is_empty() { return Vec::new(); } - walk_ast(ast, file_path, content, &ast_rules, &ruleset.defaults, &mut issues); + let mut issues = Vec::new(); + walk_ast(ast, file_path, content, &ast_rules, &mut issues); issues } -// Recursively walks the AST, checking each node against the rules -fn walk_ast(node: &AstNode, file_path: &str, content: &str, rules: &[&Rule], defaults: &Defaults, issues: &mut Vec) { +// Recursively walks the AST, checking each node against pre-filtered rules. +// Rules are already filtered for this file — no exclusion checks needed here. +fn walk_ast(node: &AstNode, file_path: &str, content: &str, rules: &[&Rule], issues: &mut Vec) { for rule in rules.iter() { // Respect global defaults + rule-level exclude_file_pattern if rule.is_file_excluded(file_path, defaults) { @@ -27,7 +33,7 @@ fn walk_ast(node: &AstNode, file_path: &str, content: &str, rules: &[&Rule], def if check_node_match(node, match_pattern) { let line_content = content.lines().nth(node.lineno.saturating_sub(1) as usize).unwrap_or("").to_string(); - // Respect exclude_pattern on the matched line + // Respect line-level exclude_pattern on the matched line if let Some(exclude) = &rule.exclude_pattern { if exclude.is_match(&line_content) { continue; @@ -77,7 +83,7 @@ fn check_node_match(node: &AstNode, match_pattern: &str) -> bool { } } } - + true } @@ -112,6 +118,6 @@ fn node_has_property(node: &AstNode, path: &[&str], expected_value: &str) -> boo } } } - + false -} \ No newline at end of file +} diff --git a/src/pyspector/_rust_core/src/analysis/config_analysis.rs b/src/pyspector/_rust_core/src/analysis/config_analysis.rs index a512afc5..b8a814b2 100644 --- a/src/pyspector/_rust_core/src/analysis/config_analysis.rs +++ b/src/pyspector/_rust_core/src/analysis/config_analysis.rs @@ -18,8 +18,8 @@ pub fn scan_file(file_path: &str, content: &str, ruleset: &RuleSet) -> Vec Vec { } } - println!("[+] Found {} files to scan", files_to_scan.len()); - + println!("[+] Found {} files to scan ({} non-Python)", files_to_scan.len(), + files_to_scan.iter().filter(|f| !f.ends_with(".py")).count()); + // Scan all files with regex patterns + let t_config = std::time::Instant::now(); let mut issues: Vec = files_to_scan .par_iter() .flat_map(|file_path| { if let Ok(content) = fs::read_to_string(file_path) { config_analysis::scan_file(file_path, &content, &context.ruleset) - } else { - Vec::new() + } else { + Vec::new() } }) .collect(); - - println!("[+] Found {} issues from config analysis", issues.len()); + println!("[*] Pattern/config scan: {:.2}s → {} issues", t_config.elapsed().as_secs_f64(), issues.len()); // Process Python files with AST analysis + let t_ast = std::time::Instant::now(); let python_issues: Vec = context.py_files .par_iter() .flat_map(|py_file| { let mut findings = Vec::new(); - if is_excluded(Path::new(&py_file.file_path), &enhanced_exclusions) { - return findings; + if is_excluded(Path::new(&py_file.file_path), &enhanced_exclusions) { + return findings; } - - // Skip regex scan for Python files (already done above) - if let Some(ast) = &py_file.ast { let ast_findings = ast_analysis::scan_ast(ast, &py_file.file_path, &py_file.content, &context.ruleset); findings.extend(ast_findings); @@ -89,12 +88,13 @@ pub fn run_analysis(mut context: AnalysisContext) -> Vec { findings }) .collect(); - - println!("[+] {} issues from Python AST analysis", python_issues.len()); + println!("[*] AST analysis: {:.2}s → {} issues", t_ast.elapsed().as_secs_f64(), python_issues.len()); issues.extend(python_issues); // Build the call graph and run taint analysis + let t_callgraph = std::time::Instant::now(); let call_graph = call_graph_builder::build_call_graph(context.py_files); + println!("[*] Call graph build: {:.2}s", t_callgraph.elapsed().as_secs_f64()); let taint_issues = taint_analysis::analyze_program_for_taint(&call_graph, &context.ruleset); println!("[+] Found {} issues from taint analysis", taint_issues.len()); issues.extend(taint_issues); diff --git a/src/pyspector/_rust_core/src/analysis/taint_analysis.rs b/src/pyspector/_rust_core/src/analysis/taint_analysis.rs index 3a11fda9..8c6e8a82 100644 --- a/src/pyspector/_rust_core/src/analysis/taint_analysis.rs +++ b/src/pyspector/_rust_core/src/analysis/taint_analysis.rs @@ -4,17 +4,94 @@ use crate::graph::cfg_builder::build_cfg; use crate::graph::representation::{BasicBlock, BlockId, ControlFlowGraph}; use crate::issues::Issue; use crate::rules::RuleSet; +use rayon::prelude::*; use std::collections::{HashMap, HashSet, VecDeque}; -/// Origin of a taint +/// Provenance of a value — universal Python semantics, no framework knowledge. +/// +/// The provenance lattice (least trusted → most trusted): +/// HttpRequest → ShellSanitized → OperatorConfig → DeveloperDefined / SystemGenerated +/// +/// HttpRequest and ShellSanitized are attacker-controlled (trigger most sinks). +/// ShellSanitized specifically does NOT trigger shell injection sinks (PY102/SHELL*). #[derive(Debug, Clone, PartialEq, Eq, Hash)] pub enum TaintOrigin { - External, // From a known source (e.g. input(), request.get()) - Param(usize), // From a function parameter (index) + /// Attacker-controlled: request.GET.get(), request.POST, cookies, body, + /// HTTP API responses (.json(), iter_lines()), CLI arguments. + HttpRequest, + + /// Attacker-controlled data that has been through shlex.quote(). + /// Safe for shell metacharacter injection (PY102) — shlex.quote prevents that. + /// Still dangerous for: path traversal (PATH813), f-string injection (FSTRING867), + /// file open (OPEN1149), URL injection (SSRF_001), SQL injection (PY101). + ShellSanitized, + + /// Attacker-controlled data that has been through html.escape() or format_html(). + /// Safe for HTML XSS — still dangerous for SQL, shell, path, URLs. + HtmlSanitized, + + /// Attacker-controlled data that has been through quote_name() or similar SQL sanitizers. + /// Safe for SQL identifier injection — still dangerous for shell, path, HTML. + SqlSanitized, + + /// Operator-controlled: os.environ.get(), config files loaded at startup. + OperatorConfig, + + /// Developer-defined: string literals, class attributes, module constants. + DeveloperDefined, + + /// System-generated: tempfile.*, uuid4(), os.urandom(), secrets.*. + SystemGenerated, + + // Legacy — kept for backward compatibility + External, + Param(usize), +} + +impl TaintOrigin { + /// True if this origin is attacker-controlled and should trigger sink findings. + /// + /// HtmlSanitized and SqlSanitized are NOT attacker-controlled for general sinks: + /// - html.escape/format_html/conditional_escape are complete XSS mitigations + /// - quote_name is a complete SQL injection mitigation + /// These sanitizers clear taint for all sinks — they were comprehensive mitigations. + /// + /// ShellSanitized IS still attacker-controlled for non-shell sinks: + /// - shlex.quote prevents shell injection but NOT path traversal, f-string, SSRF, SQL + /// - So ShellSanitized data still triggers PATH813, OPEN1149, FSTRING867, SSRF_001, PY101 + pub fn is_attacker_controlled(&self) -> bool { + matches!(self, + TaintOrigin::HttpRequest | + TaintOrigin::External | + TaintOrigin::ShellSanitized + ) + } + + /// True only for HttpRequest/External — not ShellSanitized. + /// Used by shell injection sinks (PY102, SHELL*): shlex.quote is a valid mitigation. + pub fn is_shell_injectable(&self) -> bool { + matches!(self, TaintOrigin::HttpRequest | TaintOrigin::External) + } + + /// True if this origin should still trigger SQL sinks. + /// ShellSanitized is still SQL-injectable (shlex.quote doesn't sanitize SQL). + pub fn is_sql_injectable(&self) -> bool { + matches!(self, TaintOrigin::HttpRequest | TaintOrigin::External | TaintOrigin::ShellSanitized) + } + + /// Convert a sanitizer's transforms_to string to a TaintOrigin. + pub fn from_transforms_to(s: &str) -> Option { + match s { + "ShellSanitized" => Some(TaintOrigin::ShellSanitized), + "HtmlSanitized" => Some(TaintOrigin::HtmlSanitized), + "SqlSanitized" => Some(TaintOrigin::SqlSanitized), + _ => None, + } + } } -/// Per-block taint state: maps variable names to their taint origins -/// If a variable is not in the map, it is not tainted. +/// Per-block taint state: maps variable names to their taint origins. +/// If a variable is not in the map, it is untainted (safe). type TaintState = HashMap>; /// Summary of a function's taint behavior @@ -30,6 +107,17 @@ struct FunctionSummary { struct GlobalTaintContext { /// Summaries for all functions in the program summaries: HashMap, + + /// Call-site taint: maps callee function name → per-parameter taint origins. + call_site_taints: HashMap>>, + + /// Class attribute taint: maps (file_prefix, attr_name) → taint origins. + class_attr_taints: HashMap<(String, String), HashSet>, + + /// CFG cache: pre-built control flow graphs for all functions. + /// build_cfg() is expensive (AST traversal + graph construction). + /// Caching avoids rebuilding the same CFG in each iteration and the final pass. + cfg_cache: HashMap, } /// Context for the intra-procedural fixed-point worklist algorithm @@ -51,12 +139,25 @@ impl TaintContext { // Main entry point for inter-procedural taint analysis pub fn analyze_program_for_taint(call_graph: &CallGraph, ruleset: &RuleSet) -> Vec { + let t0 = std::time::Instant::now(); println!("[*] Starting inter-procedural taint analysis with {} functions", call_graph.functions.len()); - + + // Pre-build all CFGs once — reuse across convergence iterations and final pass. + // Parallel build using Rayon: each function's CFG is independent. + println!("[*] Pre-building CFGs for {} functions (parallel)...", call_graph.functions.len()); + let cfg_cache: HashMap = call_graph.functions + .par_iter() + .map(|(func_id, func_node)| (func_id.clone(), build_cfg(func_node))) + .collect(); + println!("[*] CFG pre-build: {:.2}s", t0.elapsed().as_secs_f64()); + let mut global_ctx = GlobalTaintContext { summaries: HashMap::new(), + call_site_taints: HashMap::new(), + class_attr_taints: HashMap::new(), + cfg_cache, }; - + // Initialize summaries for all functions for func_id in call_graph.functions.keys() { global_ctx.summaries.insert(func_id.clone(), FunctionSummary::default() as FunctionSummary); @@ -64,56 +165,223 @@ pub fn analyze_program_for_taint(call_graph: &CallGraph, ruleset: &RuleSet) -> V let mut all_issues = Vec::new(); let mut iterations = 0; - const MAX_GLOBAL_ITERATIONS: usize = 10; - + const MAX_GLOBAL_ITERATIONS: usize = 10; + + // Pre-compute which files contain any taint source marker. + // Functions in files with NO taint markers cannot have internal taint sources — + // they may only receive taint from callers (handled by lazy call_site_taint filter). + // This pre-filter eliminates ~80% of function analyses in typical codebases. + const FILE_TAINT_MARKERS: &[&str] = &[ + // Django request access + "request.GET", "request.POST", "request.FILES", "request.COOKIES", + "request.META", "request.headers", + // Flask / generic request + "request.get(", "request.args", "request.form", + "request.values", "request.json", + // Environment / CLI + "os.environ.get", "sys.argv", + // HTTP streaming + ".iter_lines", ".iter_text", ".iter_raw", ".iter_bytes", + // Deserialization + "marshal.loads", "json.load(", "json.loads(", + ".json()", // HTTP response .json() method + "input(", // CLI interactive input + ]; + + let taint_active_files: std::collections::HashSet<&str> = call_graph.file_contents + .iter() + .filter(|(_, content)| FILE_TAINT_MARKERS.iter().any(|m| content.contains(m))) + .map(|(path, _)| path.as_str()) + .collect(); + + println!("[*] Taint-active files: {}/{} ({:.0}% of total)", + taint_active_files.len(), + call_graph.file_contents.len(), + 100.0 * taint_active_files.len() as f64 / call_graph.file_contents.len().max(1) as f64); + + let t_convergence = std::time::Instant::now(); loop { + let t_iter = std::time::Instant::now(); iterations += 1; - println!("[*] Global fixed-point iteration {}", iterations); let mut summaries_changed = false; - let mut current_pass_issues = Vec::new(); + let mut current_pass_issues: Vec = Vec::new(); - // Analyze each function - for (func_id, func_node) in &call_graph.functions { - let cfg = build_cfg(func_node); - - let file_path: &str = func_id.split("::").next().unwrap_or(""); - let default_content = String::new(); - let content = call_graph.file_contents.get(file_path).unwrap_or(&default_content); - - let (new_summary, issues) = analyze_function_taint( - &cfg, - func_node, - ruleset, - file_path, - content, - &global_ctx - ); - - if let Some(old_summary) = global_ctx.summaries.get(func_id) { + // Analyze functions IN PARALLEL using Rayon. + // Each function reads global_ctx (immutable snapshot of this iteration's state) + // and returns (func_id, summary, call_sites, class_attrs). + // Results are merged serially after all parallel analyses complete. + // + // Correctness: with parallel analysis, function B doesn't see call_site_taints + // produced by function A in the SAME iteration — it sees them in the NEXT + // iteration. This may require one extra iteration vs sequential but is safe. + // + // Lazy filter: iterations 2+ skip functions with no taint to propagate. + // A function has taint to propagate if: + // (a) it's an HTTP/CLI entry point (has tainted params) + // (b) it was called with tainted arguments (call_site_taint) + // (c) it's in a file where class attributes have been tainted (class_attr_taint) + // — e.g., self.output_dir set in __init__ propagates to all same-file methods + let files_with_class_attr_taints: std::collections::HashSet<&str> = global_ctx.class_attr_taints + .keys() + .filter(|(_, _)| true) + .map(|(file, _)| file.as_str()) + .collect(); + + let iter_results: Vec<(String, FunctionSummary, + HashMap>>, + HashMap<(String, String), HashSet>)> = + call_graph.functions + .par_iter() + .filter(|(func_id, func_node)| { + if iterations == 1 { return true; } + let func_name = func_node.fields.get("name") + .and_then(|v| v.as_ref()).and_then(|v| v.as_str()) + .unwrap_or(""); + let file_path = func_id.split("::").next().unwrap_or(""); + !extract_cli_tainted_params(func_node).is_empty() + || (global_ctx.call_site_taints.contains_key(func_name) + && global_ctx.call_site_taints[func_name].iter().any(|s| !s.is_empty())) + || files_with_class_attr_taints.contains(file_path) + }) + .map(|(func_id, func_node)| { + let cfg_owned; + let cfg = match global_ctx.cfg_cache.get(func_id.as_str()) { + Some(c) => c, + None => { cfg_owned = build_cfg(func_node); &cfg_owned } + }; + let file_path: &str = func_id.split("::").next().unwrap_or(""); + let default_content = String::new(); + let content = call_graph.file_contents.get(file_path) + .unwrap_or(&default_content); + let (summary, call_sites, class_attrs, _issues) = + analyze_function_taint(&cfg, func_node, ruleset, file_path, content, &global_ctx); + (func_id.clone(), summary, call_sites, class_attrs) + }) + .collect(); + + // Serial merge of parallel results into global_ctx + for (func_id, new_summary, new_call_sites, new_class_attrs) in iter_results { + for (callee, param_taints) in new_call_sites { + let entry = global_ctx.call_site_taints + .entry(callee) + .or_insert_with(Vec::new); + let mut changed = false; + for (i, origins) in param_taints.iter().enumerate() { + if i >= entry.len() { entry.resize(i + 1, HashSet::new()); } + let before_len = entry[i].len(); + entry[i].extend(origins.iter().cloned()); + if entry[i].len() > before_len { changed = true; } + } + if changed { summaries_changed = true; } + } + for (key, origins) in new_class_attrs { + let entry = global_ctx.class_attr_taints + .entry(key).or_insert_with(HashSet::new); + let before_len = entry.len(); + entry.extend(origins.iter().cloned()); + if entry.len() > before_len { summaries_changed = true; } + } + if let Some(old_summary) = global_ctx.summaries.get(&func_id) { if &new_summary != old_summary { println!("[*] Summary changed for {}", func_id); global_ctx.summaries.insert(func_id.clone(), new_summary); summaries_changed = true; } } - - // Collect issues from the latest pass - // We clear the list at the start of each global iteration so we don't duplicate - // But we accumulate across functions in the same pass - current_pass_issues.extend(issues); + + // Issues from convergence loop are discarded — collected in final pass. } - + + println!("[*] Iteration {} done in {:.2}s", iterations, t_iter.elapsed().as_secs_f64()); if !summaries_changed || iterations >= MAX_GLOBAL_ITERATIONS { if summaries_changed { println!("[!] Warning: Max global iterations reached without convergence"); } else { - println!("[+] Global convergence reached after {} iterations", iterations); + println!("[+] Global convergence reached after {} iterations in {:.2}s total", + iterations, t_convergence.elapsed().as_secs_f64()); } - all_issues = current_pass_issues; break; } } + // ── Final issue collection pass ────────────────────────────────────────── + // After convergence: collect issues using the converged global_ctx. + // + // Optimization: for large codebases (>5k functions), apply a file-level + // pre-filter to skip the ~80% of functions in files with no taint markers. + // These functions cannot produce findings since they have no taint sources. + // For small codebases, the filter overhead outweighs the savings — use + // the simpler full par_iter which has lower overhead. + const FILE_FILTER_THRESHOLD: usize = 5_000; + let use_file_filter = call_graph.functions.len() > FILE_FILTER_THRESHOLD; + + let t_final_start = std::time::Instant::now(); + let parallel_issues: Vec> = if use_file_filter { + let final_func_ids: Vec<&String> = call_graph.functions + .keys() + .filter(|func_id| { + let file_path = func_id.split("::").next().unwrap_or(""); + if taint_active_files.contains(file_path) { return true; } + if let Some(func_node) = call_graph.functions.get(*func_id) { + if !extract_cli_tainted_params(func_node).is_empty() { return true; } + let func_name = func_node.fields.get("name") + .and_then(|v| v.as_ref()).and_then(|v| v.as_str()).unwrap_or(""); + if global_ctx.call_site_taints.contains_key(func_name) + && global_ctx.call_site_taints[func_name].iter().any(|s| !s.is_empty()) { + return true; + } + } + false + }) + .collect(); + println!("[*] Final pass (parallel+filter): {}/{} functions ({}% filtered out)", + final_func_ids.len(), call_graph.functions.len(), + 100 - 100 * final_func_ids.len() / call_graph.functions.len().max(1)); + final_func_ids + .par_iter() + .filter_map(|func_id| call_graph.functions.get(*func_id).map(|fn_node| { + let cfg_owned; + let cfg = match global_ctx.cfg_cache.get(*func_id) { + Some(c) => c, + None => { cfg_owned = build_cfg(fn_node); &cfg_owned } + }; + let file_path: &str = func_id.split("::").next().unwrap_or(""); + let default_content = String::new(); + let content = call_graph.file_contents.get(file_path).unwrap_or(&default_content); + let (_, _, _, issues) = analyze_function_taint( + &cfg, fn_node, ruleset, file_path, content, &global_ctx + ); + issues + })) + .collect() + } else { + let t_final = t_final_start; + println!("[*] Final pass (parallel): {} functions...", call_graph.functions.len()); + let result = call_graph.functions + .par_iter() + .map(|(func_id, func_node)| { + let cfg_owned; + let cfg = match global_ctx.cfg_cache.get(func_id.as_str()) { + Some(c) => c, + None => { cfg_owned = build_cfg(func_node); &cfg_owned } + }; + let file_path: &str = func_id.split("::").next().unwrap_or(""); + let default_content = String::new(); + let content = call_graph.file_contents.get(file_path).unwrap_or(&default_content); + let (_, _, _, issues) = analyze_function_taint( + &cfg, func_node, ruleset, file_path, content, &global_ctx + ); + issues + }) + .collect(); + println!("[*] Final pass done in {:.2}s", t_final.elapsed().as_secs_f64()); + result + }; + for issues in parallel_issues { + all_issues.extend(issues); + } + println!("[*] Total taint analysis: {:.2}s", t0.elapsed().as_secs_f64()); + // Deduplicate issues let mut unique_issues = Vec::new(); let mut seen_fingerprints = HashSet::new(); @@ -129,6 +397,9 @@ pub fn analyze_program_for_taint(call_graph: &CallGraph, ruleset: &RuleSet) -> V unique_issues } +/// Return type: (summary, call_site_taints, class_attr_taints, issues) +/// - call_site_taints: Map> — collected at each call site +/// - class_attr_taints: Map<(file, attr), origins> — from `self.attr = tainted` assignments fn analyze_function_taint( cfg: &ControlFlowGraph, func_node: &AstNode, @@ -136,17 +407,71 @@ fn analyze_function_taint( file_path: &str, content: &str, global_ctx: &GlobalTaintContext, -) -> (FunctionSummary, Vec) { +) -> (FunctionSummary, HashMap>>, HashMap<(String, String), HashSet>, Vec) { let mut ctx = TaintContext::new(); // Extract parameters and initialize taint state let params = extract_function_params(func_node); let mut initial_state = TaintState::new(); - for (idx, param_name) in params.iter().enumerate() { + // Seed 1: decorator-detected entry-point parameters. + let entry_params = extract_cli_tainted_params(func_node); + // HTTP params (routes, API endpoints) → HttpRequest: attacker-controlled via network + for param in &entry_params.http { + let mut origins = HashSet::new(); + origins.insert(TaintOrigin::HttpRequest); + initial_state.insert(param.clone(), origins); + } + // CLI params (commands, options) → OperatorConfig: trusted operator chose these. + // Sinks like PATH813/SSRF/PY102 check is_attacker_controlled() which returns false + // for OperatorConfig, so they won't fire. FILE_DESERIALIZERS will upgrade file + // *contents* to HttpRequest, preserving supply-chain detection. + for param in &entry_params.operator { let mut origins = HashSet::new(); - origins.insert(TaintOrigin::Param(idx)); - initial_state.insert(param_name.clone(), origins); + origins.insert(TaintOrigin::OperatorConfig); + initial_state.insert(param.clone(), origins); + } + + // Seed 2: inter-procedural call-site taint — if callers passed tainted args, + // seed the matching parameters with their accumulated taint. + // + // Self-offset: for methods where params[0] is "self" or "cls", call-site args + // are indexed without self (caller writes `obj.method(arg0)`, not `method(self, arg0)`). + // Shift recorded arg indices by 1 to align with the method's param list. + let func_name = func_node.fields.get("name") + .and_then(|v| v.as_ref()).and_then(|v| v.as_str()) + .unwrap_or(""); + let self_offset = params.first().map(|p| p == "self" || p == "cls").unwrap_or(false) as usize; + if let Some(param_taints) = global_ctx.call_site_taints.get(func_name) { + for (i, origins) in param_taints.iter().enumerate() { + if !origins.is_empty() { + let param_idx = i + self_offset; + if let Some(param_name) = params.get(param_idx) { + let entry = initial_state.entry(param_name.clone()).or_insert_with(HashSet::new); + entry.extend(origins.iter().cloned()); + } + } + } + } + + // Seed 3: class attribute taint — if any method of this class (same file) + // assigned `self.attr = tainted` AND this function was seeded by call-site + // taint (i.e. it's in the taint chain), propagate those attributes here. + // + // Seed class attribute taints — always seed for same-file methods. + // Class attributes represent shared state within a class. Any method that could + // access these attributes should see their taint, regardless of whether it has + // initial_state. Scope guard was removed because cross-file FPs are caused by + // inter-proc arg propagation, not class_attr_taints seeding. + for ((attr_file, attr_name), origins) in &global_ctx.class_attr_taints { + if attr_file == file_path && !origins.is_empty() { + let key = format!("self.{}", attr_name); + let entry = initial_state.entry(key).or_insert_with(HashSet::new); + entry.extend(origins.iter().cloned()); + // Seed bare attr name for BinOp like `base / self.output_dir` + let entry2 = initial_state.entry(attr_name.clone()).or_insert_with(HashSet::new); + entry2.extend(origins.iter().cloned()); + } } // Initialize blocks @@ -215,43 +540,152 @@ fn analyze_function_taint( } } - // Collect issues and compute summary from final state + // Collect issues, summary, call-site taints, and class-attr taints let mut issues = Vec::new(); let mut summary = FunctionSummary::default(); - + // call_site_taints: callee_func_name → per-arg taint origins + let mut call_site_taints: HashMap>> = HashMap::new(); + // class_attr_taints: (file, attr_name) → origins from `self.attr = tainted` + let mut class_attr_taints: HashMap<(String, String), HashSet> = HashMap::new(); + for block in cfg.blocks.values() { - // Re-run transfer to get issues let entry_state = ctx.entry_states.get(&block.id).cloned().unwrap_or_default(); let (exit_state, block_issues) = transfer_function( - block, - entry_state, - ruleset, - file_path, - content, + block, + entry_state.clone(), + ruleset, + file_path, + content, global_ctx ); issues.extend(block_issues); - - // Check Return statements for summary + + // Scan all statements for: + // 1. Function calls with tainted arguments → record call-site taint + // 2. self.attr = tainted assignments → record class attr taint + // 3. Return statements → update function summary + // Use exit_state as running_state so we see all assignments in the block. + // This is conservative (uses end-of-block state for all stmts) but avoids + // false negatives from forward assignments in the same block. + let running_state = exit_state.clone(); + for stmt in &block.statements { + // Track self.attr = tainted assignments + if stmt.node_type == "Assign" { + // Check targets for `self.attr` pattern + if let Some(targets) = stmt.children.get("targets") { + for target in targets { + if target.node_type == "Attribute" { + let attr_name = target.fields.get("attr") + .and_then(|v| v.as_ref()).and_then(|v| v.as_str()) + .unwrap_or(""); + let is_self = target.children.get("value") + .and_then(|v| v.get(0)) + .and_then(|v| v.fields.get("id")) + .and_then(|v| v.as_ref()).and_then(|v| v.as_str()) + .map(|s| s == "self") + .unwrap_or(false); + if is_self && !attr_name.is_empty() { + // Get the value being assigned and check if it's tainted + if let Some(val) = stmt.children.get("value").and_then(|v| v.get(0)) { + let val_names = extract_all_names(val); + let mut origins: HashSet = HashSet::new(); + for name in &val_names { + if let Some(o) = running_state.get(name) { + origins.extend(o.iter().filter(|o| o.is_attacker_controlled()).cloned()); + } + } + if !origins.is_empty() { + class_attr_taints + .entry((file_path.to_string(), attr_name.to_string())) + .or_insert_with(HashSet::new) + .extend(origins.iter().cloned()); + } + } + } + } + } + } + } + + // Track function calls with tainted arguments → call-site taint + // Record under both the full name AND the bare method name so that + // p.initialize(config) registers as call_site_taints["initialize"][0]. + let mut call_nodes: Vec<&AstNode> = Vec::new(); + find_call_sites(stmt, &mut call_nodes); + for call_node in call_nodes { + let call_name = get_full_call_name(call_node); + if call_name.is_empty() { continue; } + + // The lookup key(s) to record taint under: + // - For bare call `f(x)`: just "f" + // - For method `obj.method(x)`: both "obj.method" and "method" + let lookup_names: Vec = if call_name.contains('.') { + let method_part = call_name.rsplit('.').next().unwrap_or("").to_string(); + if method_part.is_empty() { vec![call_name.clone()] } + else { vec![call_name.clone(), method_part] } + } else { + vec![call_name.clone()] + }; + + if let Some(args) = call_node.children.get("args") { + let mut param_taints: Vec> = Vec::new(); + for arg in args { + let mut origins: HashSet = HashSet::new(); + for name in extract_all_names(arg) { + if let Some(o) = running_state.get(&name) { + origins.extend(o.iter().filter(|o| o.is_attacker_controlled()).cloned()); + } + } + param_taints.push(origins); + } + if param_taints.iter().any(|o| !o.is_empty()) { + for key in &lookup_names { + let entry = call_site_taints + .entry(key.clone()) + .or_insert_with(Vec::new); + let needed = param_taints.len(); + if entry.len() < needed { entry.resize(needed, HashSet::new()); } + for (i, origins) in param_taints.iter().enumerate() { + entry[i].extend(origins.iter().cloned()); + } + } + } + } + } + + // running_state = exit_state (already set above, no per-stmt update needed) + } + + // Check Return statements for summary using exit_state + // Also check for sinks inside return values (e.g. `return FunctionType(tainted_code, ...)`) for stmt in &block.statements { if stmt.node_type == "Return" { if let Some(value) = stmt.children.get("value").and_then(|v| v.get(0)) { - // Check if return value is a direct source call if value.node_type == "Call" { - let call_name = get_full_call_name(value); - if ruleset.taint_sources.iter().any(|s| call_name.contains(&s.function_call)) { - summary.returns_external_taint = true; - } + // Check if return value is a sink with tainted argument + check_sink_and_report(value, &exit_state, ruleset, file_path, content, &mut issues); + + let call_name = get_full_call_name(value); + let is_src = ruleset.taint_sources.iter().any(|s| { + if s.function_call.contains('.') { + call_name.contains(&s.function_call) || + s.function_call.contains(&call_name) + } else { + call_name == s.function_call + } + }); + if is_src { summary.returns_external_taint = true; } } - - // Check taint of returned variables let names = extract_all_names(value); for name in names { if let Some(origins) = exit_state.get(&name) { for origin in origins { match origin { - TaintOrigin::External => summary.returns_external_taint = true, - TaintOrigin::Param(idx) => { summary.param_flows_to_return.insert(*idx); } + TaintOrigin::External | TaintOrigin::HttpRequest => + summary.returns_external_taint = true, + TaintOrigin::Param(idx) => + { summary.param_flows_to_return.insert(*idx); } + _ => {} } } } @@ -260,8 +694,8 @@ fn analyze_function_taint( } } } - - (summary, issues) + + (summary, call_site_taints, class_attr_taints, issues) } fn compute_entry_state( @@ -308,34 +742,279 @@ fn transfer_function( .collect() }) .unwrap_or_default(); - - if value_node.node_type == "Call" { + + // --- Phase 2: Subscript taint sources --- + // Handles: attr = request.GET['key'] (Subscript node, not a Call) + if value_node.node_type == "Subscript" { + let container = get_subscript_container(value_node); + // HTTP request containers — attacker-controlled + const HTTP_CONTAINERS: &[&str] = &[ + "request.GET", "request.POST", "request.FILES", + "request.COOKIES", "request.META", "request.headers", + "request.args", "request.form", "request.values", + "request.json", + ]; + // Operator-supplied containers — trusted (CLI, env config) + // sys.argv is set by whoever invokes the program (the operator). + // os.environ is set by the deployment environment (the operator). + // Neither is attacker-controlled in the HTTP threat model. + const OPERATOR_CONTAINERS: &[&str] = &[ + "sys.argv", "os.environ", + ]; + if HTTP_CONTAINERS.iter().any(|tc| container.contains(tc)) { + let mut origins = HashSet::new(); + origins.insert(TaintOrigin::External); + for target in &targets { + state.insert(target.clone(), origins.clone()); + } + } else if OPERATOR_CONTAINERS.iter().any(|tc| container.contains(tc)) { + let mut origins = HashSet::new(); + origins.insert(TaintOrigin::OperatorConfig); + for target in &targets { + state.insert(target.clone(), origins.clone()); + } + } else { + let mut new_origins = HashSet::new(); + + // Propagate taint from the subscript base if already tainted + // e.g. data = tainted_dict['key'] → data is tainted + let base_names = get_subscript_base_names(value_node); + for name in &base_names { + if let Some(origins) = state.get(name.as_str()) { + new_origins.extend(origins.iter().cloned()); + } + } + + // Also: if the subscript base is itself a taint source CALL, + // the subscript result is tainted. + // e.g. msg = r.json()["key"] → r.json() is a taint source → msg tainted + if let Some(base_value) = value_node.children.get("value").and_then(|v| v.get(0)) { + if base_value.node_type == "Call" { + let base_call_name = get_full_call_name(base_value); + let is_base_source = !base_call_name.is_empty() && + ruleset.taint_sources.iter().any(|source| { + if source.function_call.contains('.') { + base_call_name.contains(&source.function_call) || + source.function_call.contains(&base_call_name) + } else { + base_call_name == source.function_call + } + }); + if is_base_source { + new_origins.insert(TaintOrigin::HttpRequest); + } + } + } + + if !new_origins.is_empty() { + for target in &targets { + state.insert(target.clone(), new_origins.clone()); + } + } + } + } else if value_node.node_type == "Call" { let call_name = get_full_call_name(value_node); // 1. Check for Taint Source - let is_source = ruleset.taint_sources.iter().any(|source| { - call_name.contains(&source.function_call) || - source.function_call.contains(&call_name) + let is_source = !call_name.is_empty() && ruleset.taint_sources.iter().any(|source| { + if source.function_call.contains('.') { + call_name.contains(&source.function_call) || + source.function_call.contains(&call_name) + } else { + call_name == source.function_call + } }); - if is_source { + // Check for SystemGenerated sources — tempfile/uuid/secrets + // These are never attacker-controlled regardless of framework + const SYSTEM_GENERATED_CALLS: &[&str] = &[ + "tempfile.", "uuid.", "secrets.", "os.urandom", + "random.randbytes", "hashlib.new", + ]; + let is_system_generated = !call_name.is_empty() && + SYSTEM_GENERATED_CALLS.iter().any(|sg| call_name.starts_with(sg) || call_name == *sg); + + // json.load(f) is an independent taint source: file contents can + // come from third parties (plugins, packages) even if the file PATH + // is operator-chosen. This allows CLI decorator params to be + // OperatorConfig (trusted) while still catching supply-chain attacks + // via loaded config files. + // json.loads (string parsing) is taint-PRESERVING instead — the + // string's own trust level determines the output trust level. + const FILE_DESERIALIZERS: &[&str] = &[ + "json.load", // reads from file handle — contents are external + "yaml.load", // reads from file — check separate for SafeLoader + "toml.load", // reads from file + "pickle.load", // reads from file (also caught by PY301 pattern) + ]; + let is_file_deserializer = !call_name.is_empty() && + FILE_DESERIALIZERS.iter().any(|fd| call_name.contains(fd)); + + // Type conversion wrappers and deserializers that preserve taint: + // list(), tuple(), json.load(f), etc. — output has the same trust + // level as input. Propagate taint from first argument. + // INTENTIONALLY NARROW: only type conversions that preserve the + // data identity (list/tuple/set) AND JSON deserialization. + // Do NOT include sorted/reversed/enumerate/zip/map/filter — + // those push taint into DoS/join/sorted rules and produce + // massive false positives across large codebases. + const TAINT_PRESERVING_CALLS: &[&str] = &[ + "list", "tuple", "set", "frozenset", + "json.loads", + // Regex operations propagate taint from input to match objects + "re.search", "re.match", "re.fullmatch", + "re.findall", "re.finditer", + "group", "groups", "groupdict", + // Path construction/normalization — taint from any component + // propagates to the result. os.path.join(base, user_path) and + // Path(user_path) both carry the taint forward to file-operation sinks. + "os.path.join", "os.path.normpath", "os.path.abspath", + // pathlib.Path constructor: Path(tainted_str) → tainted Path object + // → .read_text(), .write_text(), .open() etc. fire PATH813/OPEN1149 + "Path", "PurePath", "PosixPath", "WindowsPath", + // URL parsing/construction: taint flows through URL manipulation. + // os.environ["CI_URL"] → urlsplit() → _replace() → urlunsplit() → + // git fetch triggers ENV_GIT_URL001 / PY102 / SSRF_001. + "urlsplit", "urlunsplit", "urlparse", "urlunparse", + "urljoin", "urlencode", + "urllib.parse.urlsplit", "urllib.parse.urlunsplit", + "urllib.parse.urlparse", "urllib.parse.urlunparse", + "urllib.parse.urljoin", "urllib.parse.urlencode", + ]; + // Match both exact names (re.match) and method suffixes (m.group → .group) + let is_taint_preserving = !call_name.is_empty() && + TAINT_PRESERVING_CALLS.iter().any(|tp| { + call_name == *tp || + call_name.ends_with(&format!(".{}", tp)) + }); + + if is_taint_preserving { + // Propagate taint from arguments to the result + if let Some(args) = value_node.children.get("args") { + let mut new_origins: HashSet = HashSet::new(); + for arg in args { + for name in extract_all_names(arg) { + if let Some(origins) = state.get(&name) { + new_origins.extend(origins.iter().cloned()); + } + } + } + if !new_origins.is_empty() { + for target in &targets { + state.insert(target.clone(), new_origins.clone()); + } + } + } + } else if is_system_generated { for target in &targets { let mut origins = HashSet::new(); - origins.insert(TaintOrigin::External); + origins.insert(TaintOrigin::SystemGenerated); state.insert(target.clone(), origins); } + } else if is_file_deserializer || is_source { + // Operator-config call sources: os.environ.get(), os.getenv() + // These read values set by the deployment operator, not by + // HTTP request senders. + const OPERATOR_CALL_SOURCES: &[&str] = &[ + "os.environ.get", "os.getenv", "os.environ[", + ]; + let is_operator_source = !call_name.is_empty() && + OPERATOR_CALL_SOURCES.iter().any(|op| call_name.contains(op)); + + if is_operator_source { + for target in &targets { + let mut origins = HashSet::new(); + origins.insert(TaintOrigin::OperatorConfig); + state.insert(target.clone(), origins); + } + } else { + // is_file_deserializer: json.load(f), yaml.load(f), etc. + // — always HttpRequest regardless of f's trust level, + // because file contents can be third-party (supply chain) + // is_source: request.GET.get(), iter_lines(), .json(), etc. + for target in &targets { + let mut origins = HashSet::new(); + origins.insert(TaintOrigin::HttpRequest); + state.insert(target.clone(), origins); + } + } } else { // 2. Check for Sanitizer - let is_sanitizer = ruleset.taint_sanitizers.iter().any(|san| { + // If transforms_to is set: transform taint origin instead of clearing. + // If no transforms_to: clear taint (data is fully sanitized). + let matching_sanitizer = ruleset.taint_sanitizers.iter().find(|san| { call_name.contains(&san.function_call) || san.function_call.contains(&call_name) }); - - if is_sanitizer { - for target in &targets { - state.remove(target); + + if let Some(san) = matching_sanitizer { + if let Some(ref transforms_to) = san.transforms_to { + // Partial sanitization: transform origin, preserve taintedness + if let Some(new_origin) = TaintOrigin::from_transforms_to(transforms_to) { + for target in &targets { + let mut new_origins = HashSet::new(); + new_origins.insert(new_origin.clone()); + state.insert(target.clone(), new_origins); + } + } else { + // Unknown transforms_to value — fall back to clearing + for target in &targets { state.remove(target); } + } + } else { + // Full sanitization: clear taint completely + for target in &targets { state.remove(target); } } } else { + // 2b. Known sink call: propagate taint to result if a + // vulnerable argument is tainted (e.g. b=bytes(tainted)) + let sink_taint = { + let mut found = HashSet::new(); + for sink in &ruleset.taint_sinks { + let matches = if sink.function_call.contains('.') { + // Forward-only: "urllib.request.urlopen".contains("open") would be a FP + call_name.contains(&sink.function_call) + } else if sink.is_method { + let dc = call_name.chars().filter(|&c| c == '.').count(); + match dc { + 0 => call_name == sink.function_call, + _ => { + const MP: &[&str] = &["posixpath.","ntpath.","genericpath.","pathlib.","os.","sys.","re.","json.","urllib.","http.","xml.","html.","csv.","io.","base64.","hashlib.","hmac.","struct.","itertools.","functools.","operator.","execute.","ops.","eager."]; + call_name.ends_with(&format!(".{}", sink.function_call)) && !MP.iter().any(|pfx| call_name.starts_with(pfx)) + } + } + } else { + call_name == sink.function_call + }; + if !matches { continue; } + // Check if the vulnerable argument is tainted + let arg_tainted = if sink.vulnerable_receiver { + if let Some(func) = value_node.children.get("func").and_then(|v| v.get(0)) { + if func.node_type == "Attribute" { + if let Some(recv) = func.children.get("value").and_then(|v| v.get(0)) { + get_direct_taint_names(recv).iter().any(|n| is_attacker_tainted(&state, n)) + } else { false } + } else { false } + } else { false } + } else { + if let Some(args) = value_node.children.get("args") { + if args.len() > sink.vulnerable_parameter_index { + get_direct_taint_names(&args[sink.vulnerable_parameter_index]).iter().any(|n| is_attacker_tainted(&state, n)) + } else { false } + } else { false } + }; + if arg_tainted { + found.insert(TaintOrigin::External); + break; + } + } + found + }; + if !sink_taint.is_empty() { + for target in &targets { + state.insert(target.clone(), sink_taint.clone()); + } + } + // 3. Check for Inter-procedural Taint (Summaries) let mut new_origins = HashSet::new(); @@ -364,18 +1043,23 @@ fn transfer_function( } } } else { - // Fallback: Conservative propagation if unknown function - if check_args_tainted(value_node, &state) { - // We propagate the origins from args - if let Some(args) = value_node.children.get("args") { - for arg in args { - let names = extract_all_names(arg); + // Method receiver propagation ONLY: + // tainted_obj.method() → result is tainted. + // We do NOT propagate through positional args of unknown functions + // (disabled: causes taint explosion through every utility call). + if let Some(func) = value_node.children.get("func").and_then(|v| v.get(0)) { + if func.node_type == "Attribute" { + if let Some(receiver) = func.children.get("value").and_then(|v| v.get(0)) { + let names = extract_all_names(receiver); for name in names { if let Some(origins) = state.get(&name) { new_origins.extend(origins.iter().cloned()); } } } + // dead code below — kept for structure + } else { + let _ = (); // no positional arg propagation } } } @@ -387,8 +1071,39 @@ fn transfer_function( } } } + } else if value_node.node_type == "Constant" || value_node.node_type == "JoinedStr" { + // Tier 3: Constant folding — string/numeric literals are DeveloperDefined. + // "text" or f"text with {constant}" → developer wrote it, never user input. + // This handles: INTERNAL_RESET_SESSION_TOKEN = "_password_reset_token" + // and all other module-level or class-level constant assignments. + let is_all_constant = value_node.node_type == "Constant" || { + // For f-strings: DeveloperDefined only if ALL FormattedValues are also constants/DeveloperDefined + value_node.children.get("values").map_or(true, |vals| { + vals.iter().all(|v| { + v.node_type == "Constant" || ( + v.node_type == "FormattedValue" && + v.children.get("value").and_then(|vv| vv.get(0)) + .map_or(false, |expr| { + // Check if the expr name is DeveloperDefined in state + get_direct_taint_names(expr).iter().all(|n| { + state.get(n).map_or(true, |origins| { + origins.iter().all(|o| !o.is_attacker_controlled()) + }) + }) + }) + ) + }) + }) + }; + if is_all_constant { + for target in &targets { + let mut origins = HashSet::new(); + origins.insert(TaintOrigin::DeveloperDefined); + state.insert(target.clone(), origins); + } + } } else { - // Transitive propagation (Assignment) + // Transitive propagation (Assignment from Name/Attribute/etc.) let mut new_origins = HashSet::new(); let src_names = extract_all_names(value_node); for name in src_names { @@ -396,23 +1111,197 @@ fn transfer_function( new_origins.extend(origins.iter().cloned()); } } - if !new_origins.is_empty() { for target in &targets { state.insert(target.clone(), new_origins.clone()); } } } + + // BinOp taint propagation: x = tainted % "..." or "..." % tainted + // Handles Python string formatting: sql = "SELECT * FROM %s" % table + if value_node.node_type == "BinOp" { + let mut binop_origins = HashSet::new(); + for side in ["left", "right"] { + if let Some(operand) = value_node.children.get(side).and_then(|v| v.get(0)) { + let names = get_direct_taint_names(operand); + for name in names { + if let Some(origins) = state.get(&name) { + binop_origins.extend(origins.iter().cloned()); + } + } + } + } + if !binop_origins.is_empty() { + for target in &targets { + state.insert(target.clone(), binop_origins.clone()); + } + } + } + + // BoolOp taint propagation: x = a or b, x = a and b + // If any operand is tainted, x is tainted. + // Handles: config = plugin_config or {} → config is tainted if plugin_config is + if value_node.node_type == "BoolOp" { + let mut bool_origins = HashSet::new(); + if let Some(values) = value_node.children.get("values") { + for val in values { + for name in extract_all_names(val) { + if let Some(origins) = state.get(&name) { + bool_origins.extend(origins.iter().cloned()); + } + } + } + } + if !bool_origins.is_empty() { + for target in &targets { + state.insert(target.clone(), bool_origins.clone()); + } + } + } + + // Check ALL call nodes within the RHS for sinks. + // Using find_call_sites (not just the outermost call) catches nested + // sinks like: result = env.from_string(tainted).render() + // where from_string is the dangerous call, not render. + if value_node.node_type == "Call" { + let mut rhs_calls = Vec::new(); + find_call_sites(value_node, &mut rhs_calls); + for call in rhs_calls { + check_sink_and_report(call, &state, ruleset, file_path, content, &mut issues); + } + } + // f-string: x = f"...{tainted}..." + // 1. Flag FSTRING867 if any slot contains tainted variable. + // 2. Propagate taint to x (the f-string result carries taint forward). + if value_node.node_type == "JoinedStr" { + check_fstring_taint(value_node, &state, ruleset, file_path, content, &mut issues); + // Propagate: if any FormattedValue is tainted, result is tainted + let mut origins = HashSet::new(); + if let Some(values) = value_node.children.get("values") { + for val in values { + if val.node_type == "FormattedValue" { + if let Some(expr) = val.children.get("value").and_then(|v| v.get(0)) { + for name in extract_all_names(expr) { + if let Some(o) = state.get(&name) { + origins.extend(o.iter().cloned()); + } + } + } + } + } + } + if !origins.is_empty() { + for target in &targets { + state.insert(target.clone(), origins.clone()); + } + } + } + } + } + // For-loop variable binding: `for x in tainted_collection` → x is tainted. + // The CFG flattens for-loops so the For node appears as a statement + // in the header block. Propagate taint from iter to target. + "For" => { + if let Some(iter) = stmt.children.get("iter").and_then(|v| v.get(0)) { + let iter_names = extract_all_names(iter); + let mut loop_origins: HashSet = HashSet::new(); + for name in &iter_names { + if let Some(origins) = state.get(name) { + loop_origins.extend(origins.iter().cloned()); + } + } + if !loop_origins.is_empty() { + if let Some(target) = stmt.children.get("target").and_then(|v| v.get(0)) { + let target_names: Vec = match target.node_type.as_str() { + "Name" => target.fields.get("id") + .and_then(|v| v.as_ref()).and_then(|v| v.as_str()) + .map(|s| vec![s.to_string()]) + .unwrap_or_default(), + "Tuple" => target.children.get("elts") + .map(|elts| elts.iter() + .filter_map(|e| e.fields.get("id") + .and_then(|v| v.as_ref()).and_then(|v| v.as_str()) + .map(|s| s.to_string())) + .collect()) + .unwrap_or_default(), + _ => vec![], + }; + for name in target_names { + state.insert(name, loop_origins.clone()); + } + } + } + } + // Also check any sink calls in the for-loop header + let mut call_sites = Vec::new(); + find_call_sites(stmt, &mut call_sites); + for call_node in call_sites { + check_sink_and_report(call_node, &state, ruleset, file_path, content, &mut issues); } } "Expr" => { if let Some(value) = stmt.children.get("value").and_then(|v| v.get(0)) { if value.node_type == "Call" { check_sink_and_report(value, &state, ruleset, file_path, content, &mut issues); - - // Sanitizer as standalone statement + } + if value.node_type == "JoinedStr" { + check_fstring_taint(value, &state, ruleset, file_path, content, &mut issues); + } + } + } + // With statement: `with expr as var` → var inherits taint from expr. + // Handles: with open(tainted_path) as f → f is tainted + // with tainted_ctx as val → val is tainted + "With" => { + if let Some(items) = stmt.children.get("items") { + for item in items { + // context_expr is the expression (e.g. open(path)) + // optional_vars is the `as var` binding + let ctx_tainted: HashSet = { + let mut origins = HashSet::new(); + if let Some(ctx) = item.children.get("context_expr").and_then(|v| v.get(0)) { + // Check if context_expr is a call that is a sink (e.g. open()) + // and whether its arguments are tainted → ctx gets taint + if ctx.node_type == "Call" { + check_sink_and_report(ctx, &state, ruleset, file_path, content, &mut issues); + // Propagate taint from call arguments to context var + if let Some(args) = ctx.children.get("args") { + for arg in args { + for name in extract_all_names(arg) { + if let Some(o) = state.get(&name) { + origins.extend(o.iter().cloned()); + } + } + } + } + } else { + for name in extract_all_names(ctx) { + if let Some(o) = state.get(&name) { + origins.extend(o.iter().cloned()); + } + } + } + } + origins + }; + if !ctx_tainted.is_empty() { + if let Some(opt_vars) = item.children.get("optional_vars").and_then(|v| v.get(0)) { + if let Some(var_name) = opt_vars.fields.get("id") + .and_then(|v| v.as_ref()).and_then(|v| v.as_str()) + { + state.insert(var_name.to_string(), ctx_tainted); + } + } + } } } + // Also check sinks in the With body via the fallthrough + let mut call_sites = Vec::new(); + find_call_sites(stmt, &mut call_sites); + for call_node in call_sites { + check_sink_and_report(call_node, &state, ruleset, file_path, content, &mut issues); + } } _ => { let mut call_sites = Vec::new(); @@ -423,10 +1312,73 @@ fn transfer_function( } } } - + (state, issues) } +/// Returns only the DIRECT variable name(s) of an AST node for taint checking. +/// Unlike `extract_all_names`, this does NOT recurse into attribute receivers. +/// - Name("attr") → ["attr"] +/// - Attribute("self.STANDARD_UNIT") → ["STANDARD_UNIT"] (not "self") +/// - Subscript(d["key"]) → ["d"] +/// Returns true if the state contains attacker-controlled taint for this name. +/// DeveloperDefined, SystemGenerated, OperatorConfig do NOT trigger sinks. +fn is_attacker_tainted(state: &TaintState, name: &str) -> bool { + state.get(name).map_or(false, |origins| { + origins.iter().any(|o| o.is_attacker_controlled()) + }) +} + +/// Check taint considering the sink's triggers_on policy. +/// +/// "all" (default) — fires for all attacker-controlled origins. +/// "shell_injectable" — fires for all EXCEPT ShellSanitized. +/// Use for PY102 — shlex.quote is valid shell mitigation. +/// "sql_injectable" — fires for all EXCEPT SqlSanitized. +/// Use for PY101 — quote_name is valid SQL mitigation. +/// "html_injectable" — fires for all EXCEPT HtmlSanitized. +/// Use for XSS sinks — html.escape/format_html are valid. +/// "injectable_only" — fires ONLY for HttpRequest/External (no sanitized variants). +/// Legacy / strict mode. +fn is_tainted_for_sink(state: &TaintState, name: &str, triggers_on: &str) -> bool { + state.get(name).map_or(false, |origins| { + origins.iter().any(|o| { + match triggers_on { + "shell_injectable" => o.is_shell_injectable(), // HttpRequest|External only + "sql_injectable" => o.is_sql_injectable(), // HttpRequest|External|ShellSanitized + "html_injectable" => o.is_attacker_controlled(), // all (HtmlSanitized is not attacker-controlled) + "injectable_only" => o.is_shell_injectable(), + _ => o.is_attacker_controlled(), // "all" default + } + }) + }) +} + +fn get_direct_taint_names(node: &AstNode) -> Vec { + match node.node_type.as_str() { + "Name" => { + if let Some(id) = node.fields.get("id").and_then(|v| v.as_ref()).and_then(|v| v.as_str()) { + return vec![id.to_string()]; + } + } + "Attribute" => { + // Only return the attribute name itself, NOT the receiver. + // This prevents self.STANDARD_UNIT from matching because self is tainted. + if let Some(attr) = node.fields.get("attr").and_then(|v| v.as_ref()).and_then(|v| v.as_str()) { + return vec![attr.to_string()]; + } + } + "Subscript" => { + // Return the container name for subscript access (e.g., dict["key"] → "dict") + if let Some(value) = node.children.get("value").and_then(|v| v.get(0)) { + return get_direct_taint_names(value); + } + } + _ => {} + } + Vec::new() +} + fn check_sink_and_report( call_node: &AstNode, state: &TaintState, @@ -436,39 +1388,261 @@ fn check_sink_and_report( issues: &mut Vec, ) { let call_name = get_full_call_name(call_node); - + + // Skip unresolvable calls (empty name matches everything via contains("")) + if call_name.is_empty() { + return; + } + for sink in &ruleset.taint_sinks { - if call_name.contains(&sink.function_call) || sink.function_call.contains(&call_name) { + // Matching strategy: + // - Dotted sink paths ("subprocess.run"): substring match + // - Method sinks (is_method=true, e.g. "replace", "join", "format"): + // call_name must end with ".funcname" (avoids "set" matching builtin "set()") + // - Builtin sinks (is_method=false, e.g. "set", "open", "getattr"): + // call_name must equal funcname exactly (prevents "cache.set" matching "set") + let matches = if sink.function_call.contains('.') { + // Forward-only: "urllib.request.urlopen".contains("open") is a FP + call_name.contains(&sink.function_call) + } else if sink.is_method { + // Method sinks (replace, join, center, etc.): + // - 0 dots: receiver was a literal/constant → exact match + // - 1 dot: normal method call "s.method" → ends_with ".method" + // EXCEPT when receiver looks like a module (posixpath, ntpath, etc.) + // - 2+ dots: module path → NOT a method, skip + const MODULE_PREFIXES: &[&str] = &[ + "posixpath.", "ntpath.", "genericpath.", "pathlib.", + "os.", "sys.", "re.", "json.", "urllib.", "http.", + "xml.", "html.", "csv.", "io.", "base64.", "hashlib.", + "hmac.", "struct.", "itertools.", "functools.", "operator.", + // ML framework module prefixes that have .execute() but are NOT SQL sinks: + // execute.execute(b"Fill", ...) — eager op execution + // ops.execute(...) — operation execution + "execute.", "ops.", "eager.", + ]; + let dot_count = call_name.chars().filter(|&c| c == '.').count(); + // For dot_count=0 (e.g. the receiver was a literal, so get_full_call_name + // only returns the method name), require the func node to be an Attribute + // to distinguish `'/'.join(parts)` (method on literal) from `execute(x)` (standalone). + let func_is_attribute = call_node.children.get("func") + .and_then(|v| v.get(0)) + .map(|f| f.node_type == "Attribute") + .unwrap_or(false); + match dot_count { + 0 => func_is_attribute && call_name == sink.function_call, + _ => { + call_name.ends_with(&format!(".{}", sink.function_call)) && + !MODULE_PREFIXES.iter().any(|pfx| call_name.starts_with(pfx)) + } + } + } else { + call_name == sink.function_call + }; + if !matches { + continue; + } + + let mut found_taint = false; + + let triggers_on = sink.triggers_on.as_str(); + + if sink.vulnerable_receiver { + // Check method receiver: tainted_obj.method(...) → receiver is tainted. + // Use extract_all_names so inline expressions like Path(tainted).mkdir() + // are correctly detected — Path(output) is a Call whose arg "output" is tainted. + if let Some(func) = call_node.children.get("func").and_then(|v| v.get(0)) { + if func.node_type == "Attribute" { + if let Some(receiver) = func.children.get("value").and_then(|v| v.get(0)) { + let names = extract_all_names(receiver); + for name in names { + if is_tainted_for_sink(state, &name, triggers_on) { + found_taint = true; + break; + } + } + } + } + } + } else { + // Check positional argument at vulnerable_parameter_index. + // When vulnerable_keyword is specified, skip Phase 1 entirely — the sink + // is keyword-only (e.g. create(password=tainted), not create(tainted)). + // Without this guard, Q.create(tainted_list) fires PLAIN_PWD001 because + // args[0] is tainted even though no password= keyword is present. + let skip_positional = sink.vulnerable_keyword.is_some(); + if !skip_positional { if let Some(args) = call_node.children.get("args") { if args.len() > sink.vulnerable_parameter_index { let arg = &args[sink.vulnerable_parameter_index]; let arg_names = extract_all_names(arg); - for name in arg_names { - if let Some(_origins) = state.get(&name) { - // We found a tainted variable flowing to a sink - - println!("[!] VULNERABILITY: Tainted variable '{}' flows to sink '{}'", name, call_name); - report_issue(ruleset, &sink.vulnerability_id, file_path, call_node, content, issues); - break; // Report once per sink call + if is_tainted_for_sink(state, &name, triggers_on) { + found_taint = true; + break; } } + // Also check if the arg contains an inline taint source call + // e.g. httpx.stream("GET", r.json()["url"]) — r.json() is a source + if !found_taint { + let mut inline_calls: Vec<&AstNode> = Vec::new(); + find_call_sites(arg, &mut inline_calls); + for inline_call in inline_calls { + let inline_name = get_full_call_name(inline_call); + let is_inline_source = ruleset.taint_sources.iter().any(|s| { + if s.function_call.contains('.') { + inline_name.contains(&s.function_call) || + s.function_call.contains(&inline_name) + } else { + inline_name == s.function_call + } + }); + if is_inline_source { + found_taint = true; + break; + } + } + } + } + } + } // end skip_positional guard + } + + // Phase 3: keyword arguments for positional-arg sinks only. + // If vulnerable_keyword is set, only that named kwarg triggers. + // Otherwise, any tainted kwarg can trigger (for sinks that accept kwargs). + if !found_taint && !sink.vulnerable_receiver { + if let Some(keywords) = call_node.children.get("keywords") { + for kw in keywords { + let kw_arg_name = kw.fields.get("arg") + .and_then(|v| v.as_ref()).and_then(|v| v.as_str()) + .unwrap_or(""); + // If vulnerable_keyword is specified, skip non-matching kwargs + if let Some(ref vk) = sink.vulnerable_keyword { + if kw_arg_name != vk.as_str() { continue; } + } + if let Some(kw_value) = kw.children.get("value").and_then(|v| v.get(0)) { + let kw_names = get_direct_taint_names(kw_value); + for name in kw_names { + if is_attacker_tainted(state, &name) { + found_taint = true; + break; + } + } + } + if found_taint { break; } } } } + + if found_taint { + println!("[!] VULNERABILITY: Tainted variable flows to sink '{}'", call_name); + report_issue(ruleset, &sink.vulnerability_id, file_path, call_node, content, issues); + } + // Note: found_taint is true only when is_attacker_controlled() returned true + // (see get_direct_taint_names usage above — we check state.contains_key which + // only contains attacker-controlled taint after the provenance gate below) } } -fn check_args_tainted(call_node: &AstNode, state: &TaintState) -> bool { - if let Some(args) = call_node.children.get("args") { - for arg in args { - let names = extract_all_names(arg); - if names.iter().any(|name| state.contains_key(name)) { - return true; +/// Check if an f-string (JoinedStr) contains a directly tainted variable and report FSTRING867. +/// +/// Uses get_direct_taint_names (not extract_all_names) so only DIRECT variable references +/// inside the f-string slots trigger the rule. This prevents FPs where tainted data is +/// wrapped in a safe function call: `f"count: {len(data)}"` does NOT fire because `len()` +/// transforms the tainted data before interpolation (result is an integer, not injectable). +/// +/// Cases that fire: +/// f"{user_input}" — direct Name reference, tainted → fires +/// f"{obj.field}" — Attribute, field is tainted → fires +/// f"{data[key]}" — Subscript, data is tainted → fires +/// +/// Cases that do NOT fire (correctly suppressed): +/// f"{len(tainted_list)}" — len() wraps it, returns int, not injectable +/// f"{str(tainted)}" — str() is a safe conversion +/// f"{repr(tainted)}" — repr() wraps it safely +/// f"{x!r}" — !r conversion quotes the value (same as repr) +/// f"{x!a}" — !a conversion applies ascii(), quotes non-ASCII +fn check_fstring_taint( + node: &AstNode, + state: &TaintState, + ruleset: &RuleSet, + file_path: &str, + content: &str, + issues: &mut Vec, +) { + // JoinedStr.children["values"] contains Constant and FormattedValue nodes. + if let Some(values) = node.children.get("values") { + for val in values { + if val.node_type == "FormattedValue" { + // Skip slots with repr/ascii conversion: {x!r} and {x!a} quote the value, + // making it safe for injection. conversion field: 114='r', 97='a', 115='s', -1=none. + let conversion = val.fields.get("conversion") + .and_then(|v| v.as_ref()).and_then(|v| v.as_i64()) + .unwrap_or(-1); + if conversion == 114 || conversion == 97 { // !r or !a + continue; + } + // FormattedValue.children["value"] is the expression inside {}. + if let Some(expr) = val.children.get("value").and_then(|v| v.get(0)) { + // Use get_direct_taint_names: only direct Name/Attribute/Subscript + // references — NOT recursive into function call arguments. + let names = get_direct_taint_names(expr); + for name in names { + if is_attacker_tainted(state, &name) { + report_issue(ruleset, "FSTRING867", file_path, node, content, issues); + return; // report once per f-string + } + } + } } } } - false +} + +/// Returns a dotted string representing the container of a Subscript node. +/// For `request.GET['key']` returns "request.GET". +fn get_subscript_container(node: &AstNode) -> String { + if let Some(value) = node.children.get("value").and_then(|v| v.get(0)) { + match value.node_type.as_str() { + "Attribute" => { + let mut parts = Vec::new(); + let mut cur = value; + loop { + if let Some(attr) = cur.fields.get("attr").and_then(|v| v.as_ref()).and_then(|v| v.as_str()) { + parts.push(attr.to_string()); + } + if let Some(next) = cur.children.get("value").and_then(|v| v.get(0)) { + cur = next; + } else { + break; + } + } + if let Some(base) = cur.fields.get("id").and_then(|v| v.as_ref()).and_then(|v| v.as_str()) { + parts.push(base.to_string()); + } + parts.reverse(); + parts.join(".") + } + "Name" => value.fields.get("id") + .and_then(|v| v.as_ref()) + .and_then(|v| v.as_str()) + .unwrap_or("") + .to_string(), + _ => String::new(), + } + } else { + String::new() + } +} + +/// Returns all Name identifiers in the base (non-slice) part of a Subscript. +/// For `tainted_dict['key']` returns ["tainted_dict"]. +fn get_subscript_base_names(node: &AstNode) -> Vec { + if let Some(value) = node.children.get("value").and_then(|v| v.get(0)) { + extract_all_names(value) + } else { + Vec::new() + } } fn extract_function_params(func_node: &AstNode) -> Vec { @@ -546,8 +1720,204 @@ fn get_full_call_name(call_node: &AstNode) -> String { String::new() } +/// Inspect a FunctionDef node's decorator_list and return the names of parameters +/// that receive user-controlled input based on known entry-point decorators. +/// +/// Supported frameworks and decorator patterns: +/// +/// **CLI** (click, typer, argparse): +/// @click.command / @click.option("--flag", "param_name") / @click.argument("name") +/// @app.command() / @typer.option / @typer.argument (Typer uses same conventions) +/// +/// **Web** (Flask, FastAPI, Django REST, aiohttp, Bottle, Falcon, Starlette): +/// @app.route("/path") / @app.get / @app.post / @app.put / @app.delete / @app.patch +/// @router.get / @router.post / @api_view / @require_http_methods +/// @web.get / @web.post (aiohttp) +/// +/// **Task queues** (Celery, RQ, Huey, Dramatiq): +/// @app.task / @celery.task / @shared_task / @dramatiq.actor / @huey.task +/// @periodic_task / @rq.job +/// +/// **Event handlers** (Django signals, Flask signals, AWS Lambda, GCP Functions): +/// @receiver(signal) / @app.before_request / @app.after_request +/// @lambda_handler / @functions_framework.http +/// +/// For all of these, ALL parameters (except self/cls) are considered user-controlled +/// because the framework injects request/event/message data into them. +/// Parameters classified by decorator type and the taint origin they should receive. +struct EntryPointParams { + /// HTTP decorator params (@app.route, @api_view) → TaintOrigin::HttpRequest. + /// Attacker-controlled: any internet user can send arbitrary values. + http: Vec, + /// CLI decorator params (@app.command, @click.option) → TaintOrigin::OperatorConfig. + /// Operator-trusted: the person running the tool chose these values. + /// FILE_DESERIALIZERS still produce HttpRequest when reading file *contents*, + /// so supply-chain detection is preserved even for operator-specified file paths. + operator: Vec, +} + +impl EntryPointParams { + fn is_empty(&self) -> bool { self.http.is_empty() && self.operator.is_empty() } +} + +fn extract_cli_tainted_params(func_node: &AstNode) -> EntryPointParams { + let mut result = EntryPointParams { http: Vec::new(), operator: Vec::new() }; + + let decorator_list = match func_node.children.get("decorator_list") { + Some(d) => d, + None => return result, + }; + + // HTTP entry points — parameters receive attacker-controlled data from network requests. + // These produce HttpRequest taint which triggers all security sinks. + const HTTP_TAINT_DECORATOR_ATTRS: &[&str] = &[ + // Web frameworks — route/endpoint decorators + "route", "get", "post", "put", "delete", "patch", "head", "options", + // Django REST Framework + "api_view", "action", "require_http_methods", "require_GET", "require_POST", + // aiohttp + "view", "endpoint", + // Starlette / FastAPI router + "add_route", + // Task queues — tasks receive data from external message brokers + "task", "shared_task", "periodic_task", "actor", "job", + // Event handlers + "receiver", "before_request", "after_request", "teardown_request", + "before_app_request", "after_app_request", + // Serverless + "handler", + ]; + + // CLI entry points (Click, Typer) are treated the same as HTTP entry points: + // both produce HttpRequest taint on all parameters. + // Rationale: CLI tools that process third-party file contents (plugin configs, + // user-supplied data) share the same supply-chain risk as HTTP handlers. + const CLI_TAINT_DECORATOR_ATTRS: &[&str] = &[ + "command", "group", + ]; + + let mut has_http_taint_decorator = false; + let mut has_cli_taint_decorator = false; + let mut click_option_params: Vec = Vec::new(); + + for decorator in decorator_list { + if decorator.node_type != "Call" { + // Bare decorator (no parens): @app.route, @app.command + if let Some(func) = decorator.children.get("func").and_then(|v| v.get(0)) { + let attr = func.fields.get("attr") + .and_then(|v| v.as_ref()).and_then(|v| v.as_str()) + .unwrap_or(""); + if HTTP_TAINT_DECORATOR_ATTRS.contains(&attr) { + has_http_taint_decorator = true; + } else if CLI_TAINT_DECORATOR_ATTRS.contains(&attr) { + has_cli_taint_decorator = true; + } + } + continue; + } + + // Call decorator: @click.option("--flag", "param_name") etc. + let func = match decorator.children.get("func").and_then(|v| v.get(0)) { + Some(f) => f, + None => continue, + }; + + let attr = func.fields.get("attr") + .and_then(|v| v.as_ref()).and_then(|v| v.as_str()) + .unwrap_or(""); + + if HTTP_TAINT_DECORATOR_ATTRS.contains(&attr) { + has_http_taint_decorator = true; + continue; + } else if CLI_TAINT_DECORATOR_ATTRS.contains(&attr) { + has_cli_taint_decorator = true; + continue; + } + + // click.option("--flag-name", "python_param_name") or just ("--flag-name") + if attr == "option" { + let args = decorator.children.get("args").map(|v| v.as_slice()).unwrap_or(&[]); + let param_name = if args.len() >= 2 { + // Second positional arg is the explicit Python parameter name + args[1].fields.get("value") + .and_then(|v| v.as_ref()).and_then(|v| v.as_str()) + .map(|s| s.to_string()) + } else if args.len() == 1 { + // Derive from flag: "--my-option" → "my_option" + args[0].fields.get("value") + .and_then(|v| v.as_ref()).and_then(|v| v.as_str()) + .map(|s| s.trim_start_matches('-').replace('-', "_")) + } else { + None + }; + if let Some(name) = param_name { + click_option_params.push(name); + } + } + + // click.argument("param_name") or typer.argument + if attr == "argument" { + let args = decorator.children.get("args").map(|v| v.as_slice()).unwrap_or(&[]); + if let Some(first) = args.first() { + if let Some(name) = first.fields.get("value") + .and_then(|v| v.as_ref()).and_then(|v| v.as_str()) + { + click_option_params.push(name.to_lowercase()); + } + } + } + } + + // Helper closure: collect all non-self/cls parameter names + let collect_params = |args_node: &AstNode| -> Vec { + let mut names = Vec::new(); + for key in &["args", "posonlyargs", "kwonlyargs"] { + if let Some(params) = args_node.children.get(*key) { + for param in params { + if let Some(name) = param.fields.get("arg") + .and_then(|v| v.as_ref()).and_then(|v| v.as_str()) + { + if name != "self" && name != "cls" { + names.push(name.to_string()); + } + } + } + } + } + names + }; + + if has_http_taint_decorator { + // HTTP entry point: all params → HttpRequest (attacker-controlled via network) + if let Some(args_node) = func_node.children.get("args").and_then(|v| v.get(0)) { + for name in collect_params(args_node) { + result.http.push(name); + } + } + } else if has_cli_taint_decorator { + // CLI entry point: all params → OperatorConfig (operator chose these values). + // The operator is trusted for PATH/URL choices. File CONTENTS they point to + // may be third-party — FILE_DESERIALIZERS will upgrade those to HttpRequest. + if let Some(args_node) = func_node.children.get("args").and_then(|v| v.get(0)) { + for name in collect_params(args_node) { + result.operator.push(name); + } + } + } else { + // @click.option / @click.argument without a command decorator: + // these are also operator-controlled inputs + result.operator.extend(click_option_params); + } + + result +} + fn report_issue(ruleset: &RuleSet, vuln_id: &str, file_path: &str, stmt: &AstNode, content: &str, issues: &mut Vec) { if let Some(vuln_rule) = ruleset.rules.iter().find(|r| r.id == vuln_id) { + // Apply global and rule-level file exclusions (path + content) to taint findings + if vuln_rule.is_excluded(file_path, content, &ruleset.defaults) { + return; + } let line_content = content.lines().nth(stmt.lineno.saturating_sub(1) as usize).unwrap_or("").to_string(); issues.push(Issue::new( vuln_rule.id.clone(), diff --git a/src/pyspector/_rust_core/src/graph/call_graph_builder.rs b/src/pyspector/_rust_core/src/graph/call_graph_builder.rs index 312be4c2..04275034 100644 --- a/src/pyspector/_rust_core/src/graph/call_graph_builder.rs +++ b/src/pyspector/_rust_core/src/graph/call_graph_builder.rs @@ -11,46 +11,121 @@ pub struct CallGraph<'a> { pub file_contents: HashMap, } +/// Returns true if a file path should be excluded from taint analysis. +/// Excluded: test files, documentation code, and example code. +/// +/// These files are excluded because: +/// - Test files: test functions never receive real attacker-controlled data, +/// so they only add functions without adding security-relevant taint paths. +/// - Docs/examples: tutorial and example code uses hardcoded credentials, +/// simplified patterns, and intentional anti-patterns for illustration. +/// Including them as taint entry points produces false positives in the +/// library code being demonstrated. +fn is_test_file(file_path: &str) -> bool { + let lower = file_path.to_lowercase(); + // Test infrastructure + if lower.contains("/test") || lower.contains("\\test") + || lower.starts_with("test") + || lower.contains("/tests/") || lower.contains("\\tests\\") + || lower.ends_with("_test.py") + || lower.contains("/conftest") || lower.contains("\\conftest") + || lower.contains("/fixture") || lower.contains("\\fixture") + || (lower.contains("/mock") && lower.ends_with(".py")) + { + return true; + } + // Documentation, example code, and project maintenance scripts. + // Entry points in these directories are for documentation or project tooling, + // not production user-facing code. Including them as taint entry points produces + // false positives in library code being demonstrated or maintained. + lower.contains("/docs/") || lower.contains("\\docs\\") + || lower.contains("/docs_src/") || lower.contains("\\docs_src\\") + || lower.contains("/examples/") || lower.contains("\\examples\\") + || lower.contains("/example/") || lower.contains("\\example\\") + || lower.contains("/tutorial/") || lower.contains("\\tutorial\\") + || lower.contains("/tutorials/") || lower.contains("\\tutorials\\") + || lower.contains("/samples/") || lower.contains("\\samples\\") + || lower.contains("/demo/") || lower.contains("\\demo\\") + // Project maintenance scripts: documentation generation, release management, + // linting/formatting, CI helpers. These are operator-run tools, not + // user-facing entry points. + || lower.contains("/scripts/") || lower.contains("\\scripts\\") + || lower.starts_with("scripts/") || lower.starts_with("scripts\\") + // Machine-generated data files — contain language docs/data as string literals. + // They are not executable entry points; including them pollutes the call graph. + || lower.contains("/pydoc_data/") || lower.contains("\\pydoc_data\\") +} + // Builds a call graph from all parsed Python files. pub fn build_call_graph(py_files: &[PythonFile]) -> CallGraph { - println!("[*] Building call graph from {} files", py_files.len()); - + let production_files: Vec<&PythonFile> = py_files + .iter() + .filter(|f| !is_test_file(&f.file_path)) + .collect(); + + println!("[*] Building call graph from {}/{} files (test files excluded from taint analysis)", + production_files.len(), py_files.len()); + let mut call_graph = CallGraph::default(); let mut all_funcs = HashMap::new(); - // First pass: find all function definitions and store their content. - for file in py_files { - println!("[*] Processing file: {}", file.file_path); - + // First pass: find all function definitions. + // Removed per-file and per-function println — 18k+ print syscalls dominated runtime. + for file in &production_files { if let Some(ast) = &file.ast { let mut funcs_in_file = Vec::new(); find_functions(ast, &mut funcs_in_file); - + for func_node in funcs_in_file { if let Some(func_name) = get_name_from_node(func_node) { let func_id = format!("{}::{}", file.file_path, func_name); - println!("[*] Found function: {}", func_id); all_funcs.insert(func_id, func_node); } } } call_graph.file_contents.insert(file.file_path.clone(), file.content.clone()); } - + call_graph.functions = all_funcs; println!("[+] Found {} total functions", call_graph.functions.len()); - // Second pass: find all call sites in each function. + // Build a name index: bare_function_name → [func_id, ...] for O(1) call resolution. + // Without this index, Pass 2 is O(functions × call_sites × functions) — O(n²). + // With the index it's O(functions × call_sites) — O(n). + let mut name_index: HashMap> = HashMap::new(); + for func_id in call_graph.functions.keys() { + // Extract bare name after "::" (may include class prefix like "ClassName.method") + if let Some(bare) = func_id.rsplit("::").next() { + name_index.entry(bare.to_string()).or_default().push(func_id.clone()); + // Also index just the method suffix for "ClassName.method" → "method" + if let Some(method) = bare.rsplit('.').next() { + if method != bare { + name_index.entry(method.to_string()).or_default().push(func_id.clone()); + } + } + } + } + + // Second pass: resolve call sites using the O(1) index. for (func_id, func_node) in &call_graph.functions { let mut calls = HashSet::new(); let mut call_sites = Vec::new(); find_call_sites(func_node, &mut call_sites); - + for call_node in call_sites { let callee_name = get_full_call_name(call_node); - for (potential_target_id, _) in &call_graph.functions { - if potential_target_id.ends_with(&format!("::{}", callee_name)) { - calls.insert(potential_target_id.clone()); + if callee_name.is_empty() { continue; } + + // Direct lookup: exact callee name + if let Some(targets) = name_index.get(&callee_name) { + calls.extend(targets.iter().cloned()); + } + // Method suffix lookup: "obj.method" → "method" + if let Some(method) = callee_name.rsplit('.').next() { + if method != callee_name { + if let Some(targets) = name_index.get(method) { + calls.extend(targets.iter().cloned()); + } } } } diff --git a/src/pyspector/_rust_core/src/graph/cfg_builder.rs b/src/pyspector/_rust_core/src/graph/cfg_builder.rs index 9b62122a..2052c502 100644 --- a/src/pyspector/_rust_core/src/graph/cfg_builder.rs +++ b/src/pyspector/_rust_core/src/graph/cfg_builder.rs @@ -23,6 +23,11 @@ fn build_from_statements( for stmt in stmts { match stmt.node_type.as_str() { "If" => { + // Add the If node to the current block so taint analysis can scan + // the condition for call-site taint (e.g. `if not plugin.initialize(config)`) + if let Some(block) = cfg.blocks.get_mut(¤t_block_id) { + block.statements.push(stmt.clone()); + } // Create blocks for the two branches and the merge point after the if/else let if_body_block_id = cfg.add_block().id; let merge_block_id = cfg.add_block().id; @@ -55,6 +60,12 @@ fn build_from_statements( current_block_id = merge_block_id; } "For" | "While" => { + // Add the For/While node to the current block so taint analysis + // can see the loop variable binding (target = iter element). + if let Some(block) = cfg.blocks.get_mut(¤t_block_id) { + block.statements.push(stmt.clone()); + } + let loop_body_id = cfg.add_block().id; let after_loop_id = cfg.add_block().id; @@ -83,6 +94,31 @@ fn build_from_statements( // A break creates a new, unconnected block after it to stop flow current_block_id = cfg.add_block().id; } + // With statement: add the With node itself (so taint analysis can handle + // `with X as y` bindings), then unfold the body into the same block so + // body statements are processed in sequence after `y` is tainted. + "With" => { + if let Some(block) = cfg.blocks.get_mut(¤t_block_id) { + block.statements.push(stmt.clone()); + } + if let Some(body) = stmt.children.get("body") { + current_block_id = build_from_statements(cfg, body, current_block_id, loop_exits); + } + } + // Try/except: unfold the body so taint flows through guarded calls. + // Exceptions are uncommon taint paths; we conservatively analyze the + // try-body as if it executes sequentially (no exception handling model). + "Try" | "TryStar" => { + if let Some(body) = stmt.children.get("body") { + current_block_id = build_from_statements(cfg, body, current_block_id, loop_exits); + } + // Also process the else branch (runs when no exception) + if let Some(orelse) = stmt.children.get("orelse") { + if !orelse.is_empty() { + current_block_id = build_from_statements(cfg, orelse, current_block_id, loop_exits); + } + } + } // For all other statements, just add them to the current block _ => { if let Some(block) = cfg.blocks.get_mut(¤t_block_id) { diff --git a/src/pyspector/_rust_core/src/graph/representation.rs b/src/pyspector/_rust_core/src/graph/representation.rs index b6c417b7..88052838 100644 --- a/src/pyspector/_rust_core/src/graph/representation.rs +++ b/src/pyspector/_rust_core/src/graph/representation.rs @@ -23,7 +23,7 @@ impl BasicBlock { } } -#[derive(Debug, Default)] +#[derive(Debug, Default, Clone)] pub struct ControlFlowGraph { pub blocks: HashMap, pub entry: BlockId, diff --git a/src/pyspector/_rust_core/src/rules.rs b/src/pyspector/_rust_core/src/rules.rs index 1af59fdc..e4d38524 100644 --- a/src/pyspector/_rust_core/src/rules.rs +++ b/src/pyspector/_rust_core/src/rules.rs @@ -36,22 +36,46 @@ pub struct Rule { /// Rule-level glob to exclude specific files (stacks on top of [defaults]). #[serde(default)] pub exclude_file_pattern: Option, + /// Regex checked against the FULL FILE CONTENT. If the file content matches, + /// this rule is suppressed for that file regardless of line-level matches. + /// Use to avoid library-specific FPs: e.g. suppress yaml.load() findings in + /// files that import ruamel.yaml (which is safe by default). + /// Example: file_content_exclude = "from ruamel\\.yaml|import ruamel" + #[serde(with = "serde_regex", default)] + pub file_content_exclude: Option, } impl Rule { - /// Returns true if `file_path` is excluded by this rule's own exclude_file_pattern - /// OR by the global defaults. + /// Returns true if the file should be excluded based on path patterns OR + /// file content (file_content_exclude checked against the full file text). pub fn is_file_excluded(&self, file_path: &str, defaults: &Defaults) -> bool { + self.is_excluded(file_path, "", defaults) + } + + /// Full exclusion check: path patterns + optional file content regex. + /// Pass file content when available for the most accurate result. + pub fn is_excluded(&self, file_path: &str, content: &str, defaults: &Defaults) -> bool { // Check global default exclusions first for pattern in &defaults.exclude_file_patterns { if wildmatch::WildMatch::new(pattern).matches(file_path) { return true; } } - // Then rule-level exclusion + // Then rule-level file path exclusion (supports comma-separated patterns) if let Some(efp) = &self.exclude_file_pattern { - if wildmatch::WildMatch::new(efp).matches(file_path) { - return true; + for pattern in efp.split(',') { + if wildmatch::WildMatch::new(pattern.trim()).matches(file_path) { + return true; + } + } + } + // Finally, file content exclusion — suppress rule if the file imports + // a library or uses a pattern that makes the rule inapplicable. + if !content.is_empty() { + if let Some(fce) = &self.file_content_exclude { + if fce.is_match(content) { + return true; + } } } false @@ -74,14 +98,48 @@ pub struct TaintSinkRule { pub vulnerability_id: String, pub description: String, pub function_call: String, + /// Index of the positional argument that must be tainted to trigger this sink. + /// Ignored when vulnerable_receiver = true. + #[serde(default)] pub vulnerable_parameter_index: usize, + /// When true, the method *receiver* (the object before the dot) must be + /// tainted rather than a positional argument. + /// e.g. tainted_template.format(...) → receiver "tainted_template" is the risk. + #[serde(default)] + pub vulnerable_receiver: bool, + /// When true, this sink is a method call (called as obj.method()), so matching + /// uses ends_with(".function_call"). When false (default), it is a direct builtin + /// call (e.g. set(), open()) matched with exact equality to prevent "cache.set" + /// matching the "set" builtin sink. + #[serde(default)] + pub is_method: bool, + /// Which taint origins trigger this sink (default = "all" attacker-controlled). + /// "injectable_only" — only fires for HttpRequest/External, NOT ShellSanitized. + /// Use for shell injection sinks (PY102): shlex.quote() is a valid mitigation. + /// "all" (default) — fires for HttpRequest, External, AND ShellSanitized. + /// Use for path/SQL/URL sinks where shlex.quote doesn't help. + #[serde(default = "default_triggers_on")] + pub triggers_on: String, + /// When set, only this named keyword argument triggers the sink. + /// e.g. vulnerable_keyword = "password" fires only on create(..., password=tainted). + /// When absent, any tainted positional or keyword arg may trigger. + #[serde(default)] + pub vulnerable_keyword: Option, } +fn default_triggers_on() -> String { "all".to_string() } + #[derive(Debug, Deserialize)] pub struct TaintSanitizerRule { pub id: String, pub description: String, pub function_call: String, + /// When set, the sanitizer does NOT clear taint but transforms its origin. + /// e.g. transforms_to = "ShellSanitized" means shlex.quote() turns + /// HttpRequest taint into ShellSanitized taint — still risky for path + /// traversal / f-strings, but safe for shell injection (PY102). + #[serde(default)] + pub transforms_to: Option, } #[derive(Debug, Deserialize)] diff --git a/src/pyspector/cli.py b/src/pyspector/cli.py index 50ae9ca5..af1608cf 100644 --- a/src/pyspector/cli.py +++ b/src/pyspector/cli.py @@ -645,13 +645,14 @@ def _execute_scan( ) # ── AST Generation ──────────────────────────────────────────────────── + t_parse = time.time() ast_stats_meta: Dict[str, int] = {} python_files_data = get_python_file_asts( scan_path, enable_syntax_warnings=syntax_warnings, _stats_meta=ast_stats_meta, ) - click.echo(f"[*] Successfully parsed {len(python_files_data)} Python files") + click.echo(f"[*] Successfully parsed {len(python_files_data)} Python files in {time.time()-t_parse:.2f}s") if stats: stats.record_files( @@ -704,10 +705,12 @@ def _execute_scan( click.echo(click.style(f"Error during supply chain scan: {e}", fg="red")) # ── Run Scan (Rust core) ─────────────────────────────────────────────── + t_rust = time.time() try: raw_issues = run_scan( str(scan_path.resolve()), rules_toml_str, config, python_files_data ) + click.echo(f"[*] Rust core scan: {time.time()-t_rust:.2f}s") except ValueError as e: click.echo( click.style( diff --git a/src/pyspector/plugin_system.py b/src/pyspector/plugin_system.py index d40e662a..91bd4564 100644 --- a/src/pyspector/plugin_system.py +++ b/src/pyspector/plugin_system.py @@ -147,6 +147,13 @@ def validate_plugin_code(plugin_path: Path) -> tuple[bool, str]: "eval", "exec", "compile", "__import__", # Reflection/introspection "vars", "getattr", + # Sandbox escape via class hierarchy traversal — + # object.__subclasses__() retrieves ALL loaded classes (including subprocess.Popen) + # without any import, bypassing every import-level check. + "__subclasses__", + # Globals access via function object — exposes the full module namespace + # of any function, including builtins and imported modules. + "__globals__", "__builtins__", # importlib — dynamic module loading (all public entry-points) "importlib.import_module", "importlib.util.spec_from_file_location", @@ -201,6 +208,9 @@ def validate_plugin_code(plugin_path: Path) -> tuple[bool, str]: "getoutput", "getstatusoutput", "exec", "eval", "compile", "load_module", "exec_module", # importlib loader API + # Sandbox escape primitives + "__subclasses__", "__globals__", "__builtins__", + "__reduce__", "__reduce_ex__", # pickle deserialization hooks } warning_calls: set[str] = {"open", "builtins.open"} @@ -303,7 +313,7 @@ def visit_Call(self, node: ast.Call) -> None: else: simplified = name.replace("builtins.", "") - + if simplified in fatal_calls: detected_fatal.add(simplified) elif simplified in warning_calls: @@ -316,7 +326,15 @@ def visit_Call(self, node: ast.Call) -> None: detected_fatal.add(normalised) elif normalised in warning_calls: detected_warnings.add(normalised) - + + # Also block dangerous dunder methods regardless of receiver: + # object.__subclasses__(), cls.__subclasses__(), etc. + # These are sandbox-escape primitives and have no place in plugins. + if "." in simplified: + method_attr = simplified.rsplit(".", 1)[-1] + if method_attr in dangerous_opaque_attrs: + detected_fatal.add(f".{method_attr}()") + self.generic_visit(node) Analyzer().visit(tree) diff --git a/src/pyspector/reporting.py b/src/pyspector/reporting.py index fb355ee5..2e58b98e 100644 --- a/src/pyspector/reporting.py +++ b/src/pyspector/reporting.py @@ -132,7 +132,7 @@ def to_json(self) -> str: "file_path": issue.file_path, "line_number": issue.line_number, "code": issue.code, - "severity": _severity_key(issue), + "severity": str(issue.severity).split(".")[-1], "remediation": issue.remediation, } for issue in self.issues diff --git a/src/pyspector/rules/built-in-rules.toml b/src/pyspector/rules/built-in-rules.toml index 7a7c11f5..8fd5df65 100644 --- a/src/pyspector/rules/built-in-rules.toml +++ b/src/pyspector/rules/built-in-rules.toml @@ -7,10 +7,32 @@ # File-path globs excluded from ALL rules unless a rule opts out. # Add paths here instead of repeating exclude_file_pattern on each rule. exclude_file_patterns = [ - "*tests*", # test directories and test_*.py / *_test.py files - "*fixtures*", # fixture data - "*testdata*", # test data - "*conftest*", # pytest configuration + "*tests*", # test directories and test_*.py / *_test.py files + "*fixtures*", # fixture data — never production code + "*testdata*", # test data + "*conftest*", # pytest configuration + "*/test/*", # test infrastructure directories (e.g. django/test/) + "*lorem_ipsum*", # demo/placeholder text generators + "*fake_data*", # synthetic data generators + "*sample_data*", # sample data files + # Documentation and example code — hardcoded credentials/simplified patterns are intentional. + # Patterns anchor on path separators to avoid substring matches (e.g. "frutadocs"). + "*/docs/*", # /docs/ as a path component (nested) + "docs/*", # top-level docs/ + "*/docs_src/*", # /docs_src/ — documentation source (used by many projects) + "docs_src/*", # top-level docs_src/ + "*/examples/*", # /examples/ as a path component + "examples/*", # top-level examples/ + "*/example/*", # /example/ as a path component + "example/*", # top-level example/ + "*/samples/*", # /samples/ + "*/demo/*", # /demo/ + "*/tutorial/*", # /tutorial/ + "*/tutorials/*", # /tutorials/ + # Machine-generated data files — contain language docs/data as string literals, + # not executable code. Pattern-matching against these produces 100% FPs. + "*/pydoc_data/*", # Python language docs embedded as string dictionaries + "pydoc_data/*", ] # Rules disabled globally because they produce 100% false positives by flagging @@ -18,36 +40,9 @@ exclude_file_patterns = [ # These rules have no security value on their own without taint analysis. # Re-enable any of these per-project by removing the ID from this list. disabled_rule_ids = [ - # Python built-in functions — not security sinks without taint context - "ABS1089", "ALL1107", "ANY1104", "BOOL1035", "BYTEARRAY1008", "BYTES1005", - "CALLABLE1131", "CAPITALIZE954", "CASEFOLD918", "CHR1017", "CLASSMETHOD1125", - "COUNT909", "DECODE882", "DICT1050", "DIR849", "DIVMOD1098", - "ENCODE885", "ENDSWITH900", "ENUMERATE1059", "FILTER1068", "FIND903", - "FLOAT1029", "FROZENSET1053", "HASH1137", "HEX1020", "ID1134", - "INDEX906", "INT1038", "ISALPHA972", "ISASCII975", "ISDIGIT981", - "ISIDENTIFIER984", "ISINSTANCE855", "ISPRINTABLE993", "ISSPACE996", - "ISUPPER1002", "ITER1110", "JOIN876", "LEN1101", "LIST1041", - "LJUST930", "LOWER888", "LSTRIP957", "MAP1065", "MAX1083", - "MEMORYVIEW1011", "MIN1086", "NEXT1113", "ORD1014", "PARTITION936", - "PRINT1146", "PROPERTY1119", "RANGE1056", "REDUCE1071", "REMOVEPREFIX963", - "REMOVESUFFIX966", "REPLACE879", "REPR858", "REVERSED1077", "RJUST933", - "ROUND1092", "RPARTITION939", "RSPLIT942", "RSTRIP960", "SET1047", - "SLICE1116", "SORTED1074", "SPLIT873", "SPLITLINES945", "STARTSWITH897", - "STATICMETHOD1122", "STR861", "STRIP894", "SUM1080", "SUPER1128", - "TITLE951", "TRANSLATE912", "TUPLE1044", "TYPE852", "UPPER891", - "VARS840", "ZIP1062", - # Medium-noise rules: too broad without taint analysis - "FSTRING867", # every f-string is NOT an injection risk - "GETATTR828", # every getattr() is NOT unsafe - "SETATTR831", # every setattr() is NOT unsafe - "HASATTR837", # every hasattr() is NOT a disclosure risk - "DELATTR834", # every delattr() is NOT unsafe - "FORMAT864", # every .format() is NOT an injection risk - "DJG513", # csrf_exempt covered by CSRF747 already - "MIME786", # HttpResponse with content_type is not a vulnerability - "BRUTE765", # login_required is not "missing brute force protection" - "INFO738", # traceback.print_exc is not information disclosure by itself - "SER522", # serializers.serialize() is not inherently unsafe + # Valid concept, needs taint or context to avoid FPs before activating: + "CACHE756", # cache.set(request.*) — cache poisoning; needs taint to confirm HTTP origin + "INFO738", # traceback.print_exc() — information disclosure; needs prod-vs-test context ] # ------------------------------------------- @@ -60,2153 +55,2097 @@ description = "Data from a web request is considered tainted." function_call = "request.get" taint_target = "return" -[[taint_sink]] -id = "SK001" -vulnerability_id = "PY102" # This sink triggers the high-confidence Command Injection rule -description = "Data is passed to a command execution function." -function_call = "subprocess.run" -vulnerable_parameter_index = 0 +[[taint_source]] +id = "TS002" +description = "Django GET parameter is tainted." +function_call = "request.GET.get" +taint_target = "return" -[[taint_sanitizer]] -id = "SN001" -description = "Shell argument escaping sanitizes data for command execution." -function_call = "shlex.quote" +[[taint_source]] +id = "TS003" +description = "Django POST parameter is tainted." +function_call = "request.POST.get" +taint_target = "return" -# ------------------------------------------- -# SECTION: Injection (OWASP A03:2021) -# ------------------------------------------- +[[taint_source]] +id = "TS004" +description = "Flask query string parameter is tainted." +function_call = "request.args.get" +taint_target = "return" -[[rule]] -id = "PY102" -description = "Command Injection detected via Taint Analysis." -severity = "Critical" -confidence = "High" -remediation = "User-controlled data reached a command execution function without sanitization. Use 'shlex.quote()' to escape arguments or avoid passing user input to shell commands entirely." -# NOTE: This rule has no 'pattern' or 'ast_match'. It is triggered ONLY by the taint engine. +[[taint_source]] +id = "TS005" +description = "Flask form field is tainted." +function_call = "request.form.get" +taint_target = "return" -[[rule]] -id = "PY001" -description = "Use of 'eval()' is highly dangerous." -severity = "High" -remediation = "Avoid 'eval()'. Use safer alternatives like 'ast.literal_eval' for data parsing." -ast_match = "Call(func.id=eval)" -file_pattern = "*.py" +[[taint_source]] +id = "TS006" +description = "Interactive user input is tainted." +function_call = "input" +taint_target = "return" -[[rule]] -id = "PY103" -description = "Use of os.system is a command injection risk." -severity = "High" -remediation = "Avoid 'os.system'. Use the 'subprocess' module with command and arguments as a list." -ast_match = "Call(func.value.id=os, func.attr=system)" -file_pattern = "*.py" +[[taint_source]] +id = "TS007" +description = "Environment variable is considered tainted." +function_call = "os.environ.get" +taint_target = "return" -[[rule]] -id = "PY101" -description = "Potential SQL injection via string formatting in database query." -severity = "Critical" -confidence = "High" -remediation = "Use parameterized queries (e.g., cursor.execute('SELECT * FROM users WHERE name = ?', (name,))) instead of string formatting." -pattern = "\\.(execute|executemany)\\s*\\(\\s*f?[\"'].*SELECT.*(FROM|UPDATE|DELETE).*%s.*[\"']" -file_pattern = "*.py" +[[taint_source]] +id = "TS008" +description = "CLI argument via argparse — user-controlled input." +function_call = "parse_args" +taint_target = "return" -[[rule]] -id = "PY104" -description = "LDAP injection may be possible with string formatting." -severity = "High" -remediation = "Use a proper LDAP escaping library for any user-controlled data in LDAP queries." -pattern = "\\.search_s\\s*\\(.*f[\"']" -file_pattern = "*.py" +[[taint_source]] +id = "TS009" +description = "CLI argument via click — user-controlled input." +function_call = "click.argument" +taint_target = "return" -[[rule]] -id = "PY105" -description = "Potential XSS vulnerability with mark_safe or Markup." -severity = "Medium" -remediation = "Ensure that data passed to 'mark_safe' or 'Markup' is from a trusted source or has been properly sanitized." -pattern = "(mark_safe|Markup)\\s*\\(" -file_pattern = "*.py" +[[taint_source]] +id = "TS010" +description = "sys.argv — raw command-line arguments, user-controlled." +function_call = "sys.argv" +taint_target = "return" -[[rule]] -id = "PY106" -description = "Use of subprocess.run with shell=True is a command injection risk." -severity = "High" -remediation = "Avoid shell=True with subprocess.run. Pass commands as a list instead of a string." -ast_match = "Call(func.value.id=subprocess, func.attr=run)" -file_pattern = "*.py" +# HTTP CLIENT RESPONSE SOURCES +# Data received from external HTTP APIs is attacker-controlled when the API +# server is compromised or a MITM attack is in progress. -[[rule]] -id = "PY107" -description = "Unsafe deserialization with 'yaml.load'." -severity = "High" -remediation = "Use 'yaml.safe_load()' instead of 'yaml.load()'." -ast_match = "Call(func.value.id=yaml, func.attr=load)" -file_pattern = "*.py" -# Do not flag when SafeLoader or BaseLoader is explicitly passed -exclude_pattern = "Loader\\s*=\\s*(yaml\\.)?(Safe|Base)Loader" +[[taint_source]] +id = "TS011" +description = "HTTP response streaming line iterator — network data is tainted." +function_call = ".iter_lines" +taint_target = "return" +# Leading dot matches any receiver: s.iter_lines(), response.iter_lines() + +[[taint_source]] +id = "TS012" +description = "HTTP response streaming text iterator — network data is tainted." +function_call = ".iter_text" +taint_target = "return" + +[[taint_source]] +id = "TS013" +description = "HTTP response streaming bytes/raw iterator — network data is tainted." +function_call = ".iter_bytes" +taint_target = "return" + +[[taint_source]] +id = "TS013B" +description = "HTTP response raw chunk iterator." +function_call = ".iter_raw" +taint_target = "return" + +[[taint_source]] +id = "TS014" +description = "HTTP response .json() method on any response object — parsed API data is tainted." +function_call = ".json" +taint_target = "return" +# Matches: local_run.json(), response.json(), res.json(), new_api_call().json() +# Does NOT match: json.loads(), json.dumps() (those have 'json' as module prefix, not method) + +[[taint_source]] +id = "TS015" +description = "marshal.loads() returns a deserialized Python code object — treat as dangerous taint." +function_call = "marshal.loads" +taint_target = "return" +# The deserialized code object is dangerous bytecode from an untrusted source. +# Any function created from it (FunctionType, exec) should be flagged. +# Works with DESER723 (pattern) and SK_DESER724 (taint sink for FunctionType). # ------------------------------------------- -# SECTION: Cryptographic Failures (OWASP A02:2021) +# SECTION: Taint Sinks # ------------------------------------------- -[[rule]] -id = "PY201" -description = "Use of weak hashing algorithm MD5." -severity = "Medium" -remediation = "Use a stronger hashing algorithm like SHA-256 or a password-specific hashing function like bcrypt." -ast_match = "Call(func.value.id=hashlib, func.attr=md5)" -file_pattern = "*.py" +[[taint_sink]] +id = "SK001" +vulnerability_id = "PY102" +description = "Data is passed to a command execution function." +function_call = "subprocess.run" +vulnerable_parameter_index = 0 +triggers_on = "shell_injectable" -[[rule]] -id = "PY202" -description = "Use of broken hashing algorithm SHA1." -severity = "Medium" -remediation = "Use a stronger hashing algorithm like SHA-256." -ast_match = "Call(func.value.id=hashlib, func.attr=sha1)" -file_pattern = "*.py" +[[taint_sink]] +id = "SK001B" +vulnerability_id = "PY102" +description = "User-controlled command string passed to asyncio create_subprocess_shell()." +function_call = "create_subprocess_shell" +vulnerable_parameter_index = 0 +is_method = false +triggers_on = "shell_injectable" -[[rule]] -id = "PY203" -description = "Use of insecure SSL/TLS protocol version." -severity = "High" -remediation = "Use 'ssl.PROTOCOL_TLS' or higher. Avoid SSLv2, SSLv3, and TLSv1.0/1.1." -pattern = "ssl\\.PROTOCOL_(SSLv2|SSLv3|TLSv1|TLSv1_1)" -file_pattern = "*.py" +[[taint_sink]] +id = "SK001C" +vulnerability_id = "PY102" +description = "User-controlled args passed to asyncio create_subprocess_exec()." +function_call = "create_subprocess_exec" +vulnerable_parameter_index = 0 +is_method = false +triggers_on = "shell_injectable" -[[rule]] -id = "PY204" -description = "Use of the 'pycrypto' library is discouraged due to known vulnerabilities." -severity = "High" -remediation = "Migrate from 'pycrypto' to a more secure and actively maintained library like 'pycryptodome'." -pattern = "from\\s+Crypto|import\\s+Crypto" -file_pattern = "*.py" +[[taint_sink]] +id = "SK002" +vulnerability_id = "GETATTR828" +description = "Tainted attribute name passed to getattr() — attacker controls which attribute is accessed." +function_call = "getattr" +vulnerable_parameter_index = 1 -[[rule]] -id = "PY205" -description = "Use of PyNaCl with low-level functions can be insecure if misused." -severity = "Low" -confidence = "Low" -remediation = "Prefer using high-level APIs like 'Box' and 'SecretBox' unless low-level functions are explicitly required and understood." -pattern = "nacl\\.low_level" -file_pattern = "*.py" +[[taint_sink]] +id = "SK003" +vulnerability_id = "OPEN1149" +description = "Tainted file path passed to open() — attacker may read/write arbitrary files." +function_call = "open" +vulnerable_parameter_index = 0 -# ------------------------------------------- -# SECTION: Insecure Deserialization & Design (OWASP A08:2021) -# ------------------------------------------- +[[taint_sink]] +id = "SK004" +vulnerability_id = "PY103" +description = "Tainted command passed to os.system()." +function_call = "os.system" +vulnerable_parameter_index = 0 +triggers_on = "shell_injectable" -[[rule]] -id = "PY002" -description = "Use of 'pickle.loads' for deserialization can lead to remote code execution." -severity = "High" -remediation = "Use a safer serialization format like JSON if deserializing untrusted data." -ast_match = "Call(func.value.id=pickle, func.attr=loads)" -file_pattern = "*.py" +[[taint_sink]] +id = "SK005" +vulnerability_id = "SETATTR831" +description = "Tainted attribute name passed to setattr() — attacker writes arbitrary object attributes." +function_call = "setattr" +vulnerable_parameter_index = 1 -[[rule]] -id = "PY301" -description = "Use of 'pickle.load' for deserialization can lead to remote code execution." -severity = "High" -remediation = "Use a safer serialization format like JSON if deserializing untrusted data." -ast_match = "Call(func.attr=load, func.value.id=pickle)" -file_pattern = "*.py" +[[taint_sink]] +id = "SK006" +vulnerability_id = "DELATTR834" +description = "Tainted attribute name passed to delattr() — attacker deletes arbitrary object attributes." +function_call = "delattr" +vulnerable_parameter_index = 1 -[[rule]] -id = "PY302" -description = "Use of 'yaml.load()' is insecure. Use 'yaml.safe_load()'." -severity = "High" -remediation = "Always use 'yaml.safe_load()' to prevent arbitrary code execution from malicious YAML." -pattern = "^\\s*[^#]*yaml\\.load" # This regex ignores comment lines -file_pattern = "*.py" -# Do not flag when SafeLoader or safe_load is used -exclude_pattern = "Loader\\s*=\\s*(yaml\\.)?(Safe|Base)Loader|yaml\\.safe_load" +[[taint_sink]] +id = "SK007" +vulnerability_id = "SER522" +description = "Tainted format/queryset arg[0] to serializer." +function_call = "serialize" +vulnerable_parameter_index = 0 -[[rule]] -id = "PY303" -description = "XML parsing with 'xml.etree.ElementTree' is vulnerable to XML bombs." -severity = "High" -remediation = "Use 'defusedxml.ElementTree' to parse untrusted XML data safely." -pattern = "xml\\.etree\\.ElementTree\\.(parse|fromstring)" -file_pattern = "*.py" +[[taint_sink]] +id = "SK007B" +vulnerability_id = "SER522" +description = "Tainted data object (arg[1]) passed to serializer." +function_call = "serialize" +vulnerable_parameter_index = 1 -[[rule]] -id = "PY304" -description = "Insecure temporary file creation may lead to race conditions." -severity = "Medium" -remediation = "Use 'tempfile.mkstemp()' instead of 'tempfile.mktemp()' for secure temporary file creation." -pattern = "tempfile\\.mktemp" -file_pattern = "*.py" +[[taint_sink]] +id = "SK008" +vulnerability_id = "RAND810" +description = "Tainted seed passed to random.seed() — predictable PRNG output." +function_call = "random.seed" +vulnerable_parameter_index = 0 -[[rule]] -id = "PY305" -description = "Use of exec() enables arbitrary code execution" -severity = "Critical" -ast_match = "Call(func.id=exec)" +[[taint_sink]] +id = "SK009" +vulnerability_id = "FORMAT864" +description = "Tainted format string used as template in .format() — SSTI-like injection." +function_call = "format" +is_method = true +vulnerable_receiver = true +# Only fires when the FORMAT STRING ITSELF is tainted (receiver = the template). +# Tainted ARGUMENTS to .format() are not themselves dangerous — the receiver +# controls the template structure. Removing vulnerable_parameter_index prevents +# FPs from os.replace(), code.replace(), node.replace() and similar APIs. -[[rule]] -id = "PY306" -description = "Unsafe pickle.loads() can execute arbitrary code" -severity = "High" -ast_match = "Call(func.value.id=pickle, func.attr=loads)" +[[taint_sink]] +id = "SK010" +vulnerability_id = "REPLACE879" +description = "Tainted first arg (search string) in .replace() — filter bypass possible." +function_call = "replace" +vulnerable_parameter_index = 0 +is_method = true +vulnerable_receiver = false + +[[taint_sink]] +id = "SK010B" +vulnerability_id = "REPLACE879" +description = "Tainted second arg (replacement string) in .replace() — injection via replacement." +function_call = "replace" +vulnerable_parameter_index = 1 +is_method = true +vulnerable_receiver = false + +[[taint_sink]] +id = "SK011" +vulnerability_id = "TRANSLATE912" +description = "Tainted translation table in .translate() — sanitization bypass." +function_call = "translate" +vulnerable_parameter_index = 0 +is_method = true +vulnerable_receiver = false # ------------------------------------------- -# SECTION: Security Misconfiguration (OWASP A05:2021) +# SECTION: A_SINK rules — attribute/object inspection # ------------------------------------------- -[[rule]] -id = "G401" -description = "Flask app is running with the development server in a non-debug context." -severity = "Medium" -confidence = "Low" -remediation = "Use a production-ready WSGI server like Gunicorn or uWSGI instead of 'app.run()'." -pattern = "app\\.run\\(host=.*0\\.0\\.0\\.0" -file_pattern = "*.py" - -[[rule]] -id = "G402" -description = "Django DEBUG mode is enabled in a settings file." -severity = "High" -remediation = "Ensure DEBUG is set to False in production settings." -pattern = "^\\s*DEBUG\\s*=\\s*True" -file_pattern = "*settings*.py" +[[taint_sink]] +id = "SK012" +vulnerability_id = "HASATTR837" +description = "Tainted attribute name to hasattr() — attacker probes object's attributes." +function_call = "hasattr" +vulnerable_parameter_index = 1 -[[rule]] -id = "G403" -description = "Flask DEBUG mode is enabled." -severity = "High" -remediation = "Ensure app.debug is False or the DEBUG config variable is False in production." -pattern = "app\\.run\\(.*debug=True" -file_pattern = "*.py" +[[taint_sink]] +id = "SK013" +vulnerability_id = "VARS840" +description = "Tainted object to vars() — attacker dumps object's internal dict." +function_call = "vars" +vulnerable_parameter_index = 0 -[[rule]] -id = "G404" -description = "Django's CSRF protection appears to be disabled globally." -severity = "Critical" -remediation = "Ensure 'django.middleware.csrf.CsrfViewMiddleware' is active in your MIDDLEWARE setting." -pattern = "#.*CsrfViewMiddleware" # Simple check for commented-out middleware -file_pattern = "*settings*.py" +[[taint_sink]] +id = "SK014" +vulnerability_id = "DIR849" +description = "Tainted object to dir() — attacker enumerates object attributes." +function_call = "dir" +vulnerable_parameter_index = 0 -[[rule]] -id = "G405" -description = "Requests made without certificate verification." -severity = "High" -remediation = "Remove 'verify=False' from requests calls to prevent man-in-the-middle attacks." -ast_match = "Call(keywords.*.arg=verify, keywords.*.value.value=False)" -file_pattern = "*.py" +# SK015 (CALLABLE1131) removed — rule disabled, sink caused downstream FP propagation # ------------------------------------------- -# SECTION: Hardcoded Secrets (OWASP A07:2021) +# A_SINK — encoding / low-level byte operations # ------------------------------------------- -[[rule]] -id = "G101" -description = "Hardcoded password or secret detected." -severity = "High" -confidence = "Medium" -remediation = "Store credentials in environment variables or a secrets management system." -pattern = "(?i)(password|secret|api_key|token|authkey|bearer|cred|credentials)\\s*[:=]\\s*[\"']\\w{8,}[\"']" -file_pattern = "*.py" +# SK016 (BYTES1005) removed — rule disabled, sink caused downstream FP propagation -[[rule]] -id = "G102" -description = "Hardcoded private key detected." -severity = "Critical" -confidence = "High" -remediation = "Load private keys from a secure, encrypted file or secrets manager." -pattern = "-----BEGIN (RSA|EC|OPENSSH|PGP) PRIVATE KEY-----" - -[[rule]] -id = "G103" -description = "Use of a blank password for a user or service." -severity = "High" -remediation = "Ensure all users and service accounts have strong, non-empty passwords." -pattern = "(?i)(password|passwd|pass)\\s*[:=]\\s*[\"']\\s*[\"']" -file_pattern = "*.py" - -[[rule]] -id = "G104" -description = "JWT secret is hardcoded." -severity = "Critical" -remediation = "Load JWT secrets from environment variables or a secrets management system." -pattern = "(?i)(jwt_secret|jwt_key)\\s*[:=]\\s*[\"'].+[\"']" -file_pattern = "*.py" - -# ------------------------------------------- -# SECTION: IaC and Configuration File Security -# ------------------------------------------- - -[[rule]] -id = "DKR001" -description = "Password or secret found in Dockerfile ENV instruction." -severity = "High" -remediation = "Use build-time arguments (ARG) with the --secret flag or a secrets management tool." -pattern = "(?i)ENV\\s+(PASS|PASSWORD|SECRET|TOKEN|API_KEY)\\s+" -file_pattern = "Dockerfile" - -[[rule]] -id = "DKR002" -description = "Use of 'latest' tag for base image is not recommended for production." -severity = "Low" -remediation = "Pin base images to a specific version digest for reproducible and secure builds." -pattern = "FROM\\s+\\w+:latest" -file_pattern = "Dockerfile" - -[[rule]] -id = "DKR003" -description = "Exposing Docker daemon socket inside a container is a security risk." -severity = "Critical" -remediation = "Avoid mounting '/var/run/docker.sock' into containers." -pattern = "/var/run/docker\\.sock" -file_pattern = "docker-compose*.y*ml" - -[[rule]] -id = "K8S001" -description = "Kubernetes container running in privileged mode." -severity = "Critical" -remediation = "Set 'securityContext.privileged' to 'false' or remove it." -pattern = "privileged:\\s*true" -file_pattern = "*.y*ml" +[[taint_sink]] +id = "SK017" +vulnerability_id = "BYTEARRAY1008" +description = "Tainted data passed to bytearray() — mutable buffer from untrusted input." +function_call = "bytearray" +vulnerable_parameter_index = 0 -[[rule]] -id = "K8S002" -description = "Kubernetes container allows privilege escalation." -severity = "High" -remediation = "Explicitly set 'securityContext.allowPrivilegeEscalation' to 'false'." -pattern = "allowPrivilegeEscalation:\\s*true" -file_pattern = "*.y*ml" +# SK018 (MEMORYVIEW1011) removed — rule disabled -[[rule]] -id = "TF001" -description = "Terraform AWS S3 bucket is publicly readable." -severity = "Critical" -remediation = "Set the 'acl' property of 'aws_s3_bucket' to 'private', not 'public-read' or 'public-read-write'." -pattern = "acl\\s*=\\s*\"(public-read|public-read-write)\"" -file_pattern = "*.tf" +[[taint_sink]] +id = "SK019" +vulnerability_id = "ORD1014" +description = "Tainted character to ord() — extracts code point from untrusted input." +function_call = "ord" +vulnerable_parameter_index = 0 -[[rule]] -id = "CFG001" -description = "AWS credentials detected in configuration file." -severity = "Critical" -remediation = "Use IAM roles or environment variables for AWS credentials." -pattern = "(?i)(aws_access_key_id|aws_secret_access_key)\\s*=\\s*[A-Za-z0-9/+=]{20,}" -file_pattern = "*.ini" +[[taint_sink]] +id = "SK020" +vulnerability_id = "CHR1017" +description = "Tainted code point to chr() — generates character from attacker-controlled value." +function_call = "chr" +vulnerable_parameter_index = 0 # ------------------------------------------- -# SECTION: ADDITIONAL SECURITY RULES +# A_SINK — width-based memory exhaustion # ------------------------------------------- -[[rule]] -id = "PY500" -description = "Dynamic code execution using builtins.exec() function." -severity = "High" -confidence = "Medium" -remediation = "Avoid dynamic code execution. Consider safer alternatives or validate input thoroughly." -ast_match = "Call(func.attr=exec, func.value.id=builtins)" -file_pattern = "*.py" - -[[rule]] -id = "SEC501" -description = "Generic exec pattern detected in code." -severity = "Medium" -confidence = "Medium" -remediation = "Dynamic code execution can be dangerous. Validate all inputs and consider safer alternatives." -pattern = "\\bexec\\b\\s*\\(" -file_pattern = "*.py" - -[[rule]] -id = "SEC502" -description = "Subprocess Popen with shell=True detected." -severity = "Medium" -confidence = "Medium" -remediation = "Using shell=True with subprocess.Popen can lead to command injection. Use argument lists instead." -ast_match = "Call(func.value.id=subprocess, func.attr=Popen, keywords.*.arg=shell, keywords.*.value.value=True)" -file_pattern = "*.py" - -[[rule]] -id = "PY503" -description = "Shell command execution with user-controllable input." -severity = "Low" -confidence = "Medium" -remediation = "Avoid using shell=True with subprocess calls. Use argument arrays for safer command execution." -pattern = "subprocess\\.(Popen|call)\\(.*shell\\s*=\\s*True" -file_pattern = "*.py" - -[[rule]] -id = "SEC504" -description = "Reading sensitive system file /etc/passwd." -severity = "Low" -remediation = "Accessing system password files should be done with proper authorization checks." -pattern = "open\\s*\\(\\s*['\\\"]/etc/passwd" -file_pattern = "*.sh" - -[[rule]] -id = "PY505" -description = "File reading operation using open().read() pattern." -severity = "High" -remediation = "Ensure file access controls and validate file paths to prevent unauthorized access." -ast_match = "Attribute(attr=read, value.func.id=open)" -file_pattern = "*.py" - -[[rule]] -id = "JS506" -description = "JavaScript eval() function usage detected." -severity = "Medium" -remediation = "Avoid using eval() in JavaScript. Use JSON.parse() for data or safer alternatives." -pattern = "eval\\s*\\(" -file_pattern = "*.js" - -[[rule]] -id = "PY507" -description = "Method call to exec function detected." -severity = "Critical" -remediation = "Method-based exec calls can execute arbitrary code. Validate inputs and use safer alternatives." -pattern = "\\.exec\\s*\\(" -file_pattern = "*.py" - -[[rule]] -id = "WEB508" -description = "Insecure Content Security Policy with unsafe-inline." -severity = "Medium" -confidence = "Medium" -remediation = "Remove unsafe-inline from CSP directives and use nonces or hashes instead." -pattern = "Content-Security-Policy\\s*:\\s*.*unsafe-inline" -file_pattern = "*.conf" +[[taint_sink]] +id = "SK021" +vulnerability_id = "CENTER927" +description = "Tainted width in .center() — attacker may allocate excessive memory." +function_call = "center" +vulnerable_parameter_index = 0 +is_method = true +vulnerable_receiver = false -[[rule]] -id = "JS509" -description = "Dynamic function creation using Function constructor." -severity = "Low" -remediation = "Function constructor can execute arbitrary code. Use predefined functions or validate inputs." -pattern = "new\\s+Function\\s*\\(" -file_pattern = "*.js" +[[taint_sink]] +id = "SK022" +vulnerability_id = "LJUST930" +description = "Tainted width in .ljust() — attacker may allocate excessive memory." +function_call = "ljust" +vulnerable_parameter_index = 0 +is_method = true +vulnerable_receiver = false -[[rule]] -id = "CFG510" -description = "AWS access key detected in configuration." -severity = "Low" -remediation = "Store AWS credentials securely using IAM roles or environment variables." -pattern = "aws_access_key_id\\s*[:=]\\s*['\\\"][A-Za-z0-9/+=]{16,}" -file_pattern = "*.ini" +[[taint_sink]] +id = "SK023" +vulnerability_id = "RJUST933" +description = "Tainted width in .rjust() — attacker may allocate excessive memory." +function_call = "rjust" +vulnerable_parameter_index = 0 +is_method = true +vulnerable_receiver = false -[[rule]] -id = "PY511" -description = "JSON deserialization without validation." -severity = "Low" -confidence = "Low" -remediation = "json.loads() is safe from code execution. Only flag if the result feeds into eval/exec/pickle." -ast_match = "Call(func.value.id=json, func.attr=loads)" -file_pattern = "*.py" +# SK024-SK028 removed — associated rules disabled (RANGE1056, JOIN876, SORTED1074, SUM1080, SET1047) +# These sinks caused downstream FP propagation: disabling the rule but keeping the sink +# continued to taint downstream variables, causing cascading false positives in SQL rules. -[[rule]] -id = "WEB512" -description = "Bearer token in configuration header." -severity = "Medium" -remediation = "Store authentication tokens securely and avoid hardcoding in configuration files." -pattern = "Authorization\\s*:\\s*\\bBearer\\b" -file_pattern = "*.conf" +[[taint_sink]] +id = "SK_PY105" +vulnerability_id = "PY105" +description = "Tainted data passed to mark_safe() — XSS risk if data contains HTML." +function_call = "mark_safe" +vulnerable_parameter_index = 0 +triggers_on = "html_injectable" -[[rule]] -id = "DJG513" -description = "Django CSRF protection bypass detected." -severity = "Low" -remediation = "Do not use csrf_exempt decorator unless absolutely necessary and with proper justification." -pattern = "csrf_exempt" -file_pattern = "*.py" +[[taint_sink]] +id = "SK_PY105B" +vulnerability_id = "PY105" +description = "Tainted data passed to Markup() — XSS risk." +function_call = "Markup" +vulnerable_parameter_index = 0 +triggers_on = "html_injectable" -[[rule]] -id = "WEB514" -description = "X-Frame-Options set to allow framing." -severity = "Medium" -remediation = "Set X-Frame-Options to DENY or SAMEORIGIN to prevent clickjacking attacks." -pattern = "X-Frame-Options\\s*:\\s*ALLOW" -file_pattern = "*.conf" +[[taint_sanitizer]] +id = "SN001" +description = "Shell argument escaping — transforms to ShellSanitized instead of clearing." +function_call = "shlex.quote" +transforms_to = "ShellSanitized" +# shlex.quote converts HttpRequest → ShellSanitized: +# - PY102/SHELL sinks (triggers_on = "shell_injectable"): do NOT fire — shlex.quote is valid mitigation +# - PATH813/OPEN1149/FSTRING867/SSRF (triggers_on = "all"): STILL fire — quoted path still traverses +# Result: `cat {shlex.quote(tainted_path)} | bash` correctly fires FSTRING867 +# `subprocess.run(["bash", shlex.quote(arg)])` correctly does NOT fire PY102 -[[rule]] -id = "PY515" -description = "Code compilation using compile() function." -severity = "High" -remediation = "Dynamic code compilation can be dangerous. Validate all inputs and consider static alternatives." -ast_match = "Call(func.attr=compile)" -file_pattern = "*.py" -# re.compile() and sql compiler.compile() are not Python code execution -exclude_pattern = "re\\.compile|regex\\.compile|compiler\\.compile|self\\.compile" +[[taint_sanitizer]] +id = "SN002" +description = "HTML escaping — transforms to HtmlSanitized." +function_call = "escape" +transforms_to = "HtmlSanitized" -[[rule]] -id = "DOM516" -description = "DOM manipulation using document.write()." -severity = "Medium" -remediation = "Use safer DOM manipulation methods like createElement() and appendChild()." -pattern = "document\\.write\\s*\\(" -file_pattern = "*.js" +[[taint_sanitizer]] +id = "SN003" +description = "format_html safely escapes for HTML — transforms to HtmlSanitized." +function_call = "format_html" +transforms_to = "HtmlSanitized" -[[rule]] -id = "XSS517" -description = "InnerHTML assignment detected." -severity = "Low" -remediation = "Using innerHTML can lead to XSS vulnerabilities. Use textContent or createElement instead." -pattern = "innerHTML\\s*=" -file_pattern = "*.html" +[[taint_sanitizer]] +id = "SN004" +description = "conditional_escape for HTML — transforms to HtmlSanitized." +function_call = "conditional_escape" +transforms_to = "HtmlSanitized" -[[rule]] -id = "PY518" -description = "Subprocess execution with shell parameter enabled." -severity = "High" -confidence = "Medium" -remediation = "Disable shell parameter or validate all inputs to prevent command injection." -ast_match = "Call(func.value.id=subprocess, func.attr=Popen, keywords.*.arg=shell, keywords.*.value.value=True)" -file_pattern = "*.py" +[[taint_sanitizer]] +id = "SN005" +description = "DB identifier quoting — transforms to SqlSanitized." +function_call = "quote_name" +transforms_to = "SqlSanitized" -[[rule]] -id = "TIME519" -description = "JavaScript setTimeout with string parameter." -severity = "Low" -remediation = "Pass function references to setTimeout instead of string code." -pattern = "setTimeout\\s*\\(\\s*['\\\"]" -file_pattern = "*.js" +[[taint_sanitizer]] +id = "SN006" +description = "DB identifier quoting via ops — transforms to SqlSanitized." +function_call = "ops.quote_name" +transforms_to = "SqlSanitized" -[[rule]] -id = "DB520" -description = "Mongoose query construction detected." -severity = "Medium" -remediation = "Use parameterized queries to prevent NoSQL injection attacks." -pattern = "mongoose\\.query\\s*\\(" -file_pattern = "*.js" +[[taint_sanitizer]] +id = "SN_SAFE_URL001" +description = "Django is_safe_url() validates the URL host against an allowed-hosts list — prevents open redirect." +function_call = "is_safe_url" -[[rule]] -id = "SER522" -description = "Object serialization function detected." -severity = "Low" -remediation = "Ensure serialized data comes from trusted sources to prevent deserialization attacks." -pattern = "\\bserialize\\b\\s*\\(" -file_pattern = "*.py" +[[taint_sanitizer]] +id = "SN_SAFE_URL002" +description = "Django url_has_allowed_host_and_scheme() validates URL host and scheme — prevents open redirect." +function_call = "url_has_allowed_host_and_scheme" -[[rule]] -id = "NODE525" -description = "Node.js child_process module import." -severity = "Low" -remediation = "Child process execution can be dangerous. Validate all inputs and limit functionality." -pattern = "require\\s*\\(.*child_process" -file_pattern = "*.js" +# ------------------------------------------- +# SECTION: SQL Injection Taint Sinks +# ------------------------------------------- -[[rule]] -id = "FILE526" -description = "File read operation using open attribute access." -severity = "Medium" -remediation = "Implement proper file access controls and validate file paths." -ast_match = "Attribute(attr=read, value.id=open)" -file_pattern = "*.py" +[[taint_sink]] +id = "SK_SQL001" +vulnerability_id = "PY101" +description = "Tainted SQL string passed to cursor.execute() — SQL injection risk." +function_call = "execute" +vulnerable_parameter_index = 0 +is_method = true +triggers_on = "sql_injectable" -[[rule]] -id = "PERM527" -description = "Setting overly permissive file permissions (777)." -severity = "High" -remediation = "Use more restrictive permissions. Consider 644 for files and 755 for directories." -pattern = "chmod\\s+777" -file_pattern = "*.sh" +[[taint_sink]] +id = "SK_SQL002" +vulnerability_id = "PY101" +description = "Tainted SQL string passed to cursor.executemany() — SQL injection risk." +function_call = "executemany" +vulnerable_parameter_index = 0 +is_method = true +triggers_on = "sql_injectable" -[[rule]] -id = "FILE528" -description = "Direct access to system password file." -severity = "High" -confidence = "Medium" -remediation = "Accessing /etc/passwd should be done through proper system APIs with authorization." -pattern = "open\\s*\\(\\s*['\\\"]/etc/passwd" -file_pattern = "*.py" +[[taint_sink]] +id = "SK_PY507" +vulnerability_id = "PY507" +description = "Tainted data passed to .exec() method — attacker may inject code or SQL." +function_call = "exec" +vulnerable_parameter_index = 0 +is_method = true +triggers_on = "sql_injectable" -[[rule]] -id = "TEMP529" -description = "Insecure temporary file creation using mktemp -u." -severity = "Low" -remediation = "Use mktemp without -u flag or mkstemp for secure temporary file creation." -pattern = "mktemp\\s+-u" -file_pattern = "*.sh" +[[taint_sink]] +id = "SK_MKDIR001" +vulnerability_id = "PATH813" +description = "Tainted path used in mkdir() — attacker can create directories at arbitrary locations." +function_call = "mkdir" +vulnerable_receiver = true +is_method = true -[[rule]] -id = "SSL531" -description = "SSL/TLS certificate verification disabled." -severity = "Medium" -remediation = "Enable certificate verification to prevent man-in-the-middle attacks." -pattern = "verify\\s*:\\s*false" -file_pattern = "*.y*ml" +[[taint_sink]] +id = "SK_MAKEDIRS001" +vulnerability_id = "PATH813" +description = "Tainted path used in os.makedirs() — attacker can create directories at arbitrary locations." +function_call = "os.makedirs" +vulnerable_parameter_index = 0 +is_method = false -[[rule]] -id = "CRYPTO532" -description = "Deprecated SSL/TLS protocol version usage." -severity = "Medium" -remediation = "Use TLS 1.2 or higher. Avoid deprecated SSL and early TLS versions." -pattern = "ssl\\.PROTOCOL_(SSLv2|SSLv3|TLSv1|TLSv1_1)" -file_pattern = "*.py" +[[taint_sink]] +id = "SK_SYMLINK001" +vulnerability_id = "SYMLINK816" +description = "User-controlled path as symlink source — attacker can create links to arbitrary files." +function_call = "os.symlink" +vulnerable_parameter_index = 0 +is_method = false -[[rule]] -id = "PERM568" -description = "File permission change to world-writable detected." -severity = "High" -confidence = "Medium" -remediation = "Avoid setting world-writable permissions. Use more restrictive file access controls." -pattern = "chmod\\s+777" -file_pattern = "*.sh" +[[taint_sink]] +id = "SK_DESER724" +vulnerability_id = "DESER724" +description = "Deserialized code object passed to types.FunctionType() — creates callable from untrusted bytecode." +function_call = "types.FunctionType" +vulnerable_parameter_index = 0 +is_method = false +# Dotted path uses contains(): matches types.FunctionType AND python_types.FunctionType +# (python_TYPES contains "types" as suffix → "python_types.FunctionType".contains("types.FunctionType") = true) -[[rule]] -id = "WEB575" -description = "Content Security Policy allows unsafe inline execution." -severity = "High" -confidence = "Medium" -remediation = "Remove unsafe-inline from CSP directives and implement nonce-based or hash-based CSP." -pattern = "Content-Security-Policy\\s*:\\s*.*unsafe-inline" -file_pattern = "*.conf" +[[taint_sink]] +id = "SK_DESER724B" +vulnerability_id = "DESER724" +description = "Deserialized code object passed to FunctionType() (direct import) — creates callable from untrusted bytecode." +function_call = "FunctionType" +vulnerable_parameter_index = 0 +is_method = false +# Matches: from types import FunctionType; FunctionType(code, ...) -[[rule]] -id = "SQL586" -description = "String formatting in SQL query execution." -severity = "Critical" -confidence = "Medium" -remediation = "Use parameterized queries instead of string formatting to prevent SQL injection." -pattern = "\\.(execute|executemany)\\s*\\(\\s*f?[\"'].*SELECT.*(FROM|UPDATE|DELETE).*%s.*[\"']" -file_pattern = "*.py" +[[taint_sink]] +id = "SK_SSTI001" +vulnerability_id = "SSTI001" +description = "Tainted string passed to Flask render_template_string() — Jinja2 SSTI → RCE." +function_call = "render_template_string" +vulnerable_parameter_index = 0 +is_method = false -[[rule]] -id = "FUNC596" -description = "JavaScript Function constructor usage." -severity = "Critical" -confidence = "Medium" -remediation = "Avoid Function constructor as it can execute arbitrary code. Use predefined functions." -pattern = "new\\s+Function\\s*\\(" -file_pattern = "*.js" +# SK_SSTI002 removed: from_string() is too generic — fires on DeviceSpec.from_string(), etc. -[[rule]] -id = "SHELL602" -description = "Shell command execution with dynamic arguments." -severity = "High" -confidence = "Medium" -remediation = "Use subprocess with argument arrays instead of shell command strings." -pattern = "subprocess\\.(Popen|call)\\(.*shell\\s*=\\s*True" -file_pattern = "*.py" +[[taint_sink]] +id = "SK_ORMRAW001" +vulnerability_id = "ORM002" +description = "Tainted SQL string passed to Django QuerySet.raw() — SQL injection via ORM." +function_call = "raw" +vulnerable_parameter_index = 0 +is_method = true +triggers_on = "sql_injectable" -[[rule]] -id = "CODE607" -description = "Content Security Policy with unsafe inline directives." -severity = "High" -confidence = "Medium" -remediation = "Implement strict CSP without unsafe-inline to prevent XSS attacks." -pattern = "Content-Security-Policy\\s*:\\s*.*unsafe-inline" -file_pattern = "*.conf" +[[taint_sink]] +id = "SK_ORMORDER001" +vulnerability_id = "ORM002" +description = "User-controlled field name in QuerySet.order_by() — Django ORM injection (CVE-2021-35042)." +function_call = "order_by" +vulnerable_parameter_index = 0 +is_method = true -[[rule]] -id = "JSON612" -description = "JSON parsing without input validation." -severity = "Low" -confidence = "Low" -remediation = "json.loads() is safe from code execution. Only flag if result feeds into eval/exec/pickle." -ast_match = "Call(func.value.id=json, func.attr=loads)" -file_pattern = "*.py" +[[taint_sink]] +id = "SK_ORMEXTRA001" +vulnerability_id = "ORM002" +description = "User-controlled SQL fragments in QuerySet.extra() — SQL injection via ORM." +function_call = "extra" +vulnerable_parameter_index = 0 +is_method = true +triggers_on = "sql_injectable" -[[rule]] -id = "YAML619" -description = "Shell execution in subprocess with dynamic input." -severity = "High" -confidence = "Medium" -remediation = "Use argument lists with subprocess to prevent command injection attacks." -pattern = "subprocess\\.(Popen|call)\\(.*shell\\s*=\\s*True" -file_pattern = "*.py" +[[taint_sink]] +id = "SK_DESER725" +vulnerability_id = "DESER725" +description = "User-controlled data passed to jsonpickle.decode() — arbitrary Python object deserialization → RCE." +function_call = "jsonpickle.decode" +vulnerable_parameter_index = 0 +is_method = false -[[rule]] -id = "SHELL631" -description = "SQL injection vulnerability in database query." -severity = "Critical" -confidence = "Medium" -remediation = "Use parameterized queries with placeholders instead of string concatenation." -pattern = "\\.(execute|executemany)\\s*\\(\\s*f?[\"'].*SELECT.*(FROM|UPDATE|DELETE).*%s.*[\"']" -file_pattern = "*.py" +[[taint_sink]] +id = "SK_DESER726" +vulnerability_id = "DESER726" +description = "User-controlled data passed to dill.loads() — arbitrary Python object deserialization → RCE." +function_call = "dill.loads" +vulnerable_parameter_index = 0 +is_method = false -[[rule]] -id = "JS635" -description = "Dynamic function creation in JavaScript." -severity = "High" -confidence = "Medium" -remediation = "Avoid Function constructor to prevent code injection. Use predefined function references." -pattern = "new\\s+Function\\s*\\(" -file_pattern = "*.js" +[[taint_sink]] +id = "SK_DESER_JOBLIB" +vulnerability_id = "DESER_JOBLIB001" +description = "User-controlled path passed to joblib.load() — arbitrary Python object deserialization → RCE." +function_call = "joblib.load" +vulnerable_parameter_index = 0 +is_method = false -[[rule]] -id = "CSP640" -description = "Unsafe Content Security Policy configuration." -severity = "High" -confidence = "Medium" -remediation = "Configure CSP without unsafe-inline and unsafe-eval directives." -pattern = "Content-Security-Policy\\s*:\\s*.*unsafe-inline" -file_pattern = "*.conf" +[[taint_sink]] +id = "SK_MARKUP001" +vulnerability_id = "PY105" +description = "Tainted string passed to jinja2.Markup() — bypasses Jinja2 auto-escaping, XSS risk." +function_call = "Markup" +vulnerable_parameter_index = 0 +is_method = false -[[rule]] -id = "SHELL645" -description = "Dynamic code compilation with user input." -severity = "High" -confidence = "Medium" -remediation = "Avoid compile() function with untrusted input. Use static code analysis instead." -ast_match = "Call(func.attr=compile)" -file_pattern = "*.py" -exclude_pattern = "re\\.compile|regex\\.compile|compiler\\.compile|self\\.compile" +[[taint_sink]] +id = "SK_ORM_VALUES001" +vulnerability_id = "ORM002" +description = "User-controlled field name in QuerySet.values() — CVE-2024-42005 Django ORM injection." +function_call = "values" +vulnerable_parameter_index = 0 +is_method = true -[[rule]] -id = "PERM650" -description = "SQL query with potential injection vulnerability." -severity = "Critical" -confidence = "Medium" -remediation = "Implement prepared statements and parameterized queries to prevent SQL injection." -pattern = "\\.(execute|executemany)\\s*\\(\\s*f?[\"'].*SELECT.*(FROM|UPDATE|DELETE).*%s.*[\"']" -file_pattern = "*.py" +[[taint_sink]] +id = "SK_ORM_VALUES_LIST001" +vulnerability_id = "ORM002" +description = "User-controlled field name in QuerySet.values_list() — column name injection." +function_call = "values_list" +vulnerable_parameter_index = 0 +is_method = true -[[rule]] -id = "JS655" -description = "Dynamic function constructor in JavaScript code." -severity = "High" -confidence = "Medium" -remediation = "Replace Function constructor with safer alternatives to prevent code injection." -pattern = "new\\s+Function\\s*\\(" -file_pattern = "*.js" +[[taint_sink]] +id = "SK_PATH_READ001" +vulnerability_id = "PATH813" +description = "Tainted path receiver for read_text() — arbitrary file read via path traversal." +function_call = "read_text" +vulnerable_receiver = true +is_method = true -[[rule]] -id = "SHELL660" -description = "Process execution with shell parameter enabled." -severity = "High" -confidence = "Medium" -remediation = "Use subprocess without shell parameter and pass arguments as a list." -pattern = "subprocess\\.(Popen|call)\\(.*shell\\s*=\\s*True" -file_pattern = "*.py" +[[taint_sink]] +id = "SK_PATH_READ002" +vulnerability_id = "PATH813" +description = "Tainted path receiver for read_bytes() — arbitrary file read via path traversal." +function_call = "read_bytes" +vulnerable_receiver = true +is_method = true -[[rule]] -id = "CSP665" -description = "Insecure Content Security Policy allowing inline scripts." -severity = "High" -confidence = "Medium" -remediation = "Use nonce or hash-based CSP instead of unsafe-inline directive." -pattern = "Content-Security-Policy\\s*:\\s*.*unsafe-inline" -file_pattern = "*.conf" +[[taint_sink]] +id = "SK_PATH_WRITE001" +vulnerability_id = "PATH813" +description = "Tainted path receiver for write_text() — arbitrary file write via path traversal." +function_call = "write_text" +vulnerable_receiver = true +is_method = true -[[rule]] -id = "SHELL670" -description = "Code compilation function usage." -severity = "High" -confidence = "Medium" -remediation = "Avoid dynamic code compilation. Consider static analysis or predefined code patterns." -ast_match = "Call(func.attr=compile)" -file_pattern = "*.py" -exclude_pattern = "re\\.compile|regex\\.compile|compiler\\.compile|self\\.compile" +[[taint_sink]] +id = "SK_PATH_WRITE002" +vulnerability_id = "PATH813" +description = "Tainted path receiver for write_bytes() — arbitrary file write via path traversal." +function_call = "write_bytes" +vulnerable_receiver = true +is_method = true -[[rule]] -id = "SHELL675" -description = "Database query with string interpolation." -severity = "Critical" -confidence = "Medium" -remediation = "Use ORM methods or prepared statements instead of string formatting in SQL queries." -pattern = "\\.(execute|executemany)\\s*\\(\\s*f?[\"'].*SELECT.*(FROM|UPDATE|DELETE).*%s.*[\"']" -file_pattern = "*.py" +[[taint_sink]] +id = "SK_PATH_UNLINK001" +vulnerability_id = "PATH813" +description = "Tainted path receiver for unlink() — attacker-controlled file deletion." +function_call = "unlink" +vulnerable_receiver = true +is_method = true -[[rule]] -id = "PERM679" -description = "Subprocess call with shell execution enabled." -severity = "High" -confidence = "Medium" -remediation = "Disable shell parameter in subprocess calls to prevent command injection." -ast_match = "Call(func.value.id=subprocess, func.attr=Popen, keywords.*.arg=shell, keywords.*.value.value=True)" -file_pattern = "*.py" +[[taint_sink]] +id = "SK_SSRF_HTTPX001" +vulnerability_id = "SSRF_001" +description = "User-controlled URL passed to httpx async client — SSRF risk." +function_call = "httpx.AsyncClient.get" +vulnerable_parameter_index = 0 +is_method = false -[[rule]] -id = "DOM683" -description = "DOM write operation using document.write." -severity = "High" -confidence = "Medium" -remediation = "Use modern DOM manipulation methods instead of document.write to prevent XSS." -pattern = "document\\.write\\s*\\(" -file_pattern = "*.js" +[[taint_sink]] +id = "SK_SSRF_HTTPX002" +vulnerability_id = "SSRF_001" +description = "User-controlled URL passed to httpx async client — SSRF risk." +function_call = "httpx.AsyncClient.post" +vulnerable_parameter_index = 0 +is_method = false -[[rule]] -id = "SHELL689" -description = "Process creation with shell command execution." -severity = "High" -confidence = "Medium" -remediation = "Use process execution without shell to avoid command injection vulnerabilities." -pattern = "subprocess\\.(Popen|call)\\(.*shell\\s*=\\s*True" -file_pattern = "*.py" +[[taint_sink]] +id = "SK_SSRF_AIOHTTP001" +vulnerability_id = "SSRF_001" +description = "User-controlled URL passed to aiohttp ClientSession.get() — SSRF risk." +function_call = "aiohttp.ClientSession.get" +vulnerable_parameter_index = 0 +is_method = false -[[rule]] -id = "SQL693" -description = "String formatting in database execute statement." -severity = "Critical" -confidence = "Medium" -remediation = "Implement parameterized queries to eliminate SQL injection risks." -pattern = "\\.(execute|executemany)\\s*\\(\\s*f?[\"'].*SELECT.*(FROM|UPDATE|DELETE).*%s.*[\"']" -file_pattern = "*.py" +[[taint_sink]] +id = "SK_SSRF_AIOHTTP002" +vulnerability_id = "SSRF_001" +description = "User-controlled URL passed to aiohttp ClientSession.post() — SSRF risk." +function_call = "aiohttp.ClientSession.post" +vulnerable_parameter_index = 0 +is_method = false -[[rule]] -id = "DOM697" -description = "Direct DOM manipulation using document.write method." -severity = "High" -confidence = "Medium" -remediation = "Use createElement and appendChild methods for safer DOM manipulation." -pattern = "document\\.write\\s*\\(" -file_pattern = "*.js" +[[taint_sink]] +id = "SK_TMPL_PATH001" +vulnerability_id = "PATH813" +description = "User-controlled string in Django render() template name — path traversal loads arbitrary templates." +function_call = "render" +vulnerable_parameter_index = 1 +is_method = false -[[rule]] -id = "PERM702" -description = "File permission modification to world-accessible." -severity = "High" -confidence = "Medium" -remediation = "Set appropriate file permissions. Avoid 777 permissions on production systems." -pattern = "chmod\\s+777" -file_pattern = "*.sh" +[[taint_sink]] +id = "SK_IMG_EVAL001" +vulnerability_id = "PY001" +description = "User-controlled expression in PIL.ImageMath.eval() — arbitrary Python code execution (CVE-2023-50447)." +function_call = "ImageMath.eval" +vulnerable_parameter_index = 0 +is_method = false -[[rule]] -id = "NET705" -description = "Network request without SSL certificate verification." -severity = "High" -confidence = "Medium" -remediation = "Enable SSL certificate verification to prevent man-in-the-middle attacks." -pattern = "requests\\.(get|post|put|delete)\\(.*verify\\s*=\\s*False" -file_pattern = "*.py" +# SK_FILE_WRITE001 removed: write() is too generic (HTTP response writes, cache writes, etc.) -[[rule]] -id = "CRYPTO708" -description = "Weak cryptographic key generation detected." -severity = "Medium" -confidence = "Medium" -remediation = "Use cryptographically secure random number generators for key generation." -pattern = "random\\.(randint|random)\\(" -file_pattern = "*.py" +[[taint_sink]] +id = "SK_REDIRECT001" +vulnerability_id = "OPEN_REDIRECT001" +description = "User-controlled URL in Flask redirect() — open redirect / SSRF." +function_call = "redirect" +vulnerable_parameter_index = 0 +is_method = false -[[rule]] -id = "AUTH711" -description = "Authentication bypass using hardcoded credentials." -severity = "Critical" -confidence = "High" -remediation = "Implement proper authentication mechanisms without hardcoded credentials." -pattern = "(?i)(username|user)\\s*[:=]\\s*[\"']admin[\"']" -file_pattern = "*.py" +[[taint_sink]] +id = "SK_REDIRECT002" +vulnerability_id = "OPEN_REDIRECT001" +description = "User-controlled URL in Django HttpResponseRedirect() — open redirect." +function_call = "HttpResponseRedirect" +vulnerable_parameter_index = 0 +is_method = false -[[rule]] -id = "XSS714" -description = "Cross-site scripting vulnerability in template rendering." -severity = "High" -confidence = "Medium" -remediation = "Use template engines with automatic escaping or manually escape user input." -pattern = "\\|safe\\b" -file_pattern = "*.html" +[[taint_sink]] +id = "SK_PLAIN_PWD001" +vulnerability_id = "PLAIN_PWD001" +description = "Tainted value stored as 'password' in Django ORM create() — plaintext password stored in database." +function_call = "create" +is_method = true +vulnerable_keyword = "password" -[[rule]] -id = "LDAP717" -description = "LDAP injection vulnerability in search filter." -severity = "High" -confidence = "Medium" -remediation = "Properly escape LDAP filter characters or use parameterized LDAP queries." -pattern = "\\.search\\(.*filter.*%s" -file_pattern = "*.py" +[[taint_sink]] +id = "SK_COOKIE_JAR001" +vulnerability_id = "COOKIE_FILE001" +description = "Attacker-controlled path loaded as cookie jar — cookie injection into HTTP sessions." +function_call = "MozillaCookieJar" +vulnerable_parameter_index = 0 +is_method = false -[[rule]] -id = "XPATH720" -description = "XPath injection vulnerability detected." -severity = "High" -confidence = "Medium" -remediation = "Use parameterized XPath queries or properly escape user input." -pattern = "xpath\\(.*%s" -file_pattern = "*.py" +[[taint_sink]] +id = "SK_EXEC_MODULE001" +vulnerability_id = "IMPORT825" +description = "User-controlled path reaches exec_module() — arbitrary code execution via dynamic import." +function_call = "exec_module" +vulnerable_parameter_index = 0 +is_method = true -[[rule]] -id = "DESER723" -description = "Unsafe deserialization of untrusted data." -severity = "Critical" -confidence = "High" -remediation = "Validate and sanitize data before deserialization or use safer formats." -ast_match = "Call(func.value.id=marshal, func.attr=loads)" -file_pattern = "*.py" +[[taint_sink]] +id = "SK_SPEC_FILE001" +vulnerability_id = "IMPORT825" +description = "User-controlled path passed to spec_from_file_location() — loads arbitrary Python file as module." +function_call = "importlib.util.spec_from_file_location" +vulnerable_parameter_index = 1 +is_method = false + +# SSRF sinks — HTTP client functions where the URL argument is tainted +[[taint_sink]] +id = "SK_SSRF001" +vulnerability_id = "SSRF_001" +description = "User-controlled URL in httpx.stream() — SSRF: attacker can redirect to internal services or file:// URIs." +function_call = "httpx.stream" +vulnerable_parameter_index = 1 +is_method = false -[[rule]] -id = "PRIV726" -description = "Privilege escalation through setuid binary execution." -severity = "High" -confidence = "Medium" -remediation = "Avoid executing setuid binaries or implement proper privilege checks." -pattern = "os\\.setuid\\(" -file_pattern = "*.py" +[[taint_sink]] +id = "SK_SSRF002" +vulnerability_id = "SSRF_001" +description = "User-controlled URL in httpx.get() — SSRF risk." +function_call = "httpx.get" +vulnerable_parameter_index = 0 +is_method = false -[[rule]] -id = "RACE729" -description = "Race condition in file operations." -severity = "Medium" -confidence = "Low" -remediation = "Use atomic file operations or proper locking mechanisms." -pattern = "os\\.path\\.exists.*open\\(" -file_pattern = "*.py" +[[taint_sink]] +id = "SK_SSRF003" +vulnerability_id = "SSRF_001" +description = "User-controlled URL in httpx.post() — SSRF risk." +function_call = "httpx.post" +vulnerable_parameter_index = 0 +is_method = false -[[rule]] -id = "MEM732" -description = "Memory exhaustion through unbounded data structure." -severity = "Medium" -confidence = "Low" -remediation = "Implement size limits on data structures to prevent memory exhaustion." -pattern = "\\[\\]\\s*\\*\\s*\\w+" -file_pattern = "*.py" +[[taint_sink]] +id = "SK_SSRF004" +vulnerability_id = "SSRF_001" +description = "User-controlled URL in httpx.request() — SSRF risk." +function_call = "httpx.request" +vulnerable_parameter_index = 1 +is_method = false -[[rule]] -id = "DIR735" -description = "Directory traversal vulnerability in file path." -severity = "High" -confidence = "Medium" -remediation = "Validate and sanitize file paths to prevent directory traversal attacks." -pattern = "\\.\\./|\\.\\.\\\\|%2e%2e%2f" -file_pattern = "*.py" +[[taint_sink]] +id = "SK_SSRF005" +vulnerability_id = "SSRF_001" +description = "User-controlled URL in requests.get() — SSRF risk." +function_call = "requests.get" +vulnerable_parameter_index = 0 +is_method = false -[[rule]] -id = "INFO738" -description = "Information disclosure through error messages." -severity = "Low" -confidence = "Low" -remediation = "Implement generic error messages that don't reveal system information." -pattern = "traceback\\.print_exc\\(" -file_pattern = "*.py" +[[taint_sink]] +id = "SK_SSRF006" +vulnerability_id = "SSRF_001" +description = "User-controlled URL in requests.post() — SSRF risk." +function_call = "requests.post" +vulnerable_parameter_index = 0 +is_method = false -[[rule]] -id = "LOG741" -description = "Log injection vulnerability detected." -severity = "Medium" -confidence = "Medium" -remediation = "Sanitize user input before logging to prevent log injection attacks." -pattern = "logging\\.(info|debug|warning|error)\\(.*%s" -file_pattern = "*.py" +[[taint_sink]] +id = "SK_SSRF007" +vulnerability_id = "SSRF_001" +description = "User-controlled URL in requests.request() — SSRF risk." +function_call = "requests.request" +vulnerable_parameter_index = 1 +is_method = false -[[rule]] -id = "SESS744" -description = "Session fixation vulnerability in session handling." -severity = "High" -confidence = "Medium" -remediation = "Regenerate session IDs after authentication to prevent fixation attacks." -# Writing data to a session is NOT session fixation. Only flag direct session key assignment from request. -pattern = "session\\.session_key\\s*=.*request\\." -file_pattern = "*.py" +[[taint_sink]] +id = "SK_SSRF008" +vulnerability_id = "SSRF_001" +description = "User-controlled URL in urllib.request.urlopen() — SSRF risk." +function_call = "urllib.request.urlopen" +vulnerable_parameter_index = 0 +is_method = false -[[rule]] -id = "CSRF747" -description = "Cross-Site Request Forgery protection bypass." -severity = "High" -confidence = "Medium" -remediation = "Implement proper CSRF tokens for state-changing operations." -pattern = "@csrf_exempt" -file_pattern = "*.py" +# LOG741 taint sinks — only fire when tainted data reaches a logging call. +# This replaces the pattern rule (which fired on any logging call with %s format). +# Internal objects (proto, op_name, config) are never tainted → no FPs. -[[rule]] -id = "HTTP750" -description = "HTTP response splitting vulnerability." -severity = "High" -confidence = "Medium" -remediation = "Validate and sanitize HTTP headers to prevent response splitting." -pattern = "HttpResponse\\(.*\\\\r\\\\n" -file_pattern = "*.py" +[[taint_sink]] +id = "SK_LOG741_INFO" +vulnerability_id = "LOG741" +description = "User-controlled data in logging.info() — log injection risk." +function_call = "logging.info" +vulnerable_parameter_index = 1 +is_method = false -[[rule]] -id = "UPLOAD753" -description = "Unrestricted file upload vulnerability." -severity = "High" -confidence = "Medium" -remediation = "Implement file type validation and size limits for uploads." -pattern = "request\\.FILES\\[.*\\]\\.save\\(" -file_pattern = "*.py" +[[taint_sink]] +id = "SK_LOG741_WARN" +vulnerability_id = "LOG741" +description = "User-controlled data in logging.warning() — log injection risk." +function_call = "logging.warning" +vulnerable_parameter_index = 1 +is_method = false -[[rule]] -id = "CACHE756" -description = "Cache poisoning vulnerability in HTTP caching." -severity = "Medium" -confidence = "Low" -remediation = "Validate cache keys and implement proper cache invalidation." -pattern = "cache\\.set\\(.*request\\." -file_pattern = "*.py" +[[taint_sink]] +id = "SK_LOG741_ERROR" +vulnerability_id = "LOG741" +description = "User-controlled data in logging.error() — log injection risk." +function_call = "logging.error" +vulnerable_parameter_index = 1 +is_method = false -[[rule]] -id = "TIMING759" -description = "Timing attack vulnerability in authentication." -severity = "Medium" -confidence = "Low" -remediation = "Use constant-time comparison functions for sensitive operations." -pattern = "password\\s*==\\s*.*" -file_pattern = "*.py" +[[taint_sink]] +id = "SK_LOG741_DEBUG" +vulnerability_id = "LOG741" +description = "User-controlled data in logging.debug() — log injection risk." +function_call = "logging.debug" +vulnerable_parameter_index = 1 +is_method = false -[[rule]] -id = "ENUM762" -description = "User enumeration vulnerability in login system." -severity = "Low" -confidence = "Low" -remediation = "Return identical responses for valid and invalid usernames." -pattern = "User\\.objects\\.get\\(username=" -file_pattern = "*.py" +[[taint_sink]] +id = "SK_LOG741_CRITICAL" +vulnerability_id = "LOG741" +description = "User-controlled data in logging.critical() — log injection risk." +function_call = "logging.critical" +vulnerable_parameter_index = 1 +is_method = false -[[rule]] -id = "BRUTE765" -description = "Missing brute force protection on authentication." -severity = "Medium" -confidence = "Low" -remediation = "Implement rate limiting and account lockout mechanisms." -pattern = "login_required" -file_pattern = "*.py" +# ------------------------------------------- +# SECTION: Injection (OWASP A03:2021) +# ------------------------------------------- [[rule]] -id = "WEAK768" -description = "Weak password policy implementation." -severity = "Low" -confidence = "Low" -remediation = "Implement strong password requirements and validation." -pattern = "len\\(password\\)\\s*<\\s*[1-6]" -file_pattern = "*.py" +id = "PY102" +description = "Command Injection detected via Taint Analysis." +severity = "Critical" +confidence = "High" +remediation = "User-controlled data reached a command execution function without sanitization. Use 'shlex.quote()' to escape arguments or avoid passing user input to shell commands entirely." +# No ast_match — triggered only by taint engine +# NOTE: This rule has no 'pattern' or 'ast_match'. It is triggered ONLY by the taint engine. [[rule]] -id = "TOKEN771" -description = "JWT token potentially without expiration time (Manual inspection suggested)." -severity = "Medium" -confidence = "Medium" -remediation = "Set appropriate expiration times for JWT tokens." -pattern = "jwt\\.encode\\s*\\(" +id = "PY001" +description = "Use of 'eval()' is highly dangerous." +severity = "High" +remediation = "Avoid 'eval()'. Use safer alternatives like 'ast.literal_eval' for data parsing." +ast_match = "Call(func.id=eval)" file_pattern = "*.py" [[rule]] -id = "OAUTH774" -description = "OAuth state parameter potentially missing in authorization request (Manual inspection suggested)." +id = "PY103" +description = "Use of os.system is a command injection risk." severity = "High" -confidence = "Medium" -remediation = "Include state parameter in OAuth flows to prevent CSRF attacks." -pattern = "oauth.*authorize.*" -file_pattern = "*.py" +remediation = "Avoid 'os.system'. Use the 'subprocess' module with command and arguments as a list." +# No ast_match — triggered only by taint engine [[rule]] -id = "API777" -description = "API endpoint without rate limiting." -severity = "Medium" -confidence = "Low" -remediation = "Implement rate limiting on API endpoints to prevent abuse." -pattern = "@app\\.route.*methods.*POST" -file_pattern = "*.py" +id = "PY101" +description = "Potential SQL injection via string formatting in database query." +severity = "Critical" +confidence = "High" +remediation = "Use parameterized queries (e.g., cursor.execute('SELECT * FROM users WHERE name = ?', (name,))) instead of string formatting." +# No pattern — triggered by taint engine (SK_SQL001/SK_SQL002) +# Exclude migration files: ORM DDL in migrations uses cursor.execute() with developer-controlled +# schema parameters (table names, column names) that are not user input. +exclude_file_pattern = "*/migrations/*,*/alembic/*,*/backends/*" [[rule]] -id = "CORS780" -description = "Overly permissive CORS configuration." -severity = "Medium" -confidence = "Medium" -remediation = "Restrict CORS origins to trusted domains only." -pattern = "Access-Control-Allow-Origin\\s*:\\s*\\*" +id = "PY104" +description = "LDAP injection may be possible with string formatting." +severity = "High" +remediation = "Use a proper LDAP escaping library for any user-controlled data in LDAP queries." +pattern = "\\.search_s\\s*\\(.*f[\"']" file_pattern = "*.py" [[rule]] -id = "CLICK783" -description = "Potential Clickjacking vulnerability due to missing X-Frame-Options (Manual inspection suggested)." -severity = "Medium" -confidence = "Low" -remediation = "Set X-Frame-Options header to DENY or SAMEORIGIN." -pattern = "HttpResponse\\s*\\(" -file_pattern = "*.py" +id = "PY105" +description = "User-controlled data passed to mark_safe() or Markup() — potential XSS." +severity = "High" +confidence = "High" +remediation = "Never pass user-controlled data to mark_safe() or Markup(). Sanitize with django.utils.html.escape() first." +# No pattern — triggered only by taint engine (SK_PY105 / SK_PY105B) [[rule]] -id = "MIME786" -description = "MIME type sniffing vulnerability." -severity = "Low" -confidence = "Low" -remediation = "Set X-Content-Type-Options header to nosniff." -pattern = "HttpResponse\\(.*content_type=" +id = "PY106" +description = "Use of subprocess.run with shell=True is a command injection risk." +severity = "High" +remediation = "Avoid shell=True with subprocess.run. Pass commands as a list instead of a string." +# Only fire when shell=True is explicitly passed — not for every subprocess.run call +ast_match = "Call(func.value.id=subprocess, func.attr=run, keywords.*.arg=shell, keywords.*.value.value=True)" file_pattern = "*.py" [[rule]] -id = "HTTPS789" -description = "Missing HTTPS enforcement in security-sensitive context." +id = "PY107" +description = "Unsafe deserialization with 'yaml.load' — no Loader specified." severity = "High" confidence = "Medium" -remediation = "Enforce HTTPS for all security-sensitive operations." -pattern = "SECURE_SSL_REDIRECT\\s*=\\s*False" -file_pattern = "*settings*.py" - -[[rule]] -id = "COOKIE792" -description = "Insecure cookie configuration detected." -severity = "Medium" -confidence = "Medium" -remediation = "Set secure and httponly flags on sensitive cookies." -pattern = "set_cookie\\(.*secure=False" +remediation = "Pass Loader=yaml.SafeLoader or use yaml.safe_load(). For ruamel.yaml, use YAML(typ='safe') or YAML(typ='rt') (round-trip is safe by default)." +ast_match = "Call(func.value.id=yaml, func.attr=load)" file_pattern = "*.py" +# Exclude when any Loader= is explicitly passed. +# Note: ruamel.yaml's YAML() (round-trip) and YAML(typ="safe"/"rt"/"base") are all safe. +# This rule may produce FPs when the variable named 'yaml' was assigned from ruamel's +# YAML() constructor (not the PyYAML module). YAML(typ="unsafe") is caught by RUAMEL_UNSAFE001. +exclude_pattern = "Loader\\s*=|YAML\\s*\\(\\s*\\)\\s*\\.\\s*load|typ\\s*=\\s*[\"'](safe|rt|base)[\"']" +file_content_exclude = "from ruamel\\.yaml|import ruamel" -[[rule]] -id = "ADMIN795" -description = "Default admin credentials detected." -severity = "Critical" -confidence = "High" -remediation = "Change default administrative credentials before deployment." -pattern = "(?i)(admin|administrator).*password.*password" -file_pattern = "*.py" +# ------------------------------------------- +# SECTION: Cryptographic Failures (OWASP A02:2021) +# ------------------------------------------- [[rule]] -id = "DEBUG798" -description = "Debug information exposed in production." +id = "PY201" +description = "Use of weak hashing algorithm MD5 — do not use for passwords or security-sensitive hashing." severity = "Medium" -confidence = "Medium" -remediation = "Disable debug mode and remove debug statements in production." -pattern = "print\\(.*password\\|.*secret" +remediation = "For passwords use bcrypt/argon2. For checksums/integrity: SHA-256 is preferred but MD5 is acceptable if not security-critical." +ast_match = "Call(func.value.id=hashlib, func.attr=md5)" file_pattern = "*.py" +# Exclude non-password MD5 uses: +# hexdigest / 0x7FFFFFFF — deterministic int seed (sharding, seeding) +# checksum / integrity — explicit file-integrity context +# hash_id / hash_file — variable/function names indicating identity hash, not auth +# legacy — explicitly marked legacy/deprecated code path +# update( — incremental MD5 building (checksums use .update(), passwords don't) +exclude_pattern = "hexdigest|checksum|integrity|fingerprint|digest\\(\\)|0x7FFFFFFF|int.*md5|md5.*int|hash_id|hash.*file|file.*hash|_hash|legacy|nonce|update\\s*\\(|hasher|algorithm" [[rule]] -id = "BACKUP801" -description = "Backup file with sensitive information accessible." +id = "PY202" +description = "Use of broken hashing algorithm SHA1." severity = "Medium" -confidence = "Low" -remediation = "Secure backup files and exclude them from web-accessible directories." -pattern = "\\.(bak|backup|old|tmp)$" -file_pattern = "*" - -[[rule]] -id = "CONFIG804" -description = "Configuration file with default values." -severity = "Low" -confidence = "Low" -remediation = "Change default configuration values before production deployment." -pattern = "(?i)secret_key.*changeme" -file_pattern = "*settings*.py" - -[[rule]] -id = "HASH807" -description = "Use of insecure hash function for passwords." -severity = "High" -confidence = "High" -remediation = "Use bcrypt, scrypt, or Argon2 for password hashing." -ast_match = "Call(func.value.id=hashlib, func.attr=sha256)" +remediation = "Use a stronger hashing algorithm like SHA-256." +ast_match = "Call(func.value.id=hashlib, func.attr=sha1)" file_pattern = "*.py" +# SHA1 for cache keys, template keys, content addressing is not a security vulnerability. +# Only flag when SHA1 is used for passwords or authentication tokens. +exclude_pattern = "cache|key|template|content|join\\(|etag|checksum|digest|signature|chunk|fingerprint|function|framework|hasher" [[rule]] -id = "RAND810" -description = "Use of predictable random number generator." -severity = "Medium" -confidence = "Medium" -remediation = "Use cryptographically secure random generators for security purposes." -ast_match = "Call(func.value.id=random, func.attr=choice)" +id = "PY203" +description = "Use of insecure SSL/TLS protocol version." +severity = "High" +remediation = "Use 'ssl.PROTOCOL_TLS' or higher. Avoid SSLv2, SSLv3, and TLSv1.0/1.1." +pattern = "ssl\\.PROTOCOL_(SSLv2|SSLv3|TLSv1|TLSv1_1)" file_pattern = "*.py" [[rule]] -id = "PATH813" -description = "Path manipulation vulnerability in file operations." +id = "PY204" +description = "Use of the 'pycrypto' library is discouraged due to known vulnerabilities." severity = "High" -confidence = "Medium" -remediation = "Validate and normalize file paths to prevent directory traversal." -pattern = "os\\.path\\.join\\(.*\\.\\." +remediation = "Migrate from 'pycrypto' to a more secure and actively maintained library like 'pycryptodome'." +pattern = "from\\s+Crypto|import\\s+Crypto" file_pattern = "*.py" [[rule]] -id = "SYMLINK816" -description = "Symbolic link vulnerability in file operations." -severity = "Medium" +id = "PY205" +description = "Use of PyNaCl with low-level functions can be insecure if misused." +severity = "Low" confidence = "Low" -remediation = "Check for symbolic links and validate target paths." -pattern = "os\\.symlink\\(" +remediation = "Prefer using high-level APIs like 'Box' and 'SecretBox' unless low-level functions are explicitly required and understood." +pattern = "nacl\\.low_level" file_pattern = "*.py" +# ------------------------------------------- +# SECTION: Insecure Deserialization & Design (OWASP A08:2021) +# ------------------------------------------- + [[rule]] -id = "PROC819" -description = "Process injection vulnerability through command execution." +id = "PY002" +description = "Use of 'pickle.loads' for deserialization can lead to remote code execution." severity = "High" -confidence = "Medium" -remediation = "Validate and sanitize all inputs to process execution functions." -ast_match = "Call(func.value.id=os, func.attr=popen)" +remediation = "Use a safer serialization format like JSON if deserializing untrusted data." +ast_match = "Call(func.value.id=pickle, func.attr=loads)" file_pattern = "*.py" +exclude_file_pattern = "*/cache/backends/*" [[rule]] -id = "ENV822" -description = "Environment variable injection vulnerability." -severity = "Medium" -confidence = "Low" -remediation = "Validate environment variables and use allow-lists where possible." -pattern = "os\\.environ\\[.*\\+.*\\]" +id = "PY301" +description = "Use of 'pickle.load' for deserialization can lead to remote code execution." +severity = "High" +remediation = "Use a safer serialization format like JSON if deserializing untrusted data." +ast_match = "Call(func.attr=load, func.value.id=pickle)" file_pattern = "*.py" [[rule]] -id = "IMPORT825" -description = "Dynamic import vulnerability allowing code execution." +id = "PY302" +description = "Use of 'yaml.load()' with no Loader — unsafe with PyYAML; allows !!python/object RCE." severity = "High" confidence = "Medium" -remediation = "Avoid dynamic imports with user-controlled input." -ast_match = "Call(func.id=__import__)" +remediation = "Use yaml.safe_load() or pass Loader=yaml.SafeLoader. For ruamel.yaml, YAML(typ='safe') or the default YAML() round-trip are both safe; only YAML(typ='unsafe') is dangerous." +pattern = "yaml\\.load[^a-zA-Z_]" file_pattern = "*.py" +# Exclude: +# Comment lines — not executable +# yaml.safe_load() — explicitly safe +# Any Loader= argument — explicit loader choice +# ruamel.yaml safe modes — YAML() round-trip and typ="safe"/"rt"/"base" are safe +# Inline YAML().load() — ruamel inline construction is round-trip (safe) +# Note: does not fully distinguish PyYAML (module) from ruamel YAML instance named 'yaml'. +# Use RUAMEL_UNSAFE001 for ruamel's explicitly unsafe YAML(typ="unsafe") pattern. +exclude_pattern = "^\\s*#|Loader\\s*=|yaml\\.safe_load|YAML\\s*\\(\\s*\\)\\s*\\.\\s*load|typ\\s*=\\s*[\"'](safe|rt|base)[\"']" +file_content_exclude = "from ruamel\\.yaml|import ruamel" [[rule]] -id = "GETATTR828" -description = "Unsafe use of getattr with user input." -severity = "Medium" -confidence = "Medium" -remediation = "Validate attribute names or use a whitelist of allowed attributes." -ast_match = "Call(func.id=getattr)" +id = "PY303" +description = "XML parsing with 'xml.etree.ElementTree' is vulnerable to XML bombs." +severity = "High" +remediation = "Use 'defusedxml.ElementTree' to parse untrusted XML data safely." +pattern = "xml\\.etree\\.ElementTree\\.(parse|fromstring)" file_pattern = "*.py" [[rule]] -id = "SETATTR831" -description = "Unsafe use of setattr with user input." +id = "PY304" +description = "Insecure temporary file creation may lead to race conditions." severity = "Medium" -confidence = "Medium" -remediation = "Validate attribute names and values before setting." -ast_match = "Call(func.id=setattr)" +remediation = "Use 'tempfile.mkstemp()' instead of 'tempfile.mktemp()' for secure temporary file creation." +pattern = "tempfile\\.mktemp" file_pattern = "*.py" [[rule]] -id = "DELATTR834" -description = "Unsafe use of delattr with user input." -severity = "Medium" -confidence = "Medium" -remediation = "Validate attribute names before deletion." -ast_match = "Call(func.id=delattr)" -file_pattern = "*.py" +id = "PY305" +description = "Use of exec() enables arbitrary code execution" +severity = "Critical" +ast_match = "Call(func.id=exec)" [[rule]] -id = "HASATTR837" -description = "Information disclosure through hasattr probing." -severity = "Low" -confidence = "Low" -remediation = "Limit attribute access or implement access controls." -ast_match = "Call(func.id=hasattr)" +id = "SANDBOX307" +description = "Python sandbox escape via object.__subclasses__() — traverses full class hierarchy to retrieve dangerous classes (subprocess.Popen, etc.) without any import." +severity = "Critical" +confidence = "High" +remediation = "Remove __subclasses__() calls that operate on the root object class or traverse __mro__ to reach it. Legitimate code calls __subclasses__() on a specific known class, never on object or via MRO root traversal." +pattern = "object\\s*\\.\\s*__subclasses__\\s*\\(|__mro__\\s*\\[\\s*-?\\d+\\s*\\]\\s*\\.\\s*__subclasses__\\s*\\(" file_pattern = "*.py" +# Matches: +# object.__subclasses__() — direct root traversal +# some.__mro__[-1].__subclasses__() — MRO-based root traversal +# Does NOT match: +# cls.__subclasses__() — legitimate: find subclasses of a specific known class +# Model.__subclasses__() — legitimate: ORM model registry [[rule]] -id = "VARS840" -description = "Information disclosure through vars() function." -severity = "Low" -confidence = "Low" -remediation = "Avoid exposing internal object state through vars()." -ast_match = "Call(func.id=vars)" +id = "SANDBOX308" +description = "Python sandbox escape via __init__.__globals__ — accesses the global namespace of a function object, bypassing import restrictions." +severity = "Critical" +confidence = "High" +remediation = "Never access __globals__ on function objects. This is exclusively used to escape restricted execution environments." +pattern = "__init__\\s*\\.\\s*__globals__|__func__\\s*\\.\\s*__globals__" file_pattern = "*.py" +# ------------------------------------------- +# SECTION: Security Misconfiguration (OWASP A05:2021) +# ------------------------------------------- + [[rule]] -id = "GLOBALS843" -description = "Access to global namespace through globals()." +id = "G401" +description = "Flask app is running with the development server in a non-debug context." severity = "Medium" -confidence = "Medium" -remediation = "Restrict access to global namespace in untrusted contexts." -ast_match = "Call(func.id=globals)" +confidence = "Low" +remediation = "Use a production-ready WSGI server like Gunicorn or uWSGI instead of 'app.run()'." +pattern = "app\\.run\\(host=.*0\\.0\\.0\\.0" file_pattern = "*.py" [[rule]] -id = "LOCALS846" -description = "Access to local namespace through locals()." -severity = "Low" -confidence = "Low" -remediation = "Be cautious when exposing local variables." -ast_match = "Call(func.id=locals)" +id = "G403" +description = "Flask DEBUG mode is enabled." +severity = "High" +remediation = "Ensure app.debug is False or the DEBUG config variable is False in production." +pattern = "app\\.run\\(.*debug=True" file_pattern = "*.py" [[rule]] -id = "DIR849" -description = "Information disclosure through dir() function." -severity = "Low" -confidence = "Low" -remediation = "Limit use of dir() in contexts accessible to untrusted users." -ast_match = "Call(func.id=dir)" -file_pattern = "*.py" +id = "G404" +description = "Django's CSRF protection appears to be disabled globally." +severity = "Critical" +remediation = "Ensure 'django.middleware.csrf.CsrfViewMiddleware' is active in your MIDDLEWARE setting." +pattern = "#.*CsrfViewMiddleware" # Simple check for commented-out middleware +file_pattern = "*settings*.py" [[rule]] -id = "TYPE852" -description = "Type confusion vulnerability through type manipulation." -severity = "Low" -confidence = "Low" -remediation = "Validate object types before operations." -ast_match = "Call(func.id=type)" +id = "G405" +description = "Requests made without certificate verification." +severity = "High" +remediation = "Remove 'verify=False' from requests calls to prevent man-in-the-middle attacks." +ast_match = "Call(keywords.*.arg=verify, keywords.*.value.value=False)" file_pattern = "*.py" +# ------------------------------------------- +# SECTION: Hardcoded Secrets (OWASP A07:2021) +# ------------------------------------------- + [[rule]] -id = "ISINSTANCE855" -description = "Type checking bypass through isinstance manipulation." -severity = "Low" -confidence = "Low" -remediation = "Use additional validation beyond isinstance checks." -ast_match = "Call(func.id=isinstance)" +id = "G101" +description = "Hardcoded password or secret detected." +severity = "High" +confidence = "Medium" +remediation = "Store credentials in environment variables or a secrets management system." +pattern = "(?i)(password|secret|api_key|token|authkey|bearer|cred|credentials)\\s*[:=]\\s*[\"']\\w{8,}[\"']" file_pattern = "*.py" +# UPPER_CASE_CONSTANTS = "value" are module-level DeveloperDefined constants, not secrets. +# But uppercase variables whose NAMES are explicit secrets (SECRET_KEY, API_KEY etc.) +# are caught by G101B below. Exclude only if not a known-secret name. +exclude_pattern = "^\\s*[A-Z][A-Z0-9_]+\\s*=" [[rule]] -id = "REPR858" -description = "Information disclosure through repr() function." -severity = "Low" -confidence = "Low" -remediation = "Avoid using repr() on sensitive objects in user-facing contexts." -ast_match = "Call(func.id=repr)" +id = "G101B" +description = "Hardcoded secret in uppercase constant — secret key, API key, token, or password assigned directly in code." +severity = "High" +confidence = "High" +remediation = "Store secrets in environment variables: SECRET_KEY = os.environ.get('SECRET_KEY') or use a secrets manager." +pattern = "(?i)\\b(SECRET[_\\s]?KEY|API[_\\s]?KEY|API[_\\s]?SECRET|ACCESS[_\\s]?KEY|ACCESS[_\\s]?SECRET|AUTH[_\\s]?TOKEN|AUTH[_\\s]?KEY|PRIVATE[_\\s]?KEY|CLIENT[_\\s]?SECRET|APP[_\\s]?SECRET|APP[_\\s]?KEY|SIGNING[_\\s]?KEY|ENCRYPTION[_\\s]?KEY|MASTER[_\\s]?KEY)\\s*=\\s*[\"'][^\"']{16,}[\"']" file_pattern = "*.py" +# Safe: reading from environment or config system — not a hardcoded secret +exclude_pattern = "os\\.environ|getenv|config\\(|env\\(|settings\\.|vault|secrets\\." [[rule]] -id = "STR861" -description = "Potential information disclosure through str() conversion." -severity = "Low" -confidence = "Low" -remediation = "Control string representations of sensitive objects." -ast_match = "Call(func.id=str)" -file_pattern = "*.py" +id = "G102" +description = "Hardcoded private key detected." +severity = "Critical" +confidence = "High" +remediation = "Load private keys from a secure, encrypted file or secrets manager." +pattern = "-----BEGIN (RSA|EC|OPENSSH|PGP) PRIVATE KEY-----" [[rule]] -id = "FORMAT864" -description = "Format string vulnerability in string formatting." -severity = "Medium" -confidence = "Medium" -remediation = "Use safe string formatting methods and validate format strings." -ast_match = "Call(func.attr=format)" +id = "G103" +description = "Use of a blank password for a user or service." +severity = "High" +remediation = "Ensure all users and service accounts have strong, non-empty passwords." +pattern = "(?i)(password|passwd|pass)\\s*[:=]\\s*[\"']\\s*[\"']" file_pattern = "*.py" +# Exclude: +# Function parameter defaults: def login(passwd='') — optional API param +# Comment lines +# Chained initialization: login = account = password = '' — variable init, not a credential +exclude_pattern = "^\\s*def\\s|^\\s*#|\\w+\\s*=\\s*\\w+\\s*=" +exclude_file_pattern = "*global_settings*" [[rule]] -id = "FSTRING867" -description = "Potential code injection through f-string formatting." -severity = "Medium" -confidence = "Low" -remediation = "Validate and sanitize data used in f-string expressions." -pattern = "f[\"'][^\"']*\\{.*\\}[^\"']*[\"']" +id = "G104" +description = "JWT secret is hardcoded." +severity = "Critical" +remediation = "Load JWT secrets from environment variables or a secrets management system." +pattern = "(?i)(jwt_secret|jwt_key)\\s*[:=]\\s*[\"'].+[\"']" file_pattern = "*.py" +# ------------------------------------------- +# SECTION: IaC and Configuration File Security +# ------------------------------------------- + [[rule]] -id = "REGEX870" -description = "Regular expression denial of service (ReDoS) vulnerability." -severity = "Medium" -confidence = "Low" -remediation = "Avoid nested quantifiers and catastrophic backtracking in regex." -pattern = "re\\.(match|search|findall)\\(.*\\(.*\\+.*\\*" -file_pattern = "*.py" +id = "DKR001" +description = "Password or secret found in Dockerfile ENV instruction." +severity = "High" +remediation = "Use build-time arguments (ARG) with the --secret flag or a secrets management tool." +pattern = "(?i)ENV\\s+(PASS|PASSWORD|SECRET|TOKEN|API_KEY)\\s+" +file_pattern = "Dockerfile" [[rule]] -id = "SPLIT873" -description = "Potential DoS through string split operations." +id = "DKR002" +description = "Use of 'latest' tag for base image is not recommended for production." severity = "Low" -confidence = "Low" -remediation = "Limit the number of splits or validate input size." -pattern = "\\.split\\(.*maxsplit" -file_pattern = "*.py" +remediation = "Pin base images to a specific version digest for reproducible and secure builds." +pattern = "FROM\\s+\\w+:latest" +file_pattern = "Dockerfile" [[rule]] -id = "JOIN876" -description = "Memory exhaustion through string join operations." -severity = "Low" -confidence = "Low" -remediation = "Validate the size of collections before joining." -ast_match = "Call(func.attr=join)" -file_pattern = "*.py" +id = "DKR003" +description = "Exposing Docker daemon socket inside a container is a security risk." +severity = "Critical" +remediation = "Avoid mounting '/var/run/docker.sock' into containers." +pattern = "/var/run/docker\\.sock" +file_pattern = "docker-compose*.y*ml" [[rule]] -id = "REPLACE879" -description = "Potential DoS through string replace operations." -severity = "Low" -confidence = "Low" -remediation = "Limit replacement operations on large strings." -ast_match = "Call(func.attr=replace)" -file_pattern = "*.py" +id = "K8S001" +description = "Kubernetes container running in privileged mode." +severity = "Critical" +remediation = "Set 'securityContext.privileged' to 'false' or remove it." +pattern = "privileged:\\s*true" +file_pattern = "*.y*ml" [[rule]] -id = "DECODE882" -description = "Encoding vulnerability in string decode operations." -severity = "Low" -confidence = "Low" -remediation = "Handle encoding errors properly and validate input." -ast_match = "Call(func.attr=decode)" -file_pattern = "*.py" +id = "K8S002" +description = "Kubernetes container allows privilege escalation." +severity = "High" +remediation = "Explicitly set 'securityContext.allowPrivilegeEscalation' to 'false'." +pattern = "allowPrivilegeEscalation:\\s*true" +file_pattern = "*.y*ml" [[rule]] -id = "ENCODE885" -description = "Information disclosure through string encoding." -severity = "Low" -confidence = "Low" -remediation = "Be careful when encoding sensitive data." -ast_match = "Call(func.attr=encode)" -file_pattern = "*.py" +id = "TF001" +description = "Terraform AWS S3 bucket is publicly readable." +severity = "Critical" +remediation = "Set the 'acl' property of 'aws_s3_bucket' to 'private', not 'public-read' or 'public-read-write'." +pattern = "acl\\s*=\\s*\"(public-read|public-read-write)\"" +file_pattern = "*.tf" [[rule]] -id = "LOWER888" -description = "Locale-dependent case conversion vulnerability." -severity = "Low" -confidence = "Low" -remediation = "Use locale-independent case conversion for security comparisons." -ast_match = "Call(func.attr=lower)" -file_pattern = "*.py" +id = "CFG001" +description = "AWS credentials detected in configuration file." +severity = "Critical" +remediation = "Use IAM roles or environment variables for AWS credentials." +pattern = "(?i)(aws_access_key_id|aws_secret_access_key)\\s*=\\s*[A-Za-z0-9/+=]{20,}" +file_pattern = "*.ini" + +# ------------------------------------------- +# SECTION: ADDITIONAL SECURITY RULES +# ------------------------------------------- [[rule]] -id = "UPPER891" -description = "Locale-dependent case conversion vulnerability." -severity = "Low" -confidence = "Low" -remediation = "Use locale-independent case conversion for security comparisons." -ast_match = "Call(func.attr=upper)" +id = "PY500" +description = "Dynamic code execution using builtins.exec() function." +severity = "High" +confidence = "Medium" +remediation = "Avoid dynamic code execution. Consider safer alternatives or validate input thoroughly." +ast_match = "Call(func.attr=exec, func.value.id=builtins)" file_pattern = "*.py" [[rule]] -id = "STRIP894" -description = "Unicode normalization bypass in string stripping." -severity = "Low" -confidence = "Low" -remediation = "Normalize Unicode strings before validation." -ast_match = "Call(func.attr=strip)" +id = "SEC501" +description = "Generic exec pattern detected in code." +severity = "Medium" +confidence = "Medium" +remediation = "Dynamic code execution can be dangerous. Validate all inputs and consider safer alternatives." +pattern = "\\bexec\\b\\s*\\(" +# Exclude: function definitions (def exec(...), async def exec(...)) +# Exclude: comment lines +# Exclude: method calls .exec(...) — taint-driven SK_PY507 handles those +# Exclude: backtick-wrapped exec() in docstrings/prose +# Exclude: quoted "exec()" or 'exec()' — documentation text, not actual calls +exclude_pattern = "^\\s*(?:async\\s+)?def\\s|^\\s*#|\\.exec\\s*\\(|`exec\\(|\"exec\\(\\)\"|'exec\\(\\)'" file_pattern = "*.py" [[rule]] -id = "STARTSWITH897" -description = "Bypass vulnerability in string prefix checking." -severity = "Low" -confidence = "Low" -remediation = "Normalize and validate strings before prefix checks." -ast_match = "Call(func.attr=startswith)" +id = "PY507" +description = "Tainted data passed to .exec() method — potential code or SQL injection." +severity = "Critical" +confidence = "High" +remediation = "Validate inputs before passing to .exec(). Use parameterized queries for SQL execution." +# No pattern — triggered only by taint engine. +# Pattern-based detection of .exec() generates 100% FPs: fires on ORM sessions +# (Session.exec(select(...))), docstring code examples, and function definitions. file_pattern = "*.py" [[rule]] -id = "ENDSWITH900" -description = "Bypass vulnerability in string suffix checking." -severity = "Low" -confidence = "Low" -remediation = "Normalize and validate strings before suffix checks." -ast_match = "Call(func.attr=endswith)" -file_pattern = "*.py" +id = "WEB508" +description = "Insecure Content Security Policy with unsafe-inline." +severity = "Medium" +confidence = "Medium" +remediation = "Remove unsafe-inline from CSP directives and use nonces or hashes instead." +pattern = "Content-Security-Policy\\s*:\\s*.*unsafe-inline" +file_pattern = "*.conf" [[rule]] -id = "FIND903" -description = "Logic error in string search operations." +id = "CFG510" +description = "AWS access key detected in configuration." severity = "Low" -confidence = "Low" -remediation = "Handle -1 return value from find() properly." -ast_match = "Call(func.attr=find)" -file_pattern = "*.py" +remediation = "Store AWS credentials securely using IAM roles or environment variables." +pattern = "aws_access_key_id\\s*[:=]\\s*['\\\"][A-Za-z0-9/+=]{16,}" +file_pattern = "*.ini" [[rule]] -id = "INDEX906" -description = "Exception handling bypass in string index operations." -severity = "Low" -confidence = "Low" -remediation = "Use find() instead of index() or handle exceptions properly." -ast_match = "Call(func.attr=index)" -file_pattern = "*.py" +id = "WEB512" +description = "Bearer token in configuration header." +severity = "Medium" +remediation = "Store authentication tokens securely and avoid hardcoding in configuration files." +pattern = "Authorization\\s*:\\s*\\bBearer\\b" +file_pattern = "*.conf" [[rule]] -id = "COUNT909" -description = "DoS vulnerability through string count operations." -severity = "Low" -confidence = "Low" -remediation = "Limit the size of strings used in count operations." -ast_match = "Call(func.attr=count)" -file_pattern = "*.py" +id = "WEB514" +description = "X-Frame-Options set to allow framing." +severity = "Medium" +remediation = "Set X-Frame-Options to DENY or SAMEORIGIN to prevent clickjacking attacks." +pattern = "X-Frame-Options\\s*:\\s*ALLOW" +file_pattern = "*.conf" [[rule]] -id = "TRANSLATE912" -description = "Character encoding bypass through translate operations." +id = "SER522" +description = "Object serialization function detected." severity = "Low" -confidence = "Low" -remediation = "Validate translation tables and input strings." -ast_match = "Call(func.attr=translate)" +remediation = "Ensure serialized data comes from trusted sources to prevent deserialization attacks." +# No ast_match/pattern — triggered only by taint engine (SK007) + +[[rule]] +id = "FILE526" +description = "File read operation using open attribute access." +severity = "Medium" +remediation = "Implement proper file access controls and validate file paths." +ast_match = "Attribute(attr=read, value.id=open)" file_pattern = "*.py" [[rule]] -id = "MAKETRANS915" -description = "Translation table manipulation vulnerability." -severity = "Low" -confidence = "Low" -remediation = "Validate translation mappings for security contexts." -ast_match = "Call(func.attr=maketrans)" -file_pattern = "*.py" +id = "PERM527" +description = "Setting overly permissive file permissions (777)." +severity = "High" +remediation = "Use more restrictive permissions. Consider 644 for files and 755 for directories." +pattern = "chmod\\s+777" +file_pattern = "*.sh" [[rule]] -id = "CASEFOLD918" -description = "Unicode normalization vulnerability in casefold operations." -severity = "Low" -confidence = "Low" -remediation = "Be aware of Unicode normalization effects in security contexts." -ast_match = "Call(func.attr=casefold)" +id = "FILE528" +description = "Direct access to system password file." +severity = "High" +confidence = "Medium" +remediation = "Accessing /etc/passwd should be done through proper system APIs with authorization." +pattern = "open\\s*\\(\\s*['\\\"]/etc/passwd" file_pattern = "*.py" [[rule]] -id = "EXPANDTABS921" -description = "Tab expansion DoS vulnerability." +id = "TEMP529" +description = "Insecure temporary file creation using mktemp -u." severity = "Low" -confidence = "Low" -remediation = "Limit tab expansion or validate input size." -ast_match = "Call(func.attr=expandtabs)" -file_pattern = "*.py" +remediation = "Use mktemp without -u flag or mkstemp for secure temporary file creation." +pattern = "mktemp\\s+-u" +file_pattern = "*.sh" [[rule]] -id = "ZFILL924" -description = "Memory exhaustion through zero-fill operations." -severity = "Low" -confidence = "Low" -remediation = "Limit the width parameter in zfill operations." -ast_match = "Call(func.attr=zfill)" -file_pattern = "*.py" +id = "SSL531" +description = "SSL/TLS certificate verification disabled." +severity = "Medium" +remediation = "Enable certificate verification to prevent man-in-the-middle attacks." +pattern = "verify\\s*:\\s*false" +file_pattern = "*.y*ml" [[rule]] -id = "CENTER927" -description = "Memory exhaustion through string centering operations." -severity = "Low" -confidence = "Low" -remediation = "Limit the width parameter in center operations." -ast_match = "Call(func.attr=center)" -file_pattern = "*.py" +id = "WEB575" +description = "Content Security Policy allows unsafe inline execution." +severity = "High" +confidence = "Medium" +remediation = "Remove unsafe-inline from CSP directives and implement nonce-based or hash-based CSP." +pattern = "Content-Security-Policy\\s*:\\s*.*unsafe-inline" +file_pattern = "*.conf" [[rule]] -id = "LJUST930" -description = "Memory exhaustion through string justification operations." -severity = "Low" -confidence = "Low" -remediation = "Limit the width parameter in ljust operations." -ast_match = "Call(func.attr=ljust)" -file_pattern = "*.py" +id = "SQL586" +description = "String formatting in SQL query execution." +severity = "Critical" +confidence = "Medium" +remediation = "Use parameterized queries instead of string formatting to prevent SQL injection." +# No pattern — triggered by taint engine (SK_SQL001/SK_SQL002) [[rule]] -id = "RJUST933" -description = "Memory exhaustion through string justification operations." -severity = "Low" -confidence = "Low" -remediation = "Limit the width parameter in rjust operations." -ast_match = "Call(func.attr=rjust)" +id = "SHELL602" +description = "Shell command execution with dynamic arguments." +severity = "High" +confidence = "Medium" +remediation = "Use subprocess with argument arrays instead of shell command strings." +pattern = "subprocess\\.(Popen|call)\\(.*shell\\s*=\\s*True" file_pattern = "*.py" [[rule]] -id = "PARTITION936" -description = "Logic error in string partition operations." -severity = "Low" -confidence = "Low" -remediation = "Validate partition results and handle edge cases." -ast_match = "Call(func.attr=partition)" -file_pattern = "*.py" +id = "CODE607" +description = "Content Security Policy with unsafe inline directives." +severity = "High" +confidence = "Medium" +remediation = "Implement strict CSP without unsafe-inline to prevent XSS attacks." +pattern = "Content-Security-Policy\\s*:\\s*.*unsafe-inline" +file_pattern = "*.conf" [[rule]] -id = "RPARTITION939" -description = "Logic error in string reverse partition operations." -severity = "Low" -confidence = "Low" -remediation = "Validate rpartition results and handle edge cases." -ast_match = "Call(func.attr=rpartition)" -file_pattern = "*.py" +id = "SHELL631" +description = "SQL injection vulnerability in database query." +severity = "Critical" +confidence = "Medium" +remediation = "Use parameterized queries with placeholders instead of string concatenation." +# No pattern — triggered by taint engine (SK_SQL001/SK_SQL002) [[rule]] -id = "RSPLIT942" -description = "Logic error in reverse string split operations." -severity = "Low" -confidence = "Low" -remediation = "Validate rsplit results and handle maxsplit parameter." -ast_match = "Call(func.attr=rsplit)" -file_pattern = "*.py" +id = "CSP640" +description = "Unsafe Content Security Policy configuration." +severity = "High" +confidence = "Medium" +remediation = "Configure CSP without unsafe-inline and unsafe-eval directives." +pattern = "Content-Security-Policy\\s*:\\s*.*unsafe-inline" +file_pattern = "*.conf" [[rule]] -id = "SPLITLINES945" -description = "Line ending normalization vulnerability." -severity = "Low" -confidence = "Low" -remediation = "Be aware of different line ending interpretations." -ast_match = "Call(func.attr=splitlines)" -file_pattern = "*.py" +id = "PERM650" +description = "SQL query with potential injection vulnerability." +severity = "Critical" +confidence = "Medium" +remediation = "Implement prepared statements and parameterized queries to prevent SQL injection." +# No pattern — triggered by taint engine (SK_SQL001/SK_SQL002) [[rule]] -id = "SWAPCASE948" -description = "Locale-dependent case swapping vulnerability." -severity = "Low" -confidence = "Low" -remediation = "Avoid swapcase in security-sensitive contexts." -ast_match = "Call(func.attr=swapcase)" -file_pattern = "*.py" +id = "CSP665" +description = "Insecure Content Security Policy allowing inline scripts." +severity = "High" +confidence = "Medium" +remediation = "Use nonce or hash-based CSP instead of unsafe-inline directive." +pattern = "Content-Security-Policy\\s*:\\s*.*unsafe-inline" +file_pattern = "*.conf" [[rule]] -id = "TITLE951" -description = "Locale-dependent title casing vulnerability." -severity = "Low" -confidence = "Low" -remediation = "Use consistent title casing for security comparisons." -ast_match = "Call(func.attr=title)" -file_pattern = "*.py" +id = "SHELL675" +description = "Database query with string interpolation." +severity = "Critical" +confidence = "Medium" +remediation = "Use ORM methods or prepared statements instead of string formatting in SQL queries." +# No pattern — triggered by taint engine (SK_SQL001/SK_SQL002) [[rule]] -id = "CAPITALIZE954" -description = "Locale-dependent capitalization vulnerability." -severity = "Low" -confidence = "Low" -remediation = "Use consistent capitalization for security comparisons." -ast_match = "Call(func.attr=capitalize)" -file_pattern = "*.py" +id = "SHELL689" +description = "Process creation with shell command execution." +severity = "High" +confidence = "Medium" +remediation = "Use process execution without shell to avoid command injection vulnerabilities." +# No pattern — triggered by taint engine (SK_SQL001/SK_SQL002) [[rule]] -id = "LSTRIP957" -description = "Unicode normalization bypass in left string stripping." -severity = "Low" -confidence = "Low" -remediation = "Normalize Unicode strings before validation." -ast_match = "Call(func.attr=lstrip)" -file_pattern = "*.py" +id = "SQL693" +description = "String formatting in database execute statement." +severity = "Critical" +confidence = "Medium" +remediation = "Implement parameterized queries to eliminate SQL injection risks." +# No pattern — triggered by taint engine (SK_SQL001/SK_SQL002) [[rule]] -id = "RSTRIP960" -description = "Unicode normalization bypass in right string stripping." -severity = "Low" -confidence = "Low" -remediation = "Normalize Unicode strings before validation." -ast_match = "Call(func.attr=rstrip)" +id = "NET705" +description = "Network request without SSL certificate verification." +severity = "High" +confidence = "Medium" +remediation = "Enable SSL certificate verification to prevent man-in-the-middle attacks." +pattern = "requests\\.(get|post|put|delete)\\(.*verify\\s*=\\s*False" file_pattern = "*.py" [[rule]] -id = "REMOVEPREFIX963" -description = "Logic error in prefix removal operations." -severity = "Low" -confidence = "Low" -remediation = "Validate prefix removal and handle Unicode normalization." -ast_match = "Call(func.attr=removeprefix)" +id = "CRYPTO708" +description = "Weak cryptographic key generation — non-CSPRNG used to generate tokens, keys, or secrets." +severity = "Medium" +confidence = "Medium" +remediation = "Use secrets.token_hex(), secrets.token_urlsafe(), or secrets.choice() for security-sensitive values. The random module uses Mersenne Twister which is predictable and not cryptographically secure." +# Extended to include random.choices/sample/randrange — all non-CSPRNG selection functions +# commonly misused to generate API keys, OTPs, session tokens, and passwords. +pattern = "random\\.(randint|random|choices|sample|randrange|choice)\\(" file_pattern = "*.py" +# Exclude non-cryptographic uses: +# np.random.* — NumPy random, used for ML data generation/seeds, not key material +# len(...) — load balancing / server selection +# range(...) — list indexing +# choice/randbelow — selection, not key generation +# variable names suggesting non-security context (index, delay, seed for ML) +exclude_pattern = "np\\.random\\.|numpy\\.random\\.|len\\(|range\\(|\\b(index|idx|pos|offset|delay|sleep_|sleep|wait|_n|num_|seed|shape|size|dim|batch|epoch)\\b|_time\\b|_delay\\b|_wait\\b|random\\.choice|randbelow|input_shape|array_ops|benchmark" [[rule]] -id = "REMOVESUFFIX966" -description = "Logic error in suffix removal operations." -severity = "Low" -confidence = "Low" -remediation = "Validate suffix removal and handle Unicode normalization." -ast_match = "Call(func.attr=removesuffix)" +id = "AUTH711" +description = "Authentication bypass using hardcoded credentials." +severity = "Critical" +confidence = "High" +remediation = "Implement proper authentication mechanisms without hardcoded credentials." +pattern = "(?i)(username|user)\\s*[:=]\\s*[\"']admin[\"']" file_pattern = "*.py" [[rule]] -id = "ISALNUM969" -description = "Unicode category bypass in alphanumeric checking." -severity = "Low" -confidence = "Low" -remediation = "Be aware of Unicode character categories in validation." -ast_match = "Call(func.attr=isalnum)" +id = "LDAP717" +description = "LDAP injection vulnerability in search filter." +severity = "High" +confidence = "Medium" +remediation = "Properly escape LDAP filter characters or use parameterized LDAP queries." +pattern = "\\.search\\(.*filter.*%s" file_pattern = "*.py" [[rule]] -id = "ISALPHA972" -description = "Unicode category bypass in alphabetic checking." -severity = "Low" -confidence = "Low" -remediation = "Be aware of Unicode character categories in validation." -ast_match = "Call(func.attr=isalpha)" +id = "XPATH720" +description = "XPath injection vulnerability detected." +severity = "High" +confidence = "Medium" +remediation = "Use parameterized XPath queries or properly escape user input." +pattern = "xpath\\(.*%s" file_pattern = "*.py" [[rule]] -id = "ISASCII975" -description = "ASCII validation bypass with Unicode characters." -severity = "Low" -confidence = "Low" -remediation = "Use proper Unicode handling for international support." -ast_match = "Call(func.attr=isascii)" +id = "DESER723" +description = "Unsafe deserialization of untrusted data via marshal.loads()." +severity = "Critical" +confidence = "High" +remediation = "Never deserialize marshal bytecode from untrusted sources. Use JSON/protobuf for data exchange. For model serialization, use SavedModel format instead of custom bytecode paths." +ast_match = "Call(func.value.id=marshal, func.attr=loads)" file_pattern = "*.py" [[rule]] -id = "ISDECIMAL978" -description = "Unicode decimal category bypass in number validation." -severity = "Low" -confidence = "Low" -remediation = "Be aware of Unicode decimal categories beyond 0-9." -ast_match = "Call(func.attr=isdecimal)" +id = "DESER724" +description = "Deserialized bytecode executed via types.FunctionType() — arbitrary code execution from untrusted marshal.loads() output." +severity = "Critical" +confidence = "High" +remediation = "Never create functions from deserialized code objects. This is equivalent to pickle.loads() and allows full RCE. Use marshal only for trusted, developer-controlled bytecode in controlled build environments." file_pattern = "*.py" +# No pattern — triggered only by taint engine (SK_DESER724): +# marshal.loads(raw) → code is tainted → FunctionType(code, globals()) fires this rule. [[rule]] -id = "ISDIGIT981" -description = "Unicode digit category bypass in number validation." -severity = "Low" -confidence = "Low" -remediation = "Be aware of Unicode digit categories beyond 0-9." -ast_match = "Call(func.attr=isdigit)" +id = "PRIV726" +description = "Privilege escalation through setuid binary execution." +severity = "High" +confidence = "Medium" +remediation = "Avoid executing setuid binaries or implement proper privilege checks." +pattern = "os\\.setuid\\(" file_pattern = "*.py" [[rule]] -id = "ISIDENTIFIER984" -description = "Python identifier validation bypass." -severity = "Low" +id = "RACE729" +description = "Race condition in file operations." +severity = "Medium" confidence = "Low" -remediation = "Validate identifiers against allowed patterns." -ast_match = "Call(func.attr=isidentifier)" +remediation = "Use atomic file operations or proper locking mechanisms." +pattern = "os\\.path\\.exists.*open\\(" file_pattern = "*.py" [[rule]] -id = "ISLOWER987" -description = "Case checking bypass with Unicode characters." +id = "INFO738" +description = "Information disclosure through error messages." severity = "Low" confidence = "Low" -remediation = "Be aware of Unicode case categories." -ast_match = "Call(func.attr=islower)" +remediation = "Implement generic error messages that don't reveal system information." +pattern = "traceback\\.print_exc\\(" file_pattern = "*.py" [[rule]] -id = "ISNUMERIC990" -description = "Unicode numeric category bypass in validation." +id = "LOG741" +description = "User-controlled data in log statement — log injection risk." severity = "Low" -confidence = "Low" -remediation = "Be aware of Unicode numeric categories." -ast_match = "Call(func.attr=isnumeric)" +confidence = "Medium" +remediation = "Sanitize user input before logging. An attacker who controls log content can fake entries, inject ANSI escape codes, or corrupt log parsers." file_pattern = "*.py" +# No pattern — triggered only by taint engine (SK_LOG741_*) +# Only fires when data traced from request.GET/POST/CLI args/API responses +# reaches a logging call. Internal framework objects and computed values +# are never tainted → no false positives on framework internals. [[rule]] -id = "ISPRINTABLE993" -description = "Printable character validation bypass." -severity = "Low" -confidence = "Low" -remediation = "Be aware of Unicode printable character definitions." -ast_match = "Call(func.attr=isprintable)" +id = "SESS744" +description = "Session fixation vulnerability in session handling." +severity = "High" +confidence = "Medium" +remediation = "Regenerate session IDs after authentication to prevent fixation attacks." +# Writing data to a session is NOT session fixation. Only flag direct session key assignment from request. +pattern = "session\\.session_key\\s*=.*request\\." file_pattern = "*.py" [[rule]] -id = "ISSPACE996" -description = "Whitespace character validation bypass." -severity = "Low" -confidence = "Low" -remediation = "Be aware of Unicode whitespace character definitions." -ast_match = "Call(func.attr=isspace)" +id = "CSRF747" +description = "Cross-Site Request Forgery protection bypass." +severity = "High" +confidence = "Medium" +remediation = "Implement proper CSRF tokens for state-changing operations." +pattern = "@csrf_exempt" file_pattern = "*.py" [[rule]] -id = "ISTITLE999" -description = "Title case validation bypass with Unicode." -severity = "Low" -confidence = "Low" -remediation = "Be aware of Unicode title case definitions." -ast_match = "Call(func.attr=istitle)" +id = "HTTP750" +description = "HTTP response splitting vulnerability." +severity = "High" +confidence = "Medium" +remediation = "Validate and sanitize HTTP headers to prevent response splitting." +pattern = "HttpResponse\\(.*\\\\r\\\\n" file_pattern = "*.py" [[rule]] -id = "ISUPPER1002" -description = "Upper case validation bypass with Unicode." -severity = "Low" -confidence = "Low" -remediation = "Be aware of Unicode upper case definitions." -ast_match = "Call(func.attr=isupper)" +id = "UPLOAD753" +description = "Unrestricted file upload vulnerability." +severity = "High" +confidence = "Medium" +remediation = "Implement file type validation and size limits for uploads." +pattern = "request\\.FILES\\[.*\\]\\.save\\(" file_pattern = "*.py" [[rule]] -id = "BYTES1005" -description = "Bytes object creation with user input." -severity = "Low" +id = "CACHE756" +description = "Cache poisoning vulnerability in HTTP caching." +severity = "Medium" confidence = "Low" -remediation = "Validate encoding when creating bytes from user input." -ast_match = "Call(func.id=bytes)" +remediation = "Validate cache keys and implement proper cache invalidation." +pattern = "cache\\.set\\(.*request\\." file_pattern = "*.py" [[rule]] -id = "BYTEARRAY1008" -description = "Mutable byte array creation with user input." -severity = "Low" +id = "TIMING759" +description = "Timing attack vulnerability in authentication — direct equality comparison of secret values." +severity = "Medium" confidence = "Low" -remediation = "Validate encoding when creating bytearrays from user input." -ast_match = "Call(func.id=bytearray)" +remediation = "Use hmac.compare_digest() or secrets.compare_digest() for all secret/hash comparisons." +pattern = "password\\s*==\\s*.*" file_pattern = "*.py" +# Exclude null/empty checks: `if password is None or password == ""` is a presence check, +# not a secret comparison. Also exclude `password != ""` style guards. +exclude_pattern = "is None|== \"\"|== ''|!= \"\"|!= ''|^\\s*#" [[rule]] -id = "MEMORYVIEW1011" -description = "Memory view creation exposing internal buffer." +id = "ENUM762" +description = "User enumeration vulnerability in login system." severity = "Low" confidence = "Low" -remediation = "Be careful when exposing memory views of sensitive data." -ast_match = "Call(func.id=memoryview)" +remediation = "Return identical responses for valid and invalid usernames." +pattern = "User\\.objects\\.get\\(username=" file_pattern = "*.py" [[rule]] -id = "ORD1014" -description = "Character code point extraction." -severity = "Low" +id = "TOKEN771" +description = "JWT token created without expiration — tokens valid indefinitely if stolen." +severity = "Medium" confidence = "Low" -remediation = "Validate character input before extracting code points." -ast_match = "Call(func.id=ord)" +remediation = "Always include 'exp' claim in JWT payload: {'sub': user_id, 'exp': datetime.utcnow() + timedelta(hours=1)}." +# jwt.encode() is the creation side — only flag when no 'exp' key is visible nearby. +# jwt.decode() without verify is caught by JWT001. +pattern = "jwt\\.encode\\s*\\(" file_pattern = "*.py" +exclude_pattern = "^\\s*#|[\"']exp[\"']|datetime|timedelta" [[rule]] -id = "CHR1017" -description = "Character creation from code point." -severity = "Low" -confidence = "Low" -remediation = "Validate code points to prevent Unicode injection." -ast_match = "Call(func.id=chr)" +id = "OAUTH774" +description = "OAuth state parameter potentially missing in authorization request (Manual inspection suggested)." +severity = "High" +confidence = "Medium" +remediation = "Include state parameter in OAuth flows to prevent CSRF attacks." +pattern = "oauth.*authorize.*" file_pattern = "*.py" +# Public OAuth authorization URLs in string literals are DeveloperDefined endpoints, not missing state params +exclude_pattern = "[\"']https?://.*oauth.*authorize|client_id=" [[rule]] -id = "HEX1020" -description = "Hexadecimal conversion exposing internal data." -severity = "Low" +id = "API777" +description = "API endpoint without rate limiting." +severity = "Medium" confidence = "Low" -remediation = "Be careful when converting sensitive data to hex." -ast_match = "Call(func.attr=hex)" +remediation = "Implement rate limiting on API endpoints to prevent abuse." +pattern = "@app\\.route.*methods.*POST" file_pattern = "*.py" [[rule]] -id = "OCT1023" -description = "Octal conversion potentially exposing data." -severity = "Low" -confidence = "Low" -remediation = "Validate octal conversions in security contexts." -ast_match = "Call(func.id=oct)" +id = "CORS780" +description = "Overly permissive CORS configuration." +severity = "Medium" +confidence = "Medium" +remediation = "Restrict CORS origins to trusted domains only." +pattern = "Access-Control-Allow-Origin\\s*:\\s*\\*" file_pattern = "*.py" [[rule]] -id = "BIN1026" -description = "Binary conversion potentially exposing data." -severity = "Low" -confidence = "Low" -remediation = "Validate binary conversions in security contexts." -ast_match = "Call(func.id=bin)" -file_pattern = "*.py" +id = "HTTPS789" +description = "Missing HTTPS enforcement in security-sensitive context." +severity = "High" +confidence = "Medium" +remediation = "Enforce HTTPS for all security-sensitive operations." +pattern = "SECURE_SSL_REDIRECT\\s*=\\s*False" +file_pattern = "*settings*.py" +# global_settings.py is a framework defaults file — False here is the intended default. +# Deployments must override this in their project settings. +exclude_file_pattern = "*global_settings*" [[rule]] -id = "FLOAT1029" -description = "Floating point precision issues in security calculations." -severity = "Low" -confidence = "Low" -remediation = "Use decimal module for precise financial calculations." -ast_match = "Call(func.id=float)" +id = "COOKIE792" +description = "Insecure cookie configuration detected." +severity = "Medium" +confidence = "Medium" +remediation = "Set secure and httponly flags on sensitive cookies." +pattern = "set_cookie\\(.*secure=False" file_pattern = "*.py" [[rule]] -id = "COMPLEX1032" -description = "Complex number usage in security contexts." -severity = "Low" -confidence = "Low" -remediation = "Avoid complex numbers in security-sensitive calculations." -ast_match = "Call(func.id=complex)" +id = "ADMIN795" +description = "Default admin credentials detected." +severity = "Critical" +confidence = "High" +remediation = "Change default administrative credentials before deployment." +pattern = "(?i)(admin|administrator).*password.*password" file_pattern = "*.py" +# "class AdminPasswordChangeForm" is a Python class declaration — DeveloperDefined name, not a credential +exclude_pattern = "^\\s*class\\s+" [[rule]] -id = "BOOL1035" -description = "Boolean conversion potentially hiding truthy/falsy behavior." -severity = "Low" -confidence = "Low" -remediation = "Be explicit about boolean conversions in security checks." -ast_match = "Call(func.id=bool)" +id = "DEBUG798" +description = "Debug information exposed in production." +severity = "Medium" +confidence = "Medium" +remediation = "Disable debug mode and remove debug statements in production." +pattern = "print\\(.*password\\|.*secret" file_pattern = "*.py" [[rule]] -id = "INT1038" -description = "Integer conversion with potential overflow." -severity = "Low" +id = "BACKUP801" +description = "Backup file with sensitive information accessible." +severity = "Medium" confidence = "Low" -remediation = "Validate integer conversions and handle overflow." -ast_match = "Call(func.id=int)" -file_pattern = "*.py" +remediation = "Secure backup files and exclude them from web-accessible directories." +# Require a real filename base (word char) before the backup extension — prevents +# matching bare extension strings like '.bak', '*.old', '".bak"' in code comments, +# docs, and build scripts that reference backup extensions without actual file paths. +pattern = "['\"][^'\"]*\\w\\.(bak|backup|old)['\"]" +file_pattern = "*" +exclude_file_pattern = "*.sh,*.rst,*.md,*.txt" [[rule]] -id = "LIST1041" -description = "List creation with potential memory exhaustion." +id = "CONFIG804" +description = "Configuration file with default values." severity = "Low" confidence = "Low" -remediation = "Limit list sizes to prevent memory exhaustion." -ast_match = "Call(func.id=list)" -file_pattern = "*.py" +remediation = "Change default configuration values before production deployment." +pattern = "(?i)secret_key.*changeme" +file_pattern = "*settings*.py" [[rule]] -id = "TUPLE1044" -description = "Tuple creation with potential memory exhaustion." -severity = "Low" +id = "HASH807" +description = "Use of SHA-256 for password hashing — prefer a KDF (bcrypt, scrypt, Argon2)." +severity = "Medium" confidence = "Low" -remediation = "Limit tuple sizes to prevent memory exhaustion." -ast_match = "Call(func.id=tuple)" +remediation = "For password storage use bcrypt, scrypt, or Argon2. SHA-256 without a salt/iteration factor is fast and vulnerable to brute force." +# SHA-256 is strong for general purposes; only flag when context suggests password hashing +# (e.g. variable name contains 'password'). Exclude pure integrity/fingerprinting uses. +ast_match = "Call(func.value.id=hashlib, func.attr=sha256)" file_pattern = "*.py" +exclude_pattern = "fingerprint|checksum|digest|integrity|hash_file|file_hash|sha256_file|content_hash|benchmark|test|sample|example|demo" [[rule]] -id = "SET1047" -description = "Set creation with potential memory exhaustion." -severity = "Low" -confidence = "Low" -remediation = "Limit set sizes to prevent memory exhaustion." -ast_match = "Call(func.id=set)" -file_pattern = "*.py" +id = "RAND810" +description = "Use of predictable random number generator." +severity = "Medium" +confidence = "Medium" +remediation = "Use cryptographically secure random generators for security purposes." +# No ast_match/pattern — triggered only by taint engine (SK008) [[rule]] -id = "DICT1050" -description = "Dictionary creation with potential memory exhaustion." -severity = "Low" -confidence = "Low" -remediation = "Limit dictionary sizes to prevent memory exhaustion." -ast_match = "Call(func.id=dict)" +id = "SSRF_001" +description = "Server-Side Request Forgery — user-controlled URL in HTTP client request." +severity = "High" +confidence = "High" +remediation = "Validate URLs against an allowlist of trusted hosts/schemes before using in HTTP requests. Reject file://, internal IPs (10.x, 172.16-31.x, 192.168.x), and metadata endpoints (169.254.169.254)." file_pattern = "*.py" +# No pattern — triggered only by taint engine (SK_SSRF001-SK_SSRF008) +# Note: SSRF requires control of the HOST, not just path components. +# 'https://api.example.com/v1/%s' % user_id — NOT SSRF (host is literal) +# r.json()["url"] flowing to httpx.stream() — SSRF (full URL is attacker-controlled) +# The taint engine correctly handles this: taint must reach the URL argument. +# For CLI args (parse_args taint source) flowing into format strings where only +# path params vary, the engine may produce FPs. Those cases need per-sink +# host-vs-path discrimination — a future enhancement. [[rule]] -id = "FROZENSET1053" -description = "Frozenset creation with potential memory exhaustion." -severity = "Low" -confidence = "Low" -remediation = "Limit frozenset sizes to prevent memory exhaustion." -ast_match = "Call(func.id=frozenset)" +id = "PATH813" +description = "Path manipulation vulnerability in file operations." +severity = "High" +confidence = "Medium" +remediation = "Validate and normalize file paths to prevent directory traversal." +pattern = "os\\.path\\.join\\(.*\\.\\." file_pattern = "*.py" +# Exclude safe package-root navigation patterns: +# os.path.join(__file__, '..', '..') — finding package root from current file +# os.path.join(module.__file__, '..') — navigating relative to installed module +# os.path.join(os.path.dirname(__file__), ..) — standard Python package path +exclude_pattern = "__file__|module\\.__file__|dirname\\(__file__\\)|abspath.*dirname" [[rule]] -id = "RANGE1056" -description = "Range creation with potential memory exhaustion." -severity = "Low" +id = "SYMLINK816" +description = "Symbolic link vulnerability — user-controlled path in os.symlink()." +severity = "Medium" confidence = "Low" -remediation = "Validate range parameters to prevent excessive iterations." -ast_match = "Call(func.id=range)" +remediation = "Validate symlink target paths; never use untrusted input as a symlink source." file_pattern = "*.py" +# Pattern removed — SYMLINK816 is now taint-driven only (see taint_sink SK_SYMLINK001). +# Pattern-based matching produced 100% FPs (capability detection, static file management). +# Only fires when the symlink source argument is HttpRequest-tainted. [[rule]] -id = "ENUMERATE1059" -description = "Enumeration with potential performance impact." -severity = "Low" -confidence = "Low" -remediation = "Consider performance implications of enumerating large collections." -ast_match = "Call(func.id=enumerate)" +id = "PROC819" +description = "Process injection vulnerability through command execution." +severity = "High" +confidence = "Medium" +remediation = "Validate and sanitize all inputs to process execution functions." +ast_match = "Call(func.value.id=os, func.attr=popen)" file_pattern = "*.py" [[rule]] -id = "ZIP1062" -description = "Zip operation with potential memory exhaustion." -severity = "Low" -confidence = "Low" -remediation = "Be careful when zipping large collections." -ast_match = "Call(func.id=zip)" +id = "IMPORT825" +description = "Dynamic import vulnerability allowing code execution." +severity = "High" +confidence = "Medium" +remediation = "Avoid dynamic imports with user-controlled input. Use importlib with validated module names." +ast_match = "Call(func.id=__import__)" file_pattern = "*.py" +# Exclude Python 2/3 compatibility shims (six, future) and stdlib-only imports. +# These use __import__ with fixed or validated module names from the Python +# standard library, not from user input. +# Also exclude when the import name is from a known-safe source (self.LIB, +# self.package) — these are class attributes set from validated plugin registries. +exclude_pattern = "self\\.(LIB|package|base_class|module)|__import__\\(name\\)|six\\.|future\\." [[rule]] -id = "MAP1065" -description = "Map operation with potential memory exhaustion." -severity = "Low" -confidence = "Low" -remediation = "Consider memory usage when mapping over large collections." -ast_match = "Call(func.id=map)" -file_pattern = "*.py" +id = "GETATTR828" +description = "User-controlled attribute name passed to getattr() — attacker may access arbitrary attributes." +severity = "High" +confidence = "High" +remediation = "Validate attribute names against an allowlist before passing to getattr(). Never let user input control which attribute is accessed." +# No ast_match — this rule is triggered ONLY by the taint engine (SK002). +# Taint flow: request.* → variable → getattr(obj, variable) +# Exclude ORM serializer patterns: getattr(obj, field.name) where field.name comes from +# ORM model _meta (developer-defined schema), not user input. These generate high FP +# rates in serializer/schema code across all ORM frameworks. +exclude_file_pattern = "*pyct*,*serializer*,*schema*,*/pandas/core/*,pandas/core/*,*/pandas/io/*,pandas/io/*" [[rule]] -id = "FILTER1068" -description = "Filter operation with potential performance impact." -severity = "Low" -confidence = "Low" -remediation = "Consider performance when filtering large collections." -ast_match = "Call(func.id=filter)" -file_pattern = "*.py" +id = "SETATTR831" +description = "Unsafe use of setattr with user input." +severity = "Medium" +confidence = "Medium" +remediation = "Validate attribute names and values before setting." +# No ast_match/pattern — triggered only by taint engine (SK005) [[rule]] -id = "REDUCE1071" -description = "Reduce operation with potential performance impact." -severity = "Low" -confidence = "Low" -remediation = "Consider performance when reducing large collections." -pattern = "functools\\.reduce\\(" -file_pattern = "*.py" +id = "DELATTR834" +description = "Unsafe use of delattr with user input." +severity = "Medium" +confidence = "Medium" +remediation = "Validate attribute names before deletion." +# No ast_match/pattern — triggered only by taint engine (SK006) [[rule]] -id = "SORTED1074" -description = "Sorting operation with potential DoS impact." -severity = "Low" -confidence = "Low" -remediation = "Limit collection sizes before sorting to prevent DoS." -ast_match = "Call(func.id=sorted)" +id = "GLOBALS843" +description = "globals() used in code-execution context — exec/eval with global namespace." +severity = "Medium" +confidence = "Medium" +remediation = "Never pass globals() to exec/eval with untrusted code. Dynamic module attribute registration via globals()[name]=value is acceptable for plugin/codec loading." +# Only matches exec/eval with globals() — the genuinely dangerous pattern. +# Removed: globals()['key'] subscript assignment — this is standard Python for +# dynamic module attribute registration (hashlib hash functions, plugin loaders, +# codec registration) and generates high FP rates in framework code. +pattern = "exec[\\s(].*globals\\s*\\(\\)|eval[\\s(].*globals\\s*\\(\\)" file_pattern = "*.py" [[rule]] -id = "REVERSED1077" -description = "Reverse operation with potential memory impact." -severity = "Low" -confidence = "Low" -remediation = "Consider memory usage when reversing large collections." -ast_match = "Call(func.id=reversed)" -file_pattern = "*.py" +id = "FORMAT864" +description = "Format string vulnerability in string formatting." +severity = "Medium" +confidence = "Medium" +remediation = "Use safe string formatting methods and validate format strings." +# No ast_match/pattern — triggered only by taint engine (SK009) [[rule]] -id = "SUM1080" -description = "Sum operation with potential overflow or DoS." -severity = "Low" +id = "REGEX870" +description = "Regular expression denial of service (ReDoS) vulnerability — nested quantifiers." +severity = "Medium" confidence = "Low" -remediation = "Validate numeric ranges to prevent overflow or DoS." -ast_match = "Call(func.id=sum)" +remediation = "Avoid nested quantifiers: (x+)+, (a*)+, (a+)* cause catastrophic backtracking." +pattern = "re\\.(match|search|findall|compile)\\(.*\\([^)]*[+*][^)]*\\)([+*]|\\{[0-9])" file_pattern = "*.py" +# Only flag when a capturing/non-capturing group itself has a quantifier INSIDE and OUTSIDE: +# (a+)+ (a*)* (a+)* (a+){2,} → dangerous nested quantifiers +# (\s+){key_name} → f-string brace after ), safe (brace not followed by digit) +# +# Safe pattern: (\\w+\\.)+\\w+ — matches dotted identifiers like "foo.bar.baz" +# \\w+ only matches [a-zA-Z0-9_] so alternation between dot and word chars is non-overlapping +# → no catastrophic backtracking. Exclude when inner group uses \\w or \\d only. +exclude_pattern = "\\\\w\\+\\.\\)\\+|\\\\d\\+\\.\\)\\+|\\\\w\\+\\.\\)\\*" [[rule]] -id = "MAX1083" -description = "Max operation with potential performance impact." -severity = "Low" -confidence = "Low" -remediation = "Consider performance when finding max of large collections." -ast_match = "Call(func.id=max)" -file_pattern = "*.py" +id = "OPEN1149" +description = "User-controlled path passed to open() — potential path traversal or arbitrary file read/write." +severity = "High" +confidence = "High" +remediation = "Validate and sanitize file paths. Use os.path.realpath() and verify the result stays within the expected directory." +# No ast_match — triggered ONLY by taint engine (SK003). +# Taint flow: request.* → variable → open(variable) [[rule]] -id = "MIN1086" -description = "Min operation with potential performance impact." -severity = "Low" -confidence = "Low" -remediation = "Consider performance when finding min of large collections." -ast_match = "Call(func.id=min)" +id = "SSTI001" +description = "Server-Side Template Injection — user-controlled data used as Jinja2/Mako template string." +severity = "Critical" +confidence = "High" +remediation = "Never pass user input as the template string. Use render_template() with a static file. Pass user data as template VARIABLES (context), not as the template source itself. For Jinja2, use SandboxedEnvironment if dynamic templates are required." file_pattern = "*.py" +# Triggered by taint engine (SK_SSTI001: render_template_string, SK_SSTI002: env.from_string). +# render_template_string(user_template) or env.from_string(user_template).render() → Jinja2 RCE. [[rule]] -id = "ABS1089" -description = "Absolute value operation with potential overflow." -severity = "Low" -confidence = "Low" -remediation = "Handle potential overflow in absolute value calculations." -ast_match = "Call(func.id=abs)" +id = "ORM002" +description = "Django ORM injection — user-controlled value in raw(), order_by(), or extra() QuerySet method." +severity = "Critical" +confidence = "High" +remediation = "Never pass user input directly to raw(), order_by(), or extra(). For sorting, validate the field name against an explicit allowlist. For raw queries, use parameterized placeholders (%s). Avoid extra() entirely — use annotate() with Case/When instead." file_pattern = "*.py" +# Triggered by taint engine: SK_ORMRAW001 (raw), SK_ORMORDER001 (order_by), SK_ORMEXTRA001 (extra). +# CVE-2021-35042: order_by(user_input) allows column name injection. +# CVE-2022-28346/28347: extra(**user_dict) allows SQL injection via crafted kwargs. [[rule]] -id = "ROUND1092" -description = "Rounding operation with potential precision loss." -severity = "Low" -confidence = "Low" -remediation = "Be aware of floating point precision issues in rounding." -ast_match = "Call(func.id=round)" +id = "DESER725" +description = "Insecure deserialization via jsonpickle.decode() — arbitrary Python object deserialization leading to RCE." +severity = "Critical" +confidence = "High" +remediation = "Never pass untrusted data to jsonpickle.decode(). jsonpickle restores arbitrary Python objects including __reduce__ gadgets. Use json.loads() for safe data exchange. CVE-2020-22083, CVE-2024 (Splunk RCE)." +pattern = "jsonpickle\\.decode\\s*\\(" file_pattern = "*.py" +exclude_pattern = "^\\s*#" [[rule]] -id = "POW1095" -description = "Power operation with potential overflow or DoS." -severity = "Medium" -confidence = "Low" -remediation = "Limit exponents to prevent computational DoS attacks." -ast_match = "Call(func.id=pow)" +id = "DESER726" +description = "Insecure deserialization via dill.loads() — arbitrary Python object deserialization leading to RCE." +severity = "Critical" +confidence = "High" +remediation = "Never pass untrusted data to dill.loads(). dill extends pickle with support for lambdas and closures, enabling full RCE via crafted serialized payloads. Use json.loads() or protocol buffers for data exchange." +pattern = "dill\\.loads\\s*\\(" file_pattern = "*.py" +exclude_pattern = "^\\s*#" [[rule]] -id = "DIVMOD1098" -description = "Division with modulo operation potential issues." -severity = "Low" -confidence = "Low" -remediation = "Handle division by zero and validate operands." -ast_match = "Call(func.id=divmod)" +id = "TLS001" +description = "TLS certificate verification disabled — connection is vulnerable to man-in-the-middle attacks." +severity = "High" +confidence = "High" +remediation = "Remove verify=False. Always verify TLS certificates. If using a custom CA, pass verify='/path/to/ca-bundle.crt' instead of disabling verification. For urllib3, remove urllib3.disable_warnings(InsecureRequestWarning)." +pattern = "\\bverify\\s*=\\s*False\\b|urllib3\\.disable_warnings\\s*\\(.*InsecureRequestWarning|TCPConnector\\s*\\(.*ssl\\s*=\\s*False|check_hostname\\s*=\\s*False" file_pattern = "*.py" +# Exclude: +# Comment/docstring lines +# Array/indexer operations: _mgr.take(verify=False), indexer=..., verify=False +# Lines containing axis= (pandas internal indexer calls) +# Bare verify=False on its own line (fragment of a multi-line pandas call) +# Docstring text describing the verify parameter +exclude_pattern = "^\\s*#|\\baxis\\s*=|_mgr\\.|_block|block_manager|Pass\\s+verify|^\\s+verify=False,?\\s*$|take\\s*\\(|indexer[^=]*verify|assumed|codes equal|parameter|description" [[rule]] -id = "LEN1101" -description = "Length operation with potential performance impact." -severity = "Low" -confidence = "Low" -remediation = "Be aware that len() on some objects can be expensive." -ast_match = "Call(func.id=len)" +id = "SSH001" +description = "Paramiko host key validation disabled — SSH connection vulnerable to man-in-the-middle attacks." +severity = "High" +confidence = "High" +remediation = "Use RejectPolicy() or load known_hosts with client.load_system_host_keys() or client.load_host_keys(). AutoAddPolicy blindly accepts any server's host key, enabling MITM attacks that intercept SSH sessions and credentials." +pattern = "AutoAddPolicy\\s*\\(\\s*\\)" file_pattern = "*.py" +exclude_pattern = "^\\s*#" [[rule]] -id = "ANY1104" -description = "Any operation with potential short-circuit bypass." -severity = "Low" -confidence = "Low" -remediation = "Be aware of short-circuit evaluation in security checks." -ast_match = "Call(func.id=any)" +id = "JWT001" +description = "JWT signature verification disabled — tokens accepted without cryptographic validation." +severity = "High" +confidence = "High" +remediation = "Never set verify_signature=False or algorithms=['none'] in jwt.decode(). Without signature verification, any attacker can forge arbitrary JWT claims (user ID, role, expiry). Always verify the signature with the correct key and algorithm." +pattern = "verify_signature[\"']?\\s*:\\s*False|[\"']none[\"']\\s*.*algorithm|algorithms\\s*=\\s*\\[[\"']none[\"']" file_pattern = "*.py" +exclude_pattern = "^\\s*#" [[rule]] -id = "ALL1107" -description = "All operation with potential short-circuit bypass." -severity = "Low" +id = "ZIPSLIP001" +description = "Archive extraction without path validation — Zip Slip / Tar Slip arbitrary file write." +severity = "High" confidence = "Low" -remediation = "Be aware of short-circuit evaluation in security checks." -ast_match = "Call(func.id=all)" +remediation = "Before extractall(), validate every member path: reject entries containing '../' or absolute paths. Use a safe extraction helper that checks paths, or iterate members manually with extract() after validation." +pattern = "\\.extractall\\s*\\(" file_pattern = "*.py" +# Exclude: +# filter= argument — Python 3.12+ safe extraction filter +# str.extractall() — pandas/polars string accessor for regex extraction (NOT archive) +# Series.str.extractall — same, string regex method +exclude_pattern = "^\\s*#|filter\\s*=|str\\.extractall|strings.*extractall|accessor.*extractall|\\.str\\." +# Low confidence: legitimate uses exist when archives are trusted/developer-controlled. [[rule]] -id = "ITER1110" -description = "Iterator creation with potential memory issues." -severity = "Low" -confidence = "Low" -remediation = "Be careful with iterators over large or infinite sequences." -ast_match = "Call(func.id=iter)" +id = "XXE001" +description = "lxml XML parser with external entity resolution — XML External Entity (XXE) vulnerability." +severity = "High" +confidence = "Medium" +remediation = "Use defusedxml.lxml, or create a safe parser: etree.XMLParser(resolve_entities=False, no_network=True, load_dtd=False). lxml's default XMLParser has resolve_entities=True, allowing XXE via crafted XML." +pattern = "etree\\.(parse|fromstring|XML|HTML)\\s*\\(" file_pattern = "*.py" +# lxml's default parser resolves external entities. Attacker-controlled XML can read +# arbitrary files (/etc/passwd) or trigger SSRF to internal services via entity references. +exclude_pattern = "^\\s*#|defusedxml|resolve_entities\\s*=\\s*False" [[rule]] -id = "NEXT1113" -description = "Next operation with potential StopIteration issues." -severity = "Low" -confidence = "Low" -remediation = "Handle StopIteration exceptions properly." -ast_match = "Call(func.id=next)" +id = "ORM001" +description = "SQLAlchemy text() with string formatting — SQL injection via ORM raw query escape hatch." +severity = "Critical" +confidence = "High" +remediation = "Use bound parameters: text('SELECT * FROM users WHERE id = :id').bindparams(id=user_id). Never construct the SQL string with f-strings, %, or .format(). The text() function is for static SQL only." +# \b (word boundary) prevents matching gettext(), pgettext(), ngettext(): +# in "gettext(" the 't' in "text" is preceded by 'e' (word char) — no boundary, no match. +# in "text(" or "sa.text(" the 't' is preceded by non-word — boundary matches. +pattern = "\\btext\\s*\\(\\s*f[\"']|\\btext\\s*\\(.*[\"']\\s*%|\\btext\\s*\\(.*\\.format\\s*\\(" file_pattern = "*.py" +exclude_pattern = "^\\s*#" +# Exclude migration/backend files: f-strings in migrations contain hardcoded schema +# identifiers, not user input. Backend files are ORM infrastructure, not application code. +exclude_file_pattern = "*/migrations/*,*/alembic/*,*/backends/*" [[rule]] -id = "SLICE1116" -description = "Slice operation with potential memory exhaustion." -severity = "Low" -confidence = "Low" -remediation = "Validate slice parameters to prevent excessive memory usage." -ast_match = "Call(func.id=slice)" +id = "FLASK001" +description = "Flask application running with debug mode enabled — Werkzeug interactive debugger exposed." +severity = "Critical" +confidence = "High" +remediation = "Never run Flask with debug=True in production. The Werkzeug debugger provides an authenticated Python REPL on every 500 error, allowing full RCE for anyone who can trigger an exception." +pattern = "app\\.run\\s*\\(.*\\bdebug\\s*=\\s*True|app\\.debug\\s*=\\s*True|[\"']DEBUG[\"']\\s*:\\s*True" file_pattern = "*.py" +exclude_pattern = "^\\s*#" [[rule]] -id = "PROPERTY1119" -description = "Property creation with potential access control bypass." -severity = "Low" -confidence = "Low" -remediation = "Implement proper access controls in property getters/setters." -ast_match = "Call(func.id=property)" -file_pattern = "*.py" +id = "AI002" +description = "Hardcoded Anthropic (Claude) API key detected." +severity = "High" +remediation = "Remove hardcoded API keys and load them from environment variables or a secure secrets manager." +pattern = "(?i)sk-ant-api[0-9]*-[A-Za-z0-9_-]{20,}" +file_pattern = ".*\\.py" [[rule]] -id = "STATICMETHOD1122" -description = "Static method bypassing instance access controls." -severity = "Low" -confidence = "Low" -remediation = "Ensure static methods don't bypass intended access controls." -ast_match = "Call(func.id=staticmethod)" -file_pattern = "*.py" +id = "PY306_CACHE" +description = "pickle.loads() in cache backend — cache poisoning leads to remote code execution." +severity = "Critical" +confidence = "High" +remediation = "Replace pickle-based cache serialization with JSON or msgpack. If pickle is required, authenticate the cache channel and use HMAC to verify payload integrity before deserializing." +pattern = "pickle\\.loads\\s*\\(" +file_pattern = "*cache/backends/*.py" [[rule]] -id = "CLASSMETHOD1125" -description = "Class method with potential privilege escalation." -severity = "Low" -confidence = "Low" -remediation = "Ensure class methods don't provide unintended access." -ast_match = "Call(func.id=classmethod)" +id = "SHELL_BYPASS001" +description = "Explicit shell interpreter bypasses shell=False — functionally equivalent to shell injection." +severity = "High" +confidence = "High" +remediation = "Never pass user-controlled data as the -c argument to bash/sh/cmd. Use subprocess with a list of arguments and shell=False, validating each element independently." +pattern = "subprocess\\.(run|Popen|call)\\s*\\(\\s*\\[\\s*[\"'](bash|sh|zsh|cmd\\.exe|powershell)[\"']\\s*,\\s*[\"']-c[\"']" file_pattern = "*.py" +exclude_pattern = "^\\s*#" [[rule]] -id = "SUPER1128" -description = "Super call bypassing method resolution order." -severity = "Low" -confidence = "Low" -remediation = "Be careful with super() calls in security-sensitive contexts." -ast_match = "Call(func.id=super)" +id = "OPEN_REDIRECT001" +description = "Unvalidated URL in redirect — open redirect enables phishing and OAuth token stealing." +severity = "High" +confidence = "Medium" +remediation = "Validate redirect URLs against an allowlist of trusted domains. Use url_has_allowed_host_and_scheme() in Django or validate against a whitelist. Never redirect to a user-supplied URL without checking the host." file_pattern = "*.py" +# No pattern — triggered only by taint engine (SK_REDIRECT001/002). +# Taint flow: request.GET/POST['next'] → redirect()/HttpResponseRedirect() +# Conditional sanitization (if is_safe_url(url): redirect(url)) is not detectable +# by static taint analysis — url remains tainted through the conditional check. +# Exclude Django's own framework files — they validate redirects with is_safe_url() / +# url_has_allowed_host_and_scheme() before calling redirect(), but the call is safe. +exclude_file_pattern = "*/django/contrib/*,django/contrib/*,*/django/views/*,django/views/*" [[rule]] -id = "CALLABLE1131" -description = "Callable check with potential type confusion." -severity = "Low" -confidence = "Low" -remediation = "Validate callable objects before invocation." -ast_match = "Call(func.id=callable)" +id = "PLAIN_PWD001" +description = "User-supplied password stored without hashing — plaintext password in database." +severity = "Critical" +confidence = "High" +remediation = "Use Django's make_password() or set_password() before storing. Never assign request data directly to a password field: User.objects.create_user(password=request.POST['password']) hashes automatically; raw create(..., password=raw) does not." file_pattern = "*.py" +# No pattern — triggered only by taint engine (SK_PLAIN_PWD001). +# Taint flow: request.POST['password'] → Model.objects.create(password=tainted) [[rule]] -id = "ID1134" -description = "Object identity check with potential security implications." -severity = "Low" -confidence = "Low" -remediation = "Be aware that object identity can be predictable." -ast_match = "Call(func.id=id)" +id = "DJANGO_DEBUG001" +description = "DEBUG=True in settings — full stack traces and internal state exposed to any HTTP client." +severity = "Critical" +confidence = "High" +remediation = "Set DEBUG=False in production. Use environment variables: DEBUG = os.environ.get('DEBUG', 'False') == 'True'. Applies to Django, Flask, and any framework that respects a DEBUG flag." +pattern = "^\\s*DEBUG\\s*=\\s*True" file_pattern = "*.py" +# Catches DEBUG=True in both Django settings.py and Flask config files. +# Flask app.run(debug=True) is covered separately by FLASK001. +# Different from FLASK001: this is a settings file value, not runtime configuration. +exclude_file_pattern = "*/tests/*,*/test_*.py" [[rule]] -id = "HASH1137" -description = "Hash operation with potential collision attacks." -severity = "Low" -confidence = "Low" -remediation = "Use cryptographic hashes for security-sensitive applications." -ast_match = "Call(func.id=hash)" +id = "RUAMEL_UNSAFE001" +description = "ruamel.yaml loaded with typ='unsafe' — allows !!python/object gadget execution." +severity = "Critical" +confidence = "High" +remediation = "Use YAML() (round-trip, safe by default) or YAML(typ='safe'). typ='unsafe' enables arbitrary Python object construction via YAML tags, equivalent to PyYAML's unsafe yaml.load()." +pattern = "YAML\\s*\\(\\s*typ\\s*=\\s*[\"']unsafe[\"']\\s*\\)" file_pattern = "*.py" +exclude_pattern = "^\\s*#" [[rule]] -id = "ASCII1140" -description = "ASCII representation potentially exposing data." -severity = "Low" -confidence = "Low" -remediation = "Be careful when converting sensitive objects to ASCII." -ast_match = "Call(func.id=ascii)" +id = "ENV_URL001" +description = "Environment variable used as HTTP endpoint URL — SSRF if the env var is attacker-controlled in CI/container environments." +severity = "High" +confidence = "Medium" +remediation = "Validate env-var URLs against an allowlist of trusted domains before use. Never allow arbitrary HTTP endpoints via environment variables without scheme and host validation. Use a fixed default and only allow override to known-safe origins." file_pattern = "*.py" +# Pattern: env var whose name contains URL used directly in HTTP calls. +# Common pattern: SEMGREP_URL, API_URL, BASE_URL, ENDPOINT_URL etc. +# The taint engine (SSRF_001) catches the downstream HTTP call when env-var URL propagates to requests/httpx. +pattern = "os\\.environ(?:\\.get)?\\s*\\([\"'][A-Z_]*URL[A-Z_]*[\"']" +exclude_pattern = "^\\s*#|allowlist|whitelist|validate|urlparse\\.scheme|startswith\\s*\\([\"']https" [[rule]] -id = "INPUT1143" -description = "User input function with potential injection risks." -severity = "Medium" -confidence = "Medium" -remediation = "Validate and sanitize all user input." -ast_match = "Call(func.id=input)" +id = "COOKIE_FILE001" +description = "Environment variable used as cookie file path — cookie injection into HTTP sessions." +severity = "High" +confidence = "High" +remediation = "Never load a cookie jar from an env-var-specified path without validating the path is within an expected directory. Prefer in-memory session cookies over file-backed cookie jars for sensitive operations." file_pattern = "*.py" +# No pattern — triggered by taint engine (SK_COOKIE_JAR001): +# os.environ["SEMGREP_COOKIES_PATH"] → MozillaCookieJar(path) → cookies.load() +# Allows attacker-controlled cookies to be injected into all HTTP requests. [[rule]] -id = "PRINT1146" -description = "Print statement potentially exposing sensitive data." -severity = "Low" -confidence = "Low" -remediation = "Avoid printing sensitive information." -ast_match = "Call(func.id=print)" +id = "ENV_GIT_URL001" +description = "CI environment variable used to construct a git fetch URL — CI_JOB_TOKEN or credentials embedded in attacker-controlled URL." +severity = "High" +confidence = "High" +remediation = "Validate that CI_MERGE_REQUEST_PROJECT_URL and similar CI env vars match the expected repository host before embedding credentials. Use allowlist: only reconstruct URLs for the known project host." file_pattern = "*.py" +# Taint-driven via existing SSRF_001 and PY102 sinks: +# os.environ["CI_MERGE_REQUEST_PROJECT_URL"] → urlsplit() → _replace(netloc=token@host) → +# urlunsplit() → git_check_output(["git", "fetch", url]) — PY102 fires on tainted subprocess arg. +# This rule provides higher-confidence CI-specific context for the same finding. +pattern = "CI_MERGE_REQUEST_PROJECT_URL|CI_JOB_TOKEN.*git.*fetch|git.*fetch.*CI_" +exclude_pattern = "^\\s*#" [[rule]] -id = "OPEN1149" -description = "File open operation with potential path traversal." -severity = "Medium" -confidence = "Medium" -remediation = "Validate file paths and use appropriate file modes." -ast_match = "Call(func.id=open)" +id = "DESER_JOBLIB001" +description = "Insecure deserialization via joblib.load() — loads arbitrary Python objects → RCE." +severity = "Critical" +confidence = "High" +remediation = "Never load joblib files from untrusted sources. joblib uses pickle internally — any crafted .pkl/.joblib file can execute arbitrary code. Use SafeLoader or JSON for data exchange." +pattern = "joblib\\.load\\s*\\(" file_pattern = "*.py" +exclude_pattern = "^\\s*#" [[rule]] -id = "FORMAT1152" -description = "Format function with potential format string attacks." -severity = "Medium" -confidence = "Low" -remediation = "Validate format strings and use safe formatting methods." -ast_match = "Call(func.id=format)" +id = "DESER_NUMPY001" +description = "numpy.load() with allow_pickle=True — arbitrary Python object deserialization → RCE." +severity = "Critical" +confidence = "High" +remediation = "Use allow_pickle=False (default in NumPy 1.17+). Only load .npy/.npz files from trusted sources when pickle is required. Use JSON or HDF5 for cross-origin data exchange." +pattern = "np\\.load\\s*\\(.*allow_pickle\\s*=\\s*True|numpy\\.load\\s*\\(.*allow_pickle\\s*=\\s*True" file_pattern = "*.py" +exclude_pattern = "^\\s*#" [[rule]] -id = "AI002" -description = "Hardcoded Anthropic (Claude) API key detected." -severity = "High" -remediation = "Remove hardcoded API keys and load them from environment variables or a secure secrets manager." -pattern = "(?i)sk-ant-api[0-9]*-[A-Za-z0-9_-]{20,}" -file_pattern = ".*\\.py" +id = "DESER_TORCH001" +description = "torch.load() uses pickle by default — loading untrusted PyTorch model files → RCE." +severity = "Critical" +confidence = "High" +remediation = "Use torch.load(..., weights_only=True) (PyTorch 2.0+) to restrict deserialization. Never load model files from untrusted sources. For model exchange, use ONNX or safetensors format." +pattern = "torch\\.load\\s*\\(" +file_pattern = "*.py" +# weights_only=True is the safe version — exclude it +exclude_pattern = "^\\s*#|weights_only\\s*=\\s*True" diff --git a/src/pyspector/triage.py b/src/pyspector/triage.py index b50e1d1c..18111bd4 100644 --- a/src/pyspector/triage.py +++ b/src/pyspector/triage.py @@ -7,14 +7,13 @@ from textual.app import App, ComposeResult # type: ignore from textual.widgets import Header, Footer, DataTable, Static, Label # type: ignore from textual.containers import Vertical # type: ignore -from textual.binding import Binding # type: ignore # Helper to create a unique, stable fingerprint for an issue def create_fingerprint(issue: Dict[str, Any]) -> str: # Use rule ID, file path relative to a potential project root, and the line content # This makes the fingerprint stable across different checkout directories unique_string = f"{issue.get('rule_id', '')}|{issue.get('file_path', '')}|{issue.get('line_number', '')}|{issue.get('code', '').strip()}" - return hashlib.sha1(unique_string.encode('utf-8')).hexdigest() + return hashlib.sha256(unique_string.encode('utf-8')).hexdigest() class PySpectorTriage(App): """An interactive TUI for triaging PySpector findings.""" diff --git a/tests/unit/reporting_test.py b/tests/unit/reporting_test.py index aee2a796..1c703a86 100644 --- a/tests/unit/reporting_test.py +++ b/tests/unit/reporting_test.py @@ -45,7 +45,7 @@ def test_to_sarif(self): # Check top level SARIF fields self.assertEqual(output_json.get("version"), "2.1.0") - self.assertEqual(output_json.get("schema_uri"), "https://schemastore.azurewebsites.net/schemas/json/sarif-2.1.0-rtm.5.json") + self.assertEqual(output_json.get("schema_uri"), "https://raw.githubusercontent.com/oasis-tcs/sarif-spec/master/Schemata/sarif-schema-2.1.0.json") # Check runs self.assertIn("runs", output_json) @@ -54,7 +54,6 @@ def test_to_sarif(self): # Check unique single run run = output_json["runs"][0] - self.assertEqual(run["tool"]["driver"]["id"], "pyspector") self.assertEqual(run["tool"]["driver"]["name"], "PySpector") # Check run results diff --git a/tests/unit/test_a_sink_rules.py b/tests/unit/test_a_sink_rules.py new file mode 100644 index 00000000..c2aabc6d --- /dev/null +++ b/tests/unit/test_a_sink_rules.py @@ -0,0 +1,167 @@ +"""Tests for A_SINK rules — all triggered by taint engine, verified without FPs.""" + +import os, sys, tempfile, textwrap, warnings +from pathlib import Path +import pytest + + +def _wrap(code): + ind = "\n".join(" " + l for l in textwrap.dedent(code).splitlines()) + return f"def _view(request):\n{ind}\n" + + +def run(code, filename="app.py"): + from pyspector._rust_core import run_scan + from pyspector.config import get_default_rules + import ast as _ast, json as _json + from pyspector.cli import AstEncoder + wrapped = _wrap(code) + rules = get_default_rules() + with tempfile.TemporaryDirectory() as d: + p = os.path.join(d, filename) + Path(p).write_text(wrapped) + with warnings.catch_warnings(): + warnings.filterwarnings("ignore") + try: aj = _json.dumps(_ast.parse(wrapped), cls=AstEncoder) + except: aj = "{}" + files = [{"file_path": filename, "content": wrapped, "ast_json": aj}] + return [{"rule_id": r.rule_id} for r in run_scan(d, rules, {"exclude": []}, files)] + + +def fires(code, rule_id): return [f for f in run(code) if f["rule_id"] == rule_id] +def not_fires(code, rule_id): return not fires(code, rule_id) + + +# --- HASATTR837 --- +class TestHasattr837: + def test_tainted_silent_disabled(self): + # HASATTR837 disabled: hasattr() returns bool — not a security sink, + # generates FPs on stdlib code that uses hasattr for duck-typing checks. + assert not_fires("attr=request.GET.get('f'); hasattr(obj,attr)", "HASATTR837") + def test_constant_safe(self): + assert not_fires("hasattr(obj,'is_active')", "HASATTR837") + +# --- VARS840 --- +class TestVars840: + def test_tainted_silent_disabled(self): + # VARS840 disabled: vars() returns __dict__ — information disclosure but + # low security impact; generates FPs in code using vars() for introspection. + assert not_fires("o=request.GET.get('obj'); vars(o)", "VARS840") + def test_constant_safe(self): + assert not_fires("vars(MyClass())", "VARS840") + +# --- DIR849 --- +class TestDir849: + def test_tainted_silent_disabled(self): + # DIR849 disabled: dir() lists attributes for introspection — not a security + # sink; generates FPs in code that uses dir() for reflection/debugging. + assert not_fires("o=request.GET.get('obj'); dir(o)", "DIR849") + def test_constant_safe(self): + assert not_fires("dir(str)", "DIR849") + +# --- CALLABLE1131 --- +class TestCallable1131: + def test_tainted_silent_disabled(self): + # CALLABLE1131 disabled: callable() checks if object is callable — + # not a security sink; generates FPs from deep inter-procedural taint. + assert not_fires("o=request.GET.get('fn'); callable(o)", "CALLABLE1131") + def test_constant_safe(self): + assert not_fires("callable(print)", "CALLABLE1131") + +# --- BYTES1005 --- +class TestBytes1005: + def test_tainted_silent_disabled(self): + # BYTES1005 disabled: bytes() encoding is not a security sink on its own. + assert not_fires("d=request.GET.get('data'); bytes(d,'utf-8')", "BYTES1005") + def test_constant_safe(self): + assert not_fires("bytes('hello','utf-8')", "BYTES1005") + +# --- BYTEARRAY1008 --- +class TestBytearray1008: + def test_tainted_silent_disabled(self): + # BYTEARRAY1008 disabled: bytearray() creates a mutable buffer — not a + # security sink; generates FPs in asyncio/networking code that buffers I/O. + assert not_fires("d=request.GET.get('data'); bytearray(d,'utf-8')", "BYTEARRAY1008") + def test_constant_safe(self): + assert not_fires("bytearray(b'hello')", "BYTEARRAY1008") + +# --- MEMORYVIEW1011 --- +class TestMemoryview1011: + def test_tainted_silent_disabled(self): + # MEMORYVIEW1011 disabled: memory view creation is not a security sink. + assert not_fires("d=request.GET.get('data'); b=bytes(d,'utf-8'); memoryview(b)", "MEMORYVIEW1011") + def test_constant_safe(self): + assert not_fires("memoryview(b'hello')", "MEMORYVIEW1011") + +# --- ORD1014 --- +class TestOrd1014: + def test_tainted_silent_disabled(self): + # ORD1014 disabled: ord() returns the integer code point of a character — + # never a security sink; generates FPs in encoding/codec implementations. + assert not_fires("c=request.GET.get('char'); ord(c)", "ORD1014") + def test_constant_safe(self): + assert not_fires("ord('A')", "ORD1014") + +# --- CHR1017 --- +class TestChr1017: + def test_tainted_silent_disabled(self): + # CHR1017 disabled: chr() converts an integer to a character — + # never a security sink; generates FPs in encoding implementations. + assert not_fires("n=request.GET.get('n'); chr(n)", "CHR1017") + def test_constant_safe(self): + assert not_fires("chr(65)", "CHR1017") + +# --- CENTER927 / LJUST930 / RJUST933 --- +class TestJustification: + def test_center_silent_disabled(self): + # CENTER927 disabled: string centering is a cosmetic operation — not a sink. + assert not_fires("w=request.GET.get('w'); 'x'.center(w)", "CENTER927") + def test_center_constant_safe(self): + assert not_fires("'x'.center(80)", "CENTER927") + def test_ljust_silent_disabled(self): + # LJUST930 disabled: string left-justification is not a security sink. + assert not_fires("w=request.GET.get('w'); 'x'.ljust(w)", "LJUST930") + def test_rjust_silent_disabled(self): + # RJUST933 disabled: zero findings across all scanned repos. + assert not_fires("w=request.GET.get('w'); 'x'.rjust(w)", "RJUST933") + +# --- RANGE1056 --- +class TestRange1056: + def test_tainted_silent_disabled(self): + # RANGE1056 disabled: range() iteration bound is not a security sink. + assert not_fires("n=request.GET.get('n'); range(n)", "RANGE1056") + def test_constant_safe(self): + assert not_fires("range(100)", "RANGE1056") + +# --- JOIN876 --- +class TestJoin876: + def test_tainted_parts_silent_disabled(self): + # JOIN876 disabled: .join() with tainted data generates FPs from deep + # inter-proc taint reaching error messages and SQL placeholder construction. + assert not_fires("parts=request.GET.getlist('p'); '/'.join(parts)", "JOIN876") + def test_constant_safe(self): + assert not_fires("'/'.join(['a','b','c'])", "JOIN876") + +# --- SORTED1074 --- +class TestSorted1074: + def test_tainted_silent_disabled(self): + # SORTED1074 disabled: sorting user data is not a security sink. + assert not_fires("data=request.GET.getlist('items'); sorted(data)", "SORTED1074") + def test_constant_safe(self): + assert not_fires("sorted([3,1,2])", "SORTED1074") + +# --- SUM1080 --- +class TestSum1080: + def test_tainted_silent_disabled(self): + # SUM1080 disabled: summing user data is not a security sink. + assert not_fires("vals=request.GET.getlist('v'); sum(vals)", "SUM1080") + def test_constant_safe(self): + assert not_fires("sum([1,2,3])", "SUM1080") + +# --- SET1047 --- +class TestSet1047: + def test_tainted_silent_disabled(self): + # SET1047 disabled: set() deduplication causes FPs from deep inter-proc taint. + assert not_fires("items=request.GET.getlist('i'); set(items)", "SET1047") + def test_constant_safe(self): + assert not_fires("set([1,2,3])", "SET1047") diff --git a/tests/unit/test_false_positive_reductions.py b/tests/unit/test_false_positive_reductions.py index 94258b0a..7c631b34 100644 --- a/tests/unit/test_false_positive_reductions.py +++ b/tests/unit/test_false_positive_reductions.py @@ -359,14 +359,15 @@ def test_pickle_loads_still_flagged_py002(self): assert findings_for_rule(code, "PY002") != [], \ "PY002 must still fire for pickle.loads() — this is a TRUE POSITIVE" - def test_pickle_loads_still_flagged_py306(self): - """pickle.loads() MUST still be flagged — it's a true positive.""" + def test_pickle_loads_still_flagged_py002(self): + """pickle.loads() MUST still be flagged — it's a true positive. + PY306 was disabled (duplicate of PY002); PY002 is the canonical rule.""" code = """ import pickle return pickle.loads(zlib.decompress(f.read())) """ - assert findings_for_rule(code, "PY306") != [], \ - "PY306 must still fire for pickle.loads() — this is a TRUE POSITIVE" + assert findings_for_rule(code, "PY002") != [], \ + "PY002 must still fire for pickle.loads() — this is a TRUE POSITIVE" # =========================================================================== diff --git a/tests/unit/test_group_a_rules.py b/tests/unit/test_group_a_rules.py new file mode 100644 index 00000000..62933472 --- /dev/null +++ b/tests/unit/test_group_a_rules.py @@ -0,0 +1,267 @@ +""" +Tests for Group A taint-driven rules: SETATTR831, DELATTR834, FORMAT864, +FSTRING867, TRANSLATE912, REPLACE879, SER522, RAND810. + +Each test proves: + - True positive: tainted arg → rule fires + - True negative: constant arg → rule does NOT fire +""" + +import os +import sys +import tempfile +import textwrap +import warnings +from pathlib import Path + +import pytest + + +def _wrap(code: str) -> str: + indented = "\n".join(" " + l for l in textwrap.dedent(code).splitlines()) + return f"def _view(request):\n{indented}\n" + + +def run_pyspector(code: str, filename: str = "app.py") -> list[dict]: + from pyspector._rust_core import run_scan + from pyspector.config import get_default_rules + import ast as _ast, json as _json + from pyspector.cli import AstEncoder + + wrapped = _wrap(code) + rules_toml = get_default_rules() + + with tempfile.TemporaryDirectory() as tmpdir: + path = os.path.join(tmpdir, filename) + Path(path).write_text(wrapped) + with warnings.catch_warnings(): + warnings.filterwarnings("ignore") + try: + tree = _ast.parse(wrapped) + ast_json = _json.dumps(tree, cls=AstEncoder) + except Exception: + ast_json = "{}" + files = [{"file_path": filename, "content": wrapped, "ast_json": ast_json}] + results = run_scan(tmpdir, rules_toml, {"exclude": []}, files) + + return [{"rule_id": r.rule_id, "line_number": r.line_number} for r in results] + + +def fires(code, rule_id, **kw): + return [f for f in run_pyspector(code, **kw) if f["rule_id"] == rule_id] + + +# ============================================================ +# SETATTR831 — arbitrary attribute write via tainted name +# ============================================================ + +class TestSetattr831: + def test_tainted_attr_name_fires(self): + code = """ + attr = request.GET.get('field') + setattr(user, attr, 'value') + """ + assert fires(code, "SETATTR831"), "SETATTR831 must fire: tainted attr name to setattr" + + def test_subscript_source_fires(self): + code = """ + attr = request.POST['field'] + setattr(obj, attr, True) + """ + assert fires(code, "SETATTR831"), "SETATTR831 must fire with subscript source" + + def test_constant_attr_safe(self): + code = """ + setattr(obj, 'username', 'alice') + """ + assert not fires(code, "SETATTR831"), "SETATTR831 must NOT fire for constant attr name" + + +# ============================================================ +# DELATTR834 — arbitrary attribute deletion via tainted name +# ============================================================ + +class TestDelattr834: + def test_tainted_attr_name_fires(self): + code = """ + attr = request.GET.get('field') + delattr(obj, attr) + """ + assert fires(code, "DELATTR834"), "DELATTR834 must fire: tainted attr name to delattr" + + def test_constant_attr_safe(self): + code = """ + delattr(obj, 'cache') + """ + assert not fires(code, "DELATTR834"), "DELATTR834 must NOT fire for constant attr" + + +# ============================================================ +# FORMAT864 — tainted format string used as template +# ============================================================ + +class TestFormat864: + def test_tainted_receiver_fires(self): + """template = request.GET.get('t'); template.format(user=user)""" + code = """ + template = request.GET.get('template') + result = template.format(user=user_obj) + """ + assert fires(code, "FORMAT864"), "FORMAT864 must fire: tainted string used as .format() template" + + def test_tainted_via_subscript_fires(self): + code = """ + tmpl = request.GET['template'] + output = tmpl.format(name='Alice') + """ + assert fires(code, "FORMAT864"), "FORMAT864 must fire with subscript source" + + def test_constant_template_safe(self): + code = """ + result = 'Hello {name}!'.format(name=user.name) + """ + assert not fires(code, "FORMAT864"), "FORMAT864 must NOT fire for constant template" + + def test_tainted_arg_safe(self): + # FORMAT864 only fires when the TEMPLATE (receiver) is tainted. + # A safe hardcoded template with tainted ARGUMENTS is not SSTI. + # FP case: msg = '{} is a symlink'; raise FileExistsError(msg.format(cfile)) + code = """ + msg = '{} is not a valid path' + raise ValueError(msg.format(request.GET.get('path'))) + """ + assert not fires(code, "FORMAT864"), "FORMAT864 must NOT fire when only the arg is tainted" + + +# ============================================================ +# FSTRING867 — tainted variable inside f-string +# ============================================================ + +class TestFstring867: + # FSTRING867 is disabled as a standalone sink — f-string taint propagates forward + # to downstream sinks (LOG741, PY101, PATH813, etc.) which report it more precisely. + # As a standalone sink it fires on every display/error string in large codebases. + def test_tainted_variable_silent_disabled(self): + code = """ + cmd = request.GET.get('cmd') + query = f'SELECT * FROM {cmd}' + """ + assert not fires(code, "FSTRING867"), "FSTRING867 disabled: downstream PY101 covers this" + + def test_constant_fstring_safe(self): + code = """ + name = 'Alice' + greeting = f'Hello {name}!' + """ + assert not fires(code, "FSTRING867"), "FSTRING867 must NOT fire for f-string with local constant" + + +# ============================================================ +# REPLACE879 — tainted replace arg used for filter bypass +# ============================================================ + +class TestReplace879: + def test_tainted_silent_disabled(self): + # REPLACE879 disabled: str.replace() is a pure data transformation. + # Also caused FPs from os.replace(), node.replace(), code.replace() — any + # method named 'replace' matched regardless of receiver type. + code = """ + bad = request.GET.get('pattern') + result = sanitized.replace(bad, '') + """ + assert not fires(code, "REPLACE879"), "REPLACE879 disabled: str.replace() is not a security sink alone" + + def test_constant_replace_safe(self): + code = """ + result = user_name.replace('<', '<') + """ + assert not fires(code, "REPLACE879"), "REPLACE879 must NOT fire for constant search/replace" + + +# ============================================================ +# TRANSLATE912 — tainted translation table (sanitization bypass) +# ============================================================ + +class TestTranslate912: + def test_tainted_silent_disabled(self): + # TRANSLATE912 disabled: str.translate() is a character-mapping transformation. + # The downstream result needs to reach a dangerous sink to be exploitable. + code = """ + table_data = request.GET.get('table') + result = user_input.translate(table_data) + """ + assert not fires(code, "TRANSLATE912"), "TRANSLATE912 disabled: translate is not a security sink alone" + + def test_constant_table_safe(self): + code = """ + import str + result = text.translate(str.maketrans('abc', 'xyz')) + """ + assert not fires(code, "TRANSLATE912"), "TRANSLATE912 must NOT fire for constant table" + + +# ============================================================ +# RAND810 — tainted seed → predictable PRNG +# ============================================================ + +class TestRand810: + def test_tainted_seed_fires(self): + code = """ + import random + seed = request.GET.get('seed') + random.seed(seed) + """ + assert fires(code, "RAND810"), "RAND810 must fire: tainted seed to random.seed()" + + def test_constant_seed_safe(self): + code = """ + import random + random.seed(42) + """ + assert not fires(code, "RAND810"), "RAND810 must NOT fire for constant seed" + + +# ============================================================ +# SER522 — tainted object to serializer +# ============================================================ + +class TestSer522: + def test_tainted_object_fires(self): + code = """ + data = request.POST.get('data') + result = serialize('json', data) + """ + assert fires(code, "SER522"), "SER522 must fire: tainted object to serialize()" + + def test_constant_object_safe(self): + code = """ + result = serialize('json', MyModel.objects.all()) + """ + assert not fires(code, "SER522"), "SER522 must NOT fire for untainted queryset" + + +# ============================================================ +# Regression — existing rules still fire +# ============================================================ + +class TestRegression: + def test_getattr828_still_fires(self): + code = """ + attr = request.GET.get('field') + getattr(user, attr) + """ + assert fires(code, "GETATTR828"), "GETATTR828 regression" + + def test_py102_still_fires(self): + code = """ + cmd = request.get('command') + subprocess.run(cmd) + """ + assert fires(code, "PY102"), "PY102 regression" + + def test_open1149_still_fires(self): + code = """ + path = request.GET.get('file') + open(path) + """ + assert fires(code, "OPEN1149"), "OPEN1149 regression" diff --git a/tests/unit/test_missing_rules.py b/tests/unit/test_missing_rules.py new file mode 100644 index 00000000..191428eb --- /dev/null +++ b/tests/unit/test_missing_rules.py @@ -0,0 +1,453 @@ +""" +Tests for the 10 newly added security rules: +SSTI001, ORM001, ORM002, DESER725, DESER726, +TLS001, SSH001, JWT001, ZIPSLIP001, XXE001, FLASK001. +""" +import os +import sys +import tempfile +import textwrap +import warnings +from pathlib import Path + +sys.path.insert(0, str(Path(__file__).parent.parent.parent / "src")) + + +def _wrap(code: str) -> str: + indented = "\n".join(" " + l for l in textwrap.dedent(code).splitlines()) + return f"def _view(request):\n{indented}\n" + + +def run_pyspector(code: str, filename: str = "app.py") -> list[dict]: + from pyspector._rust_core import run_scan + from pyspector.config import get_default_rules + import ast as _ast, json as _json + from pyspector.cli import AstEncoder + + wrapped = _wrap(code) + rules_toml = get_default_rules() + + with tempfile.TemporaryDirectory() as tmpdir: + path = os.path.join(tmpdir, filename) + Path(path).write_text(wrapped) + with warnings.catch_warnings(): + warnings.filterwarnings("ignore") + try: + tree = _ast.parse(wrapped) + ast_json = _json.dumps(tree, cls=AstEncoder) + except Exception: + ast_json = "{}" + files = [{"file_path": filename, "content": wrapped, "ast_json": ast_json}] + results = run_scan(tmpdir, rules_toml, {"exclude": []}, files) + + return [{"rule_id": r.rule_id, "line_number": r.line_number} for r in results] + + +def fires(code, rule_id, **kw): + return bool([f for f in run_pyspector(code, **kw) if f["rule_id"] == rule_id]) + + +def not_fires(code, rule_id, **kw): + return not fires(code, rule_id, **kw) + + +# ============================================================ +# SSTI001 — Server-Side Template Injection +# ============================================================ + +class TestSSTI001: + def test_render_template_string_tainted_fires(self): + code = """ + tmpl = request.GET.get('template') + return render_template_string(tmpl) + """ + assert fires(code, "SSTI001"), "SSTI001 must fire: tainted string to render_template_string" + + def test_from_string_silent_removed(self): + # SK_SSTI002 (from_string sink) removed — from_string() is too generic. + # It fired on TF's DeviceSpec.from_string(), any library with .from_string(). + # SSTI is still caught via render_template_string (SK_SSTI001) and + # the jinja2.Template pattern-based rule. + code = """ + src = request.POST.get('src') + result = env.from_string(src).render() + """ + assert not_fires(code, "SSTI001"), "SK_SSTI002 removed: from_string too generic" + + def test_static_template_safe(self): + code = """ + result = render_template_string('

Hello {{ name }}

', name=user) + """ + assert not_fires(code, "SSTI001"), "SSTI001 must NOT fire for static template literal" + + +# ============================================================ +# ORM001 — SQLAlchemy text() injection +# ============================================================ + +class TestORM001: + def test_fstring_in_text_fires(self): + code = """ + uid = request.GET.get('id') + result = session.execute(text(f"SELECT * FROM users WHERE id={uid}")) + """ + assert fires(code, "ORM001"), "ORM001 must fire: f-string inside text()" + + def test_percent_format_in_text_fires(self): + code = """ + result = session.execute(text("SELECT * FROM users WHERE name='%s'" % name)) + """ + assert fires(code, "ORM001"), "ORM001 must fire: %-format inside text()" + + def test_safe_parameterized_text_safe(self): + code = """ + result = session.execute(text("SELECT * FROM users WHERE id = :uid"), {"uid": uid}) + """ + assert not_fires(code, "ORM001"), "ORM001 must NOT fire for static text() with params" + + +# ============================================================ +# ORM002 — Django ORM injection (raw, order_by, extra) +# ============================================================ + +class TestORM002: + def test_raw_tainted_sql_fires(self): + code = """ + sql = request.GET.get('q') + users = User.objects.raw(sql) + """ + assert fires(code, "ORM002"), "ORM002 must fire: tainted SQL in raw()" + + def test_order_by_tainted_fires(self): + code = """ + sort = request.GET.get('sort') + qs = User.objects.order_by(sort) + """ + assert fires(code, "ORM002"), "ORM002 must fire: tainted field in order_by (CVE-2021-35042)" + + def test_order_by_literal_safe(self): + code = """ + qs = User.objects.order_by('username') + """ + assert not_fires(code, "ORM002"), "ORM002 must NOT fire for literal field name in order_by" + + +# ============================================================ +# DESER725 — jsonpickle deserialization +# ============================================================ + +class TestDESER725: + def test_jsonpickle_decode_fires(self): + code = "import jsonpickle; obj = jsonpickle.decode(data)" + assert fires(code, "DESER725"), "DESER725 must fire: jsonpickle.decode" + + def test_comment_line_safe(self): + code = "# jsonpickle.decode(data)" + assert not_fires(code, "DESER725"), "DESER725 must NOT fire in comment" + + +# ============================================================ +# DESER726 — dill deserialization +# ============================================================ + +class TestDESER726: + def test_dill_loads_fires(self): + code = "import dill; obj = dill.loads(payload)" + assert fires(code, "DESER726"), "DESER726 must fire: dill.loads" + + def test_comment_line_safe(self): + code = "# dill.loads(data)" + assert not_fires(code, "DESER726"), "DESER726 must NOT fire in comment" + + +# ============================================================ +# TLS001 — TLS verification disabled +# ============================================================ + +class TestTLS001: + def test_verify_false_fires(self): + code = "resp = requests.get(url, verify=False)" + assert fires(code, "TLS001"), "TLS001 must fire: requests verify=False" + + def test_disable_warnings_fires(self): + code = "urllib3.disable_warnings(InsecureRequestWarning)" + assert fires(code, "TLS001"), "TLS001 must fire: disable_warnings InsecureRequestWarning" + + def test_verify_true_safe(self): + code = "resp = requests.get(url, verify=True)" + assert not_fires(code, "TLS001"), "TLS001 must NOT fire for verify=True" + + def test_verify_capath_safe(self): + code = "resp = requests.get(url, verify='/etc/ssl/certs/ca-bundle.crt')" + assert not_fires(code, "TLS001"), "TLS001 must NOT fire for verify=CA path" + + +# ============================================================ +# SSH001 — Paramiko MITM +# ============================================================ + +class TestSSH001: + def test_auto_add_policy_fires(self): + code = "client.set_missing_host_key_policy(paramiko.AutoAddPolicy())" + assert fires(code, "SSH001"), "SSH001 must fire: AutoAddPolicy()" + + def test_reject_policy_safe(self): + code = "client.set_missing_host_key_policy(paramiko.RejectPolicy())" + assert not_fires(code, "SSH001"), "SSH001 must NOT fire for RejectPolicy" + + +# ============================================================ +# JWT001 — JWT signature bypass +# ============================================================ + +class TestJWT001: + def test_verify_signature_false_fires(self): + code = 'payload = jwt.decode(token, options={"verify_signature": False})' + assert fires(code, "JWT001"), "JWT001 must fire: verify_signature=False" + + def test_algorithms_none_fires(self): + code = "payload = jwt.decode(token, algorithms=['none'])" + assert fires(code, "JWT001"), "JWT001 must fire: algorithms=['none']" + + def test_valid_decode_safe(self): + code = "payload = jwt.decode(token, secret, algorithms=['HS256'])" + assert not_fires(code, "JWT001"), "JWT001 must NOT fire for valid HS256 decode" + + +# ============================================================ +# ZIPSLIP001 — Archive extraction without path validation +# ============================================================ + +class TestZIPSLIP001: + def test_zipfile_extractall_fires(self): + code = "zf.extractall('/var/app/uploads/')" + assert fires(code, "ZIPSLIP001"), "ZIPSLIP001 must fire: zipfile extractall" + + def test_tarfile_extractall_fires(self): + code = "tf.extractall('/tmp/extract/')" + assert fires(code, "ZIPSLIP001"), "ZIPSLIP001 must fire: tarfile extractall" + + +# ============================================================ +# XXE001 — lxml XXE +# ============================================================ + +class TestXXE001: + def test_etree_parse_fires(self): + code = "from lxml import etree; tree = etree.parse(user_file)" + assert fires(code, "XXE001"), "XXE001 must fire: etree.parse without safe parser" + + def test_etree_fromstring_fires(self): + code = "from lxml import etree; root = etree.fromstring(xml_data)" + assert fires(code, "XXE001"), "XXE001 must fire: etree.fromstring" + + def test_defusedxml_safe(self): + code = "from defusedxml import etree; root = etree.fromstring(xml_data)" + assert not_fires(code, "XXE001"), "XXE001 must NOT fire when defusedxml is used" + + def test_resolve_entities_false_safe(self): + code = "p = etree.XMLParser(resolve_entities=False); tree = etree.parse(f, p)" + assert not_fires(code, "XXE001"), "XXE001 must NOT fire when resolve_entities=False" + + +# ============================================================ +# FLASK001 — Flask debug mode +# ============================================================ + +class TestFLASK001: + def test_app_run_debug_fires(self): + code = "app.run(host='0.0.0.0', debug=True)" + assert fires(code, "FLASK001"), "FLASK001 must fire: app.run(debug=True)" + + def test_app_debug_assignment_fires(self): + code = "app.debug = True" + assert fires(code, "FLASK001"), "FLASK001 must fire: app.debug = True" + + def test_debug_false_safe(self): + code = "app.run(host='0.0.0.0', debug=False)" + assert not_fires(code, "FLASK001"), "FLASK001 must NOT fire for debug=False" + + +# ============================================================ +# FILE_WRITE001 — writing user content to files +# ============================================================ + +class TestFILE_WRITE001: + # FILE_WRITE001 taint sink (SK_FILE_WRITE001) removed — write() is too generic. + # It fired on HTTP response writes (response.write()), cache writes, and all + # framework file operations generating massive FPs (74 in CPython, 24 in Django). + # Rule remains for documentation; the finding in PyGoat is still detected via + # the PLAIN_PWD001, FILE_WRITE001 pattern, and broader path traversal rules. + def test_tainted_write_silent_disabled(self): + code = """ + code = request.POST.get('code') + f = open('/tmp/plugin.py', 'w') + f.write(code) + """ + assert not_fires(code, "FILE_WRITE001"), "FILE_WRITE001 taint sink disabled: write() too generic" + + def test_constant_write_safe(self): + code = """ + f = open('/tmp/output.py', 'w') + f.write('print("hello")') + """ + assert not_fires(code, "FILE_WRITE001"), "FILE_WRITE001 must NOT fire for constant content" + + +# ============================================================ +# OPEN_REDIRECT001 — unvalidated redirect URL +# ============================================================ + +class TestOPENREDIRECT001: + def test_flask_redirect_fires(self): + code = """ + next_url = request.GET.get('next') + return redirect(next_url) + """ + assert fires(code, "OPEN_REDIRECT001"), "OPEN_REDIRECT001 must fire: user-controlled redirect URL" + + def test_django_redirect_fires(self): + code = """ + url = request.GET.get('url') + return HttpResponseRedirect(url) + """ + assert fires(code, "OPEN_REDIRECT001"), "OPEN_REDIRECT001 must fire: HttpResponseRedirect with user URL" + + def test_hardcoded_redirect_safe(self): + code = """ + return redirect('/dashboard/') + """ + assert not_fires(code, "OPEN_REDIRECT001"), "OPEN_REDIRECT001 must NOT fire for hardcoded redirect" + + +# ============================================================ +# PLAIN_PWD001 — plaintext password in Django ORM create() +# ============================================================ + +class TestPLAINPWD001: + def test_create_with_tainted_password_fires(self): + code = """ + pwd = request.POST.get('password') + User.objects.create(username='alice', password=pwd) + """ + assert fires(code, "PLAIN_PWD001"), "PLAIN_PWD001 must fire: tainted password in ORM create()" + + def test_hashed_password_safe(self): + code = """ + from django.contrib.auth.hashers import make_password + User.objects.create(username='alice', password=make_password(raw_pwd)) + """ + assert not_fires(code, "PLAIN_PWD001"), "PLAIN_PWD001 must NOT fire when password is hashed" + + +# ============================================================ +# DJANGO_DEBUG001 — DEBUG=True in settings +# ============================================================ + +class TestDJANGO_DEBUG001: + def test_debug_true_fires(self): + code = "DEBUG = True" + assert fires(code, "DJANGO_DEBUG001"), "DJANGO_DEBUG001 must fire: DEBUG=True" + + def test_debug_false_safe(self): + code = "DEBUG = False" + assert not_fires(code, "DJANGO_DEBUG001"), "DJANGO_DEBUG001 must NOT fire for DEBUG=False" + + def test_debug_env_var_safe(self): + code = "DEBUG = os.environ.get('DEBUG', 'False') == 'True'" + assert not_fires(code, "DJANGO_DEBUG001"), "DJANGO_DEBUG001 must NOT fire for env var pattern" + + +# ============================================================ +# PATH813 via os.path.join (new taint propagation) +# ============================================================ + +class TestOSPathJoinPropagation: + def test_path_join_propagates_to_open(self): + code = """ + blog = request.POST.get('blog') + filename = os.path.join('/app/blogs', blog) + f = open(filename, 'r') + """ + assert fires(code, "OPEN1149"), "os.path.join must propagate taint to open() → OPEN1149" + + def test_imagmath_eval_via_sink(self): + code = """ + from PIL import ImageMath, Image + func = request.POST.get('function') + img = Image.open('test.png') + output = ImageMath.eval(func, img=img) + """ + assert fires(code, "PY001"), "ImageMath.eval() must fire PY001 via SK_IMG_EVAL001 taint sink" + + +# ============================================================ +# file_content_exclude — PY302/PY107 ruamel false positive fix +# ============================================================ + +class TestFileContentExclude: + def test_pyyaml_unsafe_fires(self): + # Plain PyYAML import with unsafe load — must fire + code = "import yaml\nyaml.load(data)" + assert fires(code, "PY302"), "PY302 must fire for PyYAML yaml.load() without Loader" + + def test_ruamel_yaml_suppressed(self, tmp_path): + # ruamel.yaml with YAML() round-trip is safe — must NOT fire + # file_content_exclude = "from ruamel.yaml|import ruamel" suppresses it + from pyspector._rust_core import run_scan + from pyspector.config import get_default_rules + import ast as _ast, json as _json, os, warnings + from pyspector.cli import AstEncoder + + code = "from ruamel.yaml import YAML\nyaml = YAML()\nyaml.load(stream)" + filename = str(tmp_path / "settings.py") + with open(filename, "w") as f: + f.write(code) + rules_toml = get_default_rules() + tree = _ast.parse(code, filename=filename) + ast_json = _json.dumps(_ast.dump(tree), cls=AstEncoder) + files = [{"file_path": filename, "content": code, "ast_json": ast_json}] + with warnings.catch_warnings(): + warnings.filterwarnings("ignore") + results = run_scan(str(tmp_path), rules_toml, {"exclude": []}, files) + py302 = [r for r in results if r.rule_id in ("PY302", "PY107")] + assert len(py302) == 0, f"PY302/PY107 must NOT fire for ruamel YAML() round-trip, got: {py302}" + + +# ============================================================ +# CLI vs HTTP taint distinction (OperatorConfig vs HttpRequest) +# ============================================================ + +class TestCLIvsHTTPTaint: + def test_http_path_fires_PATH813(self): + # @app.route path param → HttpRequest → PATH813 + code = """ + path = request.GET.get('path') + from pathlib import Path + Path(path).mkdir(parents=True, exist_ok=True) + """ + assert fires(code, "PATH813"), "HTTP path traversal must fire PATH813" + + def test_cli_path_no_PATH813(self): + # @app.command path param → OperatorConfig → no PATH813 + code = """ + @app.command() + def run(output): + from pathlib import Path + Path(output).mkdir(parents=True, exist_ok=True) + """ + assert not_fires(code, "PATH813"), \ + "CLI operator path must NOT fire PATH813 — operator chose the path" + + def test_json_load_supply_chain_fires(self): + # json.load is a FILE_DESERIALIZER: always produces HttpRequest taint + # regardless of how the file path was obtained. Supply-chain detection + # is preserved even when the operator chose the file path. + code = """ + import json + config_path = request.POST.get("config") + data = json.load(open(config_path)) + f = open(data, "w") + """ + assert fires(code, "OPEN1149"), \ + "json.load FILE_DESERIALIZER must propagate HttpRequest to open() sink" diff --git a/tests/unit/test_semantic_provenance.py b/tests/unit/test_semantic_provenance.py new file mode 100644 index 00000000..dfd2bd9e --- /dev/null +++ b/tests/unit/test_semantic_provenance.py @@ -0,0 +1,180 @@ +""" +Tier 1 + Tier 2 semantic provenance tests. +Universal Python semantics — no framework-specific knowledge required. +""" +import os, sys, tempfile, warnings +from pathlib import Path +import pytest + + +def run(code, filename="app.py"): + import ast as _ast, json as _json + from pyspector._rust_core import run_scan + from pyspector.config import get_default_rules + from pyspector.cli import AstEncoder + rules = get_default_rules() + with tempfile.TemporaryDirectory() as d: + p = os.path.join(d, filename) + Path(p).write_text(code) + with warnings.catch_warnings(): + warnings.filterwarnings("ignore") + try: aj = _json.dumps(_ast.parse(code), cls=AstEncoder) + except: aj = "{}" + files = [{"file_path": filename, "content": code, "ast_json": aj}] + return [{"rule_id": r.rule_id} for r in run_scan(d, rules, {"exclude": []}, files)] + + +def fires(code, rule_id, filename="app.py"): + return [f for f in run(code, filename) if f["rule_id"] == rule_id] + + +def _wrap(code): + import textwrap + ind = "\n".join(" " + l for l in textwrap.dedent(code).strip().splitlines()) + return f"def view(request):\n{ind}\n" + + +def taint_fires(code, rule_id): + """Use taint engine — wraps code in a function for CFG analysis.""" + wrapped = _wrap(code) + return fires(wrapped, rule_id) + + +# ─── Tier 1: Structural Python rules ──────────────────────────────────────── + +class TestTier1StructuralRules: + + def test_admin795_class_declaration_not_flagged(self): + """ + 'class AdminPasswordChangeForm' is a Python class declaration. + Python syntax: class keyword → DeveloperDefined name context. + Universal — applies to any codebase, not just Django. + """ + code = "class AdminPasswordChangeForm(BaseForm):\n pass\n" + assert not fires(code, "ADMIN795"), \ + "ADMIN795 must not fire on class declarations" + + def test_admin795_fires_on_actual_inline_credential(self): + """Lowercase variable with password=password pattern still fires.""" + # Pattern requires: admin/administrator + password + password (twice) + code = 'admin_default_password = "password_admin"\n' + assert fires(code, "ADMIN795", filename="config.py"), \ + "ADMIN795 must still fire when pattern has two 'password' occurrences" + + def test_g101_uppercase_constant_not_flagged(self): + """ + INTERNAL_RESET_SESSION_TOKEN = "_password_reset_token" is a module constant. + Python: UPPER_CASE = "literal" → DeveloperDefined provenance. + Universal — any Python module constant. + """ + code = 'INTERNAL_RESET_SESSION_TOKEN = "_password_reset_token"\n' + assert not fires(code, "G101"), \ + "G101 must not fire on UPPER_CASE module constants" + + def test_g101_fires_on_lowercase_secret(self): + """Lowercase secret variable must still fire.""" + code = 'api_secret = "mysecretkey123"\n' + assert fires(code, "G101", filename="config.py"), \ + "G101 must fire on lowercase secret variable assignments" + + def test_symlink816_hardcoded_path_not_flagged(self): + """ + SYMLINK816 is now taint-driven only — no pattern. + os.symlink() with non-tainted arguments must not fire. + """ + code = "os.symlink(original_path, symlink_path)\n" + assert not fires(code, "SYMLINK816", filename="utils.py"), \ + "SYMLINK816 must not fire on os.symlink with non-tainted (non-HttpRequest) args" + + def test_symlink816_fires_on_user_controlled_path(self): + """Symlink with HttpRequest-tainted source must fire via taint engine.""" + code = _wrap("src = request.GET.get('path')\nos.symlink(src, '/tmp/dst')") + assert fires(code, "SYMLINK816"), \ + "SYMLINK816 must fire when symlink source is HttpRequest-tainted" + + +# ─── Tier 2: Provenance tracking ──────────────────────────────────────────── + +class TestTier2ProvenanceTracking: + + def test_http_request_to_getattr_fires(self): + """HttpRequest provenance → getattr sink → fires.""" + assert taint_fires( + "attr = request.GET.get('field')\ngetattr(obj, attr)", + "GETATTR828" + ), "HttpRequest provenance must trigger GETATTR828" + + def test_http_request_to_open_fires(self): + """HttpRequest provenance → open() sink → fires.""" + assert taint_fires( + "path = request.GET.get('file')\nopen(path)", + "OPEN1149" + ), "HttpRequest provenance must trigger OPEN1149" + + def test_system_generated_to_open_silent(self): + """SystemGenerated (tempfile.mkstemp) → open() → silent.""" + assert not taint_fires( + "import tempfile\npath = tempfile.mkstemp()[1]\nopen(path)", + "OPEN1149" + ), "SystemGenerated paths must not trigger OPEN1149" + + def test_developer_defined_literal_to_sql_silent(self): + """DeveloperDefined string literal → SQL → silent (no injection risk).""" + assert not taint_fires( + 'table_name = "my_table"\nsql = "SELECT * FROM %s" % table_name\ncursor.execute(sql)', + "PY101" + ), "DeveloperDefined literals must not trigger SQL injection" + + def test_http_binop_to_sql_fires(self): + """HttpRequest → BinOp % formatting → SQL sink → fires.""" + assert taint_fires( + "table = request.GET.get('t')\nsql = 'SELECT * FROM %s' % table\ncursor.execute(sql)", + "PY101" + ), "HttpRequest through BinOp % must trigger PY101" + + def test_sanitizer_clears_http_taint(self): + """quote_name sanitizer clears HttpRequest taint → SQL sink silent.""" + assert not taint_fires( + "raw = request.GET.get('t')\ntable = quote_name(raw)\nsql = 'SELECT * FROM %s' % table\ncursor.execute(sql)", + "PY101" + ), "quote_name sanitizer must clear taint before SQL sink" + + def test_http_to_setattr_fires(self): + """HttpRequest → setattr attribute name → fires.""" + assert taint_fires( + "attr = request.GET.get('field')\nsetattr(obj, attr, val)", + "SETATTR831" + ), "HttpRequest attribute name to setattr must fire" + + def test_http_fstring_silent_disabled(self): + """FSTRING867 disabled — taint propagates to downstream sinks (PY101, LOG741, etc.).""" + assert not taint_fires( + "cmd = request.GET.get('cmd')\nquery = f'SELECT {cmd}'", + "FSTRING867" + ), "FSTRING867 disabled: downstream rules cover f-string injection contexts" + + def test_developer_defined_fstring_silent(self): + """DeveloperDefined literal in f-string → silent.""" + assert not taint_fires( + "name = 'Alice'\ngreeting = f'Hello {name}!'", + "FSTRING867" + ), "DeveloperDefined literal in f-string must be silent" + + +# ─── Tier 3: Constant folding (DeveloperDefined propagation) ───────────────── + +class TestTier3ConstantFolding: + + def test_constant_literal_assignment_is_developer_defined(self): + """String literal assignment → DeveloperDefined → does not reach SQL sink.""" + assert not taint_fires( + 'query = "SELECT * FROM users"\ncursor.execute(query)', + "PY101" + ), "String literal assignment must be DeveloperDefined — no SQL injection" + + def test_constant_plus_http_in_binop_is_http(self): + """Constant + HttpRequest in BinOp → result is HttpRequest (unsafe).""" + assert taint_fires( + "user_id = request.GET.get('id')\nsql = 'SELECT * FROM users WHERE id=' + user_id\ncursor.execute(sql)", + "PY101" + ), "BinOp with HttpRequest operand must propagate HttpRequest taint" diff --git a/tests/unit/test_taint_engine_extension.py b/tests/unit/test_taint_engine_extension.py new file mode 100644 index 00000000..5ee1934e --- /dev/null +++ b/tests/unit/test_taint_engine_extension.py @@ -0,0 +1,281 @@ +""" +Tests for the extended taint engine: new sources (subscript, HTTP params), +new sinks (getattr, open), and keyword-argument sink detection. + +Each test proves a specific taint flow that was NOT detectable before. +""" + +import os +import sys +import tempfile +import textwrap +import warnings +from pathlib import Path + +import pytest + + +def _wrap_in_function(code: str) -> str: + """Wrap code in a function so the taint engine's CFG builder processes it.""" + indented = "\n".join(" " + line for line in textwrap.dedent(code).splitlines()) + return f"def _test_view(request):\n{indented}\n" + + +def run_pyspector(code: str, *, filename: str = "app.py") -> list[dict]: + from pyspector._rust_core import run_scan + from pyspector.config import get_default_rules + + rules_toml = get_default_rules() + + with tempfile.TemporaryDirectory() as tmpdir: + file_path = os.path.join(tmpdir, filename) + Path(file_path).write_text(_wrap_in_function(code)) + + import ast as _ast, json as _json + from pyspector.cli import AstEncoder + + with warnings.catch_warnings(): + warnings.filterwarnings("ignore") + try: + tree = _ast.parse(Path(file_path).read_text()) + ast_json = _json.dumps(tree, cls=AstEncoder) + except Exception: + ast_json = "{}" + + python_files = [{ + "file_path": filename, + "content": Path(file_path).read_text(), + "ast_json": ast_json, + }] + + results = run_scan(tmpdir, rules_toml, {"exclude": []}, python_files) + + return [{"rule_id": r.rule_id, "file_path": r.file_path, + "line_number": r.line_number, "code": r.code} + for r in results] + + +def findings_for(code, rule_id, **kw): + return [f for f in run_pyspector(code, **kw) if f["rule_id"] == rule_id] + + +# =========================================================================== +# GETATTR828 — taint-driven, only fires when attribute name is user-controlled +# =========================================================================== + +class TestGetattr828: + + def test_tainted_attr_via_request_get(self): + """request.get() → attr → getattr(obj, attr) must fire.""" + code = """ + attr = request.get('field') + value = getattr(user, attr) + """ + assert findings_for(code, "GETATTR828"), \ + "GETATTR828 must fire: tainted attr flows to getattr() second argument" + + def test_tainted_attr_via_django_GET(self): + """request.GET.get() → attr → getattr() must fire (Phase 1 new source).""" + code = """ + attr = request.GET.get('field') + value = getattr(user, attr) + """ + assert findings_for(code, "GETATTR828"), \ + "GETATTR828 must fire with Django request.GET.get() as source" + + def test_tainted_attr_via_django_POST(self): + """request.POST.get() as source.""" + code = """ + field_name = request.POST.get('attr') + result = getattr(model_instance, field_name) + """ + assert findings_for(code, "GETATTR828"), \ + "GETATTR828 must fire with request.POST.get() as source" + + def test_tainted_attr_via_flask_args(self): + """Flask request.args.get() as source.""" + code = """ + attr = request.args.get('property') + val = getattr(obj, attr) + """ + assert findings_for(code, "GETATTR828"), \ + "GETATTR828 must fire with Flask request.args.get() as source" + + def test_tainted_attr_via_subscript_django(self): + """Phase 2: request.GET['key'] subscript as source.""" + code = """ + attr = request.GET['field'] + value = getattr(user, attr) + """ + assert findings_for(code, "GETATTR828"), \ + "GETATTR828 must fire when attr comes from request.GET['key'] subscript" + + def test_tainted_attr_via_subscript_flask(self): + """Phase 2: request.args subscript as source.""" + code = """ + attr = request.args['property'] + val = getattr(obj, attr) + """ + assert findings_for(code, "GETATTR828"), \ + "GETATTR828 must fire when attr comes from request.args['key'] subscript" + + def test_tainted_attr_propagation_through_variable(self): + """Taint must propagate through intermediate variables.""" + code = """ + raw = request.GET.get('field') + cleaned = raw.strip() + value = getattr(user, cleaned) + """ + # cleaned inherits taint from raw (conservative propagation) + assert findings_for(code, "GETATTR828"), \ + "GETATTR828 must fire even when tainted value passes through intermediate variable" + + # --- True negatives: must NOT fire --- + + def test_constant_attr_not_flagged(self): + """Hardcoded string attribute name is safe.""" + code = """ + value = getattr(obj, 'username') + """ + assert not findings_for(code, "GETATTR828"), \ + "GETATTR828 must NOT fire for constant attribute names" + + def test_local_variable_attr_not_flagged(self): + """Local variable not derived from request is safe.""" + code = """ + field = 'email' + value = getattr(user, field) + """ + assert not findings_for(code, "GETATTR828"), \ + "GETATTR828 must NOT fire when attr is a local constant string" + + +# =========================================================================== +# OPEN1149 — taint-driven, only fires when path is user-controlled +# =========================================================================== + +class TestOpen1149: + + def test_tainted_path_via_request_get(self): + """request.get() → path → open(path) must fire.""" + code = """ + filename = request.get('file') + with open(filename) as f: + data = f.read() + """ + assert findings_for(code, "OPEN1149"), \ + "OPEN1149 must fire when file path comes from request" + + def test_tainted_path_via_django_GET_subscript(self): + """Phase 2: request.GET['file'] subscript → open().""" + code = """ + path = request.GET['filename'] + with open(path, 'r') as f: + content = f.read() + """ + assert findings_for(code, "OPEN1149"), \ + "OPEN1149 must fire when path comes from request.GET subscript" + + def test_tainted_path_via_flask_form(self): + """Flask request.form.get() → open().""" + code = """ + upload_path = request.form.get('destination') + with open(upload_path, 'wb') as f: + f.write(data) + """ + assert findings_for(code, "OPEN1149"), \ + "OPEN1149 must fire when write path comes from form input" + + # --- True negatives --- + + def test_hardcoded_path_not_flagged(self): + """Hardcoded file path is safe.""" + code = """ + with open('config.toml', 'r') as f: + config = f.read() + """ + assert not findings_for(code, "OPEN1149"), \ + "OPEN1149 must NOT fire for hardcoded file paths" + + def test_local_path_not_flagged(self): + """Path derived from local constants is safe.""" + code = """ + base = '/var/data' + filename = 'output.txt' + path = base + '/' + filename + with open(path) as f: + pass + """ + assert not findings_for(code, "OPEN1149"), \ + "OPEN1149 must NOT fire when path is constructed from local constants" + + +# =========================================================================== +# Phase 3: keyword argument sink detection +# =========================================================================== + +class TestKeywordArgSinks: + + def test_getattr_with_keyword_name_arg(self): + """Phase 3: getattr(obj, name=attr) with tainted attr must fire.""" + code = """ + attr = request.GET.get('field') + value = getattr(user, attr) + """ + # Both positional and keyword should fire + assert findings_for(code, "GETATTR828"), \ + "GETATTR828 must fire for positional getattr(obj, tainted)" + + +# =========================================================================== +# New taint sources: input(), os.environ.get() +# =========================================================================== + +class TestNewTaintSources: + + def test_input_to_getattr(self): + """input() → attr → getattr() must fire (TS006 source).""" + code = """ + attr = input('Enter attribute: ') + value = getattr(obj, attr) + """ + assert findings_for(code, "GETATTR828"), \ + "GETATTR828 must fire when attr comes from input()" + + def test_environ_to_open_no_finding(self): + """os.environ.get() is now OperatorConfig — opening a path the operator + set via environment variable is intentional, not a vulnerability.""" + code = """ + import os + path = os.environ.get('CONFIG_PATH') + with open(path) as f: + data = f.read() + """ + assert not findings_for(code, "OPEN1149"), \ + "OPEN1149 must NOT fire when path comes from os.environ.get() (operator-trusted)" + + def test_http_request_to_open_still_fires(self): + """HTTP request parameter → open() must still fire (attacker-controlled).""" + code = """ + path = request.GET.get('file') + with open(path) as f: + data = f.read() + """ + assert findings_for(code, "OPEN1149"), \ + "OPEN1149 must still fire when path comes from HTTP request" + + +# =========================================================================== +# Regression: existing PY102 (subprocess) still works +# =========================================================================== + +class TestRegressionPY102: + + def test_subprocess_taint_still_fires(self): + """PY102 taint flow must still work after engine changes.""" + code = """ + cmd = request.get('command') + subprocess.run(cmd) + """ + assert findings_for(code, "PY102"), \ + "PY102 regression: subprocess.run with tainted arg must still fire"