scan.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan.json (28546B)
      1 {
      2   "paper": {
      3     "title": "STELP: Secure Transpilation and Execution of LLM-Generated Programs",
      4     "authors": [
      5       "Swapnil Shinde",
      6       "Sahil Wadhwa",
      7       "Andy Luo",
      8       "Akshay Gupta",
      9       "Mohammad Shahed Sorower"
     10     ],
     11     "year": 2026,
     12     "venue": "arXiv",
     13     "arxiv_id": "2601.05467",
     14     "doi": "10.48550/arXiv.2601.05467"
     15   },
     16   "scan_version": 2,
     17   "active_modules": ["experimental_rigor", "data_leakage"],
     18   "methodology_tags": ["benchmark-eval"],
     19   "key_findings": "STELP is a transpiler-inspired engine that intercepts LLM-generated Python code, validates its AST against a configurable safe grammar subset, and executes a secured version with runtime controls. On the authors' InjectedHumanEval benchmark (634 samples, 12 CWE types), STELP achieves a perfect True Block Rate (1.0) and 0.981 True Allow Rate, significantly outperforming Meta's CodeShield (TBR 0.68, TAR 0.93). STELP adds median 0.19ms latency overhead and achieves 100% correctness on a filtered subset of the Python-Code-Execution-Output dataset. A feedback loop using Llama 3.3-70B repaired 90.2% of blocked code samples in under 2 retries.",
     20   "checklist": {
     21     "artifacts": {
     22       "code_released": {
     23         "applies": true,
     24         "answer": false,
     25         "justification": "No STELP source code is released. The paper provides no repository URL for the engine itself. Only the InjectedHumanEval dataset is linked (tinyurl.com/24aebhmr)."
     26       },
     27       "data_released": {
     28         "applies": true,
     29         "answer": true,
     30         "justification": "The InjectedHumanEval dataset is released at https://tinyurl.com/24aebhmr. The Python-Code-Execution-Output dataset is publicly available on HuggingFace (Diwank 2024)."
     31       },
     32       "environment_specified": {
     33         "applies": true,
     34         "answer": false,
     35         "justification": "The latency evaluation mentions '16GB of RAM, and a 10-core CPU' but no requirements.txt, Dockerfile, Python version, or library versions are provided."
     36       },
     37       "reproduction_instructions": {
     38         "applies": true,
     39         "answer": false,
     40         "justification": "No step-by-step reproduction instructions are provided. Without the source code or detailed setup guide, results cannot be replicated."
     41       }
     42     },
     43     "statistical_methodology": {
     44       "confidence_intervals_or_error_bars": {
     45         "applies": true,
     46         "answer": false,
     47         "justification": "The main safety results (TBR=1.0, TAR=0.981) are reported as point estimates with no confidence intervals. Table 2 reports descriptive statistics (mean, median, std dev, IQR) for latency but not CIs."
     48       },
     49       "significance_tests": {
     50         "applies": true,
     51         "answer": false,
     52         "justification": "The paper claims STELP 'significantly outperforms' CodeShield based on comparing raw numbers (TBR 1.0 vs 0.68, TAR 0.981 vs 0.93) without any statistical significance test."
     53       },
     54       "effect_sizes_reported": {
     55         "applies": true,
     56         "answer": true,
     57         "justification": "Absolute metrics with baseline context are provided: STELP TBR=1.0 vs CodeShield TBR=0.68, TAR=0.981 vs 0.93. The magnitude of improvement is clear from these paired comparisons."
     58       },
     59       "sample_size_justified": {
     60         "applies": true,
     61         "answer": false,
     62         "justification": "No justification for why 634 samples in InjectedHumanEval, 361 for correctness, or 262 for latency. No power analysis or discussion of whether these sizes are adequate."
     63       },
     64       "variance_reported": {
     65         "applies": true,
     66         "answer": true,
     67         "justification": "Table 2 reports std dev (46.26ms) and IQR (0.17ms) for latency across 30 executions per sample. The safety evaluation is deterministic (rule-based AST analysis), so variance is not applicable there."
     68       }
     69     },
     70     "evaluation_design": {
     71       "baselines_included": {
     72         "applies": true,
     73         "answer": true,
     74         "justification": "STELP is compared against Meta's CodeShield on TBR and TAR metrics. Tables 4 and 5 provide feature and CWE coverage comparisons."
     75       },
     76       "baselines_contemporary": {
     77         "applies": true,
     78         "answer": true,
     79         "justification": "CodeShield (Meta PurpleLlama, 2024) is a recent and relevant baseline for static analysis of LLM-generated code."
     80       },
     81       "ablation_study": {
     82         "applies": true,
     83         "answer": false,
     84         "justification": "STELP has three main components (AST Processor, Safe Code Generator, Feedback Generator) but no ablation study isolates the contribution of each component."
     85       },
     86       "multiple_metrics": {
     87         "applies": true,
     88         "answer": true,
     89         "justification": "The evaluation uses TBR, TAR (safety), correctness rate, and multiple latency statistics (mean, median, per-statement type breakdown). Four distinct evaluation dimensions."
     90       },
     91       "human_evaluation": {
     92         "applies": true,
     93         "answer": false,
     94         "justification": "Human annotators validated the InjectedHumanEval dataset creation, but no human evaluation of STELP's outputs (e.g., whether blocking decisions are correct beyond binary labels, or whether transpiled code preserves semantics) was conducted."
     95       },
     96       "held_out_test_set": {
     97         "applies": true,
     98         "answer": false,
     99         "justification": "The STELP configuration was 'manually developed' as 'the minimum set of permissions that allows the benign InjectedHumanEval samples to run' — meaning the configuration was tuned directly on the test data with no separate dev/test split."
    100       },
    101       "per_category_breakdown": {
    102         "applies": true,
    103         "answer": true,
    104         "justification": "Table 1 breaks down unsafe samples by CWE type. Table 6 shows STELP exceptions by type. Table 3 provides per-statement latency. Table 5 compares CWE coverage per tool."
    105       },
    106       "failure_cases_discussed": {
    107         "applies": true,
    108         "answer": false,
    109         "justification": "The paper mentions 3 safe samples incorrectly blocked (TAR=0.981) and 8% of feedback loop samples exceeding 10 retries, but does not analyze specific failure cases or show examples of incorrect blocking."
    110       },
    111       "negative_results_reported": {
    112         "applies": true,
    113         "answer": true,
    114         "justification": "Several negative results are reported: TAR=0.981 (3 safe samples blocked), worst-case 520ms latency, 8% of feedback loop samples exceeded 10 retries, and the paper acknowledges STELP is currently Python-only."
    115       }
    116     },
    117     "claims_and_evidence": {
    118       "abstract_claims_supported": {
    119         "applies": true,
    120         "answer": true,
    121         "justification": "The abstract's claim that STELP 'outperforms an existing method by a significant margin' is supported by TBR 1.0 vs 0.68 and TAR 0.981 vs 0.93 on InjectedHumanEval. The 'human-validated dataset of 634 samples' is documented in the Evaluation section."
    122       },
    123       "causal_claims_justified": {
    124         "applies": true,
    125         "answer": false,
    126         "justification": "The paper claims 'STELP's dynamic validation of a safe grammar subset is a more robust approach than traditional static pattern matching' — a causal claim about why STELP outperforms CodeShield. No ablation or controlled experiment isolates the design choice (dynamic vs static) as the cause of improvement."
    127       },
    128       "generalization_bounded": {
    129         "applies": true,
    130         "answer": false,
    131         "justification": "The title says 'LLM-Generated Programs' (general) and abstract claims STELP can handle 'production systems,' but evaluation is Python-only on synthetic benchmarks (InjectedHumanEval, Python-Code-Execution-Output). No real production code or other languages tested."
    132       },
    133       "alternative_explanations_discussed": {
    134         "applies": true,
    135         "answer": false,
    136         "justification": "No alternative explanations for STELP's performance are discussed. For example, the possibility that InjectedHumanEval's synthetic injections are formulaic and easy to detect (vs. real adversarial code) is not considered."
    137       },
    138       "proxy_outcome_distinction": {
    139         "applies": true,
    140         "answer": false,
    141         "justification": "The paper measures TBR/TAR on a synthetic benchmark but claims STELP 'secures autonomous production AI systems.' The gap between benchmark safety (synthetic CWE injections) and actual production security is not acknowledged."
    142       }
    143     },
    144     "setup_transparency": {
    145       "model_versions_specified": {
    146         "applies": true,
    147         "answer": true,
    148         "justification": "The paper specifies 'Llama3.3 70B Instruct' for code injection and 'Llama 3-70B' / 'Llama 3.3-70B' for the feedback loop. These are specific model identifiers."
    149       },
    150       "prompts_provided": {
    151         "applies": true,
    152         "answer": true,
    153         "justification": "Figure 5 provides the full injection prompt template used for InjectedHumanEval creation. Figures 9 and 10 provide the feedback generation and code repair prompts. These are actual prompt text, not just descriptions."
    154       },
    155       "hyperparameters_reported": {
    156         "applies": true,
    157         "answer": false,
    158         "justification": "No temperature, top-p, max_tokens, or other sampling parameters are reported for any of the LLM calls (Llama3.3 for injection, Llama 3-70B for feedback)."
    159       },
    160       "scaffolding_described": {
    161         "applies": true,
    162         "answer": true,
    163         "justification": "STELP's architecture is described in detail: AST Processor (parser + validator), Safe Code Generator and Executor (recursive AST traversal with controls), Feedback Generator (error classification + LLM summarization). Figures 1-4 illustrate the flow with examples."
    164       },
    165       "data_preprocessing_documented": {
    166         "applies": true,
    167         "answer": false,
    168         "justification": "InjectedHumanEval creation is documented (LLM injection → human review → 634 samples). However, the correctness subset selection ('361 code samples compatible with our evaluation configuration') and latency subset (262 samples) have no explained filtering criteria."
    169       }
    170     },
    171     "limitations_and_scope": {
    172       "limitations_section_present": {
    173         "applies": true,
    174         "answer": false,
    175         "justification": "There is no dedicated limitations section. The 'Conclusion and Future Work' mentions Python-only scope and plans for Java/SQL, but does not substantively discuss limitations of the current evaluation or approach."
    176       },
    177       "threats_to_validity_specific": {
    178         "applies": true,
    179         "answer": false,
    180         "justification": "No threats to validity are discussed. The paper does not address whether synthetic CWE injections represent real-world threats, whether the configuration was overfit to the benchmark, or whether the results generalize beyond the tested setting."
    181       },
    182       "scope_boundaries_stated": {
    183         "applies": true,
    184         "answer": false,
    185         "justification": "The paper mentions Python-only scope and future language extensions but does not explicitly state what the results do NOT show (e.g., no adversarial robustness testing, no real production deployment, no multi-language support)."
    186       }
    187     },
    188     "data_integrity": {
    189       "raw_data_available": {
    190         "applies": true,
    191         "answer": true,
    192         "justification": "The InjectedHumanEval dataset is available at the provided tinyurl link. The Python-Code-Execution-Output dataset is public on HuggingFace."
    193       },
    194       "data_collection_described": {
    195         "applies": true,
    196         "answer": true,
    197         "justification": "InjectedHumanEval creation is described: HumanEval base → LLM injection with Llama3.3 70B using specific prompts (Figure 5) → human annotation review → final dataset of 164 safe + 470 unsafe samples across 12 CWE types."
    198       },
    199       "recruitment_methods_described": {
    200         "applies": true,
    201         "answer": false,
    202         "justification": "Human annotators validated the InjectedHumanEval dataset but are not described — no mention of how many annotators, their expertise, inter-annotator agreement, or selection criteria."
    203       },
    204       "data_pipeline_documented": {
    205         "applies": true,
    206         "answer": false,
    207         "justification": "The InjectedHumanEval pipeline is partially documented. However, the paper uses 634 samples for safety, 361 for correctness, and 262 for latency with no explanation for the different subset sizes or filtering criteria."
    208       }
    209     },
    210     "conflicts_of_interest": {
    211       "funding_disclosed": {
    212         "applies": true,
    213         "answer": false,
    214         "justification": "No funding disclosure or acknowledgments section. All authors are affiliated with Capital One but no explicit funding statement is provided."
    215       },
    216       "affiliations_disclosed": {
    217         "applies": true,
    218         "answer": true,
    219         "justification": "All authors list Capital One as their affiliation, with a note that some work was 'performed while at Capital One.' The corporate affiliation is clearly stated."
    220       },
    221       "funder_independent_of_outcome": {
    222         "applies": true,
    223         "answer": false,
    224         "justification": "Capital One employees developed and evaluated STELP, presumably for Capital One's production use. The employer has a direct interest in the tool being effective, making the funder non-independent of the outcome."
    225       },
    226       "financial_interests_declared": {
    227         "applies": true,
    228         "answer": false,
    229         "justification": "No competing interests statement or financial disclosure is present in the paper."
    230       }
    231     },
    232     "contamination": {
    233       "training_cutoff_stated": {
    234         "applies": true,
    235         "answer": false,
    236         "justification": "The feedback loop evaluation uses Llama 3.3-70B on HumanEval-derived tasks. The training data cutoff for Llama 3.3 is not stated, despite HumanEval being published in 2021 and likely in the training data."
    237       },
    238       "train_test_overlap_discussed": {
    239         "applies": true,
    240         "answer": false,
    241         "justification": "InjectedHumanEval is derived from HumanEval (2021). Llama 3.3 was trained after 2021 and likely saw HumanEval solutions. The 90.2% repair rate may be inflated if the LLM already knows the solutions. This overlap is not discussed."
    242       },
    243       "benchmark_contamination_addressed": {
    244         "applies": true,
    245         "answer": false,
    246         "justification": "HumanEval has been publicly available since 2021. Using it as the base for InjectedHumanEval means any model trained after 2021 may have memorized solutions, but contamination risk is not addressed."
    247       }
    248     },
    249     "human_studies": {
    250       "pre_registered": {
    251         "applies": false,
    252         "answer": false,
    253         "justification": "No human participants in the study. Human annotators performed dataset quality control but were not research subjects."
    254       },
    255       "irb_or_ethics_approval": {
    256         "applies": false,
    257         "answer": false,
    258         "justification": "No human participants. The study evaluates a software tool on benchmarks."
    259       },
    260       "demographics_reported": {
    261         "applies": false,
    262         "answer": false,
    263         "justification": "No human participants in the study."
    264       },
    265       "inclusion_exclusion_criteria": {
    266         "applies": false,
    267         "answer": false,
    268         "justification": "No human participants in the study."
    269       },
    270       "randomization_described": {
    271         "applies": false,
    272         "answer": false,
    273         "justification": "No human participants in the study."
    274       },
    275       "blinding_described": {
    276         "applies": false,
    277         "answer": false,
    278         "justification": "No human participants in the study."
    279       },
    280       "attrition_reported": {
    281         "applies": false,
    282         "answer": false,
    283         "justification": "No human participants in the study."
    284       }
    285     },
    286     "cost_and_practicality": {
    287       "inference_cost_reported": {
    288         "applies": true,
    289         "answer": true,
    290         "justification": "Latency is reported in detail: median 0.19ms overhead, mean 4.93ms, per-statement breakdowns (Table 3), and worst case 520ms. The latency evaluation used 30 runs per sample on specified hardware."
    291       },
    292       "compute_budget_stated": {
    293         "applies": true,
    294         "answer": false,
    295         "justification": "Hardware is mentioned ('16GB of RAM, and a 10-core CPU') but total compute budget (e.g., total GPU/API hours for LLM calls in benchmark creation and feedback loop evaluation) is not stated."
    296       }
    297     },
    298     "experimental_rigor": {
    299       "seed_sensitivity_reported": {
    300         "applies": true,
    301         "answer": false,
    302         "justification": "No seed sensitivity analysis for the stochastic LLM components (Llama 3.3 for injection, feedback loop). The 30 latency runs are timing runs, not seed variation."
    303       },
    304       "number_of_runs_stated": {
    305         "applies": true,
    306         "answer": true,
    307         "justification": "The latency evaluation explicitly states '30 executions on both STELP and Python's native execution engine.' The feedback loop evaluation reports aggregate results without stating number of runs."
    308       },
    309       "hyperparameter_search_budget": {
    310         "applies": true,
    311         "answer": false,
    312         "justification": "The STELP configuration was 'manually developed' for the evaluation. No systematic search budget, number of configurations tried, or tuning methodology is described."
    313       },
    314       "best_config_selection_justified": {
    315         "applies": true,
    316         "answer": false,
    317         "justification": "The configuration was tuned on the test data itself: 'a single STELP configuration containing the minimum set of permissions that allows the benign InjectedHumanEval samples to run.' No validation set was used for selection."
    318       },
    319       "multiple_comparison_correction": {
    320         "applies": false,
    321         "answer": false,
    322         "justification": "No statistical significance tests are performed, so multiple comparison correction is not applicable."
    323       },
    324       "self_comparison_bias_addressed": {
    325         "applies": true,
    326         "answer": false,
    327         "justification": "The authors created both the tool (STELP) and the benchmark (InjectedHumanEval) and evaluated one against the other. No acknowledgment of author-evaluation bias or independent evaluation."
    328       },
    329       "compute_budget_vs_performance": {
    330         "applies": true,
    331         "answer": false,
    332         "justification": "Latency overhead is reported (Table 2) but performance is not plotted as a function of compute budget. No analysis of how changing STELP's configuration complexity affects the safety-latency tradeoff."
    333       },
    334       "benchmark_construct_validity": {
    335         "applies": true,
    336         "answer": false,
    337         "justification": "No discussion of whether InjectedHumanEval (synthetic CWE injections into simple coding tasks) actually represents the threat landscape of real LLM-generated code in production. The benchmark's construct validity is not questioned."
    338       },
    339       "scaffold_confound_addressed": {
    340         "applies": false,
    341         "answer": false,
    342         "justification": "STELP is the tool being evaluated, not a scaffold confound in model comparison. No model-vs-model comparison is made where scaffolding would confound results."
    343       }
    344     },
    345     "data_leakage": {
    346       "temporal_leakage_addressed": {
    347         "applies": true,
    348         "answer": false,
    349         "justification": "HumanEval (2021) was published before Llama 3.3's training. The 90.2% feedback loop repair rate may benefit from Llama 3.3 having seen HumanEval solutions. Temporal leakage is not discussed."
    350       },
    351       "feature_leakage_addressed": {
    352         "applies": true,
    353         "answer": false,
    354         "justification": "Not discussed. The feedback loop provides detailed error information from STELP, which could make repair trivially easy compared to real-world scenarios where such feedback is unavailable."
    355       },
    356       "non_independence_addressed": {
    357         "applies": true,
    358         "answer": false,
    359         "justification": "InjectedHumanEval is derived directly from HumanEval. The safe samples ARE HumanEval samples. Independence between training data and test data is not discussed."
    360       },
    361       "leakage_detection_method": {
    362         "applies": true,
    363         "answer": false,
    364         "justification": "No leakage detection or prevention method is applied. No canary strings, membership inference, or decontamination analysis."
    365       }
    366     }
    367   },
    368   "claims": [
    369     {
    370       "claim": "STELP achieves a perfect True Block Rate (1.0) for blocking unsafe code on InjectedHumanEval.",
    371       "evidence": "Evaluation section: 470 unsafe samples all blocked. Table 6 breaks down blocking by exception type (FunctionNotAllowedError: 311, WhileTrueError: 70, etc.).",
    372       "supported": "moderate"
    373     },
    374     {
    375       "claim": "STELP achieves a True Allow Rate of 0.981 for executing safe code on InjectedHumanEval.",
    376       "evidence": "Evaluation section: 161/164 safe samples executed correctly. Three safe samples were incorrectly blocked.",
    377       "supported": "moderate"
    378     },
    379     {
    380       "claim": "STELP significantly outperforms CodeShield on safety metrics (TBR 1.0 vs 0.68, TAR 0.981 vs 0.93).",
    381       "evidence": "Static Code Analysis Comparison section compares results on InjectedHumanEval. Tables 4 and 5 compare feature coverage.",
    382       "supported": "moderate"
    383     },
    384     {
    385       "claim": "STELP achieves 100% correctness on the Python-Code-Execution-Output test dataset.",
    386       "evidence": "Correctness section: tested on 361 samples 'compatible with our evaluation configuration,' all produced correct output matching native Python execution.",
    387       "supported": "weak"
    388     },
    389     {
    390       "claim": "STELP adds negligible latency overhead (median 0.19ms) compared to native Python execution.",
    391       "evidence": "Table 2: median execution time increase 0.19ms across 262 samples with 30 runs each. Table 3 provides per-statement breakdowns.",
    392       "supported": "strong"
    393     },
    394     {
    395       "claim": "90.2% of unsafe code samples were successfully repaired by the feedback loop in under 2 retries.",
    396       "evidence": "Feedback Loop section: Llama 3.3-70B used for code repair on InjectedHumanEval. 8% exceeded 10 retries, attributed to LLM limitations.",
    397       "supported": "weak"
    398     }
    399   ],
    400   "red_flags": [
    401     {
    402       "flag": "Configuration tuned on test data",
    403       "detail": "The STELP configuration used for safety evaluation was 'manually developed' as 'the minimum set of permissions that allows the benign InjectedHumanEval samples to run.' This means the configuration was directly fit to the test benchmark, creating circularity: the tool was configured to work on the exact data it was evaluated on, with no held-out validation."
    404     },
    405     {
    406       "flag": "Self-created benchmark evaluating own system",
    407       "detail": "The authors created both STELP and InjectedHumanEval, then evaluated one against the other. The synthetic CWE injections were designed by the same team that built the defense. No independent evaluation or red-teaming by external parties."
    408     },
    409     {
    410       "flag": "Company evaluating own product",
    411       "detail": "All authors are Capital One employees evaluating a Capital One tool. No independent replication or third-party evaluation. The corporate interest in STELP being effective is not acknowledged."
    412     },
    413     {
    414       "flag": "Unexplained subset filtering",
    415       "detail": "Different evaluation dimensions use different unexplained subsets: 634 samples for safety, 361 'compatible' samples for correctness, and 262 for latency. The filtering criteria are not documented, raising questions about whether incompatible samples would expose weaknesses."
    416     },
    417     {
    418       "flag": "No adversarial robustness evaluation",
    419       "detail": "Unsafe samples were generated by LLM injection of known CWE patterns — a formulaic approach. No testing against adaptive adversaries who might craft code specifically to bypass STELP's AST analysis while remaining malicious."
    420     },
    421     {
    422       "flag": "Contamination risk in feedback evaluation",
    423       "detail": "The 90.2% feedback repair rate uses Llama 3.3-70B on HumanEval-derived code. Llama 3.3 likely saw HumanEval solutions during training, which would inflate the repair success rate — the model may be recovering memorized solutions rather than genuinely repairing code from feedback."
    424     }
    425   ],
    426   "cited_papers": [
    427     {
    428       "title": "Evaluating Large Language Models Trained on Code",
    429       "authors": ["Mark Chen", "Jerry Tworek", "Heewoo Jun"],
    430       "year": 2021,
    431       "arxiv_id": "2107.03374",
    432       "relevance": "Introduces HumanEval benchmark, which serves as the base for InjectedHumanEval and is central to LLM code generation evaluation."
    433     },
    434     {
    435       "title": "Program Synthesis with Large Language Models",
    436       "authors": ["Jacob Austin", "Augustus Odena", "Maxwell Nye"],
    437       "year": 2021,
    438       "arxiv_id": "2108.07732",
    439       "relevance": "Early benchmark for LLM code generation capabilities, referenced as containing potentially unsafe code for execution."
    440     },
    441     {
    442       "title": "CodeShield: Shield against LLM generated insecure code (PurpleLlama)",
    443       "authors": ["Meta"],
    444       "year": 2024,
    445       "relevance": "The primary baseline — static analysis tool for LLM-generated code security that STELP claims to significantly outperform."
    446     },
    447     {
    448       "title": "MetaGPT: Meta Programming for Multi-Agent Collaborative Framework",
    449       "authors": ["Sirui Hong", "Xiawu Zheng", "Jonathan Chen"],
    450       "year": 2023,
    451       "arxiv_id": "2308.00352",
    452       "relevance": "Multi-agent code generation framework representing the type of system STELP aims to secure."
    453     },
    454     {
    455       "title": "ChatDev: Communicative Agents for Software Development",
    456       "authors": ["Chen Qian", "Wei Liu", "Hongzhang Liu"],
    457       "year": 2023,
    458       "arxiv_id": "2307.07924",
    459       "relevance": "Multi-agent software development system where LLM-generated code needs safety controls like STELP."
    460     },
    461     {
    462       "title": "Executable Code Actions Elicit Better LLM Agents",
    463       "authors": ["Xingyao Wang", "Yangyi Chen", "Lifan Yuan"],
    464       "year": 2024,
    465       "arxiv_id": "2402.01030",
    466       "relevance": "Proposes executable code as a unified action space for LLM agents — directly motivates STELP's safe execution engine."
    467     },
    468     {
    469       "title": "The Impact of AI on Developer Productivity: Evidence from GitHub Copilot",
    470       "authors": ["Sida Peng", "Eirini Kalliamvakou", "Peter Cihon", "Mert Demirer"],
    471       "year": 2023,
    472       "arxiv_id": "2302.06590",
    473       "relevance": "Empirical study of AI coding assistant productivity impact, relevant to the agentic code generation ecosystem STELP targets."
    474     },
    475     {
    476       "title": "LLMs Cannot Reliably Identify and Reason About Security Vulnerabilities (Yet?)",
    477       "authors": ["Saad Ullah", "Mingji Han", "Saurabh Pujar"],
    478       "year": 2024,
    479       "arxiv_id": "2312.12575",
    480       "relevance": "Demonstrates LLM limitations in security vulnerability detection, motivating non-LLM approaches like STELP's AST analysis."
    481     },
    482     {
    483       "title": "Large Language Models and Code Security: A Systematic Literature Review",
    484       "authors": ["Edin Basic", "Alberto Giaretta"],
    485       "year": 2025,
    486       "arxiv_id": "2412.15004",
    487       "relevance": "Systematic review of LLM code security — directly relevant to survey scope on code generation safety."
    488     },
    489     {
    490       "title": "The Llama 3 Herd of Models",
    491       "authors": ["Aaron Grattafiori", "Abhimanyu Dubey"],
    492       "year": 2024,
    493       "arxiv_id": "2407.21783",
    494       "relevance": "The LLM family used in STELP's evaluation for code injection and feedback generation."
    495     },
    496     {
    497       "title": "Large Language Model Based Multi-Agents: A Survey of Progress and Challenges",
    498       "authors": ["Taicheng Guo", "Xiuying Chen", "Yaqi Wang"],
    499       "year": 2024,
    500       "arxiv_id": "2402.01680",
    501       "relevance": "Survey of multi-agent LLM systems that STELP aims to secure, relevant to agentic AI survey scope."
    502     },
    503     {
    504       "title": "A Survey on Code Generation with LLM-Based Agents",
    505       "authors": ["Yanzhen Dong", "Xin Jiang", "Jiahui Qian"],
    506       "year": 2025,
    507       "arxiv_id": "2508.00083",
    508       "relevance": "Survey of LLM-based code generation agents, directly relevant to the systems STELP targets."
    509     }
    510   ]
    511 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs