scan.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan.json (31453B)
      1 {
      2   "paper": {
      3     "title": "The Perplexity Paradox: Why Code Compresses Better Than Math in LLM Prompts",
      4     "authors": ["Warren Johnson"],
      5     "year": 2026,
      6     "venue": "arXiv",
      7     "arxiv_id": "2602.15843"
      8   },
      9   "scan_version": 2,
     10   "active_modules": ["experimental_rigor", "data_leakage"],
     11   "methodology_tags": ["benchmark-eval"],
     12   "key_findings": "The paper validates the 'perplexity paradox' mechanism: code syntax tokens have 79× higher perplexity than content words and are preserved by compression algorithms, while numerical values in math problems have low perplexity and are pruned despite being task-critical. Signature injection recovers +34pp in pass rate (Cohen's h = 0.890) at aggressive compression ratios. TAAC, a task-aware adaptive compression algorithm, achieves 95.6% quality preservation with 21.8% cost savings, outperforming fixed-ratio compression. Length-controlled analysis (ANCOVA) confirms the code vs. CoT compression dichotomy is independent of prompt length.",
     13   "checklist": {
     14     "artifacts": {
     15       "code_released": {
     16         "applies": true,
     17         "answer": true,
     18         "justification": "The paper provides a GitHub URL: 'We release our extended benchmark suite, per-token analysis tools, and TAAC implementation at https://github.com/micoverde/taac-llm-compression' (Abstract, Section 10)."
     19       },
     20       "data_released": {
     21         "applies": true,
     22         "answer": true,
     23         "justification": "The paper uses publicly available benchmarks (HumanEval, MBPP, GSM8K, MATH, ARC-Challenge, MMLU-STEM) and claims to release the extended benchmark suite at the GitHub repository."
     24       },
     25       "environment_specified": {
     26         "applies": true,
     27         "answer": false,
     28         "justification": "No requirements.txt, Dockerfile, conda environment file, or library version specifications are provided in the paper. The software environment needed to reproduce the experiments is not documented."
     29       },
     30       "reproduction_instructions": {
     31         "applies": true,
     32         "answer": false,
     33         "justification": "No step-by-step reproduction instructions are included in the paper. A GitHub URL is provided but the paper itself does not contain commands to run or a 'Reproducing Results' section."
     34       }
     35     },
     36     "statistical_methodology": {
     37       "confidence_intervals_or_error_bars": {
     38         "applies": true,
     39         "answer": true,
     40         "justification": "95% confidence intervals are reported throughout: Wilson score CIs in Table 5 for MBPP pass rates, 95% CIs for Cohen's d in Table 3, and the ANCOVA results include standard error estimates."
     41       },
     42       "significance_tests": {
     43         "applies": true,
     44         "answer": true,
     45         "justification": "Multiple significance tests are used: ANCOVA F-tests (Table 1, p < .001), two-way ANOVA (Table 2, p = .0002), Cochran-Armitage trend test (p < 0.001), Kolmogorov-Smirnov test for bin matching (D = 0.089, p = .312), and TOST equivalence testing is proposed."
     46       },
     47       "effect_sizes_reported": {
     48         "applies": true,
     49         "answer": true,
     50         "justification": "Cohen's d with interpretations at each compression level (Table 3), Cohen's h = 0.890 for signature preservation (Section 8.3), η² partial eta-squared values in Tables 1 and 2, and percentage point differences throughout."
     51       },
     52       "sample_size_justified": {
     53         "applies": true,
     54         "answer": false,
     55         "justification": "No power analysis or formal justification for sample sizes. The paper states n=1,800 for MBPP, n=488 for signature experiments, n=723 for perplexity analysis, and 220 for TAAC evaluation, but does not justify why these sizes were chosen or whether they provide adequate statistical power."
     56       },
     57       "variance_reported": {
     58         "applies": true,
     59         "answer": false,
     60         "justification": "Std dev is reported for perplexity analysis (Table 12) but this is across token categories, not experimental runs. The main experimental results (Tables 5, 14) report only Wilson score CIs derived from point estimates, not variance across repeated experimental runs. No mention of multiple runs or seed-level variation for pass rate experiments."
     61       }
     62     },
     63     "evaluation_design": {
     64       "baselines_included": {
     65         "applies": true,
     66         "answer": true,
     67         "justification": "TAAC is compared against four baselines: uncompressed baseline (r=1.0), fixed r=0.7, fixed r=0.6, and task-based fixed compression (Table 15). The signature experiment compares baseline vs. signature injection conditions. Multi-algorithm comparison includes Random compression as a control."
     68       },
     69       "baselines_contemporary": {
     70         "applies": true,
     71         "answer": true,
     72         "justification": "Compression baselines include LLMLingua-2 (Pan et al., 2024), LLMLingua-1 (Jiang et al., 2023), and Selective Context (Li et al., 2023b), which are contemporary methods. Prior task-aware methods ATACompressor (Huang et al., 2024) and TACO-RL (Shi et al., 2024) are discussed."
     73       },
     74       "ablation_study": {
     75         "applies": true,
     76         "answer": false,
     77         "justification": "The paper briefly mentions 'Component ablation shows quality gating contributes most to quality preservation' (Section 8.5) but does not present the ablation results in a table or with detailed data. The actual ablation study is referenced but not shown."
     78       },
     79       "multiple_metrics": {
     80         "applies": true,
     81         "answer": true,
     82         "justification": "Multiple metrics are used: pass rate, quality retention, cost savings (Table 15), Cohen's d and Cohen's h effect sizes, η² for variance explained, NameError/AssertionError rates (Table 7), and perplexity measurements."
     83       },
     84       "human_evaluation": {
     85         "applies": true,
     86         "answer": false,
     87         "justification": "No human evaluation is included. All evaluation is automated via test suite pass/fail on benchmarks. Human evaluation could assess compression quality beyond binary pass/fail (e.g., code readability, partial correctness)."
     88       },
     89       "held_out_test_set": {
     90         "applies": true,
     91         "answer": true,
     92         "justification": "For TAAC, the quality predictor is trained on Phase 1 experiments (~50K samples) and evaluated on a separate 'synthetic validation set (220 prompts: 100 code, 100 CoT, 20 hybrid)' (Section 8.5). The main benchmark experiments use standard test sets without tuning."
     93       },
     94       "per_category_breakdown": {
     95         "applies": true,
     96         "answer": true,
     97         "justification": "Results are broken down by compression ratio (Tables 5, 11, 14), by task type (code vs. CoT in Table 11), by error type (Table 7), by token category (Table 12), and by algorithm type (Section 6)."
     98       },
     99       "failure_cases_discussed": {
    100         "applies": true,
    101         "answer": true,
    102         "justification": "Error type analysis in Table 7 shows specific failure modes: 86.1% NameError at baseline (function identity collapse), AssertionError after signature injection (logic failures). The perplexity paradox analysis explains why compression fails for math tasks (numbers pruned despite being task-critical)."
    103       },
    104       "negative_results_reported": {
    105         "applies": true,
    106         "answer": true,
    107         "justification": "MBPP results show severe degradation at aggressive compression (3.7% at r=0.3, Table 14). Table 15 shows TAAC achieves lower cost savings (21.8%) than fixed r=0.6 (41.2%), demonstrating a tradeoff. Fixed r=0.7 and Task-Based Fixed strategies are shown as dominated (Table 15, Figure 4)."
    108       }
    109     },
    110     "claims_and_evidence": {
    111       "abstract_claims_supported": {
    112         "applies": true,
    113         "answer": false,
    114         "justification": "The abstract claims validation 'across six code benchmarks (HumanEval, MBPP, HumanEval+, MultiPL-E in Python/JavaScript/Java)' but the paper only shows detailed results for MBPP. HumanEval+, MultiPL-E Python/JavaScript/Java results are not presented in any results table. The abstract also claims '7%' improvement for TAAC; actual data shows 6.5pp (95.6% vs 89.1%). The cross-language generalization claim is unsupported by the presented results."
    115       },
    116       "causal_claims_justified": {
    117         "applies": true,
    118         "answer": true,
    119         "justification": "The signature preservation experiment (Section 8.3) is a controlled intervention with clear before/after comparison, providing causal evidence for the perplexity paradox mechanism. The ANCOVA and bin-matched sampling (Section 3) use appropriate causal inference designs to rule out the length confound."
    120       },
    121       "generalization_bounded": {
    122         "applies": true,
    123         "answer": false,
    124         "justification": "The title claims broadly 'Why Code Compresses Better Than Math in LLM Prompts' but results are from specific benchmarks (primarily MBPP for code) with three specific models (Claude 3 Haiku, DeepSeek-Chat, GPT-4o-mini). The abstract claims cross-language generalization across Python/JavaScript/Java but only Python results are shown. The limitations section partially bounds scope but the title and abstract overclaim."
    125       },
    126       "alternative_explanations_discussed": {
    127         "applies": true,
    128         "answer": true,
    129         "justification": "Section 3 is entirely devoted to ruling out prompt length as an alternative explanation for the code vs. CoT dichotomy, using ANCOVA and bin-matched sampling. Section 6 addresses whether the pattern is an artifact of a specific compression algorithm. The paper systematically tests alternative explanations."
    130       },
    131       "proxy_outcome_distinction": {
    132         "applies": true,
    133         "answer": true,
    134         "justification": "The paper measures pass@k rates on benchmarks and frames results in terms of 'quality preservation' under compression. The measurements (pass/fail on test suites) closely match the claims (compression's effect on code generation quality). The proxy-outcome gap is minimal and appropriately scoped."
    135       }
    136     },
    137     "setup_transparency": {
    138       "model_versions_specified": {
    139         "applies": true,
    140         "answer": false,
    141         "justification": "Section 4.3 lists 'Claude 3 Haiku, DeepSeek-Chat, GPT-4o-mini' without specific version identifiers, snapshot dates, or API versions. 'DeepSeek-Chat' is especially vague. Per schema guidance, marketing names without snapshot dates do not count as specified versions."
    142       },
    143       "prompts_provided": {
    144         "applies": true,
    145         "answer": false,
    146         "justification": "No actual prompt text is shown in the paper. The paper describes what the compression algorithms do and references a GitHub repository, but does not include the actual prompts or system instructions used for LLM evaluation in the paper or appendix."
    147       },
    148       "hyperparameters_reported": {
    149         "applies": true,
    150         "answer": false,
    151         "justification": "No LLM inference hyperparameters are reported (temperature, top-p, max tokens). Compression ratios are specified, and TAAC's architecture parameters are described, but the inference settings for the underlying LLMs are absent."
    152       },
    153       "scaffolding_described": {
    154         "applies": false,
    155         "answer": false,
    156         "justification": "No agentic scaffolding is used. The paper evaluates prompt compression on standard benchmark problems without multi-step agent workflows."
    157       },
    158       "data_preprocessing_documented": {
    159         "applies": true,
    160         "answer": true,
    161         "justification": "The paper documents stratified sampling of 500 problems from MBPP's 974 (Section 4.3), the bin-matched sampling procedure with KS validation (Section 3.2.2), and the 12-category token classification methodology (Section 5.2, Table 8)."
    162       }
    163     },
    164     "limitations_and_scope": {
    165       "limitations_section_present": {
    166         "applies": true,
    167         "answer": true,
    168         "justification": "Section 9.3 'Limitations' provides four specific limitation bullet points covering code benchmark scope, pilot model dependency, TAAC training distribution, and missing agentic/multi-turn evaluation."
    169       },
    170       "threats_to_validity_specific": {
    171         "applies": true,
    172         "answer": true,
    173         "justification": "The limitations are specific to this study: 'Code benchmarks focus on function completion; longer code files may exhibit different patterns,' 'Perplexity analysis uses a single pilot model; different model families may yield different patterns,' and 'We do not evaluate agentic or multi-turn scenarios.'"
    174       },
    175       "scope_boundaries_stated": {
    176         "applies": true,
    177         "answer": true,
    178         "justification": "The limitations section explicitly states what was NOT tested: longer code files, different model families for perplexity analysis, distribution shift for the quality predictor, and agentic/multi-turn scenarios."
    179       }
    180     },
    181     "data_integrity": {
    182       "raw_data_available": {
    183         "applies": true,
    184         "answer": false,
    185         "justification": "The paper claims to release tools and implementation but does not explicitly state that raw experimental data (individual trial results, per-token perplexity measurements) is available. The GitHub repository contains 'benchmark suite, per-token analysis tools, and TAAC implementation' but not explicitly raw data."
    186       },
    187       "data_collection_described": {
    188         "applies": true,
    189         "answer": true,
    190         "justification": "The experimental design sections describe data collection: MBPP benchmark characteristics (Table 4), trial counts per condition (Section 4.3), perplexity token analysis methodology (Section 5.2), and signature preservation experiment design (Section 4.5)."
    191       },
    192       "recruitment_methods_described": {
    193         "applies": false,
    194         "answer": false,
    195         "justification": "No human participants. Data sources are standard public benchmarks (MBPP, HumanEval, GSM8K, etc.)."
    196       },
    197       "data_pipeline_documented": {
    198         "applies": true,
    199         "answer": false,
    200         "justification": "The pipeline from raw model outputs to reported metrics is not fully documented. The experimental design specifies 9,000 total trials (Section 4.3) but results show only 1,800 (Table 5) without explaining how trials were aggregated or which models' results are shown. How pass/fail was determined from model outputs is not described."
    201       }
    202     },
    203     "conflicts_of_interest": {
    204       "funding_disclosed": {
    205         "applies": true,
    206         "answer": true,
    207         "justification": "The acknowledgments section states: 'Computational resources were provided by Microsoft Azure.' The funding/compute source is disclosed."
    208       },
    209       "affiliations_disclosed": {
    210         "applies": true,
    211         "answer": true,
    212         "justification": "Author affiliation is listed as 'Bona Opera Studios, Sammamish, WA, USA' with an email address. The paper does not evaluate products from the author's organization."
    213       },
    214       "funder_independent_of_outcome": {
    215         "applies": true,
    216         "answer": true,
    217         "justification": "Microsoft Azure provided compute resources. While Microsoft has LLM products, the paper does not evaluate Azure-specific products and the funder has no direct financial stake in whether prompt compression works better for code than math."
    218       },
    219       "financial_interests_declared": {
    220         "applies": true,
    221         "answer": false,
    222         "justification": "No competing interests or financial interests statement is present in the paper. There is no disclosure of patents, equity, or other financial interests related to prompt compression technology."
    223       }
    224     },
    225     "contamination": {
    226       "training_cutoff_stated": {
    227         "applies": true,
    228         "answer": false,
    229         "justification": "No training data cutoff dates are stated for any of the three models used (Claude 3 Haiku, DeepSeek-Chat, GPT-4o-mini). This is critical since benchmarks like MBPP (2021) and HumanEval (2021) predate these models."
    230       },
    231       "train_test_overlap_discussed": {
    232         "applies": true,
    233         "answer": false,
    234         "justification": "No discussion of whether the models' training data includes MBPP, HumanEval, or other benchmark problems. All benchmarks used (2018-2021) were public well before the 2026 models were trained."
    235       },
    236       "benchmark_contamination_addressed": {
    237         "applies": true,
    238         "answer": false,
    239         "justification": "HumanEval (2021), MBPP (2021), GSM8K (2021), MATH (2021), and ARC (2018) were all published years before the models' training cutoffs. Contamination risk is significant but not discussed anywhere in the paper."
    240       }
    241     },
    242     "human_studies": {
    243       "pre_registered": {
    244         "applies": false,
    245         "answer": false,
    246         "justification": "No human participants in the study. All experiments involve automated benchmark evaluation of LLM compression."
    247       },
    248       "irb_or_ethics_approval": {
    249         "applies": false,
    250         "answer": false,
    251         "justification": "No human participants in the study."
    252       },
    253       "demographics_reported": {
    254         "applies": false,
    255         "answer": false,
    256         "justification": "No human participants in the study."
    257       },
    258       "inclusion_exclusion_criteria": {
    259         "applies": false,
    260         "answer": false,
    261         "justification": "No human participants in the study."
    262       },
    263       "randomization_described": {
    264         "applies": false,
    265         "answer": false,
    266         "justification": "No human participants in the study."
    267       },
    268       "blinding_described": {
    269         "applies": false,
    270         "answer": false,
    271         "justification": "No human participants in the study."
    272       },
    273       "attrition_reported": {
    274         "applies": false,
    275         "answer": false,
    276         "justification": "No human participants in the study."
    277       }
    278     },
    279     "cost_and_practicality": {
    280       "inference_cost_reported": {
    281         "applies": true,
    282         "answer": true,
    283         "justification": "Costs are reported for each experimental phase: MBPP validation '$1.52' (Section 4.3), Phase 1 '$0.08', Phase 2 '$0.59', Phase 3 '$5.35' (Section 6.2). TAAC cost savings are quantified at 21.8% (Table 15). General API cost context ($3-$15 per million tokens) is provided in Section 1."
    284       },
    285       "compute_budget_stated": {
    286         "applies": true,
    287         "answer": true,
    288         "justification": "Estimated runtime of 6.2 hours for MBPP validation (Section 4.3), per-phase cost estimates are given, and computational resources from Microsoft Azure are acknowledged. Compression algorithm latencies are reported: LLMLingua-2 ~150ms, LLMLingua-1 ~3s, Selective Context ~500ms."
    289       }
    290     },
    291     "experimental_rigor": {
    292       "seed_sensitivity_reported": {
    293         "applies": true,
    294         "answer": false,
    295         "justification": "No mention of random seeds or seed sensitivity analysis. Results appear to be from single experimental runs without assessing variability across seeds."
    296       },
    297       "number_of_runs_stated": {
    298         "applies": true,
    299         "answer": true,
    300         "justification": "Trial counts are explicitly stated: n=1,800 for MBPP validation, n=488 for signature preservation, n=723 for perplexity analysis, 220 prompts for TAAC evaluation, N=600 for length-controlled analysis."
    301       },
    302       "hyperparameter_search_budget": {
    303         "applies": true,
    304         "answer": false,
    305         "justification": "TAAC includes a trained 2-layer MLP quality predictor and a DistilBERT classifier, but no hyperparameter search budget is reported for training these components. The compression ratios are pre-specified but model training details are missing."
    306       },
    307       "best_config_selection_justified": {
    308         "applies": true,
    309         "answer": true,
    310         "justification": "TAAC thresholds are justified with explicit rationale: r*_code = 0.65 as 'conservative buffer above the r = 0.6 cliff,' r*_cot = 0.80 as 'minimal compression for reasoning tasks' (Table 10). The compression ratios tested are pre-specified from prior work, not selected post-hoc."
    311       },
    312       "multiple_comparison_correction": {
    313         "applies": true,
    314         "answer": false,
    315         "justification": "The paper runs numerous statistical tests (ANCOVA, two-way ANOVA, KS test, Cochran-Armitage, multiple Cohen's d calculations across 6 compression levels) without applying any multiple comparison correction (Bonferroni, Holm, or Benjamini-Hochberg)."
    316       },
    317       "self_comparison_bias_addressed": {
    318         "applies": true,
    319         "answer": false,
    320         "justification": "TAAC is the authors' own system, compared against baselines the authors implemented. No discussion of author-evaluation bias or independent evaluation. The comparison in Table 15 is entirely self-administered."
    321       },
    322       "compute_budget_vs_performance": {
    323         "applies": true,
    324         "answer": true,
    325         "justification": "Table 15 and Figure 4 explicitly show performance (quality preservation) as a function of cost savings for each strategy. Compression algorithm latencies are also reported (LLMLingua-2 ~150ms, LLMLingua-1 ~3s, Selective Context ~500ms per prompt)."
    326       },
    327       "benchmark_construct_validity": {
    328         "applies": true,
    329         "answer": false,
    330         "justification": "No discussion of whether MBPP, HumanEval, or other benchmarks actually measure the capabilities the paper claims to evaluate. The paper uses these benchmarks without questioning construct validity or comparing with alternative evaluation methods."
    331       },
    332       "scaffold_confound_addressed": {
    333         "applies": false,
    334         "answer": false,
    335         "justification": "No scaffolding is involved. The experiments evaluate prompt compression on standard benchmarks without agent scaffolding."
    336       }
    337     },
    338     "data_leakage": {
    339       "temporal_leakage_addressed": {
    340         "applies": true,
    341         "answer": false,
    342         "justification": "All benchmarks used (HumanEval 2021, MBPP 2021, GSM8K 2021, MATH 2021, ARC 2018) predate the 2026 models by years. Temporal leakage is not discussed despite being a clear risk."
    343       },
    344       "feature_leakage_addressed": {
    345         "applies": true,
    346         "answer": false,
    347         "justification": "No discussion of whether the compressed prompts or evaluation setup leaks answer information. The signature injection experiment deliberately provides additional information but this is not framed as a leakage consideration."
    348       },
    349       "non_independence_addressed": {
    350         "applies": true,
    351         "answer": false,
    352         "justification": "No discussion of independence between benchmark problems and model training data. MBPP and HumanEval problems are publicly available and likely in the training data of 2026 models."
    353       },
    354       "leakage_detection_method": {
    355         "applies": true,
    356         "answer": false,
    357         "justification": "No leakage detection methods are applied. No canary strings, membership inference tests, n-gram overlap analysis, or decontamination pipelines are mentioned."
    358       }
    359     }
    360   },
    361   "claims": [
    362     {
    363       "claim": "The compression threshold (r ≥ 0.6) generalizes beyond HumanEval to MBPP, with pass rates degrading systematically from 54.7% at r=1.0 to 3.7% at r=0.3.",
    364       "evidence": "MBPP validation with n=1,800 trials across 6 compression ratios (Table 5, Section 4.4). Cochran-Armitage trend test confirms linear trend (p < 0.001).",
    365       "supported": "strong"
    366     },
    367     {
    368       "claim": "The code vs. CoT compression dichotomy is independent of prompt length, reflecting task structure rather than prompt brevity.",
    369       "evidence": "ANCOVA controlling for length: F(5, 2019) = 57.84, p = .000108, η² = .081 (Table 1). Length-matched samples show larger interaction effect (η² = .102 vs .081, Table 2).",
    370       "supported": "strong"
    371     },
    372     {
    373       "claim": "Python syntax tokens exhibit 79× higher perplexity than content words, explaining their preservation under compression, while numbers have low perplexity despite being task-critical.",
    374       "evidence": "Per-token perplexity analysis of n=723 tokens (Table 12, Figure 2). Python syntax mean PPL = 928,636 vs content words 11,697. Numbers PPL = 9,195. Kept vs removed tokens: 71,000× difference.",
    375       "supported": "moderate"
    376     },
    377     {
    378       "claim": "Signature injection recovers +34 percentage points in pass rate under aggressive compression, with NameError rates dropping from 86.1% to 6.1%.",
    379       "evidence": "Controlled experiment with n=488 pooled trials across 3 compression ratios (Tables 6, 7, 13). Cohen's h = 0.890 (very large effect). Consistent across r=0.3, 0.4, 0.5.",
    380       "supported": "strong"
    381     },
    382     {
    383       "claim": "TAAC achieves 95.6% quality preservation with 21.8% cost savings, outperforming fixed-ratio compression by 6.5 percentage points in quality.",
    384       "evidence": "Comparison against 4 baselines on 220-prompt synthetic validation set (Table 15, Figure 4). Pareto-optimal on cost-quality frontier.",
    385       "supported": "moderate"
    386     },
    387     {
    388       "claim": "The compression threshold generalizes across six code benchmarks and programming languages (Python, JavaScript, Java).",
    389       "evidence": "The abstract and Section 1 claim cross-language generalization, but detailed results are only presented for MBPP (Python). HumanEval+, MultiPL-E Python/JavaScript/Java results are not shown in any results table.",
    390       "supported": "weak"
    391     }
    392   ],
    393   "red_flags": [
    394     {
    395       "flag": "Abstract overclaims benchmark coverage",
    396       "detail": "The abstract lists 'six code benchmarks (HumanEval, MBPP, HumanEval+, MultiPL-E in Python/JavaScript/Java)' but detailed results are only presented for MBPP. The claimed cross-language generalization across Python/JavaScript/Java is not supported by presented data."
    397     },
    398     {
    399       "flag": "Complete absence of contamination analysis",
    400       "detail": "All benchmarks used (HumanEval 2021, MBPP 2021, GSM8K 2021, MATH 2021, ARC 2018) were publicly available 3-8 years before the 2026 models were trained. No contamination discussion, training cutoffs, or decontamination measures are mentioned. This is a fundamental threat to validity for a benchmark-eval paper."
    401     },
    402     {
    403       "flag": "Trial count discrepancy",
    404       "detail": "Section 4.3 describes a design with '9,000 trials' (500 problems × 6 ratios × 3 models) but Table 5 only reports 1,800 trials (300 per ratio). The discrepancy between designed and reported trials is unexplained—unclear whether only one model's results are shown or whether the design was modified."
    405     },
    406     {
    407       "flag": "Small TAAC evaluation set",
    408       "detail": "TAAC's main evaluation (Table 15) uses only 220 prompts (100 code, 100 CoT, 20 hybrid). This is quite small for validating an adaptive system, and the 'synthetic validation set' construction is not described."
    409     },
    410     {
    411       "flag": "No seed sensitivity or repeated runs",
    412       "detail": "All results appear to be from single experimental runs. No random seed analysis or repeated trials to assess result stability. Wilson CIs capture sampling uncertainty but not experimental variability."
    413     },
    414     {
    415       "flag": "Self-citing unpublished prior work as foundation",
    416       "detail": "The entire paper builds on 'Compress or Route?' (Johnson, 2026), a Zenodo publication by the same sole author. The foundational findings (r ≥ 0.6 threshold) cannot be independently verified from peer-reviewed sources."
    417     }
    418   ],
    419   "cited_papers": [
    420     {
    421       "title": "LLMLingua: Compressing Prompts for Accelerated Inference of Large Language Models",
    422       "authors": ["H. Jiang", "Q. Wu", "C.-Y. Lin", "Y. Yang", "L. Qiu"],
    423       "year": 2023,
    424       "relevance": "Core prompt compression method using perplexity-based token pruning; baseline algorithm in this paper's comparisons."
    425     },
    426     {
    427       "title": "LLMLingua-2: Data Distillation for Efficient and Faithful Task-Agnostic Prompt Compression",
    428       "authors": ["Z. Pan", "Q. Wu", "H. Jiang"],
    429       "year": 2024,
    430       "relevance": "Learned BERT-based compression method used as primary compression algorithm in experiments."
    431     },
    432     {
    433       "title": "FrugalGPT: How to Use Large Language Models While Reducing Cost and Improving Performance",
    434       "authors": ["L. Chen", "M. Zaharia", "J. Zou"],
    435       "year": 2023,
    436       "arxiv_id": "2305.05176",
    437       "relevance": "Foundational work on LLM cost reduction through cascading model strategies; related cost-optimization approach."
    438     },
    439     {
    440       "title": "RouteLLM: Learning to Route LLMs with Preference Data",
    441       "authors": ["I. Ong", "A. Almahairi", "V. Wu"],
    442       "year": 2024,
    443       "arxiv_id": "2406.18665",
    444       "relevance": "LLM routing using preference data for cost-quality optimization; complementary approach to compression."
    445     },
    446     {
    447       "title": "Evaluating Large Language Models Trained on Code",
    448       "authors": ["M. Chen", "J. Tworek", "H. Jun"],
    449       "year": 2021,
    450       "arxiv_id": "2107.03374",
    451       "relevance": "Introduced HumanEval benchmark and pass@k metric for code generation evaluation; key benchmark used in this paper."
    452     },
    453     {
    454       "title": "Program Synthesis with Large Language Models",
    455       "authors": ["J. Austin", "A. Odena", "M. Nye"],
    456       "year": 2021,
    457       "arxiv_id": "2108.07732",
    458       "relevance": "Introduced MBPP benchmark for Python code generation; primary validation benchmark in this paper."
    459     },
    460     {
    461       "title": "SWE-bench: Can Language Models Resolve Real-World GitHub Issues?",
    462       "authors": ["C. E. Jimenez", "J. Yang", "A. Wettig"],
    463       "year": 2024,
    464       "arxiv_id": "2310.06770",
    465       "relevance": "Real-world code generation benchmark for GitHub issue resolution; referenced as more complex code evaluation."
    466     },
    467     {
    468       "title": "MultiPL-E: A Scalable and Polyglot Approach to Benchmarking Neural Code Generation",
    469       "authors": ["F. Cassano", "J. Gouwar", "D. Nguyen"],
    470       "year": 2023,
    471       "relevance": "Multi-language code generation benchmark; claimed as part of cross-language validation but results not shown."
    472     },
    473     {
    474       "title": "Efficient Memory Management for Large Language Model Serving with PagedAttention",
    475       "authors": ["W. Kwon", "Z. Li", "S. Zhuang"],
    476       "year": 2023,
    477       "relevance": "Systems-level LLM serving optimization through memory management; complementary efficiency approach."
    478     },
    479     {
    480       "title": "FlashAttention: Fast and Memory-Efficient Exact Attention with IO-Awareness",
    481       "authors": ["T. Dao", "D. Fu", "S. Ermon", "A. Rudra", "C. Ré"],
    482       "year": 2022,
    483       "relevance": "Foundational work on efficient attention computation for transformer inference."
    484     },
    485     {
    486       "title": "Chain-of-Thought Prompting Elicits Reasoning in Large Language Models",
    487       "authors": ["J. Wei", "X. Wang", "D. Schuurmans"],
    488       "year": 2022,
    489       "relevance": "Introduced chain-of-thought prompting; the CoT reasoning task type is a core experimental condition in this paper."
    490     },
    491     {
    492       "title": "Is Your Code Generated by ChatGPT Really Correct? Rigorous Evaluation of Large Language Models for Code Generation",
    493       "authors": ["J. Liu", "C. S. Xia", "Y. Wang", "L. Zhang"],
    494       "year": 2024,
    495       "relevance": "Introduced HumanEval+ with additional test cases for rigorous code generation evaluation."
    496     }
    497   ]
    498 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs