scan-v5.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan-v5.json (25093B)
      1 {
      2   "scan_version": 5,
      3   "paper_type": "empirical",
      4   "paper": {
      5     "title": "Interactive Code Generation via Test-Driven User-Intent Formalization",
      6     "authors": [
      7       "Shuvendu K. Lahiri",
      8       "Sarah Fakhoury",
      9       "Aaditya Naik",
     10       "Georgios Sakkas",
     11       "Saikat Chakraborty",
     12       "Madanlal Musuvathi",
     13       "Jeevana Priya Inala",
     14       "Piali Choudhury",
     15       "Curtis von Veh",
     16       "Chenglong Wang",
     17       "Jianfeng Gao"
     18     ],
     19     "year": 2022,
     20     "venue": "arXiv.org",
     21     "arxiv_id": "2208.05950",
     22     "doi": "10.48550/arXiv.2208.05950"
     23   },
     24   "checklist": {
     25     "claims_and_evidence": {
     26       "abstract_claims_supported": {
     27         "applies": true,
     28         "answer": true,
     29         "justification": "The abstract's claimed improvements (22.49–37.71% for MBPP, 24.79–53.98% for HumanEval) are directly backed by Figures 5–6 and the results in Section V.",
     30         "source": "haiku"
     31       },
     32       "causal_claims_justified": {
     33         "applies": true,
     34         "answer": true,
     35         "justification": "The paper makes causal claims ('each component contributes to improving effectiveness') and supports them with ablation studies in Section V-C, systematically removing each component.",
     36         "source": "haiku"
     37       },
     38       "generalization_bounded": {
     39         "applies": true,
     40         "answer": true,
     41         "justification": "Section VI explicitly states findings 'may not generalize to a different set of programs across different languages and problem domains' and that MBPP/HumanEval 'may not be representative of real-world software development.'",
     42         "source": "haiku"
     43       },
     44       "alternative_explanations_discussed": {
     45         "applies": true,
     46         "answer": false,
     47         "justification": "The paper does not discuss alternative explanations for why the approach works beyond the intended mechanism; ablations show component contributions but don't consider whether, for example, the benefit comes purely from increased sample diversity rather than user feedback.",
     48         "source": "haiku"
     49       },
     50       "proxy_outcome_distinction": {
     51         "applies": true,
     52         "answer": true,
     53         "justification": "The paper explicitly defines oracle simulation as a proxy for user intent (Definition IV.1) and acknowledges in Section VI that it 'cannot account for the cognitive effort of users undertaking the coding tasks.'",
     54         "source": "haiku"
     55       }
     56     },
     57     "limitations_and_scope": {
     58       "limitations_section_present": {
     59         "applies": true,
     60         "answer": true,
     61         "justification": "Section VI 'THREATS' is a dedicated threats-to-validity section covering generalization, model stability, and interaction simulation.",
     62         "source": "haiku"
     63       },
     64       "threats_to_validity_specific": {
     65         "applies": true,
     66         "answer": true,
     67         "justification": "Threats are specific: benchmark scope (MBPP/HumanEval may not represent real-world code), Codex API non-determinism threatening replicability, and oracle simulation not capturing user cognitive burden.",
     68         "source": "haiku"
     69       },
     70       "scope_boundaries_stated": {
     71         "applies": true,
     72         "answer": true,
     73         "justification": "The paper explicitly bounds scope to Python programming on MBPP and HumanEval, single-function completion tasks, and simulation-based (not real-user) evaluation.",
     74         "source": "haiku"
     75       }
     76     },
     77     "conflicts_of_interest": {
     78       "funding_disclosed": {
     79         "applies": true,
     80         "answer": false,
     81         "justification": "No funding disclosure is present in the paper; while Microsoft Research affiliation is stated, there is no explicit funding statement.",
     82         "source": "haiku"
     83       },
     84       "affiliations_disclosed": {
     85         "applies": true,
     86         "answer": true,
     87         "justification": "All author affiliations are clearly disclosed: Microsoft Research, University of Pennsylvania, and UC San Diego.",
     88         "source": "haiku"
     89       },
     90       "funder_independent_of_outcome": {
     91         "applies": true,
     92         "answer": false,
     93         "justification": "The majority of authors are Microsoft Research employees evaluating TICODER, a tool they built; this is not independent evaluation.",
     94         "source": "haiku"
     95       },
     96       "financial_interests_declared": {
     97         "applies": true,
     98         "answer": false,
     99         "justification": "No competing interests or financial interests statement appears anywhere in the paper.",
    100         "source": "haiku"
    101       }
    102     },
    103     "scope_and_framing": {
    104       "key_terms_defined": {
    105         "applies": true,
    106         "answer": true,
    107         "justification": "Key terms are defined precisely: ITDCG workflow, pass@k@m and accept@m metrics (Table I, Section II-C), and the formal program definition (Definition II.1).",
    108         "source": "haiku"
    109       },
    110       "intended_contribution_clear": {
    111         "applies": true,
    112         "answer": true,
    113         "justification": "The paper explicitly states three contributions in the introduction: abstract ITDCG algorithm, TICODER implementation, and empirical evaluation showing per-component improvements.",
    114         "source": "haiku"
    115       },
    116       "engagement_with_prior_work": {
    117         "applies": true,
    118         "answer": true,
    119         "justification": "Section VII engages substantively with AlphaCode, CodeT, PBE literature, oracle-guided synthesis (OGIS), and interactive program synthesis, explaining how TICODER differs from each.",
    120         "source": "haiku"
    121       }
    122     }
    123   },
    124   "type_checklist": {
    125     "empirical": {
    126       "artifacts": {
    127         "code_released": {
    128           "applies": true,
    129           "answer": false,
    130           "justification": "The paper states 'we aim to mitigate this by releasing model generated output in the near future'—this is a promise of future release, not an actual release.",
    131           "source": "haiku"
    132         },
    133         "data_released": {
    134           "applies": true,
    135           "answer": true,
    136           "justification": "MBPP and HumanEval are standard public benchmarks used unmodified (with only a minor documented modification to HumanEval docstrings).",
    137           "source": "haiku"
    138         },
    139         "environment_specified": {
    140           "applies": true,
    141           "answer": false,
    142           "justification": "Only API access parameters (temperature, top_p, token limits) are given; no requirements file, Dockerfile, or dependency specification is provided.",
    143           "source": "haiku"
    144         },
    145         "reproduction_instructions": {
    146           "applies": true,
    147           "answer": false,
    148           "justification": "No step-by-step instructions for reproduction are included; the algorithm is described formally but code is not released and no runnable instructions are provided.",
    149           "source": "haiku"
    150         }
    151       },
    152       "statistical_methodology": {
    153         "confidence_intervals_or_error_bars": {
    154           "applies": true,
    155           "answer": false,
    156           "justification": "No confidence intervals or error bars are reported; only single expected-value pass@k@m results appear in all figures and tables.",
    157           "source": "haiku"
    158         },
    159         "significance_tests": {
    160           "applies": true,
    161           "answer": false,
    162           "justification": "No statistical significance tests are used for any comparative claims; improvements are reported as raw percentage differences without p-values.",
    163           "source": "haiku"
    164         },
    165         "effect_sizes_reported": {
    166           "applies": true,
    167           "answer": true,
    168           "justification": "Absolute percentage improvements are reported throughout (e.g., 22.49% improvement for MBPP pass@1@1) with clear baseline context.",
    169           "source": "haiku"
    170         },
    171         "sample_size_justified": {
    172           "applies": true,
    173           "answer": false,
    174           "justification": "Sample sizes are inherited from existing benchmarks (MBPP: 427, HumanEval: 164) without any power analysis or justification for adequacy.",
    175           "source": "haiku"
    176         },
    177         "variance_reported": {
    178           "applies": true,
    179           "answer": false,
    180           "justification": "No variance, standard deviation, or spread is reported for any results; all results are single-point expected values.",
    181           "source": "haiku"
    182         }
    183       },
    184       "evaluation_design": {
    185         "baselines_included": {
    186           "applies": true,
    187           "answer": true,
    188           "justification": "Multiple baselines are included: Codex (t=0.8 and t=0), Baseline TICODER (no mutation/ranking), CodeT, IdealRanking, and IdealTests.",
    189           "source": "haiku"
    190         },
    191         "baselines_contemporary": {
    192           "applies": true,
    193           "answer": true,
    194           "justification": "CodeT and AlphaCode are contemporary (2022) state-of-the-art methods for LLM-based code generation with test-based ranking.",
    195           "source": "haiku"
    196         },
    197         "ablation_study": {
    198           "applies": true,
    199           "answer": true,
    200           "justification": "Section V-C contains a thorough ablation study removing each component individually (code prompt, single-assert, dynamic mutation, test ranking, code ranking) across both benchmarks.",
    201           "source": "haiku"
    202         },
    203         "multiple_metrics": {
    204           "applies": true,
    205           "answer": true,
    206           "justification": "The evaluation uses pass@k@m (for k∈{1,2,5,10} and m∈{0,1,2,5}) and accept@m metrics across both benchmarks.",
    207           "source": "haiku"
    208         },
    209         "human_evaluation": {
    210           "applies": true,
    211           "answer": false,
    212           "justification": "No human evaluation is conducted; user interaction is entirely simulated via oracle (reference implementations). A user study is mentioned only as future work.",
    213           "source": "haiku"
    214         },
    215         "held_out_test_set": {
    216           "applies": true,
    217           "answer": true,
    218           "justification": "Both benchmarks include hidden test sets (Tp) used to evaluate whether generated code is correct, distinct from the tests generated by TICODER.",
    219           "source": "haiku"
    220         },
    221         "per_category_breakdown": {
    222           "applies": true,
    223           "answer": false,
    224           "justification": "No per-category or per-task-type breakdown is provided; ablation tables break down by system component, not by problem type or difficulty category.",
    225           "source": "haiku"
    226         },
    227         "failure_cases_discussed": {
    228           "applies": true,
    229           "answer": true,
    230           "justification": "The paper discusses the significant gap vs IdealTests (16.62% and 35.57% behind for MBPP/HumanEval) and notes specific cases where removing dynamic mutation improves some HumanEval metrics.",
    231           "source": "haiku"
    232         },
    233         "negative_results_reported": {
    234           "applies": true,
    235           "answer": true,
    236           "justification": "The paper honestly reports that the default TICODER configuration is outperformed by CodeT on HumanEval pass@1@1, and that an alternate non-default configuration performs better on that metric.",
    237           "source": "haiku"
    238         }
    239       },
    240       "setup_transparency": {
    241         "model_versions_specified": {
    242           "applies": true,
    243           "answer": true,
    244           "justification": "The model is specified as 'OpenAI's Codex code-davinci-002 model,' a specific versioned endpoint.",
    245           "source": "haiku"
    246         },
    247         "prompts_provided": {
    248           "applies": true,
    249           "answer": true,
    250           "justification": "Figure 3 shows concrete example code and test prompts with actual content; the prompt structure including prefix, description, header, and prompt body is fully illustrated.",
    251           "source": "haiku"
    252         },
    253         "hyperparameters_reported": {
    254           "applies": true,
    255           "answer": true,
    256           "justification": "Temperature (0.8), top_p (0.95), max generation length (300 tokens), number of code suggestions (100), and number of test suggestions (50) are all stated in Section IV-C.",
    257           "source": "haiku"
    258         },
    259         "scaffolding_described": {
    260           "applies": true,
    261           "answer": true,
    262           "justification": "Algorithm 1 (InteractiveTestDrivenCodeGen) is described in formal pseudocode with all components (QueryLLM, SyntacticMutateTests, DynMutateTests, RankTests, RankCodes) defined.",
    263           "source": "haiku"
    264         },
    265         "data_preprocessing_documented": {
    266           "applies": true,
    267           "answer": true,
    268           "justification": "The modification to HumanEval (removing non-hidden input-output examples from docstrings) is documented and the rationale explained in Section IV-B.",
    269           "source": "haiku"
    270         }
    271       },
    272       "data_integrity": {
    273         "raw_data_available": {
    274           "applies": true,
    275           "answer": false,
    276           "justification": "Raw Codex outputs are promised for future release but not available at time of submission.",
    277           "source": "haiku"
    278         },
    279         "data_collection_described": {
    280           "applies": true,
    281           "answer": true,
    282           "justification": "The paper describes querying Codex once per problem and caching outputs for all experiments, with specific parameters for code and test generation.",
    283           "source": "haiku"
    284         },
    285         "recruitment_methods_described": {
    286           "applies": false,
    287           "answer": false,
    288           "justification": "No human participants; oracle simulation replaces user interaction entirely.",
    289           "source": "haiku"
    290         },
    291         "data_pipeline_documented": {
    292           "applies": true,
    293           "answer": true,
    294           "justification": "The pipeline is documented: benchmark datasets → Codex API query (cached once) → test mutation → ranking → oracle simulation → metric computation.",
    295           "source": "haiku"
    296         }
    297       },
    298       "contamination": {
    299         "training_cutoff_stated": {
    300           "applies": true,
    301           "answer": false,
    302           "justification": "Codex's training data cutoff is not stated; the paper uses Codex on HumanEval without addressing when Codex was trained relative to benchmark creation.",
    303           "source": "haiku"
    304         },
    305         "train_test_overlap_discussed": {
    306           "applies": true,
    307           "answer": false,
    308           "justification": "Potential overlap between Codex training data and HumanEval/MBPP benchmarks is never discussed, despite HumanEval being created specifically to evaluate Codex.",
    309           "source": "haiku"
    310         },
    311         "benchmark_contamination_addressed": {
    312           "applies": true,
    313           "answer": false,
    314           "justification": "HumanEval was publicly released with the Codex paper in 2021 and likely overlaps with Codex training data; this contamination risk is not addressed.",
    315           "source": "haiku"
    316         }
    317       },
    318       "human_studies": {
    319         "pre_registered": {
    320           "applies": false,
    321           "answer": false,
    322           "justification": "No human participants in the study.",
    323           "source": "haiku"
    324         },
    325         "irb_or_ethics_approval": {
    326           "applies": false,
    327           "answer": false,
    328           "justification": "No human participants in the study.",
    329           "source": "haiku"
    330         },
    331         "demographics_reported": {
    332           "applies": false,
    333           "answer": false,
    334           "justification": "No human participants in the study.",
    335           "source": "haiku"
    336         },
    337         "inclusion_exclusion_criteria": {
    338           "applies": false,
    339           "answer": false,
    340           "justification": "No human participants in the study.",
    341           "source": "haiku"
    342         },
    343         "randomization_described": {
    344           "applies": false,
    345           "answer": false,
    346           "justification": "No human participants in the study.",
    347           "source": "haiku"
    348         },
    349         "blinding_described": {
    350           "applies": false,
    351           "answer": false,
    352           "justification": "No human participants in the study.",
    353           "source": "haiku"
    354         },
    355         "attrition_reported": {
    356           "applies": false,
    357           "answer": false,
    358           "justification": "No human participants in the study.",
    359           "source": "haiku"
    360         }
    361       },
    362       "cost_and_practicality": {
    363         "inference_cost_reported": {
    364           "applies": true,
    365           "answer": false,
    366           "justification": "No inference cost, API cost, or latency figures are reported; the paper uses Codex API but does not quantify computational expense.",
    367           "source": "haiku"
    368         },
    369         "compute_budget_stated": {
    370           "applies": true,
    371           "answer": false,
    372           "justification": "No total computational budget is stated anywhere in the paper.",
    373           "source": "haiku"
    374         }
    375       }
    376     }
    377   },
    378   "claims": [
    379     {
    380       "claim": "TICODER improves pass@1 code generation accuracy by 22.49% (1 query) to 37.71% (5 queries) over baseline Codex on MBPP.",
    381       "evidence": "Figure 6a shows pass@1@1 of 70.73% vs Codex baseline 48.24%; pass@1@5 of 85.95% vs baseline 48.24%.",
    382       "supported": "strong"
    383     },
    384     {
    385       "claim": "TICODER improves pass@1 code generation accuracy by 24.79% (1 query) to 53.98% (5 queries) over baseline Codex on HumanEval.",
    386       "evidence": "Figure 6b shows pass@1@1 of 55.28% vs Codex baseline 30.49%; pass@1@5 of 84.47% vs baseline 30.49%.",
    387       "supported": "strong"
    388     },
    389     {
    390       "claim": "Test ranking is the most impactful component; random test selection degrades performance most severely.",
    391       "evidence": "Tables II and III show removing test ranking (using random) yields the largest drops: pass@1@1 drops from 70.72% to 63.23% on MBPP and 55.27% to 48.44% on HumanEval.",
    392       "supported": "strong"
    393     },
    394     {
    395       "claim": "TICODER outperforms CodeT with only 1 user interaction on MBPP and 2 interactions on HumanEval.",
    396       "evidence": "Figure 6: TICODER pass@1@1 of 70.73% vs CodeT 63.70% on MBPP; TICODER pass@1@2 of 68.94% vs CodeT 58.54% on HumanEval.",
    397       "supported": "strong"
    398     },
    399     {
    400       "claim": "TICODER generates user-intent-consistent tests for 87.12% of MBPP and 95.73% of HumanEval examples within 10 queries.",
    401       "evidence": "Figure 7 shows cumulative accept@m curves reaching 87.12% (MBPP) and 95.73% (HumanEval) at m=10.",
    402       "supported": "strong"
    403     },
    404     {
    405       "claim": "Dynamic test mutation improves test pool quality by 10.72% (MBPP) and 13.74% (HumanEval) over purely LLM-generated tests at 5 interactions.",
    406       "evidence": "Figures 6a and 6b compare TICODER vs BaselineIdeal (optimal ranking on LLM-only tests), showing the gap attributable to mutation.",
    407       "supported": "moderate"
    408     }
    409   ],
    410   "methodology_tags": [
    411     "benchmark-eval"
    412   ],
    413   "key_findings": "TICODER, an interactive test-driven code generation framework, significantly improves LLM code generation accuracy by using simulated user feedback on generated test queries to prune and rank candidate code suggestions. With a single simulated user query, TICODER improves pass@1 by 22.49% on MBPP and 24.79% on HumanEval over baseline Codex; with 5 queries these gains reach 37.71% and 53.98%. Test ranking is the single most important component—random test presentation degrades performance more than removing any other component. The evaluation relies entirely on oracle simulation (reference implementations substituting for users), with a real user study left as future work.",
    414   "red_flags": [
    415     {
    416       "flag": "Oracle simulation only, no user study",
    417       "detail": "The entire evaluation simulates user interaction via reference code implementations; no real users were tested, meaning cognitive load, error rates, and actual usability are entirely unvalidated."
    418     },
    419     {
    420       "flag": "Codex contamination on HumanEval unaddressed",
    421       "detail": "HumanEval was released alongside Codex and was likely in Codex's training data; the paper uses Codex on HumanEval without discussing this contamination risk."
    422     },
    423     {
    424       "flag": "No statistical significance testing or CIs",
    425       "detail": "All comparative results are reported as single point estimates without confidence intervals, standard errors, or significance tests, making it impossible to assess whether differences are reliable."
    426     },
    427     {
    428       "flag": "Code and raw outputs not released",
    429       "detail": "TICODER implementation and Codex outputs are promised 'in the near future' but not actually released, making reproduction impossible."
    430     },
    431     {
    432       "flag": "Default configuration selected on MBPP pass@1@1",
    433       "detail": "The default configuration was chosen based on performance on a single metric (pass@1@1) on a single benchmark (MBPP), and Table III shows this is not optimal for HumanEval."
    434     },
    435     {
    436       "flag": "No conflict of interest disclosure",
    437       "detail": "Microsoft Research authors evaluate their own tool (TICODER) with no competing interests statement, and no mention of potential patent or commercial interests."
    438     }
    439   ],
    440   "cited_papers": [
    441     {
    442       "title": "Evaluating large language models trained on code (Codex)",
    443       "relevance": "Introduces Codex and HumanEval benchmark; core baseline model and evaluation dataset used throughout the paper."
    444     },
    445     {
    446       "title": "CodeT: Code generation with generated tests",
    447       "relevance": "Primary comparable baseline; uses LLM-generated tests to rank code without user interaction, directly compared against TICODER."
    448     },
    449     {
    450       "title": "Competition-level code generation with AlphaCode",
    451       "relevance": "Related approach using test generation for code ranking; compared as related work in test-augmented code generation."
    452     },
    453     {
    454       "title": "Program synthesis with large language models (MBPP)",
    455       "relevance": "Introduces MBPP benchmark, one of two primary evaluation datasets used in the paper."
    456     },
    457     {
    458       "title": "Oracle-guided component-based program synthesis (OGIS)",
    459       "relevance": "Foundational work on oracle-guided synthesis that inspires the interactive query framework; the oracle simulation methodology is derived from OGIS."
    460     },
    461     {
    462       "title": "Interactive program synthesis by augmented examples",
    463       "relevance": "Prior PBE work on interactive synthesis with augmented examples; directly motivates the user-query approach for disambiguating intent."
    464     },
    465     {
    466       "title": "Expectation vs. experience: Evaluating the usability of code generation tools powered by LLMs",
    467       "relevance": "Provides motivation for the paper by showing users struggle to evaluate LLM-generated code without running or debugging it."
    468     },
    469     {
    470       "title": "Productivity assessment of neural code completion",
    471       "relevance": "Evidence that Copilot generates non-trivial fractions of real-world code; motivates the importance of correctness guarantees in code generation."
    472     }
    473   ],
    474   "engagement_factors": {
    475     "practical_relevance": {
    476       "score": 3,
    477       "justification": "Directly applicable to IDE code completion tools like Copilot; proposes a concrete workflow that could be integrated into existing products."
    478     },
    479     "surprise_contrarian": {
    480       "score": 1,
    481       "justification": "The idea of using tests for disambiguation is intuitive and grounded in prior PBE work; no surprising or counterintuitive findings."
    482     },
    483     "fear_safety": {
    484       "score": 0,
    485       "justification": "No AI safety or risk concerns raised; focus is on improving code correctness, not on harms."
    486     },
    487     "drama_conflict": {
    488       "score": 0,
    489       "justification": "No controversy or conflict with competing approaches; the paper is constructive and non-adversarial."
    490     },
    491     "demo_ability": {
    492       "score": 2,
    493       "justification": "TICODER is described as a working tool and the workflow is concrete enough to demonstrate, though code is not released."
    494     },
    495     "brand_recognition": {
    496       "score": 2,
    497       "justification": "Microsoft Research team with prominent co-authors; evaluates OpenAI Codex, a well-known model."
    498     }
    499   },
    500   "hn_data": {
    501     "threads": [
    502       {
    503         "hn_id": "44889206",
    504         "title": "Large Language Models Do Not Simulate Human Psychology",
    505         "points": 1,
    506         "comments": 0,
    507         "url": "https://news.ycombinator.com/item?id=44889206",
    508         "created_at": "2025-08-13T14:50:01Z"
    509       },
    510       {
    511         "hn_id": "38006205",
    512         "title": "UK NCSC and GCHQ's Thoughts on Child Safety on Commodity Platforms (2022)",
    513         "points": 1,
    514         "comments": 0,
    515         "url": "https://news.ycombinator.com/item?id=38006205",
    516         "created_at": "2023-10-24T21:54:20Z"
    517       },
    518       {
    519         "hn_id": "28289552",
    520         "title": "Transferring Manipulation from GPU Simulation to a Remote Real-World TriFinger",
    521         "points": 1,
    522         "comments": 0,
    523         "url": "https://news.ycombinator.com/item?id=28289552",
    524         "created_at": "2021-08-24T14:48:13Z"
    525       }
    526     ],
    527     "top_points": 1,
    528     "total_points": 3,
    529     "total_comments": 0
    530   }
    531 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs