scan.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan.json (26346B)
      1 {
      2   "scan_version": 2,
      3   "active_modules": ["experimental_rigor", "data_leakage"],
      4   "paper": {
      5     "title": "LLM-based Test-driven Interactive Code Generation: User Study and Empirical Evaluation",
      6     "authors": ["Sarah Fakhoury", "Aaditya Naik", "Georgios Sakkas", "Saikat Chakraborty", "Shuvendu K. Lahiri"],
      7     "year": 2024,
      8     "venue": "arXiv",
      9     "arxiv_id": "2404.10100"
     10   },
     11   "methodology_tags": ["rct", "benchmark-eval"],
     12   "key_findings": "TiCoder, a test-driven interactive code generation workflow, significantly improves developers' ability to correctly evaluate AI-generated code (0.84 vs 0.40 correctness) and reduces cognitive load, with no significant time overhead. At scale on MBPP and HumanEval benchmarks, TiCoder achieves an average 45.73% absolute improvement in pass@1 across 7 LLMs within 5 user interactions, using an idealized oracle proxy for user feedback. Smaller models boosted by TiCoder can match or exceed pass@1 of larger models like GPT-4-32k.",
     13   "claims": [
     14     {
     15       "claim": "Participants using TiCoder-PASSFAIL are significantly more likely to correctly evaluate AI-generated code (mean 0.84 vs 0.40, p=0.001).",
     16       "evidence": "Table III, Section VI-A. Mixed-effects logistic regression with correctness as dependent variable, participant and task as random effects. FDR-corrected pairwise comparison.",
     17       "supported": "moderate"
     18     },
     19     {
     20       "claim": "TiCoder reduces task-induced cognitive load significantly compared to the control condition.",
     21       "evidence": "Table III, Section VI-C. NASA TLX scores: 28.00 (A2) and 29.52 (A3) vs 45.46 (A1), with significant pairwise differences for mental demand, effort, stress, and pace.",
     22       "supported": "moderate"
     23     },
     24     {
     25       "claim": "TiCoder does not introduce significant time overhead for task completion.",
     26       "evidence": "Table III, Section VI-B. Mean times: 327.7s (A1), 284.15s (A2), 253.88s (A3). No statistically significant difference.",
     27       "supported": "moderate"
     28     },
     29     {
     30       "claim": "TiCoder achieves an average absolute improvement of 45.73% in pass@1 code generation accuracy across both datasets and all LLMs within 5 user interactions.",
     31       "evidence": "Table IV, Section VII-E. Results shown for 7 models on MBPP and HumanEval. However, this uses an idealized oracle proxy, acknowledged as an upper bound.",
     32       "supported": "weak"
     33     },
     34     {
     35       "claim": "TiCoder can boost smaller model pass@1 accuracy to levels comparable to much larger models like GPT-4-32k within just one user interaction.",
     36       "evidence": "Table IV. code-davinci-002 on MBPP achieves 68.42% with TiCoder-PASSFAIL@1, exceeding GPT-4-32k baseline of 67.13%.",
     37       "supported": "moderate"
     38     }
     39   ],
     40   "checklist": {
     41     "artifacts": {
     42       "code_released": {
     43         "applies": true,
     44         "answer": false,
     45         "justification": "No repository URL, GitHub link, or code archive is mentioned anywhere in the paper."
     46       },
     47       "data_released": {
     48         "applies": true,
     49         "answer": true,
     50         "justification": "The paper uses publicly available MBPP and HumanEval benchmarks. No proprietary data was collected beyond user study responses."
     51       },
     52       "environment_specified": {
     53         "applies": true,
     54         "answer": false,
     55         "justification": "No requirements.txt, Dockerfile, or environment specification is provided. Only model names are listed."
     56       },
     57       "reproduction_instructions": {
     58         "applies": true,
     59         "answer": false,
     60         "justification": "No step-by-step reproduction instructions, README, or scripts are provided."
     61       }
     62     },
     63     "statistical_methodology": {
     64       "confidence_intervals_or_error_bars": {
     65         "applies": true,
     66         "answer": false,
     67         "justification": "Table III reports only means and p-values. No confidence intervals or error bars are provided for any results."
     68       },
     69       "significance_tests": {
     70         "applies": true,
     71         "answer": true,
     72         "justification": "Section V-E describes mixed-effects regression models with ANOVA omnibus tests and FDR correction for multiple comparisons. P-values are reported in Table III."
     73       },
     74       "effect_sizes_reported": {
     75         "applies": true,
     76         "answer": true,
     77         "justification": "Absolute improvements are reported with baseline context throughout, e.g., 'from 49.16% to 68.04%, an absolute improvement of 18.88%' (Section VII-E). User study reports means for each condition."
     78       },
     79       "sample_size_justified": {
     80         "applies": true,
     81         "answer": false,
     82         "justification": "The user study uses 15 participants with no power analysis or justification for this sample size. No discussion of whether N=15 is adequate for the within-subjects design."
     83       },
     84       "variance_reported": {
     85         "applies": true,
     86         "answer": false,
     87         "justification": "No standard deviations, variance, or spread measures are reported for either the user study results or benchmark evaluations. Table III shows only means."
     88       }
     89     },
     90     "evaluation_design": {
     91       "baselines_included": {
     92         "applies": true,
     93         "answer": true,
     94         "justification": "Both evaluations include baselines: the user study has a control condition (Assistant 1, no TiCoder), and the benchmark evaluation reports baseline pass@1 and pass@100 for all models."
     95       },
     96       "baselines_contemporary": {
     97         "applies": true,
     98         "answer": true,
     99         "justification": "Benchmark evaluation includes GPT-4-turbo, GPT-4-32k, and GPT-3.5-turbo alongside older models. These were contemporary at time of writing."
    100       },
    101       "ablation_study": {
    102         "applies": true,
    103         "answer": true,
    104         "justification": "The two TiCoder variants (PASSFAIL vs OUTPUT) serve as a partial ablation of the feedback mechanism. Table V compares execution-based pruning vs prompting with tests."
    105       },
    106       "multiple_metrics": {
    107         "applies": true,
    108         "answer": true,
    109         "justification": "User study measures correctness, time, and cognitive load (5 NASA TLX dimensions). Benchmark evaluation uses pass@1 and pass@k@m metrics."
    110       },
    111       "human_evaluation": {
    112         "applies": true,
    113         "answer": true,
    114         "justification": "The entire RQ1 is a user study with 15 human participants evaluating code suggestions, measuring correctness, time, and cognitive load."
    115       },
    116       "held_out_test_set": {
    117         "applies": true,
    118         "answer": true,
    119         "justification": "The benchmark evaluation uses hidden test sets from MBPP and HumanEval that TiCoder does not have access to: 'Our workflow does not have access to either Tp or bp' (Section IV-B)."
    120       },
    121       "per_category_breakdown": {
    122         "applies": true,
    123         "answer": true,
    124         "justification": "Results are broken down per model (7 models), per dataset (MBPP, HumanEval), per interaction count (m=1 to 5), and per task in the user study discussion."
    125       },
    126       "failure_cases_discussed": {
    127         "applies": true,
    128         "answer": true,
    129         "justification": "Section VI-A discusses specific participant mistakes in detail, and Section VIII-D discusses precondition-violating tests as failure cases. Section IX discusses limitations."
    130       },
    131       "negative_results_reported": {
    132         "applies": true,
    133         "answer": true,
    134         "justification": "The paper reports that Assistant 3 (TiCoder-OUTPUT) did not achieve statistical significance for correctness despite higher mean, and that time differences were not significant. Also reports participants making more mistakes with Assistant 3."
    135       }
    136     },
    137     "claims_and_evidence": {
    138       "abstract_claims_supported": {
    139         "applies": true,
    140         "answer": true,
    141         "justification": "Abstract claims of 'significantly more likely to correctly evaluate AI generated code', 'significantly less task-induced cognitive load', and '45.97% improvement' are all supported by results in Tables III and IV."
    142       },
    143       "causal_claims_justified": {
    144         "applies": true,
    145         "answer": true,
    146         "justification": "Causal claims about TiCoder's effect are supported by a within-subjects controlled experiment with Latin Square design (Section V-C) controlling for order effects, task, and participant variability."
    147       },
    148       "generalization_bounded": {
    149         "applies": true,
    150         "answer": false,
    151         "justification": "The title says 'LLM-based Test-driven Interactive Code Generation' broadly, but results are only on simple Python functions from MBPP/HumanEval. The paper acknowledges this in limitations but the title and framing overgeneralize."
    152       },
    153       "alternative_explanations_discussed": {
    154         "applies": true,
    155         "answer": true,
    156         "justification": "Section IX discusses specific alternative explanations: controlled conditions may not reflect real-world usage, the oracle proxy represents an upper bound, test validation may not scale to complex tasks, and participants' evaluation strategies could explain time results."
    157       },
    158       "proxy_outcome_distinction": {
    159         "applies": true,
    160         "answer": true,
    161         "justification": "The paper explicitly acknowledges that the benchmark evaluation uses 'the reference implementation as an idealized proxy' and that 'our empirical evaluation represents an upper bound on the improvement that TiCoder can have on the benchmarks with real users' (Section VII-D). The user study directly measures what it claims."
    162       }
    163     },
    164     "setup_transparency": {
    165       "model_versions_specified": {
    166         "applies": true,
    167         "answer": false,
    168         "justification": "Models are named as 'GPT-3.5-turbo', 'GPT-4-turbo', 'GPT-4-32k' without snapshot dates or API versions. 'code-davinci-002' and 'text-davinci-003' are specific but the chat models lack version specificity."
    169       },
    170       "prompts_provided": {
    171         "applies": true,
    172         "answer": true,
    173         "justification": "Figure 2 shows the actual prompt format including prefix, description, header, prompt body, and test body with a concrete example. The prompt construction is described in enough detail (Section IV-B1) to reconstruct."
    174       },
    175       "hyperparameters_reported": {
    176         "applies": true,
    177         "answer": true,
    178         "justification": "Temperature is reported as 0.8 for all models (Section VII-C), with justification for this choice. Default API parameters noted for user study code generation (temperature=1.0 for GPT-3.5-turbo, Section V-B2)."
    179       },
    180       "scaffolding_described": {
    181         "applies": true,
    182         "answer": true,
    183         "justification": "The TiCoder workflow is described in detail in Section IV: test generation, discriminative ranking policy (with formula), pruning logic, and code ranking strategy. Figure 1 provides a workflow diagram."
    184       },
    185       "data_preprocessing_documented": {
    186         "applies": true,
    187         "answer": true,
    188         "justification": "Section V-B describes task selection from MBPP (clustering by domain and complexity), code/test generation process (sampling 5 incorrect codes), and filtering criteria. Section VII-A describes HumanEval modification to remove examples from docstrings."
    189       }
    190     },
    191     "limitations_and_scope": {
    192       "limitations_section_present": {
    193         "applies": true,
    194         "answer": true,
    195         "justification": "Section IX 'Limitations and Threats' provides substantive discussion of generalizability, test execution overhead, and experimental constraints."
    196       },
    197       "threats_to_validity_specific": {
    198         "applies": true,
    199         "answer": true,
    200         "justification": "Section IX discusses specific threats: controlled conditions restrict code editing, tasks are simple Python functions, oracle proxy is an upper bound, participants may provide noisy input on edge cases, and test validation may not scale to complex scenarios."
    201       },
    202       "scope_boundaries_stated": {
    203         "applies": true,
    204         "answer": true,
    205         "justification": "The paper explicitly states what was not tested: real-world code settings, broader audiences, online metrics like acceptance rates, complex programs, different languages (Section IX). Also notes the oracle proxy limitation."
    206       }
    207     },
    208     "data_integrity": {
    209       "raw_data_available": {
    210         "applies": true,
    211         "answer": false,
    212         "justification": "No raw data from the user study (recordings, survey responses) or benchmark experiments (cached LLM outputs) is made available."
    213       },
    214       "data_collection_described": {
    215         "applies": true,
    216         "answer": true,
    217         "justification": "Section V describes the user study protocol in detail: recruitment, interview format, survey platform, video recordings, 45-minute sessions, task time limits. Section VII describes LLM querying procedure."
    218       },
    219       "recruitment_methods_described": {
    220         "applies": true,
    221         "answer": true,
    222         "justification": "Section V states 'We recruit participants using a mix of distribution lists and personal contacts.' Table I provides demographics. 8 industry, 10 academia (including 3 pilots)."
    223       },
    224       "data_pipeline_documented": {
    225         "applies": true,
    226         "answer": true,
    227         "justification": "The pipeline from task selection → code/test generation → user study execution → metric extraction is documented across Sections V-A through V-D. Benchmark pipeline described in Section VII."
    228       }
    229     },
    230     "conflicts_of_interest": {
    231       "funding_disclosed": {
    232         "applies": true,
    233         "answer": false,
    234         "justification": "No funding source, acknowledgments section, or grant information is mentioned in the paper."
    235       },
    236       "affiliations_disclosed": {
    237         "applies": true,
    238         "answer": true,
    239         "justification": "Author affiliations are clearly listed: Microsoft Research, University of Pennsylvania, and UC San Diego."
    240       },
    241       "funder_independent_of_outcome": {
    242         "applies": true,
    243         "answer": false,
    244         "justification": "Three of five authors are from Microsoft Research. Microsoft has a significant investment in OpenAI and GitHub Copilot, products directly related to this research. No independence of funder discussed."
    245       },
    246       "financial_interests_declared": {
    247         "applies": true,
    248         "answer": false,
    249         "justification": "No competing interests or financial interests statement is present in the paper."
    250       }
    251     },
    252     "contamination": {
    253       "training_cutoff_stated": {
    254         "applies": true,
    255         "answer": false,
    256         "justification": "No training data cutoff dates are stated for any of the 7 models used, despite evaluating on MBPP and HumanEval benchmarks."
    257       },
    258       "train_test_overlap_discussed": {
    259         "applies": true,
    260         "answer": false,
    261         "justification": "No discussion of whether MBPP or HumanEval problems appeared in any model's training data, despite these being well-known public benchmarks."
    262       },
    263       "benchmark_contamination_addressed": {
    264         "applies": true,
    265         "answer": false,
    266         "justification": "MBPP (2021) and HumanEval (2021) were published before the training cutoff of most models used (GPT-4, GPT-3.5-turbo). No contamination analysis is performed."
    267       }
    268     },
    269     "human_studies": {
    270       "pre_registered": {
    271         "applies": true,
    272         "answer": false,
    273         "justification": "No pre-registration link or mention of pre-registration is present."
    274       },
    275       "irb_or_ethics_approval": {
    276         "applies": true,
    277         "answer": true,
    278         "justification": "Section V states 'The study was IRB approved with voluntary participation and paid $15.'"
    279       },
    280       "demographics_reported": {
    281         "applies": true,
    282         "answer": true,
    283         "justification": "Table I provides detailed demographics: Python experience, Python usage frequency, AI assistant use frequency, and occupation (industry/academia) for all 18 participants."
    284       },
    285       "inclusion_exclusion_criteria": {
    286         "applies": true,
    287         "answer": false,
    288         "justification": "No explicit inclusion or exclusion criteria are stated. Participants were recruited via 'distribution lists and personal contacts' with no described screening process."
    289       },
    290       "randomization_described": {
    291         "applies": true,
    292         "answer": true,
    293         "justification": "Section V-C describes a Latin Square Design with randomized order of assistants across tasks to balance order effects."
    294       },
    295       "blinding_described": {
    296         "applies": true,
    297         "answer": false,
    298         "justification": "No blinding is described. Participants were given 'general instructions around how to interact with each AI assistant, the differences between them' (Section V-C), indicating they knew which condition they were in."
    299       },
    300       "attrition_reported": {
    301         "applies": true,
    302         "answer": true,
    303         "justification": "Section V states 3 of 18 participants were used as pilots, and the remaining 15 were used in the final experiment. No dropout from the final 15 is reported, implying full completion."
    304       }
    305     },
    306     "cost_and_practicality": {
    307       "inference_cost_reported": {
    308         "applies": true,
    309         "answer": false,
    310         "justification": "The paper generates 100 code and 50 test suggestions per problem across multiple models and datasets but reports no API costs, token counts, or inference latency."
    311       },
    312       "compute_budget_stated": {
    313         "applies": true,
    314         "answer": false,
    315         "justification": "No total computational budget, GPU hours, or API spend is reported despite substantial LLM usage across 7 models and 2 datasets."
    316       }
    317     },
    318     "experimental_rigor": {
    319       "seed_sensitivity_reported": {
    320         "applies": true,
    321         "answer": false,
    322         "justification": "Results are from a single cache of 100 code and 50 test suggestions per model: 'we only query each model once to generate an initial set' (Section VII-C). No multi-seed analysis."
    323       },
    324       "number_of_runs_stated": {
    325         "applies": true,
    326         "answer": true,
    327         "justification": "Section VII-C explicitly states models were queried once to generate a cache: 'we only query each model once to generate an initial set of 100 code and 50 test suggestions.'"
    328       },
    329       "hyperparameter_search_budget": {
    330         "applies": true,
    331         "answer": false,
    332         "justification": "The paper states 'We experimented with different temperature configurations' but does not report how many configurations were tried or the search budget."
    333       },
    334       "best_config_selection_justified": {
    335         "applies": true,
    336         "answer": true,
    337         "justification": "Temperature selection is justified: 'we settle on a temperature of 0.8, as it maximizes the number of examples for which at least one correct code is produced within 100 suggestions' (Section VII-C)."
    338       },
    339       "multiple_comparison_correction": {
    340         "applies": true,
    341         "answer": true,
    342         "justification": "Section V-E states 'To correct for multiple comparisons and conduct False Discovery Rate (FDR) correction' using Benjamini-Hochberg."
    343       },
    344       "self_comparison_bias_addressed": {
    345         "applies": true,
    346         "answer": false,
    347         "justification": "Authors propose and evaluate their own TiCoder system without acknowledging self-comparison bias or having independent evaluation."
    348       },
    349       "compute_budget_vs_performance": {
    350         "applies": true,
    351         "answer": false,
    352         "justification": "TiCoder requires generating 100 code + 50 test suggestions per problem plus execution overhead, but no compute comparison with the baseline (single generation) is provided."
    353       },
    354       "benchmark_construct_validity": {
    355         "applies": true,
    356         "answer": false,
    357         "justification": "MBPP and HumanEval are used without discussing whether simple single-function Python problems are representative of real code generation needs. The limitations section mentions this generalizability concern but does not analyze construct validity."
    358       },
    359       "scaffold_confound_addressed": {
    360         "applies": false,
    361         "answer": false,
    362         "justification": "TiCoder IS the scaffold being tested; the paper is evaluating the scaffold's effect, not comparing models through different scaffolds."
    363       }
    364     },
    365     "data_leakage": {
    366       "temporal_leakage_addressed": {
    367         "applies": true,
    368         "answer": false,
    369         "justification": "MBPP and HumanEval were published in 2021, before training cutoffs of GPT-4 and GPT-3.5-turbo. No temporal leakage discussion."
    370       },
    371       "feature_leakage_addressed": {
    372         "applies": true,
    373         "answer": false,
    374         "justification": "No discussion of whether docstrings or function signatures in MBPP/HumanEval leak information about expected solutions."
    375       },
    376       "non_independence_addressed": {
    377         "applies": true,
    378         "answer": false,
    379         "justification": "No discussion of independence between benchmark problems or potential structural similarities."
    380       },
    381       "leakage_detection_method": {
    382         "applies": true,
    383         "answer": false,
    384         "justification": "No leakage detection or prevention method is applied."
    385       }
    386     }
    387   },
    388   "red_flags": [
    389     {
    390       "flag": "Very small user study sample",
    391       "detail": "N=15 participants with 3 conditions and 3 tasks in a within-subjects design. No power analysis. Some cells have only 4-5 participants per task-treatment combination, making individual results fragile."
    392     },
    393     {
    394       "flag": "Idealized oracle proxy inflates benchmark results",
    395       "detail": "The headline 45.73% improvement uses a perfect oracle (reference implementation) to simulate user feedback. The paper acknowledges this is an upper bound, but the abstract presents it without this caveat."
    396     },
    397     {
    398       "flag": "Contamination risk unaddressed",
    399       "detail": "MBPP and HumanEval are widely-used benchmarks published in 2021. Models like GPT-4 likely saw these problems during training, which could inflate both baseline and TiCoder results unpredictably."
    400     },
    401     {
    402       "flag": "Microsoft conflict of interest",
    403       "detail": "Three authors are from Microsoft Research. Microsoft has major investments in OpenAI (whose models are evaluated) and GitHub Copilot (a direct competitor/application area). No competing interests statement is present."
    404     },
    405     {
    406       "flag": "No code or data release",
    407       "detail": "Despite proposing a specific workflow with implementation details, no code repository or experimental artifacts are released for reproduction."
    408     }
    409   ],
    410   "cited_papers": [
    411     {
    412       "title": "Evaluating large language models trained on code",
    413       "authors": ["Mark Chen", "Jerry Tworek"],
    414       "year": 2021,
    415       "arxiv_id": "2107.03374",
    416       "relevance": "Introduces HumanEval benchmark and Codex, foundational for code generation evaluation."
    417     },
    418     {
    419       "title": "Program synthesis with large language models",
    420       "authors": ["Jacob Austin", "Augustus Odena"],
    421       "year": 2021,
    422       "arxiv_id": "2108.07732",
    423       "relevance": "Introduces MBPP benchmark used in this paper's evaluation."
    424     },
    425     {
    426       "title": "CodeT: Code generation with generated tests",
    427       "authors": ["Bei Chen", "Fengji Zhang"],
    428       "year": 2022,
    429       "arxiv_id": "2207.10397",
    430       "relevance": "Uses LLM-generated tests to rerank code suggestions, directly related approach to TiCoder."
    431     },
    432     {
    433       "title": "Competition-level code generation with AlphaCode",
    434       "authors": ["Yujia Li", "David Choi"],
    435       "year": 2022,
    436       "arxiv_id": "2203.07814",
    437       "relevance": "Proposes test-based grouping of code suggestions for competitive programming."
    438     },
    439     {
    440       "title": "CodaMosa: Escaping coverage plateaus in test generation with pre-trained large language models",
    441       "authors": ["Caroline Lemieux", "Jeevana Priya Inala", "Shuvendu K. Lahiri"],
    442       "year": 2023,
    443       "relevance": "LLM-augmented test generation for improving code coverage."
    444     },
    445     {
    446       "title": "Understanding the usability of AI programming assistants",
    447       "authors": ["Jenny T. Liang", "Chenyang Yang", "Brad A. Myers"],
    448       "year": 2023,
    449       "relevance": "Identifies usability issues with AI programming assistants including lack of feedback mechanisms."
    450     },
    451     {
    452       "title": "Taking flight with Copilot: Early insights and opportunities of AI-powered pair-programming tools",
    453       "authors": ["Christian Bird", "Denae Ford", "Thomas Zimmermann"],
    454       "year": 2022,
    455       "relevance": "Shows developer roles shifting toward more code review with AI tools."
    456     },
    457     {
    458       "title": "Reading between the lines: Modeling user behavior and costs in AI-assisted programming",
    459       "authors": ["Hussein Mozannar", "Gagan Bansal", "Adam Fourney", "Eric Horvitz"],
    460       "year": 2022,
    461       "relevance": "Models developer interaction patterns with Copilot, finding significant verification time."
    462     },
    463     {
    464       "title": "Grounded Copilot: How programmers interact with code-generating models",
    465       "authors": ["Shraddha Barke", "Michael B. James", "Nadia Polikarpova"],
    466       "year": 2023,
    467       "relevance": "Studies how programmers interact with Copilot, identifies disambiguation as key need."
    468     },
    469     {
    470       "title": "Is GitHub's Copilot as bad as humans at introducing vulnerabilities in code?",
    471       "authors": ["Owura Asare", "Meiyappan Nagappan", "N. Asokan"],
    472       "year": 2022,
    473       "relevance": "Evaluates security of Copilot-generated code, relevant to AI code quality assessment."
    474     },
    475     {
    476       "title": "Do users write more insecure code with AI assistants?",
    477       "authors": ["Neil Perry", "Megha Srivastava", "Deepak Kumar", "Dan Boneh"],
    478       "year": 2022,
    479       "relevance": "User study on security implications of AI-assisted coding."
    480     },
    481     {
    482       "title": "Adaptive test generation using a large language model",
    483       "authors": ["Max Schäfer", "Sarah Nadi", "Aryaz Eghbali", "Frank Tip"],
    484       "year": 2023,
    485       "relevance": "LLM-based test generation approach relevant to TiCoder's test component."
    486     }
    487   ]
    488 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs