scan.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan.json (19950B)
      1 {
      2   "paper": {
      3     "title": "ClarifyGPT: Empowering LLM-based Code Generation with Intention Clarification",
      4     "authors": ["Fangwen Mu", "Lin Shi", "Song Wang", "Zhuohao Yu", "Binquan Zhang", "ChenXue Wang", "Shichao Liu", "Qing Wang"],
      5     "year": 2023,
      6     "venue": "arXiv preprint",
      7     "arxiv_id": "2310.10996"
      8   },
      9   "checklist": {
     10     "artifacts": {
     11       "code_released": {
     12         "applies": true,
     13         "answer": true,
     14         "justification": "GitHub repository provided: https://github.com/ClarifyGPT/ClarifyGPT (reference [1])."
     15       },
     16       "data_released": {
     17         "applies": true,
     18         "answer": true,
     19         "justification": "The paper uses publicly available benchmarks (HumanEval, MBPP-sanitized, HumanEval-ET, MBPP-ET) and states 'publicly accessible dataset and source code' in contributions."
     20       },
     21       "environment_specified": {
     22         "applies": true,
     23         "answer": false,
     24         "justification": "No requirements.txt, Dockerfile, or detailed environment setup section found. Only API model names are mentioned."
     25       },
     26       "reproduction_instructions": {
     27         "applies": true,
     28         "answer": false,
     29         "justification": "No step-by-step reproduction instructions found in the paper. The GitHub link is provided but the paper itself lacks a reproducing results section."
     30       }
     31     },
     32     "statistical_methodology": {
     33       "confidence_intervals_or_error_bars": {
     34         "applies": true,
     35         "answer": false,
     36         "justification": "Results in Tables 2-4 report only point estimates (e.g., 80.80%) with no confidence intervals or error bars, despite running experiments three times."
     37       },
     38       "significance_tests": {
     39         "applies": true,
     40         "answer": false,
     41         "justification": "The paper claims ClarifyGPT outperforms baselines but provides no statistical significance tests. Comparisons are based solely on point estimate differences."
     42       },
     43       "effect_sizes_reported": {
     44         "applies": true,
     45         "answer": true,
     46         "justification": "Relative improvements are reported with baseline context, e.g., 'improves the average performance of GPT-4 across four benchmarks from 68.02% to 75.75%' (Section 5.2)."
     47       },
     48       "sample_size_justified": {
     49         "applies": true,
     50         "answer": false,
     51         "justification": "No justification for why 10 participants were recruited for the human evaluation, nor why 3 runs were chosen for averaging."
     52       },
     53       "variance_reported": {
     54         "applies": true,
     55         "answer": false,
     56         "justification": "The paper states 'we run each approach three times and report the average results' (Section 4.4) but does not report standard deviation or any spread measure across runs."
     57       }
     58     },
     59     "evaluation_design": {
     60       "baselines_included": {
     61         "applies": true,
     62         "answer": true,
     63         "justification": "Three baselines compared: Default LLM, Chain-of-Thought (CoT), and GPT-Engineer (Section 4.5)."
     64       },
     65       "baselines_contemporary": {
     66         "applies": true,
     67         "answer": true,
     68         "justification": "CoT (2022) and GPT-Engineer (2023) are contemporary baselines. GPT-Engineer is the most directly related work."
     69       },
     70       "ablation_study": {
     71         "applies": true,
     72         "answer": true,
     73         "justification": "RQ3 varies the number of demonstrations (zero to three-shot), functioning as an ablation of the prompt design component (Table 4)."
     74       },
     75       "multiple_metrics": {
     76         "applies": true,
     77         "answer": false,
     78         "justification": "Only Pass@1 is used as the evaluation metric. No other metrics (e.g., Pass@k for k>1, code quality measures) are reported."
     79       },
     80       "human_evaluation": {
     81         "applies": true,
     82         "answer": true,
     83         "justification": "RQ1 involves 10 human participants answering clarifying questions and evaluating code generation on MBPP-sanitized and MBPP-ET (Section 5.1)."
     84       },
     85       "held_out_test_set": {
     86         "applies": true,
     87         "answer": true,
     88         "justification": "Evaluation uses standard benchmark test suites (HumanEval, MBPP-sanitized, etc.) which are separate from the demonstration examples selected from the first three problems."
     89       },
     90       "per_category_breakdown": {
     91         "applies": true,
     92         "answer": true,
     93         "justification": "Results are broken down per benchmark (HumanEval, HumanEval-ET, MBPP-sanitized, MBPP-ET) in Tables 2-4."
     94       },
     95       "failure_cases_discussed": {
     96         "applies": true,
     97         "answer": true,
     98         "justification": "Section 6.1 case study shows both success and failure cases. Section 6.2 Limitations discusses where ClarifyGPT fails (complex inputs, code without return values)."
     99       },
    100       "negative_results_reported": {
    101         "applies": true,
    102         "answer": true,
    103         "justification": "The zero-shot setting shows marginal improvement (0.0-2.8%), reported honestly in Table 4. The paper also notes ClarifyGPT (Simulated Feedback) underperforms Human Feedback."
    104       }
    105     },
    106     "claims_and_evidence": {
    107       "abstract_claims_supported": {
    108         "applies": true,
    109         "answer": true,
    110         "justification": "Abstract claims (GPT-4 from 70.96% to 80.80% on MBPP-sanitized; average improvements of 11.52% and 15.07%) match Tables 2 and 3."
    111       },
    112       "causal_claims_justified": {
    113         "applies": true,
    114         "answer": true,
    115         "justification": "Causal claims ('ClarifyGPT improves code generation') are supported by controlled comparisons where ClarifyGPT is the only variable changed against Default, CoT, and GPT-Engineer baselines."
    116       },
    117       "generalization_bounded": {
    118         "applies": true,
    119         "answer": false,
    120         "justification": "The abstract claims ClarifyGPT 'can effectively facilitate the practical application of LLMs in real-world development environments' but tests only on Python function-level benchmarks with short requirements. Section 6.2 acknowledges limitations but the abstract/conclusion overgeneralize."
    121       },
    122       "alternative_explanations_discussed": {
    123         "applies": true,
    124         "answer": false,
    125         "justification": "The threats to validity section (6.3) discusses data leakage and simulation fidelity but does not consider alternative explanations for the observed improvements (e.g., additional tokens/context providing more information regardless of clarification quality)."
    126       }
    127     },
    128     "setup_transparency": {
    129       "model_versions_specified": {
    130         "applies": true,
    131         "answer": true,
    132         "justification": "Section 4.2 specifies 'gpt-3.5-turbo' for ChatGPT and 'gpt-4-turbo' for GPT-4, though no snapshot dates are given. These are API model names, not versioned snapshots."
    133       },
    134       "prompts_provided": {
    135         "applies": true,
    136         "answer": true,
    137         "justification": "Figure 3 shows the actual prompt templates used for seed input initialization, question generation, user simulation, and enhanced code generation, including instruction text and demonstration structure."
    138       },
    139       "hyperparameters_reported": {
    140         "applies": true,
    141         "answer": true,
    142         "justification": "Section 4.6 reports top_p=0.95, frequency_penalty=0, max_tokens=800/300, temperature=0 (or 0.8 for sampling)."
    143       },
    144       "scaffolding_described": {
    145         "applies": true,
    146         "answer": true,
    147         "justification": "The four-stage pipeline (test input generation, code consistency check, reasoning-based question generation, enhanced code generation) is described in detail in Section 3 with Figure 1."
    148       },
    149       "data_preprocessing_documented": {
    150         "applies": true,
    151         "answer": true,
    152         "justification": "Section 4.6 describes prompt construction (selecting first three problems as demonstration seeds). Section 4.3 describes benchmark statistics. The process from benchmark to evaluation is documented."
    153       }
    154     },
    155     "limitations_and_scope": {
    156       "limitations_section_present": {
    157         "applies": true,
    158         "answer": true,
    159         "justification": "Section 6.2 'Benefits and Limitations' and Section 6.3 'Threats to Validity' provide substantive discussion."
    160       },
    161       "threats_to_validity_specific": {
    162         "applies": true,
    163         "answer": true,
    164         "justification": "Section 6.3 discusses specific threats: data leakage from training on public benchmarks, simulation fidelity concerns, and generalizability across only two LLMs and four datasets."
    165       },
    166       "scope_boundaries_stated": {
    167         "applies": true,
    168         "answer": true,
    169         "justification": "Section 6.2 Limitations explicitly states ClarifyGPT is not suitable for code with complex inputs (images, files) or code without return values, and requires instruction-tuned LLMs."
    170       }
    171     },
    172     "data_integrity": {
    173       "raw_data_available": {
    174         "applies": true,
    175         "answer": false,
    176         "justification": "Human evaluation responses from the 10 participants are not made available. Only aggregate Pass@1 results are reported."
    177       },
    178       "data_collection_described": {
    179         "applies": true,
    180         "answer": true,
    181         "justification": "Section 5.1 describes the human evaluation data collection: 140 ambiguous problems identified, questionnaires with three elements, each problem assessed by three participants."
    182       },
    183       "recruitment_methods_described": {
    184         "applies": true,
    185         "answer": true,
    186         "justification": "Section 5.1 describes participants: 'ten participants, including three Ph.D. students, two Master's students, two senior researchers, and three industry developers' with Python experience details."
    187       },
    188       "data_pipeline_documented": {
    189         "applies": true,
    190         "answer": true,
    191         "justification": "The pipeline from requirement input through ambiguity detection (140 of 427 identified as ambiguous), question generation (avg 2.85 questions per problem), human response collection, to code generation is documented."
    192       }
    193     },
    194     "conflicts_of_interest": {
    195       "funding_disclosed": {
    196         "applies": true,
    197         "answer": false,
    198         "justification": "No funding or acknowledgments section found in the paper."
    199       },
    200       "affiliations_disclosed": {
    201         "applies": true,
    202         "answer": true,
    203         "justification": "Author affiliations are clearly listed, including one author from Huawei Central Software Institute."
    204       },
    205       "funder_independent_of_outcome": {
    206         "applies": true,
    207         "answer": false,
    208         "justification": "No funding information is disclosed, so independence cannot be assessed. One author is from Huawei, which has commercial interest in code generation tools."
    209       },
    210       "financial_interests_declared": {
    211         "applies": true,
    212         "answer": false,
    213         "justification": "No competing interests statement found in the paper."
    214       }
    215     },
    216     "contamination": {
    217       "training_cutoff_stated": {
    218         "applies": true,
    219         "answer": false,
    220         "justification": "The paper does not state training data cutoff dates for ChatGPT (gpt-3.5-turbo) or GPT-4 (gpt-4-turbo), despite evaluating on public benchmarks."
    221       },
    222       "train_test_overlap_discussed": {
    223         "applies": true,
    224         "answer": true,
    225         "justification": "Section 6.3 first threat discusses data leakage: 'Since these LLMs are trained on open-source code repositories, it is possible that some public benchmarks were included in their training data.'"
    226       },
    227       "benchmark_contamination_addressed": {
    228         "applies": true,
    229         "answer": false,
    230         "justification": "While Section 6.3 acknowledges the risk, the mitigation is weak — stating that benchmarks are 'manually crafted' and 'widely employed' does not address whether they appeared in training data. HumanEval (2021) and MBPP (2021) predate GPT-4's training."
    231       }
    232     },
    233     "human_studies": {
    234       "pre_registered": {
    235         "applies": true,
    236         "answer": false,
    237         "justification": "No pre-registration mentioned for the human evaluation study."
    238       },
    239       "irb_or_ethics_approval": {
    240         "applies": true,
    241         "answer": false,
    242         "justification": "No IRB or ethics board approval mentioned despite involving 10 human participants."
    243       },
    244       "demographics_reported": {
    245         "applies": true,
    246         "answer": true,
    247         "justification": "Section 5.1 reports participant roles (PhD students, Master's students, senior researchers, industry developers) and Python experience levels (at least 3 years, 6 with 5+ years)."
    248       },
    249       "inclusion_exclusion_criteria": {
    250         "applies": true,
    251         "answer": false,
    252         "justification": "No explicit inclusion/exclusion criteria stated. Participants 'have at least three years of experience in Python development' is mentioned but it is unclear if this was a selection criterion."
    253       },
    254       "randomization_described": {
    255         "applies": true,
    256         "answer": false,
    257         "justification": "The assignment of 42 problems per participant is mentioned but no randomization procedure is described for problem assignment."
    258       },
    259       "blinding_described": {
    260         "applies": true,
    261         "answer": false,
    262         "justification": "No blinding described. Participants answering clarifying questions likely knew they were evaluating ClarifyGPT."
    263       },
    264       "attrition_reported": {
    265         "applies": true,
    266         "answer": false,
    267         "justification": "No mention of whether all 10 participants completed all assigned problems or if there was any dropout."
    268       }
    269     },
    270     "cost_and_practicality": {
    271       "inference_cost_reported": {
    272         "applies": true,
    273         "answer": false,
    274         "justification": "No API costs, token counts, or latency reported despite ClarifyGPT requiring multiple LLM calls per problem (sampling n solutions, generating questions, simulating answers, final generation)."
    275       },
    276       "compute_budget_stated": {
    277         "applies": true,
    278         "answer": false,
    279         "justification": "No total API spend or computational budget reported."
    280       }
    281     }
    282   },
    283   "claims": [
    284     {
    285       "claim": "ClarifyGPT elevates GPT-4 Pass@1 on MBPP-sanitized from 70.96% to 80.80% with human feedback.",
    286       "evidence": "Table 2 in Section 5.1 shows this result from human evaluation with 10 participants.",
    287       "supported": "moderate"
    288     },
    289     {
    290       "claim": "ClarifyGPT (Simulated Feedback) improves GPT-4 average Pass@1 across four benchmarks from 68.02% to 75.75%.",
    291       "evidence": "Table 3 in Section 5.2 shows results across HumanEval, HumanEval-ET, MBPP-sanitized, MBPP-ET.",
    292       "supported": "moderate"
    293     },
    294     {
    295       "claim": "ClarifyGPT (Simulated Feedback) improves ChatGPT average Pass@1 from 58.55% to 67.22%.",
    296       "evidence": "Table 3 in Section 5.2.",
    297       "supported": "moderate"
    298     },
    299     {
    300       "claim": "ClarifyGPT demonstrates robustness to the number of demonstrations, consistently outperforming Default from zero-shot to three-shot.",
    301       "evidence": "Table 4 in Section 5.3 shows consistent improvements, though zero-shot gains are marginal (0.0-2.8%).",
    302       "supported": "moderate"
    303     },
    304     {
    305       "claim": "The user simulation method produces high-fidelity simulated feedback close to real user responses.",
    306       "evidence": "Comparison of ClarifyGPT (Human Feedback) vs (Simulated Feedback) in Table 3 shows similar but not identical results. Only tested on MBPP-sanitized/MBPP-ET overlap.",
    307       "supported": "weak"
    308     }
    309   ],
    310   "methodology_tags": ["benchmark-eval", "case-study"],
    311   "key_findings": "ClarifyGPT improves LLM code generation by detecting ambiguous requirements via code consistency checking and generating targeted clarifying questions. With human feedback, it improves GPT-4 Pass@1 by 13.87% on MBPP-sanitized. With simulated feedback, it achieves average improvements of 11.52% (GPT-4) and 15.07% (ChatGPT) across four benchmarks. The framework outperforms GPT-Engineer, which asks questions indiscriminately for all requirements.",
    312   "red_flags": [
    313     {
    314       "flag": "No statistical tests or variance reporting",
    315       "detail": "Despite running experiments three times, no standard deviations, confidence intervals, or significance tests are reported. It is impossible to assess whether observed differences are statistically meaningful."
    316     },
    317     {
    318       "flag": "Simulated feedback uses ground-truth test cases",
    319       "detail": "The user simulation method provides ground-truth test cases to the LLM to generate simulated answers. This gives the simulation access to information real users would not have, potentially inflating performance in the automated evaluation."
    320     },
    321     {
    322       "flag": "No inference cost analysis",
    323       "detail": "ClarifyGPT requires multiple LLM calls per problem (sampling n code solutions, generating test inputs, generating questions, simulating answers, final generation) but reports no cost or latency comparison with baselines."
    324     },
    325     {
    326       "flag": "Small human evaluation sample",
    327       "detail": "Only 10 participants with no power analysis, no IRB approval, and no randomization or blinding described. Results may not generalize."
    328     }
    329   ],
    330   "cited_papers": [
    331     {
    332       "title": "Evaluating Large Language Models Trained on Code",
    333       "authors": ["Mark Chen"],
    334       "year": 2021,
    335       "arxiv_id": "2107.03374",
    336       "relevance": "Introduces HumanEval benchmark and Codex, foundational for LLM code generation evaluation."
    337     },
    338     {
    339       "title": "Self-collaboration Code Generation via ChatGPT",
    340       "authors": ["Yihong Dong", "Xue Jiang", "Zhi Jin", "Ge Li"],
    341       "year": 2023,
    342       "arxiv_id": "2304.07590",
    343       "relevance": "Multi-agent collaboration approach for LLM code generation."
    344     },
    345     {
    346       "title": "Is Your Code Generated by ChatGPT Really Correct? Rigorous Evaluation of Large Language Models for Code Generation",
    347       "authors": ["Jiawei Liu", "Chunqiu Steven Xia", "Yuyao Wang", "Lingming Zhang"],
    348       "year": 2023,
    349       "arxiv_id": "2305.01210",
    350       "relevance": "Introduces extended test suites (HumanEval-ET, MBPP-ET) and type-aware mutation used by ClarifyGPT."
    351     },
    352     {
    353       "title": "CodeT: Code Generation with Generated Tests",
    354       "authors": ["Bei Chen"],
    355       "year": 2022,
    356       "arxiv_id": "2207.10397",
    357       "relevance": "Uses generated tests to improve code generation quality, related post-processing approach."
    358     },
    359     {
    360       "title": "CODAMOSA: Escaping coverage plateaus in test generation with pre-trained large language models",
    361       "authors": ["Caroline Lemieux", "Jeevana Priya Inala", "Shuvendu K Lahiri", "Siddhartha Sen"],
    362       "year": 2023,
    363       "relevance": "LLM-augmented test generation for software engineering."
    364     },
    365     {
    366       "title": "Python Code Generation by Asking Clarification Questions",
    367       "authors": ["Haau-Sing Li", "Mohsen Mesgar", "André F. T. Martins", "Iryna Gurevych"],
    368       "year": 2023,
    369       "relevance": "Most directly related prior work on clarification questions for code generation."
    370     },
    371     {
    372       "title": "Interactive Code Generation via Test-Driven User-Intent Formalization",
    373       "authors": ["Shuvendu K. Lahiri"],
    374       "year": 2022,
    375       "arxiv_id": "2208.05950",
    376       "relevance": "Interactive code generation approach using test-driven intent formalization."
    377     },
    378     {
    379       "title": "Chain-of-Thought Prompting Elicits Reasoning in Large Language Models",
    380       "authors": ["Jason Wei"],
    381       "year": 2022,
    382       "relevance": "Foundational prompting technique used as both inspiration and baseline in this work."
    383     },
    384     {
    385       "title": "Program Synthesis with Large Language Models",
    386       "authors": ["Jacob Austin"],
    387       "year": 2021,
    388       "arxiv_id": "2108.07732",
    389       "relevance": "Introduces MBPP benchmark used in evaluation."
    390     }
    391   ]
    392 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs