scan.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan.json (29306B)
      1 {
      2   "paper": {
      3     "title": "An Agent-based Evaluation Framework for Complex Code Generation",
      4     "authors": [
      5       "Xinchen Wang",
      6       "Ruida Hu",
      7       "Pengfei Gao",
      8       "Chao Peng",
      9       "Cuiyun Gao"
     10     ],
     11     "year": 2025,
     12     "venue": "arXiv preprint",
     13     "arxiv_id": "2504.13472"
     14   },
     15   "checklist": {
     16     "artifacts": {
     17       "code_released": {
     18         "applies": true,
     19         "answer": true,
     20         "justification": "The abstract states: 'The resources of CodeVisionary are available at https://github.com/Eshe0922/CodeVisionary.' A working GitHub URL is provided."
     21       },
     22       "data_released": {
     23         "applies": true,
     24         "answer": false,
     25         "justification": "The benchmark of 363 samples was constructed by the authors from CodeArena filtered to 'hard' tasks with human annotations. No link to download this constructed benchmark is provided in the paper; only the GitHub link for the framework code is mentioned."
     26       },
     27       "environment_specified": {
     28         "applies": true,
     29         "answer": false,
     30         "justification": "The paper mentions using Docker containers for code execution within CodeVisionary, but does not provide a requirements file, Dockerfile for the framework itself, or a detailed environment setup section listing library versions needed to run the evaluation framework."
     31       },
     32       "reproduction_instructions": {
     33         "applies": true,
     34         "answer": false,
     35         "justification": "The paper references the GitHub repository but provides no step-by-step instructions for reproducing the main experimental results in the paper text itself. It is unclear from the paper whether the repository contains such instructions."
     36       }
     37     },
     38     "statistical_methodology": {
     39       "confidence_intervals_or_error_bars": {
     40         "applies": true,
     41         "answer": false,
     42         "justification": "All main results in Table II and Table III are reported as single point estimates (Pearson, Spearman, Kendall-Tau coefficients) with no confidence intervals, error bars, or uncertainty quantification."
     43       },
     44       "significance_tests": {
     45         "applies": true,
     46         "answer": false,
     47         "justification": "The paper claims CodeVisionary outperforms baselines based on comparing numeric coefficients (e.g., 0.301 vs. 0.084 average rp) but performs no statistical significance tests to validate these differences."
     48       },
     49       "effect_sizes_reported": {
     50         "applies": true,
     51         "answer": true,
     52         "justification": "The paper reports both raw correlation coefficients and absolute improvements (e.g., 'outperforming the best baseline with average improvements of 0.217, 0.163, and 0.141 in Pearson, Spearman, and Kendall-Tau coefficients'), providing sufficient context to assess magnitude."
     53       },
     54       "sample_size_justified": {
     55         "applies": true,
     56         "answer": false,
     57         "justification": "The benchmark uses 363 samples derived from filtering CodeArena's 397 'hard' tasks, but there is no power analysis or justification for why this sample size is sufficient for the correlation-based claims made."
     58       },
     59       "variance_reported": {
     60         "applies": true,
     61         "answer": false,
     62         "justification": "The paper states 'we perform multiple trials and take the average as the experimental results' (Section V-D) but does not report standard deviation, IQR, or any spread measure across these trials. Only average values appear in the result tables."
     63       }
     64     },
     65     "evaluation_design": {
     66       "baselines_included": {
     67         "applies": true,
     68         "answer": true,
     69         "justification": "Three baselines are compared: VANILLA (direct LLM prompting), ICE-Score, and CODEJUDGE, representing naive, and two state-of-the-art LLM-based code evaluation approaches."
     70       },
     71       "baselines_contemporary": {
     72         "applies": true,
     73         "answer": true,
     74         "justification": "ICE-Score (EACL 2024) and CODEJUDGE (EMNLP 2024) are both recent contemporary baselines. The authors state they 'try our best to reproduce them from publicly available source code and papers.'"
     75       },
     76       "ablation_study": {
     77         "applies": true,
     78         "answer": true,
     79         "justification": "Section IV-B (RQ2) presents ablation studies: removing the RMCD stage (w/o RMCD), removing the FSAS stage (w/o FSAS), and removing individual information types (w/o RT, w/o UI/UX, w/o ST) from the RMCD stage, with results in Table III."
     80       },
     81       "multiple_metrics": {
     82         "applies": true,
     83         "answer": true,
     84         "justification": "Three correlation metrics are used: Pearson (rp), Spearman (rs), and Kendall-Tau (tau) coefficients, all measuring correlation with human-annotated ground truth scores."
     85       },
     86       "human_evaluation": {
     87         "applies": true,
     88         "answer": false,
     89         "justification": "The human expert annotations create ground truth labels for the benchmark. They evaluate the LLM-generated code quality, NOT CodeVisionary's outputs. Per the schema: 'The humans must be evaluating what the system produced -- manual classification of the benchmark or dataset itself does not count.' No humans evaluated CodeVisionary's evaluation outputs (scores, reports). Evaluation of the system is entirely automated via correlation metrics."
     90       },
     91       "held_out_test_set": {
     92         "applies": true,
     93         "answer": false,
     94         "justification": "RQ3 explores hyperparameter choices (number of judges 2-5, rounds 2-5) on the same 363-sample test set that final results are reported on. The schema states 'If unclear whether the reported numbers are on data used for any selection decisions, NO.' The same dataset is used for both hyperparameter selection and evaluation."
     95       },
     96       "per_category_breakdown": {
     97         "applies": true,
     98         "answer": true,
     99         "justification": "Figure 4 provides per-coding-scenario breakdowns (Development and Programming, Tools Environments and Practices, UI/UX, Emerging Technologies, Algorithm Design) and per-programming-language breakdowns (Python, JavaScript, Java, C#, C)."
    100       },
    101       "failure_cases_discussed": {
    102         "applies": true,
    103         "answer": false,
    104         "justification": "The paper discusses failure cases of the BASELINES (ICE-Score) in Section I to motivate the work. However, it does not discuss where CodeVisionary itself fails or breaks down. The case study (Figure 5) shows a success case. The schema requires 'discussion of where the approach breaks down.'"
    105       },
    106       "negative_results_reported": {
    107         "applies": true,
    108         "answer": true,
    109         "justification": "Section V-C tests CodeVisionary on the less complex CoNaLa benchmark and reports that it does not outperform ICE-Score on rp (0.644 vs 0.655), which is an honest negative result. The ablation in Table III also shows performance degradation when components are removed."
    110       }
    111     },
    112     "claims_and_evidence": {
    113       "abstract_claims_supported": {
    114         "applies": true,
    115         "answer": true,
    116         "justification": "The abstract claims 'average improvements of 0.217, 0.163, and 0.141 in Pearson, Spearman, and Kendall-Tau coefficients' and Table II confirms CodeVisionary achieves avg 0.301/0.272/0.241 vs. best baseline 0.084/0.109/0.100. The claim is supported."
    117       },
    118       "causal_claims_justified": {
    119         "applies": true,
    120         "answer": false,
    121         "justification": "The paper makes causal claims such as 'The RMCD stage boosts rp, rs, and tau by 27.0%, 32.0%, and 32.4%' based on ablation studies, but the ablation design removes entire stages rather than isolating individual variables precisely. Additionally, the same dataset is used for hyperparameter selection (RQ3) and final evaluation, creating potential circularity."
    122       },
    123       "generalization_bounded": {
    124         "applies": true,
    125         "answer": false,
    126         "justification": "The paper frames contributions broadly ('the first agent-based evaluation framework for complex code generation') but tests only on one benchmark (CodeArena hard tasks, 363 samples) and one simpler dataset (CoNaLa). The paper acknowledges in the threats section that 'our benchmark may not cover all coding scenarios and programming languages' but the title and abstract do not bound the claims accordingly."
    127       },
    128       "alternative_explanations_discussed": {
    129         "applies": true,
    130         "answer": false,
    131         "justification": "The threats section (Section V-D) mentions only two threats: benchmark coverage limitation and LLM randomness. The paper does not discuss alternative explanations for the improvements, such as whether the advantage stems primarily from GPT-4o having access to more computation (via 40 interactions) rather than the architectural design per se."
    132       }
    133     },
    134     "setup_transparency": {
    135       "model_versions_specified": {
    136         "applies": true,
    137         "answer": false,
    138         "justification": "The paper says 'CodeVisionary and the baseline methods are provided access to GPT-4o' (Section III-D), but uses only the marketing name 'GPT-4o' without a specific API version or snapshot date. The references for GPT-3.5-turbo, Claude-3.5-Sonnet, and GPT-4o point to product announcement pages rather than specific model versions."
    139       },
    140       "prompts_provided": {
    141         "applies": true,
    142         "answer": false,
    143         "justification": "The paper describes prompts in natural language (e.g., the 'thought'/'action' paradigm, evaluation criteria) and provides one example of clarity criteria, but the full prompts sent to GPT-4o are not included in the paper text. The paper refers readers to the repository for 'remaining aspects and criteria' but actual prompt text is not reproduced."
    144       },
    145       "hyperparameters_reported": {
    146         "applies": true,
    147         "answer": true,
    148         "justification": "Section III-D reports: temperature=0.2 for RMCD stage, temperature=0.7 for FSAS stage, max interactions=40 for RMCD, number of judges=3, max negotiation rounds=4. These key hyperparameters are specified."
    149       },
    150       "scaffolding_described": {
    151         "applies": true,
    152         "answer": true,
    153         "justification": "The paper extensively describes the agentic scaffolding: Docker container setup, tool list (Dynamic Execution, Static Linter, Unit Tests, Screenshot, Interaction, Web Browsing, General Semantic, Bash Command), the thought/action/observation loop, the Execute/Analyze state alternation, and the multi-judge negotiation protocol with formal definitions."
    154       },
    155       "data_preprocessing_documented": {
    156         "applies": true,
    157         "answer": true,
    158         "justification": "Section III-A documents the benchmark construction pipeline: (1) filter CodeArena to 'hard' tasks, (2) exclude platform-specific tasks (MATLAB, Verilog), (3) generate responses using 3 LLMs, (4) manual scoring by two experts with Kappa>80% and third-expert adjudication. The filtering criteria are stated explicitly."
    159       }
    160     },
    161     "limitations_and_scope": {
    162       "limitations_section_present": {
    163         "applies": true,
    164         "answer": true,
    165         "justification": "Section V-D is titled 'Threats and Limitations' and provides dedicated discussion of threats to validity."
    166       },
    167       "threats_to_validity_specific": {
    168         "applies": true,
    169         "answer": false,
    170         "justification": "Section V-D mentions only two threats: (1) benchmark coverage may not generalize and (2) LLM randomness. Both are generic disclaimers. The threats do not discuss specific concerns like confounding from compute asymmetry between CodeVisionary and baselines, or the fact that the same data is used for both hyperparameter selection and evaluation."
    171       },
    172       "scope_boundaries_stated": {
    173         "applies": true,
    174         "answer": false,
    175         "justification": "The limitations section notes the benchmark 'may not cover all coding scenarios and programming languages' but does not explicitly state what the results do NOT show (e.g., does not state results are limited to GPT-4o as evaluator, or to 'hard' complexity tasks, or to the specific benchmark construction methodology)."
    176       }
    177     },
    178     "data_integrity": {
    179       "raw_data_available": {
    180         "applies": true,
    181         "answer": false,
    182         "justification": "The paper does not provide a download link for the 363-sample benchmark with human annotations. Only the framework code repository is referenced. The human-annotated benchmark data needed to reproduce the correlation results is not publicly available based on the paper."
    183       },
    184       "data_collection_described": {
    185         "applies": true,
    186         "answer": true,
    187         "justification": "Section III-A describes the data collection: filtering CodeArena to hard tasks, excluding platform-specific tasks, generating responses via GPT-3.5-turbo/Claude-3.5-Sonnet/GPT-4o, and human annotation procedure with inter-rater reliability (Kappa>80%)."
    188       },
    189       "recruitment_methods_described": {
    190         "applies": true,
    191         "answer": false,
    192         "justification": "The paper states expert annotators have 'over five years of expertise in the relevant programming languages' but does not describe how these experts were recruited, selected, or compensated, or whether they could introduce selection bias."
    193       },
    194       "data_pipeline_documented": {
    195         "applies": true,
    196         "answer": true,
    197         "justification": "The data pipeline from CodeArena (397 tasks) to final benchmark (363 samples with 121 tasks x 3 LLM responses) is documented: filtering hard tasks, excluding certain platform-specific ones, generating responses, manual scoring. The counts are consistent (121 tasks x 3 = 363 samples)."
    198       }
    199     },
    200     "conflicts_of_interest": {
    201       "funding_disclosed": {
    202         "applies": true,
    203         "answer": true,
    204         "justification": "The Acknowledgment section discloses funding: National Natural Science Foundation of China (No. 62472126, 62276075), Natural Science Foundation of Guangdong Province (No. 2023A1515011959), and Shenzhen-Hong Kong Jointly Funded Project (No. SGDX20230116091246007)."
    205       },
    206       "affiliations_disclosed": {
    207         "applies": true,
    208         "answer": true,
    209         "justification": "Author affiliations are listed on the first page: Xinchen Wang and Ruida Hu are from Harbin Institute of Technology, Shenzhen; Pengfei Gao and Chao Peng are from ByteDance. A footnote notes 'Work done during an internship at ByteDance.' The industry affiliation is disclosed."
    210       },
    211       "funder_independent_of_outcome": {
    212         "applies": true,
    213         "answer": true,
    214         "justification": "Funders are the National Natural Science Foundation of China and a Shenzhen-Hong Kong government project — public research funding with no financial interest in CodeVisionary's performance. ByteDance is an industrial affiliate of some authors but is not listed as a funder."
    215       },
    216       "financial_interests_declared": {
    217         "applies": true,
    218         "answer": false,
    219         "justification": "There is no competing interests statement or declaration that authors hold (or do not hold) patents or equity related to the work. The absence of such a statement means this criterion is not met."
    220       }
    221     },
    222     "contamination": {
    223       "training_cutoff_stated": {
    224         "applies": true,
    225         "answer": false,
    226         "justification": "The paper uses GPT-4o as the evaluator model but does not state GPT-4o's training data cutoff date. It also evaluates responses generated by GPT-3.5-turbo, Claude-3.5-Sonnet, and GPT-4o without stating any model's training cutoff."
    227       },
    228       "train_test_overlap_discussed": {
    229         "applies": true,
    230         "answer": false,
    231         "justification": "The benchmark is derived from CodeArena which was published in 2024 and could have been in the training data of the models used. The paper does not discuss whether the code generation tasks could have appeared in model training data."
    232       },
    233       "benchmark_contamination_addressed": {
    234         "applies": true,
    235         "answer": false,
    236         "justification": "CodeArena was published in late 2024 (arXiv:2412.05210) and GPT-4o's training cutoff is unknown from the paper. The paper does not discuss contamination risk for either the benchmark tasks or the generated responses being evaluated."
    237       }
    238     },
    239     "human_studies": {
    240       "pre_registered": {
    241         "applies": false,
    242         "answer": false,
    243         "justification": "The expert annotators are providing ground truth labels for a benchmark -- they are professional raters, not research participants in a human subjects study. Pre-registration applies to studies OF human participants, not studies that use human annotators as instruments. This is a benchmark evaluation paper."
    244       },
    245       "irb_or_ethics_approval": {
    246         "applies": false,
    247         "answer": false,
    248         "justification": "Expert annotators creating benchmark labels are not human research participants. IRB review is for research ON human subjects, not for employing human annotators in a data pipeline. This is a benchmark evaluation paper."
    249       },
    250       "demographics_reported": {
    251         "applies": false,
    252         "answer": false,
    253         "justification": "The annotators are expert raters creating ground truth labels, not participants in a human study. Demographic reporting requirements apply to studies of human participants."
    254       },
    255       "inclusion_exclusion_criteria": {
    256         "applies": false,
    257         "answer": false,
    258         "justification": "Same reasoning -- expert annotators are instruments for benchmark construction, not human research participants. Inclusion/exclusion criteria requirements are for participant recruitment in human subjects research."
    259       },
    260       "randomization_described": {
    261         "applies": false,
    262         "answer": false,
    263         "justification": "The human annotation study is an observational scoring study, not an experimental study with treatment/control assignment. Randomization of annotators to conditions does not apply."
    264       },
    265       "blinding_described": {
    266         "applies": false,
    267         "answer": false,
    268         "justification": "No human subjects study is conducted. While the paper does note that experts 'remain unaware of the identity of the LLM,' this is good annotation practice, not blinding in a human subjects study context."
    269       },
    270       "attrition_reported": {
    271         "applies": false,
    272         "answer": false,
    273         "justification": "There is no longitudinal study with dropout risk. The annotation was a bounded task and attrition is not applicable."
    274       }
    275     },
    276     "cost_and_practicality": {
    277       "inference_cost_reported": {
    278         "applies": true,
    279         "answer": false,
    280         "justification": "CodeVisionary uses GPT-4o with up to 40 interactions per evaluation instance in the RMCD stage plus additional calls in the FSAS stage. Table IV reports an average of 6.12 actions per instance, but no API cost, token count, or monetary cost is reported."
    281       },
    282       "compute_budget_stated": {
    283         "applies": true,
    284         "answer": false,
    285         "justification": "No total computational budget (GPU hours, API spend, wall-clock time) is stated for running the 363-sample evaluation. Given the 40-interaction limit per instance with GPT-4o, this is a non-trivial cost that is not quantified."
    286       }
    287     }
    288   },
    289   "claims": [
    290     {
    291       "claim": "CodeVisionary achieves the best performance among three baselines for evaluating complex code generation, outperforming the best baseline with average improvements of 0.217, 0.163, and 0.141 in Pearson, Spearman, and Kendall-Tau coefficients, respectively.",
    292       "evidence": "Table II shows CodeVisionary avg rp=0.301, rs=0.272, tau=0.241 versus best baseline (CODEJUDGE avg rp=0.084, rs=0.094, tau=0.089). Section IV-A discusses results.",
    293       "supported": "strong"
    294     },
    295     {
    296       "claim": "The requirement-guided multi-dimensional context distillation (RMCD) stage improves CodeVisionary performance by an average of 27.0%, 32.0%, and 32.4% across rp, rs, and tau respectively.",
    297       "evidence": "Table III ablation results: w/o RMCD achieves avg rp=0.237, rs=0.206, tau=0.182 vs. full system rp=0.301, rs=0.272, tau=0.241. Section IV-B1.",
    298       "supported": "moderate"
    299     },
    300     {
    301       "claim": "The fine-grained scoring and summarization (FSAS) stage improves CodeVisionary performance by 21.4%, 21.4%, and 19.9% on average across rp, rs, and tau.",
    302       "evidence": "Table III: w/o FSAS achieves avg rp=0.248, rs=0.224, tau=0.201 vs. full system. Section IV-B2.",
    303       "supported": "moderate"
    304     },
    305     {
    306       "claim": "Runtime information (dynamic execution and unit tests) provides the most contribution, improving rp, rs, and tau by 46.8%, 40.9%, and 41.8% respectively.",
    307       "evidence": "Table III: w/o RT achieves avg rp=0.205, rs=0.193, tau=0.170 vs. full system. Section IV-B3.",
    308       "supported": "moderate"
    309     },
    310     {
    311       "claim": "Current state-of-the-art LLM-based evaluation approaches (ICE-Score, CODEJUDGE) exhibit minimal difference compared to a vanilla baseline for complex code evaluation.",
    312       "evidence": "Table II: VANILLA avg rp/rs/tau = 0.042/0.091/0.085; ICE-SCORE = 0.039/0.109/0.100; CODEJUDGE = 0.084/0.094/0.089. Section IV-A.",
    313       "supported": "strong"
    314     },
    315     {
    316       "claim": "CodeVisionary maintains effectiveness on less complex tasks, achieving competitive performance on CoNaLa.",
    317       "evidence": "Table V: CodeVisionary rp=0.644, rs=0.637, tau=0.572 vs. ICE-SCORE rp=0.655, rs=0.596, tau=0.534. CodeVisionary is better on rs and tau but slightly worse on rp. Section V-C.",
    318       "supported": "moderate"
    319     }
    320   ],
    321   "methodology_tags": [
    322     "benchmark-eval",
    323     "case-study"
    324   ],
    325   "key_findings": "CodeVisionary, a two-stage agent-based framework, substantially outperforms existing LLM-based code evaluation methods (ICE-Score, CODEJUDGE) on complex code generation tasks, with average Pearson correlation improvements of 0.217 over the best baseline. The framework's advantage stems from combining multi-dimensional context collection (runtime execution, static linting, web browsing, UI/UX screenshots) with a multi-judge negotiation protocol. Notably, existing state-of-the-art LLM-based evaluation methods show only marginal improvement over a vanilla prompting baseline on complex code tasks, highlighting the difficulty of this evaluation setting. Performance on a simpler benchmark (CoNaLa) is competitive but does not uniformly exceed ICE-Score, suggesting the approach is most valuable for complex scenarios.",
    326   "red_flags": [
    327     {
    328       "flag": "No model version specificity",
    329       "detail": "The paper uses 'GPT-4o' throughout without specifying API version or snapshot date. GPT-4o behavior changes across API versions, making exact reproduction impossible. References for the model point only to announcement blog posts."
    330     },
    331     {
    332       "flag": "Compute asymmetry between method and baselines",
    333       "detail": "CodeVisionary uses up to 40 interactions per evaluation instance with GPT-4o, plus additional FSAS calls, while baselines use a single or few LLM calls. The improvements may largely reflect more compute spent per evaluation rather than the architectural novelty. No cost or token count is reported to assess this."
    334     },
    335     {
    336       "flag": "Same dataset for hyperparameter selection and evaluation",
    337       "detail": "RQ3 explores hyperparameter choices (number of judges 2-5, number of rounds 2-5) on the same 363-sample test set, then reports final results on that same set. This conflates hyperparameter optimization and evaluation, potentially inflating reported performance."
    338     },
    339     {
    340       "flag": "No variance reported across multiple trials",
    341       "detail": "The paper states multiple trials are averaged but reports no standard deviation or confidence intervals. The magnitude of the improvements is large, but without variance estimates, the reliability of the differences cannot be assessed."
    342     },
    343     {
    344       "flag": "Benchmark annotation data not released",
    345       "detail": "The human-annotated benchmark of 363 samples (the ground truth for all correlation computations) is not publicly available, making it impossible to independently verify the reported correlation results."
    346     },
    347     {
    348       "flag": "Industry co-authorship without COI declaration",
    349       "detail": "Two authors (Pengfei Gao, Chao Peng) are from ByteDance, and two student authors did internships at ByteDance. CodeVisionary is presented as a tool that could be commercially deployed. No competing interests statement is provided."
    350     }
    351   ],
    352   "cited_papers": [
    353     {
    354       "title": "ICE-Score: Instructing Large Language Models to Evaluate Code",
    355       "authors": [
    356         "T. Y. Zhuo"
    357       ],
    358       "year": 2024,
    359       "relevance": "Direct baseline for the paper's code evaluation framework; a representative LLM-based code evaluation approach."
    360     },
    361     {
    362       "title": "CodeJudge: Evaluating Code Generation with Large Language Models",
    363       "authors": [
    364         "W. Tong",
    365         "T. Zhang"
    366       ],
    367       "year": 2024,
    368       "relevance": "Direct baseline for the paper; uses 'slow thinking' approach for semantic correctness evaluation of generated code."
    369     },
    370     {
    371       "title": "Evaluating and Aligning CodeLLMs on Human Preference",
    372       "authors": [
    373         "J. Yang",
    374         "J. Yang",
    375         "K. Jin",
    376         "Y. Miao",
    377         "L. Zhang",
    378         "L. Yang",
    379         "Z. Cui",
    380         "Y. Zhang",
    381         "B. Hui",
    382         "J. Lin"
    383       ],
    384       "year": 2024,
    385       "arxiv_id": "2412.05210",
    386       "relevance": "Source of the CodeArena benchmark used to construct CodeVisionary's evaluation dataset."
    387     },
    388     {
    389       "title": "Out of the BLEU: how should we assess quality of the code generation models?",
    390       "authors": [
    391         "M. Evtikhiev",
    392         "E. Bogomolov",
    393         "Y. Sokolov",
    394         "T. Bryksin"
    395       ],
    396       "year": 2023,
    397       "relevance": "Human evaluation study of code generation quality; provides the human annotation framework used for scoring in this paper."
    398     },
    399     {
    400       "title": "Evaluating large language models trained on code",
    401       "authors": [
    402         "M. Chen",
    403         "J. Tworek",
    404         "H. Jun"
    405       ],
    406       "year": 2021,
    407       "arxiv_id": "2107.03374",
    408       "relevance": "Introduces HumanEval and pass@k metric, foundational benchmark evaluation methodology for code generation."
    409     },
    410     {
    411       "title": "A survey on evaluating large language models in code generation tasks",
    412       "authors": [
    413         "L. Chen",
    414         "Q. Guo",
    415         "H. Jia"
    416       ],
    417       "year": 2024,
    418       "arxiv_id": "2408.16498",
    419       "relevance": "Survey of code generation evaluation approaches, providing context for the paper's contributions."
    420     },
    421     {
    422       "title": "Judging LLM-as-a-Judge with MT-Bench and Chatbot Arena",
    423       "authors": [
    424         "L. Zheng",
    425         "W.-L. Chiang",
    426         "Y. Sheng"
    427       ],
    428       "year": 2023,
    429       "relevance": "Foundational paper on using LLMs as judges for evaluation, relevant to the LLM-as-evaluator paradigm."
    430     },
    431     {
    432       "title": "The rise and potential of large language model based agents: A survey",
    433       "authors": [
    434         "Z. Xi",
    435         "W. Chen",
    436         "X. Guo"
    437       ],
    438       "year": 2025,
    439       "relevance": "Comprehensive survey on LLM-based agents, providing theoretical grounding for the agent-based approach used."
    440     },
    441     {
    442       "title": "Large language model-based agents for software engineering: A survey",
    443       "authors": [
    444         "J. Liu",
    445         "K. Wang",
    446         "Y. Chen"
    447       ],
    448       "year": 2024,
    449       "arxiv_id": "2409.02977",
    450       "relevance": "Survey of LLM agents applied to software engineering tasks, relevant to the paper's positioning."
    451     },
    452     {
    453       "title": "ChatDev: Communicative Agents for Software Development",
    454       "authors": [
    455         "C. Qian",
    456         "W. Liu",
    457         "H. Liu"
    458       ],
    459       "year": 2024,
    460       "relevance": "Multi-agent framework for software development; related to the multi-agent paradigm used in CodeVisionary."
    461     },
    462     {
    463       "title": "Can LLMs replace human evaluators? An empirical study of LLM-as-a-judge in software engineering",
    464       "authors": [
    465         "R. Wang",
    466         "J. Guo",
    467         "C. Gao"
    468       ],
    469       "year": 2025,
    470       "relevance": "Empirical study of LLM-as-judge for software engineering evaluation, directly relevant to this paper's core topic."
    471     },
    472     {
    473       "title": "AgentCoder: Multi-agent-based code generation with iterative testing and optimisation",
    474       "authors": [
    475         "D. Huang",
    476         "Q. Bu",
    477         "J. M. Zhang"
    478       ],
    479       "year": 2023,
    480       "arxiv_id": "2312.13010",
    481       "relevance": "Multi-agent code generation system; relevant as a precursor to agent-based approaches in software engineering."
    482     },
    483     {
    484       "title": "CodePlan: Repository-level coding using LLMs and planning",
    485       "authors": [
    486         "R. Bairi",
    487         "A. Sonwane",
    488         "A. Kanade"
    489       ],
    490       "year": 2024,
    491       "relevance": "Repository-level code generation requiring complex multi-step reasoning, motivating evaluation of complex code scenarios."
    492     }
    493   ]
    494 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs