scan.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan.json (30102B)
      1 {
      2   "paper": {
      3     "title": "HumanEvalComm: Benchmarking the Communication Competence of Code Generation for LLMs and LLM Agent",
      4     "authors": ["Jie JW Wu", "Fatemeh H. Fard"],
      5     "year": 2024,
      6     "venue": "ACM Transactions on Software Engineering and Methodology",
      7     "arxiv_id": "2406.00215",
      8     "doi": "10.1145/3715109"
      9   },
     10   "scan_version": 2,
     11   "active_modules": ["experimental_rigor", "data_leakage"],
     12   "checklist": {
     13     "artifacts": {
     14       "code_released": {
     15         "applies": true,
     16         "answer": true,
     17         "justification": "The paper provides a public GitHub repository: 'Our benchmark and replication package are made public at https://github.com/jie-jw-wu/human-eval-comm' (Section 1, Section 9)."
     18       },
     19       "data_released": {
     20         "applies": true,
     21         "answer": true,
     22         "justification": "The HumanEvalComm benchmark dataset is publicly released at the same GitHub repository. The paper states 'Our benchmark and full code are publicly available.'"
     23       },
     24       "environment_specified": {
     25         "applies": true,
     26         "answer": false,
     27         "justification": "The paper describes hardware (Intel i7-6700K, Tesla V100-SXM2-16GB) and Python 3.12, and lists HuggingFace model names, but does not provide a requirements.txt, Dockerfile, or detailed dependency list sufficient to recreate the environment."
     28       },
     29       "reproduction_instructions": {
     30         "applies": true,
     31         "answer": false,
     32         "justification": "The paper calls its release a 'replication package' but does not contain step-by-step reproduction instructions within the paper itself. No 'Reproducing Results' section or specific commands are provided."
     33       }
     34     },
     35     "statistical_methodology": {
     36       "confidence_intervals_or_error_bars": {
     37         "applies": true,
     38         "answer": false,
     39         "justification": "Main results in Tables 3-5 report point estimates without confidence intervals or error bars. RQ4 (Table 7-8) reports variance from 5 runs, but the primary results lack uncertainty quantification."
     40       },
     41       "significance_tests": {
     42         "applies": true,
     43         "answer": true,
     44         "justification": "Independent two-sample t-tests with p-values are reported throughout Tables 4-5 at α = 0.01, 0.05, 0.1 significance levels (Section 3.5)."
     45       },
     46       "effect_sizes_reported": {
     47         "applies": true,
     48         "answer": true,
     49         "justification": "Absolute differences are reported with baseline context throughout (e.g., 'Pass@1 drop by 35%∼52%', 'increases Communication Rate by an absolute 58%'). Tables show both HumanEval and HumanEvalComm values, enabling effect size assessment."
     50       },
     51       "sample_size_justified": {
     52         "applies": true,
     53         "answer": true,
     54         "justification": "For the manual evaluation (RQ3), the paper explicitly uses a sample size calculator: 'we set Confidence Level as 95%, Margin of Error as 5%, and Population Proportion as 50%' to determine 60 samples per model (Section 4.3)."
     55       },
     56       "variance_reported": {
     57         "applies": true,
     58         "answer": false,
     59         "justification": "Main results (Tables 3-5) are reported as single-run point estimates without variance or standard deviation across runs. Only the RQ4 investigation (Tables 7-8) reports variance from 5 repeated runs."
     60       }
     61     },
     62     "evaluation_design": {
     63       "baselines_included": {
     64         "applies": true,
     65         "answer": true,
     66         "justification": "Five Code LLMs (ChatGPT, CodeLlama, CodeQwen1.5 Chat, DeepSeek Coder, DeepSeek Chat) serve as baselines for evaluating Okanagan. AgentCoder is also tested as an external LLM agent baseline (Section 5.4)."
     67       },
     68       "baselines_contemporary": {
     69         "applies": true,
     70         "answer": true,
     71         "justification": "Models evaluated include CodeLlama, DeepSeek Coder, CodeQwen1.5 Chat, and ChatGPT 3.5, all released in 2023-2024 and contemporary to the study. AgentCoder (2023) is also included."
     72       },
     73       "ablation_study": {
     74         "applies": true,
     75         "answer": false,
     76         "justification": "Okanagan has a multi-round structure with specific design choices (3 rounds, reflection pattern), but no systematic ablation removes individual components to measure their contribution. Switching base models (ChatGPT vs DeepSeek Coder) is tested but is not a proper ablation."
     77       },
     78       "multiple_metrics": {
     79         "applies": true,
     80         "answer": true,
     81         "justification": "Four evaluation metrics are used: Communication Rate, Good Question Rate, Pass@1, and Test Pass Rate (Section 2.2)."
     82       },
     83       "human_evaluation": {
     84         "applies": true,
     85         "answer": true,
     86         "justification": "RQ3 (Section 4.3) involves 6 graduate students manually evaluating question quality and answer quality from models' responses, with inter-rater reliability measured via Cohen's Kappa (Table 6)."
     87       },
     88       "held_out_test_set": {
     89         "applies": true,
     90         "answer": false,
     91         "justification": "The full HumanEvalComm benchmark (762 problems) is used for both prompt development and evaluation. The authors note they 'optimized the prompt for the LLM-based evaluator several times and checked the results manually' on the same data used for reporting."
     92       },
     93       "per_category_breakdown": {
     94         "applies": true,
     95         "answer": true,
     96         "justification": "Detailed breakdowns by clarification category (1a, 1c, 1p, 2ac, 2cp, 2ap) are provided in Tables 4 and 5, with per-model results for each category."
     97       },
     98       "failure_cases_discussed": {
     99         "applies": true,
    100         "answer": true,
    101         "justification": "Section 5.1 provides example cases including a failure case (Table 12, CodeLlama on problem 45 where inconsistency was not detected). Okanagan's tendency to ask unnecessary questions is discussed as a limitation."
    102       },
    103       "negative_results_reported": {
    104         "applies": true,
    105         "answer": true,
    106         "justification": "Several negative results are reported: Okanagan achieves lower Pass@1 on original HumanEval than ChatGPT alone (Section 4.2), AgentCoder had 0% communication rate (Section 5.4), and Okanagan with DeepSeek Coder as base underperforms vs ChatGPT base (Section 4.2)."
    107       }
    108     },
    109     "claims_and_evidence": {
    110       "abstract_claims_supported": {
    111         "applies": true,
    112         "answer": true,
    113         "justification": "Abstract claims (>60% code generation, 35-52% Pass@1 drop, 58% Communication Rate increase, 38% Good Question Rate increase for Okanagan) are supported by Table 3 data. However, there is a discrepancy with body text numbers (Section 1 states '59% and 5%' instead of '58% and 38%')."
    114       },
    115       "causal_claims_justified": {
    116         "applies": true,
    117         "answer": true,
    118         "justification": "The claim that Okanagan 'boosts' performance uses causal language. The study design (same base model ChatGPT, same data, only difference is agent structure) provides a controlled comparison adequate for this causal claim. Testing with an alternative base model (DeepSeek Coder) adds robustness."
    119       },
    120       "generalization_bounded": {
    121         "applies": true,
    122         "answer": true,
    123         "justification": "Section 7 (External Validity) explicitly states: 'we cannot make a sound claim regarding the communication capability of the models on another dataset.' The paper is tested on Python problems only and the scope is appropriately bounded."
    124       },
    125       "alternative_explanations_discussed": {
    126         "applies": true,
    127         "answer": true,
    128         "justification": "Section 5.3 discusses how the LLM-based evaluator's False Recovery Rate could inflate pass rates for some models. Section 4.1 offers hypotheses for differences between models (generative nature, training data composition). Section 7 addresses internal validity threats."
    129       },
    130       "proxy_outcome_distinction": {
    131         "applies": true,
    132         "answer": true,
    133         "justification": "The paper formally defines 'communication capability' (Section 1) and directly measures it via Communication Rate and Good Question Rate. Pass@1 and Test Pass Rate directly measure code correctness. The metrics match the granularity of claims without overreaching into broader framing."
    134       }
    135     },
    136     "setup_transparency": {
    137       "model_versions_specified": {
    138         "applies": true,
    139         "answer": true,
    140         "justification": "Specific model versions are provided: 'gpt-3.5-turbo-0125' for ChatGPT, and HuggingFace model names 'deepseek-coder-6.7b-instruct', 'deepseek-llm-7b-chat', 'CodeQwen1.5-7B-Chat', 'CodeLlama-13b-Instruct-hf' (Section 3.5)."
    141       },
    142       "prompts_provided": {
    143         "applies": true,
    144         "answer": true,
    145         "justification": "Full prompt text is provided in Tables 14-16 (Appendix), the LLM-based evaluator prompt in Section 3.2, and prompt variants in Table 15. Actual prompt templates with placeholder variables are shown."
    146       },
    147       "hyperparameters_reported": {
    148         "applies": true,
    149         "answer": true,
    150         "justification": "Temperature=1.0, n=1 for ChatGPT API, max_new_tokens=512 for open-source models, default values for other parameters are reported (Section 3.5, Section 7)."
    151       },
    152       "scaffolding_described": {
    153         "applies": true,
    154         "answer": true,
    155         "justification": "Okanagan's 3-round structure is described in detail (Section 3.4): Round 1 generates code, Round 2 asks questions, Round 3 regenerates with reflection. Figure 3 provides visual illustration. Parameters (number of agents, rounds, actions, thinking pattern) are enumerated."
    156       },
    157       "data_preprocessing_documented": {
    158         "applies": true,
    159         "answer": true,
    160         "justification": "Section 2 documents the benchmark construction process in detail: taxonomy of clarification types, guidelines for modification, annotator qualifications, discussion process, time estimates (100 hours initial + 30 hours review), and statistics (Table 1)."
    161       }
    162     },
    163     "limitations_and_scope": {
    164       "limitations_section_present": {
    165         "applies": true,
    166         "answer": true,
    167         "justification": "Section 5.5 'Limitations' provides a dedicated subsection with four specific limitations. Section 7 'Threats to Validity' provides additional discussion organized by construct, internal, and external validity."
    168       },
    169       "threats_to_validity_specific": {
    170         "applies": true,
    171         "answer": true,
    172         "justification": "Specific threats are discussed: subjectivity in manual modifications, different engineers approaching problems differently, LLM-based evaluator tending to mark more 'Good' questions than human annotators, False Recovery Rates for specific models (30-40%), and Okanagan asking unnecessary questions."
    173       },
    174       "scope_boundaries_stated": {
    175         "applies": true,
    176         "answer": true,
    177         "justification": "Section 7 External Validity explicitly states: 'we cannot make a sound claim regarding the communication capability of the models on another dataset.' The paper notes it only tests Python problems and a limited set of models."
    178       }
    179     },
    180     "data_integrity": {
    181       "raw_data_available": {
    182         "applies": true,
    183         "answer": true,
    184         "justification": "The benchmark, code, and replication package are publicly available at the GitHub repository. The paper states: 'we release our complete code and dataset' (Section 7)."
    185       },
    186       "data_collection_described": {
    187         "applies": true,
    188         "answer": true,
    189         "justification": "Section 2 describes the benchmark construction in detail: source (HumanEval), modification taxonomy (Ambiguity, Inconsistency, Incompleteness), annotator process (two software engineers, disagreement resolution), and time estimates."
    190       },
    191       "recruitment_methods_described": {
    192         "applies": true,
    193         "answer": true,
    194         "justification": "For manual evaluation: 'We recruited six graduate students at the University of British Columbia' from 'the software engineering lab in the computer science department, all having at least two years of professional software development background' (Section 4.3). For benchmark creation: annotator qualifications are described."
    195       },
    196       "data_pipeline_documented": {
    197         "applies": true,
    198         "answer": true,
    199         "justification": "The pipeline from HumanEval to HumanEvalComm is documented with counts (Table 1: 164 per single type, varying for combinations, 762 total). The evaluation pipeline is shown in Figure 2 with clear steps. Manual evaluation uses hash-function sampling of 60 problems per model."
    200       }
    201     },
    202     "conflicts_of_interest": {
    203       "funding_disclosed": {
    204         "applies": true,
    205         "answer": true,
    206         "justification": "Section 10: 'This research is supported by a grant from Natural Sciences and Engineering Research Council of Canada RGPIN-2019-05175.'"
    207       },
    208       "affiliations_disclosed": {
    209         "applies": true,
    210         "answer": true,
    211         "justification": "Authors are affiliated with University of British Columbia, Kelowna, Canada. They evaluate models from external companies (OpenAI, Meta, DeepSeek, Qwen) with no affiliation to those companies."
    212       },
    213       "funder_independent_of_outcome": {
    214         "applies": true,
    215         "answer": true,
    216         "justification": "NSERC is the Canadian government research funding agency with no financial interest in the evaluation outcome of any specific LLM."
    217       },
    218       "financial_interests_declared": {
    219         "applies": true,
    220         "answer": false,
    221         "justification": "No competing interests or financial interests statement is present in the paper."
    222       }
    223     },
    224     "contamination": {
    225       "training_cutoff_stated": {
    226         "applies": true,
    227         "answer": false,
    228         "justification": "No training data cutoff dates are stated for any of the evaluated models (ChatGPT, CodeLlama, DeepSeek Coder, DeepSeek Chat, CodeQwen1.5)."
    229       },
    230       "train_test_overlap_discussed": {
    231         "applies": true,
    232         "answer": false,
    233         "justification": "HumanEvalComm is based on HumanEval (published 2021). All evaluated models were trained after 2021 and may have seen the original problems. While the modifications are new, the base problems and test cases overlap. This is not discussed."
    234       },
    235       "benchmark_contamination_addressed": {
    236         "applies": true,
    237         "answer": false,
    238         "justification": "HumanEval has been widely available online since 2021. The paper does not discuss the contamination risk from models having seen the original HumanEval problems during training, even though modifications partially mitigate this."
    239       }
    240     },
    241     "human_studies": {
    242       "pre_registered": {
    243         "applies": true,
    244         "answer": false,
    245         "justification": "No pre-registration is mentioned for the manual evaluation study with 6 graduate students."
    246       },
    247       "irb_or_ethics_approval": {
    248         "applies": true,
    249         "answer": false,
    250         "justification": "No IRB or ethics board approval is mentioned for the manual evaluation involving 6 graduate students."
    251       },
    252       "demographics_reported": {
    253         "applies": true,
    254         "answer": true,
    255         "justification": "The 6 students are described as 'from the software engineering lab in the computer science department, all having at least two years of professional software development background' (Section 4.3)."
    256       },
    257       "inclusion_exclusion_criteria": {
    258         "applies": true,
    259         "answer": true,
    260         "justification": "Participants are from 'the software engineering lab' with 'at least two years of professional software development background' — these serve as implicit inclusion criteria (Section 4.3)."
    261       },
    262       "randomization_described": {
    263         "applies": false,
    264         "answer": false,
    265         "justification": "This is a manual evaluation task, not an experimental study with treatment conditions. Randomization of participants to conditions does not apply."
    266       },
    267       "blinding_described": {
    268         "applies": true,
    269         "answer": true,
    270         "justification": "'To avoid introducing bias, we hide the actual names of the 6 models and used model 1 to model 6 instead of actual model names' (Section 4.3)."
    271       },
    272       "attrition_reported": {
    273         "applies": true,
    274         "answer": false,
    275         "justification": "No information on whether all 6 students completed all their assigned evaluations or if any dropped out."
    276       }
    277     },
    278     "cost_and_practicality": {
    279       "inference_cost_reported": {
    280         "applies": true,
    281         "answer": true,
    282         "justification": "API costs are reported: 'the total cost in using ChatGPT 3.5 is less than 15 USD' and 'the cost for running AgentCoder alone in HumanEvalComm is about 36 USD.' Per-token pricing is stated (Section 3.5)."
    283       },
    284       "compute_budget_stated": {
    285         "applies": true,
    286         "answer": true,
    287         "justification": "Hardware is specified (Intel i7-6700K, 32GB RAM for API models; Xeon Gold 6130, 44GB RAM, 4x Tesla V100 for open-source models). Total API cost <15 USD. Runtime 'less than half an hour' for Okanagan/ChatGPT (Section 3.5)."
    288       }
    289     },
    290     "experimental_rigor": {
    291       "seed_sensitivity_reported": {
    292         "applies": true,
    293         "answer": false,
    294         "justification": "Main results (Tables 3-5) are single-run results. Only the RQ4 investigation (Table 7-8) repeats experiments 5 times. The primary evaluation does not report seed sensitivity."
    295       },
    296       "number_of_runs_stated": {
    297         "applies": true,
    298         "answer": false,
    299         "justification": "For the main results (RQ1, RQ2), the number of runs is not explicitly stated. RQ4 states '5 times' for the sensitivity analysis, but main experiments lack this information."
    300       },
    301       "hyperparameter_search_budget": {
    302         "applies": true,
    303         "answer": false,
    304         "justification": "No hyperparameter search is described. Default parameters are used throughout, but the choice of defaults is not justified with a search budget."
    305       },
    306       "best_config_selection_justified": {
    307         "applies": true,
    308         "answer": true,
    309         "justification": "The paper uses default/standard configurations and reports results for all tested configurations. RQ4 (Section 4.4) tests prompt variants and hyperparameter variations, presenting all results (Tables 7-9) rather than cherry-picking."
    310       },
    311       "multiple_comparison_correction": {
    312         "applies": true,
    313         "answer": false,
    314         "justification": "Many t-tests are performed across 6 clarification categories × 6 models × 2 metrics, but no correction for multiple comparisons (Bonferroni, Holm, etc.) is applied."
    315       },
    316       "self_comparison_bias_addressed": {
    317         "applies": true,
    318         "answer": false,
    319         "justification": "The authors propose and evaluate Okanagan against baselines without acknowledging the bias of evaluating their own system. No independent evaluation or discussion of author-evaluation bias is present."
    320       },
    321       "compute_budget_vs_performance": {
    322         "applies": true,
    323         "answer": false,
    324         "justification": "Okanagan uses 3 LLM calls per problem vs 1-2 for base models, but performance is not compared at matched compute budgets. The 3x compute overhead is not analyzed against the performance gain."
    325       },
    326       "benchmark_construct_validity": {
    327         "applies": true,
    328         "answer": true,
    329         "justification": "Section 5.2 discusses why automated modification fails (doesn't guarantee triggering clarifying questions). RQ3 validates the evaluation metrics against human judgment. Section 2 justifies the taxonomy of clarification types with RE literature."
    330       },
    331       "scaffold_confound_addressed": {
    332         "applies": true,
    333         "answer": true,
    334         "justification": "The paper tests Okanagan with two different base models (ChatGPT and DeepSeek Coder) to separate scaffold effects from model effects (Section 4.2), though the analysis reveals prompt compatibility issues."
    335       }
    336     },
    337     "data_leakage": {
    338       "temporal_leakage_addressed": {
    339         "applies": true,
    340         "answer": false,
    341         "justification": "HumanEval was published in 2021 and is widely available online. All evaluated models were trained after 2021 and may have memorized solutions. The modifications mitigate but do not eliminate this risk, and it is not discussed."
    342       },
    343       "feature_leakage_addressed": {
    344         "applies": true,
    345         "answer": false,
    346         "justification": "No discussion of whether the evaluation setup (providing function signatures, test cases) leaks information about the correct solution beyond what would be available in realistic usage."
    347       },
    348       "non_independence_addressed": {
    349         "applies": true,
    350         "answer": false,
    351         "justification": "No discussion of independence between the 762 modified problems, which are derived from only 164 original HumanEval problems — performance on modifications of the same problem may be correlated."
    352       },
    353       "leakage_detection_method": {
    354         "applies": true,
    355         "answer": false,
    356         "justification": "No concrete leakage detection or prevention method is used. No canary strings, membership inference, or decontamination analysis is performed."
    357       }
    358     }
    359   },
    360   "claims": [
    361     {
    362       "claim": "More than 60% of responses from Code LLMs still generate code rather than ask questions when problem descriptions are modified to be ambiguous, inconsistent, or incomplete.",
    363       "evidence": "Table 3 shows communication rates: ChatGPT 14.21%, CodeLlama 10.16%, CodeQwen1.5 4.82%, DeepSeek Coder 30.76%, DeepSeek Chat 37.93% — all below 40%, meaning >60% generate code (Section 4.1).",
    364       "supported": "strong"
    365     },
    366     {
    367       "claim": "Pass@1 of most Code LLMs drops by 35%–52% and Test Pass Rate drops by 17%–35% on HumanEvalComm compared to original HumanEval, with statistical significance.",
    368       "evidence": "Tables 3-5 show relative drops with p-values. E.g., ChatGPT Pass@1: 65.58% → 31.34%; CodeQwen1.5: 76.83% → 47.61%. Statistical significance at p<0.01 for most comparisons (Section 4.1).",
    369       "supported": "strong"
    370     },
    371     {
    372       "claim": "Okanagan (ChatGPT base) increases Communication Rate by 58% and Good Question Rate by 38% absolute over ChatGPT alone.",
    373       "evidence": "Table 3: Okanagan Communication Rate 72.73% vs ChatGPT 14.21% (diff ≈58%). Good Question Rate 52.24% vs 13.43% (diff ≈39%). Numbers match abstract claims.",
    374       "supported": "strong"
    375     },
    376     {
    377       "claim": "Okanagan boosts Pass@1 by 8% and Test Pass Rate by 7% absolute over ChatGPT on HumanEvalComm.",
    378       "evidence": "Table 3: Okanagan Pass@1 39.62% vs ChatGPT 31.34% (diff ≈8%). Test Pass Rate 56.98% vs 49.39% (diff ≈8%). However, LLM-based evaluator has 0.94% False Recovery Rate for ChatGPT but 30-40% for other models (Table 6), introducing potential confound.",
    379       "supported": "moderate"
    380     },
    381     {
    382       "claim": "Incompleteness category results in higher communication rates but lower pass rates than Ambiguity and Inconsistency categories.",
    383       "evidence": "Table 4: For most models, 1p (Incompleteness) has higher communication rates (e.g., ChatGPT: 31.68% for 1p vs 5.84% for 1a vs 5.84% for 1c) and lower Pass@1 (e.g., ChatGPT: 27.95% for 1p vs 33.77% for 1a vs 53.25% for 1c).",
    384       "supported": "strong"
    385     },
    386     {
    387       "claim": "LLM-based evaluator's Good Question Rate aligns with human judgment within 10% for Communication Rate but shows larger discrepancies for Good Question Rate.",
    388       "evidence": "Figure 5 shows Communication Rate differences <10% between automated and manual evaluation. Good Question Rate shows larger discrepancies for CodeLlama, CodeQwen1.5, DeepSeek Coder (Section 4.3).",
    389       "supported": "moderate"
    390     }
    391   ],
    392   "methodology_tags": ["benchmark-eval"],
    393   "key_findings": "Code LLMs overwhelmingly generate code (>60%) rather than ask clarifying questions even when problem descriptions are deliberately made ambiguous, inconsistent, or incomplete, with Pass@1 dropping 35-52%. The proposed LLM agent approach Okanagan increases communication rate from 14% to 73% and modestly improves code generation metrics (+8% Pass@1) by asking clarifying questions before coding. However, Okanagan also asks unnecessary questions on well-specified problems, and the LLM-based evaluator used for assessment shows non-trivial biases (30-40% False Recovery Rate for some models).",
    394   "red_flags": [
    395     {
    396       "flag": "Internal number inconsistency",
    397       "detail": "The abstract reports Okanagan increases Good Question Rate by 38%, but the body text in Section 1 states '5%' for the same metric. This is a significant discrepancy that is never reconciled."
    398     },
    399     {
    400       "flag": "Circular evaluation with GPT-3.5",
    401       "detail": "GPT-3.5 serves triple duty: it is the base model for ChatGPT experiments, the base model for Okanagan, and the LLM-based evaluator that rates question quality and generates answers. This creates a potential self-evaluation bias where the evaluator may be more generous toward outputs from its own architecture."
    402     },
    403     {
    404       "flag": "High False Recovery Rate inflates some baselines",
    405       "detail": "Table 6 shows 30-40% False Recovery Rate for CodeLlama, CodeQwen1.5, DeepSeek Coder, and DeepSeek Chat — the evaluator provides correct answers even when these models asked no relevant questions, artificially inflating their pass rates relative to Okanagan (0% False Recovery Rate)."
    406     },
    407     {
    408       "flag": "No contamination analysis despite HumanEval-based benchmark",
    409       "detail": "HumanEvalComm is built on HumanEval (published 2021). All evaluated models were trained after 2021 and likely saw original HumanEval solutions. The modifications reduce but do not eliminate contamination risk, and this is never discussed."
    410     }
    411   ],
    412   "cited_papers": [
    413     {
    414       "title": "Evaluating large language models trained on code",
    415       "authors": ["Mark Chen", "Jerry Tworek"],
    416       "year": 2021,
    417       "arxiv_id": "2107.03374",
    418       "relevance": "Introduces HumanEval benchmark used as the basis for HumanEvalComm; foundational code generation evaluation work."
    419     },
    420     {
    421       "title": "SWE-bench: Can Language Models Resolve Real-world Github Issues?",
    422       "authors": ["Carlos E Jimenez", "John Yang"],
    423       "year": 2024,
    424       "relevance": "Major real-world code generation benchmark; cited as example of benchmarks that don't evaluate communication skills."
    425     },
    426     {
    427       "title": "SWE-Agent: Agent-computer interfaces enable automated software engineering",
    428       "authors": ["John Yang", "Carlos E Jimenez"],
    429       "year": 2024,
    430       "arxiv_id": "2405.15793",
    431       "relevance": "Prominent LLM agent for software engineering tasks, relevant to agentic code generation evaluation."
    432     },
    433     {
    434       "title": "Code llama: Open foundation models for code",
    435       "authors": ["Baptiste Roziere", "Jonas Gehring"],
    436       "year": 2023,
    437       "arxiv_id": "2308.12950",
    438       "relevance": "One of the evaluated Code LLMs; key open-source code generation model."
    439     },
    440     {
    441       "title": "DeepSeek-Coder: When the Large Language Model Meets Programming",
    442       "authors": ["Daya Guo", "Qihao Zhu"],
    443       "year": 2024,
    444       "arxiv_id": "2401.14196",
    445       "relevance": "One of the evaluated Code LLMs; achieved top communication rates among base models."
    446     },
    447     {
    448       "title": "AgentCoder: Multi-agent-based code generation with iterative testing and optimisation",
    449       "authors": ["Dong Huang", "Qingwen Bu"],
    450       "year": 2023,
    451       "arxiv_id": "2312.13010",
    452       "relevance": "LLM agent baseline for code generation that was tested on HumanEvalComm but showed 0% communication rate."
    453     },
    454     {
    455       "title": "LLM is Like a Box of Chocolates: the Non-determinism of ChatGPT in Code Generation",
    456       "authors": ["Shuyin Ouyang", "Jie M Zhang"],
    457       "year": 2023,
    458       "arxiv_id": "2308.02828",
    459       "relevance": "Study of ChatGPT non-determinism in code generation; provided code used in this paper's evaluation."
    460     },
    461     {
    462       "title": "Beyond accuracy: Evaluating self-consistency of code large language models with identitychain",
    463       "authors": ["Marcus J Min", "Yangruibo Ding"],
    464       "year": 2023,
    465       "arxiv_id": "2310.14053",
    466       "relevance": "Evaluates self-consistency of Code LLMs beyond accuracy; provided code used in this paper's evaluation."
    467     },
    468     {
    469       "title": "Reflexion: Language agents with verbal reinforcement learning",
    470       "authors": ["Noah Shinn", "Nelson Labash"],
    471       "year": 2023,
    472       "arxiv_id": "2303.11366",
    473       "relevance": "Framework for LLM self-improvement through verbal reinforcement; related agent approach for code generation."
    474     },
    475     {
    476       "title": "ReAct: Synergizing reasoning and acting in language models",
    477       "authors": ["Shunyu Yao", "Jeffrey Zhao"],
    478       "year": 2023,
    479       "arxiv_id": "2210.03629",
    480       "relevance": "Foundational work on integrating reasoning and acting in LLMs; underpins many agentic code generation approaches."
    481     },
    482     {
    483       "title": "LLM-based Test-driven Interactive Code Generation: User Study and Empirical Evaluation",
    484       "authors": ["Sarah Fakhoury", "Aaditya Naik"],
    485       "year": 2024,
    486       "arxiv_id": "2404.10100",
    487       "relevance": "Proposes interactive test-driven code generation with LLMs, closely related to communication-enhanced code generation."
    488     },
    489     {
    490       "title": "Rethinking Software Engineering in the Foundation Model Era: From Task-Driven AI Copilots to Goal-Driven AI Pair Programmers",
    491       "authors": ["Ahmed E Hassan", "Gustavo A Oliva"],
    492       "year": 2024,
    493       "arxiv_id": "2404.10225",
    494       "relevance": "Discusses the shift from task-driven to goal-driven AI programming assistants; frames the communication capability gap this paper addresses."
    495     }
    496   ]
    497 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs