ai-research-survey

Systematic scan of agentic development research. What's signal, what's noise.
git clone https://git.shiptheloop.com/ai-research-survey.git
Log | Files | Refs

scan.json (23943B)


      1 {
      2   "paper": {
      3     "title": "Automatic Generation of Benchmarks and Reliable LLM Judgment for Code Tasks",
      4     "authors": ["Eitan Farchi", "Shmulik Froimovich", "Rami Katan", "Orna Raz"],
      5     "year": 2024,
      6     "venue": "arXiv preprint",
      7     "arxiv_id": "2410.21071"
      8   },
      9   "checklist": {
     10     "artifacts": {
     11       "code_released": {
     12         "applies": true,
     13         "answer": false,
     14         "justification": "No repository URL, GitHub link, or code archive is provided anywhere in the paper. The benchmark generation engine and LaaJ implementations are described but not released."
     15       },
     16       "data_released": {
     17         "applies": true,
     18         "answer": false,
     19         "justification": "No dataset download link or data archive is provided. The generated benchmark data (program descriptions, code, summaries, LaaJ scores) is described but not released."
     20       },
     21       "environment_specified": {
     22         "applies": true,
     23         "answer": false,
     24         "justification": "No environment specification, dependency list, or software versions are mentioned. The paper describes the methodology conceptually but provides no technical setup details."
     25       },
     26       "reproduction_instructions": {
     27         "applies": true,
     28         "answer": false,
     29         "justification": "No reproduction instructions, README, or runnable scripts are provided. The methodology is described at a high level but the paper does not provide enough specifics to replicate the benchmark generation or LaaJ evaluation pipeline."
     30       }
     31     },
     32     "statistical_methodology": {
     33       "confidence_intervals_or_error_bars": {
     34         "applies": true,
     35         "answer": false,
     36         "justification": "No confidence intervals or error bars are reported. The paper reports point estimates of accuracy (e.g., 99.54%, 99.85%) with no uncertainty quantification."
     37       },
     38       "significance_tests": {
     39         "applies": true,
     40         "answer": false,
     41         "justification": "The paper compares two LaaJ models (laaj1 vs laaj2) and claims one is better than the other based on raw accuracy numbers (99.54% vs 99.85%) without any statistical significance test."
     42       },
     43       "effect_sizes_reported": {
     44         "applies": true,
     45         "answer": false,
     46         "justification": "No effect sizes are reported. The paper reports raw accuracy percentages and score distributions but no standardized effect sizes or baseline-contextualized improvements."
     47       },
     48       "sample_size_justified": {
     49         "applies": true,
     50         "answer": false,
     51         "justification": "The sample sizes (40 program ideas from 4 seeds, 232 valid pairs, 464 total pairs) are not justified. No power analysis or rationale for why these numbers are sufficient is provided. The paper mentions sampling 20-30 points for eyeballing based on CLT (Section 7.2) but this is for manual validation, not for the main experimental results."
     52       },
     53       "variance_reported": {
     54         "applies": true,
     55         "answer": false,
     56         "justification": "No variance, standard deviation, or spread measures are reported across runs. The experiment appears to be a single run with no repetition."
     57       }
     58     },
     59     "evaluation_design": {
     60       "baselines_included": {
     61         "applies": true,
     62         "answer": false,
     63         "justification": "No baseline comparisons are included. The paper does not compare its LaaJ approach against existing metrics (BLEU, ROUGE-L, CodeBLEU, CodeBERTScore, ICE-Score) despite mentioning them in Section 1.1. The two LaaJ variants are compared against each other but not against any external baseline."
     64       },
     65       "baselines_contemporary": {
     66         "applies": true,
     67         "answer": false,
     68         "justification": "No baselines are included at all, so contemporaneity cannot be assessed. The paper mentions BLEU, ROUGE-L, CodeBLEU, CodeBERTScore, and ICE-Score in the related work but never compares against them."
     69       },
     70       "ablation_study": {
     71         "applies": true,
     72         "answer": false,
     73         "justification": "No ablation study is conducted. The system has multiple components (seed generation, code generation, summary generation, similarity scale, threshold selection) but none are individually ablated to understand their contribution."
     74       },
     75       "multiple_metrics": {
     76         "applies": true,
     77         "answer": false,
     78         "justification": "Only accuracy (correct identification of true/false pairs) is reported as an evaluation metric. No other metrics such as precision, recall, F1, or agreement measures (e.g., Cohen's kappa) are used."
     79       },
     80       "human_evaluation": {
     81         "applies": true,
     82         "answer": false,
     83         "justification": "No human evaluation of the LaaJ outputs is conducted. The paper discusses the need for human judgment as the gold standard (Section 7.2, 'direct eyeballing of the data is continually needed') but does not actually perform any human evaluation in the reported experiments. The correctness labels are derived from the self-consistency property (same seed = similar, different seed = dissimilar) rather than human judgment."
     84       },
     85       "held_out_test_set": {
     86         "applies": true,
     87         "answer": false,
     88         "justification": "No explicit separation between development and test data is described. The 464 pairs appear to be used for both developing and evaluating the LaaJ models, with no held-out test set mentioned."
     89       },
     90       "per_category_breakdown": {
     91         "applies": true,
     92         "answer": true,
     93         "justification": "Tables 1 and 2 provide score distributions for the two LaaJ models. Section 6.1 reports separate results for the cluster-based experiments (sorting algorithms and graph traversal) vs. the non-cluster experiments, showing per-category breakdowns of accuracy."
     94       },
     95       "failure_cases_discussed": {
     96         "applies": true,
     97         "answer": true,
     98         "justification": "Section 6.1 and Table 2 discuss failure cases where scores were incorrectly labeled ('The scores highlighted in red represent those that were incorrectly labeled as true'). Section 7 ('Pitfalls and how we address them') discusses several failure modes including overfitting, LaaJ being only a proxy, and challenges with realistic programs."
     99       },
    100       "negative_results_reported": {
    101         "applies": true,
    102         "answer": true,
    103         "justification": "The paper reports that some programs were corrupted due to formatting errors (leaving only 232 valid pairs from the original expected number), and that cluster-based comparisons reduced accuracy (from 100% to 98.44%/99.48%). Section 7 discusses multiple pitfalls and challenges."
    104       }
    105     },
    106     "claims_and_evidence": {
    107       "abstract_claims_supported": {
    108         "applies": true,
    109         "answer": true,
    110         "justification": "The abstract claims are methodological ('we introduce a methodology to generate and evaluate LaaJ implementations, utilizing an automatically generated benchmark') and are supported by the described approach and Section 6 experiments. The abstract does not make unsupported quantitative claims."
    111       },
    112       "causal_claims_justified": {
    113         "applies": true,
    114         "answer": false,
    115         "justification": "The paper makes implicit causal claims about its approach enabling 'high quality code task solutions' (abstract) and that the self-consistency methodology creates reliable benchmarks. However, no controlled experiment or rigorous evaluation demonstrates that the approach actually causes better solutions compared to alternatives. The self-consistency check itself is presented as sufficient validation without external verification."
    116       },
    117       "generalization_bounded": {
    118         "applies": true,
    119         "answer": false,
    120         "justification": "The paper claims the approach 'is general and will be applicable to any context of LLM tasks in which loops in G can be utilized' (Section 1.2) but experiments only cover code summarization comparisons across three programming languages with small, simple programs. The title and abstract suggest generality ('Code Tasks') while results are limited to summary comparison for simple algorithmic programs."
    121       },
    122       "alternative_explanations_discussed": {
    123         "applies": true,
    124         "answer": false,
    125         "justification": "No alternative explanations are discussed for the high accuracy results. The 100% accuracy on non-cluster pairs could be due to the trivial nature of distinguishing summaries of completely different programs. The paper does not consider whether the self-consistency property creates artificially easy evaluation tasks."
    126       }
    127     },
    128     "setup_transparency": {
    129       "model_versions_specified": {
    130         "applies": true,
    131         "answer": false,
    132         "justification": "No specific model names or versions are mentioned anywhere in the paper. The paper refers only to 'LLMs', 'LLMt' (tested LLM), and 'LLMs' (strong LLM) without identifying what models were used. Section 6 says 'We tried the experiment with different LLMs' without naming any."
    133       },
    134       "prompts_provided": {
    135         "applies": true,
    136         "answer": false,
    137         "justification": "While the paper provides example seed concepts and program idea expansions (Section 3.1-3.2), the actual prompts used for code generation, summary generation, and LaaJ evaluation are not provided. The similarity scale descriptions (Sections 2 and 4) are provided but the full prompt text sent to the LLM judges is not shown."
    138       },
    139       "hyperparameters_reported": {
    140         "applies": true,
    141         "answer": false,
    142         "justification": "No hyperparameters (temperature, top-p, max tokens, etc.) are reported for any of the LLM calls. The threshold of 4 on the 1-7 similarity scale is mentioned but no LLM inference parameters are provided."
    143       },
    144       "scaffolding_described": {
    145         "applies": true,
    146         "answer": true,
    147         "justification": "Section 4.1 describes the LaaJ architecture with three layers: Prompt Optimization (dynamic few-shot learning, automated prompt extraction), Optimized Inferencing (batch processing, token allocation, error recovery), and Postprocessing (formatting, analytical postprocessing, error handling). The multi-agent pipeline is described conceptually with the graph G framework."
    148       },
    149       "data_preprocessing_documented": {
    150         "applies": true,
    151         "answer": false,
    152         "justification": "The paper mentions that 'due to formatting errors, some programs were corrupted, leaving us with 232 valid pairs' (Section 6) but does not describe the filtering criteria used to identify and remove corrupted programs, or how many were removed from the original count."
    153       }
    154     },
    155     "limitations_and_scope": {
    156       "limitations_section_present": {
    157         "applies": true,
    158         "answer": true,
    159         "justification": "Section 7 ('Pitfalls and how we address them') serves as a limitations section, discussing overfitting (7.1), LaaJ being only a proxy for human judgment (7.2), challenges with realistic programs (7.3), and coverage (7.4)."
    160       },
    161       "threats_to_validity_specific": {
    162         "applies": true,
    163         "answer": true,
    164         "justification": "Section 7 discusses specific threats: overfitting to examples shared with the development team (7.1), the gap between LaaJ proxy and human judgment (7.2), simple programs not representing real-world complexity (7.3), and representativeness of generated data (7.4). These are specific to this study rather than generic boilerplate."
    165       },
    166       "scope_boundaries_stated": {
    167         "applies": true,
    168         "answer": false,
    169         "justification": "The paper does not explicitly state what the results do not show. While Section 7.3 acknowledges that simple programs may not represent realistic programs, the paper does not explicitly bound its claims to the tested languages, program sizes, or task types. The generalization claim in Section 1.2 ('the approach is general') is unbounded."
    170       }
    171     },
    172     "data_integrity": {
    173       "raw_data_available": {
    174         "applies": true,
    175         "answer": false,
    176         "justification": "Raw data (generated programs, summaries, LaaJ scores, pair labels) is not available. Only aggregate results (accuracy percentages and score distributions in Tables 1 and 2) are presented."
    177       },
    178       "data_collection_described": {
    179         "applies": true,
    180         "answer": true,
    181         "justification": "The data generation procedure is described in detail in Sections 3.1-3.4: starting from seed concepts, expanding to program ideas, generating code in multiple languages, creating summaries, and formulating comparison pairs. Section 6 describes the specific experiment with 4 seeds, 40 program ideas, and the resulting pairs."
    182       },
    183       "recruitment_methods_described": {
    184         "applies": false,
    185         "answer": false,
    186         "justification": "No human participants are involved. The data is entirely LLM-generated from seed concepts."
    187       },
    188       "data_pipeline_documented": {
    189         "applies": true,
    190         "answer": false,
    191         "justification": "The conceptual pipeline is described (seeds -> ideas -> code -> summaries -> pairs) but critical details are missing: the original number of programs before corruption filtering, the criteria for identifying corrupted programs, and how the 232 valid pairs were selected from the original generation. The jump from 40 program ideas to 232 valid pairs (expected would be more) is not fully explained."
    192       }
    193     },
    194     "conflicts_of_interest": {
    195       "funding_disclosed": {
    196         "applies": true,
    197         "answer": false,
    198         "justification": "No funding source is disclosed anywhere in the paper. All four authors have IBM email addresses, suggesting this is IBM-funded work, but no explicit funding statement is provided."
    199       },
    200       "affiliations_disclosed": {
    201         "applies": true,
    202         "answer": true,
    203         "justification": "Author affiliations are disclosed via IBM email addresses (farchi@il.ibm.com, shmulik.froimovich@ibm.com, rami.katan@il.ibm.com, ornar@il.ibm.com). All four authors are at IBM."
    204       },
    205       "funder_independent_of_outcome": {
    206         "applies": true,
    207         "answer": false,
    208         "justification": "All authors are IBM employees. IBM has a commercial interest in LLM code generation solutions. The paper describes a testing methodology for IBM's code generation tools, making IBM a non-independent funder with a stake in demonstrating the approach works. This conflict is not acknowledged."
    209       },
    210       "financial_interests_declared": {
    211         "applies": true,
    212         "answer": false,
    213         "justification": "No competing interests statement or financial disclosures are provided in the paper."
    214       }
    215     },
    216     "contamination": {
    217       "training_cutoff_stated": {
    218         "applies": true,
    219         "answer": false,
    220         "justification": "No model training cutoff dates are stated. The paper does not even name the models used, let alone their training data cutoffs. Since the LaaJ models evaluate summaries that could overlap with training data, this is relevant."
    221       },
    222       "train_test_overlap_discussed": {
    223         "applies": true,
    224         "answer": false,
    225         "justification": "No discussion of potential train/test overlap. The generated benchmark consists of common algorithmic problems (sorting, graph traversal, matrix multiplication) that are abundantly present in LLM training data, which could inflate the quality of generated code and summaries. This is not addressed."
    226       },
    227       "benchmark_contamination_addressed": {
    228         "applies": true,
    229         "answer": false,
    230         "justification": "The paper does not discuss whether the generated benchmark programs (common CS algorithms) may have been seen during model training. While the paper argues fresh benchmarks can be generated to avoid overfitting (Section 7.1), this is about overfitting to specific examples, not about pre-training data contamination."
    231       }
    232     },
    233     "human_studies": {
    234       "pre_registered": {
    235         "applies": false,
    236         "answer": false,
    237         "justification": "No human participants are involved in the study."
    238       },
    239       "irb_or_ethics_approval": {
    240         "applies": false,
    241         "answer": false,
    242         "justification": "No human participants are involved in the study."
    243       },
    244       "demographics_reported": {
    245         "applies": false,
    246         "answer": false,
    247         "justification": "No human participants are involved in the study."
    248       },
    249       "inclusion_exclusion_criteria": {
    250         "applies": false,
    251         "answer": false,
    252         "justification": "No human participants are involved in the study."
    253       },
    254       "randomization_described": {
    255         "applies": false,
    256         "answer": false,
    257         "justification": "No human participants are involved in the study."
    258       },
    259       "blinding_described": {
    260         "applies": false,
    261         "answer": false,
    262         "justification": "No human participants are involved in the study."
    263       },
    264       "attrition_reported": {
    265         "applies": false,
    266         "answer": false,
    267         "justification": "No human participants are involved in the study."
    268       }
    269     },
    270     "cost_and_practicality": {
    271       "inference_cost_reported": {
    272         "applies": true,
    273         "answer": false,
    274         "justification": "No inference costs, token counts, or wall-clock times are reported despite the approach involving multiple LLM calls per benchmark instance (code generation, summary generation, LaaJ evaluation). Section 7.1 mentions 'the cost, mainly of LLM inference time, of generating a fresh sample' but does not quantify it."
    275       },
    276       "compute_budget_stated": {
    277         "applies": true,
    278         "answer": false,
    279         "justification": "No computational budget, GPU hours, or API costs are reported for any of the experiments."
    280       }
    281     }
    282   },
    283   "claims": [
    284     {
    285       "claim": "The LaaJ approach achieves 100% accuracy in identifying true and false summary pairs for non-cluster program ideas.",
    286       "evidence": "Section 6 reports that both LaaJ models achieved 100% accuracy on 464 pairs (232 true, 232 false), and 100% accuracy on symmetry tests (LaaJ(a,b) = LaaJ(b,a)).",
    287       "supported": "weak"
    288     },
    289     {
    290       "claim": "For cluster-based program ideas (related but not identical programs), the combined benchmark achieves 99.54% accuracy for LaaJ1 and 99.85% for LaaJ2.",
    291       "evidence": "Section 6.1 reports these accuracy numbers when combining cluster and non-cluster results across 96 pairwise comparisons per cluster for sorting algorithms and graph traversal algorithms.",
    292       "supported": "moderate"
    293     },
    294     {
    295       "claim": "The self-consistency approach enables generating reliable labeled data at scale for LaaJ validation.",
    296       "evidence": "The paper describes the graph-based methodology (Section 3.3) and demonstrates it with one experiment (Section 6). However, the scale is modest (40 program ideas, 464 pairs) and no comparison against human-labeled data is provided.",
    297       "supported": "weak"
    298     },
    299     {
    300       "claim": "The approach is general and applicable to any context of LLM tasks in which loops in G can be utilized to obtain correctness claims.",
    301       "evidence": "Section 1.2 states this claim. Only code summarization across languages is demonstrated; no other LLM task domains are tested.",
    302       "supported": "unsupported"
    303     }
    304   ],
    305   "methodology_tags": ["benchmark-eval"],
    306   "key_findings": "The paper proposes a graph-based methodology for automatically generating benchmarks to evaluate LLM code-related tasks, using self-consistency properties (cycles in the generation graph) to create labeled data without human annotation. In experiments with 40 program ideas across COBOL, PL/1, Java, C++, and Python, two LaaJ models achieved 99.54% and 99.85% accuracy in distinguishing same-seed vs. different-seed code summaries. The approach is interesting conceptually but the evaluation is limited to one task (summary comparison) with simple algorithmic programs, no human validation ground truth, and no comparison against existing metrics.",
    307   "red_flags": [
    308     {
    309       "flag": "No model identification",
    310       "detail": "The paper never names any LLM used in experiments. Neither the 'strong' LLM (LLMs) nor the 'tested' LLM (LLMt) is identified. This makes the results impossible to reproduce or contextualize."
    311     },
    312     {
    313       "flag": "No external baselines",
    314       "detail": "Despite discussing BLEU, ROUGE-L, CodeBLEU, CodeBERTScore, and ICE-Score in the introduction, none of these are compared against. The two LaaJ variants are only compared against each other."
    315     },
    316     {
    317       "flag": "Self-referential evaluation",
    318       "detail": "The benchmark labels are derived from the self-consistency property (same seed = similar summaries) rather than from human judgment. The paper acknowledges human judgment is needed (Section 7.2) but does not perform any. This means the evaluation measures consistency with a structural assumption, not actual quality."
    319     },
    320     {
    321       "flag": "Trivially easy evaluation task",
    322       "detail": "Distinguishing summaries of completely different programs (e.g., sorting vs. financial applications) is likely trivial for any LLM judge. The 100% accuracy on non-cluster pairs may not demonstrate meaningful capability. The more interesting cluster results (98-99%) receive less attention."
    323     },
    324     {
    325       "flag": "IBM conflict of interest not acknowledged",
    326       "detail": "All four authors are IBM employees working on LLM code generation solutions. The paper describes a testing methodology for such solutions without acknowledging the potential conflict of interest."
    327     },
    328     {
    329       "flag": "Extremely small experiment scale",
    330       "detail": "Only 40 program ideas from 4 seed concepts are used. With formatting errors reducing the valid pairs to 232, the evaluation is very small for the generality of the claims made."
    331     }
    332   ],
    333   "cited_papers": [
    334     {
    335       "title": "Evaluating large language models trained on code",
    336       "authors": ["Mark Chen", "Jerry Tworek", "Heewoo Jun"],
    337       "year": 2021,
    338       "arxiv_id": "2107.03374",
    339       "relevance": "Introduces HumanEval benchmark for code generation evaluation, a foundational benchmark in LLM code assessment."
    340     },
    341     {
    342       "title": "CrossCodeEval: A Diverse and Multilingual Benchmark for Cross-File Code Completion",
    343       "authors": ["Yangruibo Ding", "Zijian Wang", "Wasi Uddin Ahmad"],
    344       "year": 2023,
    345       "relevance": "Multilingual code completion benchmark relevant to evaluating LLM code generation across programming languages."
    346     },
    347     {
    348       "title": "SWE-bench: Can Language Models Resolve Real-World GitHub Issues?",
    349       "authors": ["Carlos E. Jimenez", "John Yang", "Alexander Wettig"],
    350       "year": 2024,
    351       "relevance": "Major benchmark for evaluating LLM capability on real-world software engineering tasks."
    352     },
    353     {
    354       "title": "CodeBLEU: A Method for Automatic Evaluation of Code Synthesis",
    355       "authors": ["Shuo Ren", "Daya Guo", "Shuai Lu"],
    356       "year": 2020,
    357       "relevance": "Code evaluation metric incorporating code structure, relevant to automated assessment of code generation quality."
    358     },
    359     {
    360       "title": "CodeBERTScore: Evaluating Code Generation with Pretrained Models of Code",
    361       "authors": ["Shuyan Zhou", "Uri Alon", "Sumit Agarwal", "Graham Neubig"],
    362       "year": 2023,
    363       "relevance": "Semantic-embedding-based code evaluation metric relevant to LLM code generation assessment methodology."
    364     },
    365     {
    366       "title": "ICE-Score: Instructing Large Language Models to Evaluate Code",
    367       "authors": ["Terry Yue Zhuo"],
    368       "year": 2024,
    369       "relevance": "LLM-as-judge approach for code evaluation, directly comparable to the LaaJ methodology proposed in this paper."
    370     }
    371   ]
    372 }

Impressum · Datenschutz