scan.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan.json (29561B)
      1 {
      2   "paper": {
      3     "title": "Identifying Inaccurate Descriptions in LLM-generated Code Comments via Test Execution",
      4     "authors": ["Sungmin Kang", "Louis Milliken", "Shin Yoo"],
      5     "year": 2024,
      6     "venue": "arXiv",
      7     "arxiv_id": "2406.14836",
      8     "doi": "10.48550/arXiv.2406.14836"
      9   },
     10   "scan_version": 2,
     11   "active_modules": ["experimental_rigor", "data_leakage"],
     12   "methodology_tags": ["benchmark-eval", "qualitative"],
     13   "key_findings": "Even the best-performing LLM (GPT-4) generates factually inaccurate code comments in roughly 20% of cases. Nine existing code-comment consistency detection techniques showed no statistically significant relationship with comment factual accuracy. The proposed 'document testing' concept — generating tests from comments via LLMs and observing pass/fail — yields a robust statistical relationship with comment accuracy (p < 10⁻⁹, ROC-AUC 0.67), with failing tests being more predictive of inaccuracies than passing tests.",
     14   "checklist": {
     15     "artifacts": {
     16       "code_released": {
     17         "applies": true,
     18         "answer": false,
     19         "justification": "No repository URL, GitHub link, or code archive is provided in the paper. Supplementary material is mentioned for prompts but no code release."
     20       },
     21       "data_released": {
     22         "applies": true,
     23         "answer": false,
     24         "justification": "The paper uses public Defects4J methods, but the 540 labeled comments, accuracy labels, and generated test results are not released. No dataset download link is provided."
     25       },
     26       "environment_specified": {
     27         "applies": true,
     28         "answer": false,
     29         "justification": "No environment specifications, requirements files, or dependency versions are described. Only the model version gpt-3.5-turbo-0125 and the Defects4J command line interface are mentioned."
     30       },
     31       "reproduction_instructions": {
     32         "applies": true,
     33         "answer": false,
     34         "justification": "No step-by-step reproduction instructions are provided. The pipeline is described conceptually (Figure 3) but without runnable scripts or commands."
     35       }
     36     },
     37     "statistical_methodology": {
     38       "confidence_intervals_or_error_bars": {
     39         "applies": true,
     40         "answer": true,
     41         "justification": "95% confidence intervals are reported throughout: Figure 4a (pass rate by accuracy), Figure 5 (ROC-AUC and AP), Figure 6 (binned inaccuracy rate), Figure 7 (ablation), and Figure 8 (w sensitivity)."
     42       },
     43       "significance_tests": {
     44         "applies": true,
     45         "answer": true,
     46         "justification": "Welch's t-test and Point-Biserial correlation are used systematically. Table 1 reports p-values for all 9 baselines. Section 6.1 reports p=0.002 for pass rate difference and p < 10⁻⁹ for the correctness estimator."
     47       },
     48       "effect_sizes_reported": {
     49         "applies": true,
     50         "answer": true,
     51         "justification": "ROC-AUC (0.67) and Average Precision are reported as effect size measures. Pass rate differences (~0.55 vs ~0.35) with CIs are shown. Practical thresholds are given: 'rejecting comments with score < 0.8 would remove 46% of inaccurate comments while retaining 72% of correct ones.'"
     52       },
     53       "sample_size_justified": {
     54         "applies": true,
     55         "answer": false,
     56         "justification": "180 methods were sampled from Defects4J but no justification is given for this specific number. No power analysis is discussed. The 141 unambiguously labeled comments used for evaluation are a subset determined by labeling outcome, not design."
     57       },
     58       "variance_reported": {
     59         "applies": true,
     60         "answer": true,
     61         "justification": "Section 5.2 states 'we repeated experiments five times to verify that our approach works consistently.' Results are reported as means with 95% confidence intervals across the five runs (Figures 5, 7, 8)."
     62       }
     63     },
     64     "evaluation_design": {
     65       "baselines_included": {
     66         "applies": true,
     67         "answer": true,
     68         "justification": "Nine baselines are evaluated: four code-comment consistency techniques (DocChecker, Deep-JIT, GPT-3-NoCoT, GPT-3-CoT), four similarity measures (BLEU, SentenceBERT, CodeT5, CodeBERT), and one LLM inaccuracy detector (CID). Results in Table 1 and Figure 5."
     69       },
     70       "baselines_contemporary": {
     71         "applies": true,
     72         "answer": true,
     73         "justification": "Baselines include DocChecker (2024), Li and Shin/GPT-3-NoCoT (2024), CID (2024), Deep-JIT (2021), CodeT5 (2021), CodeBERT (2020). The most relevant consistency detection baselines are contemporary."
     74       },
     75       "ablation_study": {
     76         "applies": true,
     77         "answer": true,
     78         "justification": "RQ2 (Section 6.2, Figure 7) is an explicit prompt ablation study, removing components incrementally: comment/signature only → + class name → + constructor → + example tests → + two-stage prompting. Also compares EvoSuite vs human tests."
     79       },
     80       "multiple_metrics": {
     81         "applies": true,
     82         "answer": true,
     83         "justification": "Multiple metrics are used: ROC-AUC, Average Precision (AP), Welch's t-test p-values, Point-Biserial correlation, and test pass rate. Section 5.2 justifies the choice of each metric."
     84       },
     85       "human_evaluation": {
     86         "applies": true,
     87         "answer": true,
     88         "justification": "RQ4 (Section 6.4) provides a qualitative evaluation where the authors manually analyze the system's outputs — examining specific test cases that succeeded (Figure 9) and failed (Figures 10, 11) to assess strengths and limitations."
     89       },
     90       "held_out_test_set": {
     91         "applies": true,
     92         "answer": false,
     93         "justification": "All 141 labeled comments are used for both evaluation (RQ1-2 with w=100) and parameter analysis (RQ3 exploring w on the same data). No train/dev/test split is used. The w=100 parameter choice for RQ1-2 is not justified from a separate validation set."
     94       },
     95       "per_category_breakdown": {
     96         "applies": true,
     97         "answer": true,
     98         "justification": "Results are broken down by LLM (Figure 1: StarCoder, GPT-3, GPT-4), by accuracy type (Figure 4a: accurate, inaccurate, behaviorally inaccurate), and by error taxonomy (Figure 2: four error categories with counts)."
     99       },
    100       "failure_cases_discussed": {
    101         "applies": true,
    102         "answer": true,
    103         "justification": "RQ4 explicitly discusses failure cases: execution environment failures (Figure 10, test fails due to missing CSV file) and LLM-hallucinated properties (Figure 11, test based on fabricated property not in the comment)."
    104       },
    105       "negative_results_reported": {
    106         "applies": true,
    107         "answer": true,
    108         "justification": "Table 1 shows all 9 baseline techniques fail to detect inaccurate comments (a major negative result). Section 6.4 reports failure modes of the proposed approach. CodeBERT's spurious ROC-AUC is investigated and debunked (Section 6.1)."
    109       }
    110     },
    111     "claims_and_evidence": {
    112       "abstract_claims_supported": {
    113         "applies": true,
    114         "answer": true,
    115         "justification": "Abstract claims: (1) ~1/5 GPT-4 comments inaccurate → supported by Figure 1. (2) Nine techniques have no statistically significant relationship → supported by Table 1. (3) Document testing has robust statistical relationship → supported by Section 6.1 (p < 10⁻⁹, ROC-AUC 0.67)."
    116       },
    117       "causal_claims_justified": {
    118         "applies": true,
    119         "answer": true,
    120         "justification": "The paper's main claims are correlational ('statistical relationship'). Causal claims in the ablation study (RQ2, 'each element helps improve') are justified through controlled single-variable manipulation in Figure 7."
    121       },
    122       "generalization_bounded": {
    123         "applies": true,
    124         "answer": false,
    125         "justification": "The title says 'LLM-generated Code Comments' broadly, but results are only on Java methods from Defects4J with GPT-3.5-turbo. Section 7.1 acknowledges the limitation ('further research is required to tell whether these principles would work for other languages and projects'), but the title and abstract frame the contribution more broadly than the evidence supports."
    126       },
    127       "alternative_explanations_discussed": {
    128         "applies": true,
    129         "answer": true,
    130         "justification": "Section 7.1 discusses LLM randomness (mitigated by 5 runs), training data contamination risk (argued not to invalidate results). Section 6.1 investigates CodeBERT's spurious performance. Section 6.4 discusses failure modes where the core assumption breaks down."
    131       },
    132       "proxy_outcome_distinction": {
    133         "applies": true,
    134         "answer": true,
    135         "justification": "The paper explicitly formalizes the proxy relationship: test pass rate as a proxy for comment accuracy, with a full Bayesian derivation (Section 4.1, Equations 1-13). Limitations of this proxy are discussed in Sections 6.4 and 7 (hallucinated properties, execution environment issues)."
    136       }
    137     },
    138     "setup_transparency": {
    139       "model_versions_specified": {
    140         "applies": true,
    141         "answer": false,
    142         "justification": "GPT-3 is versioned as 'gpt-3.5-turbo-0125' (Section 5.2). However, GPT-4 is referenced only as 'GPT-4 [19]' without a version or snapshot date, and StarCoder is referenced only by name without a specific version. Both are used in the accuracy evaluation (Section 2.1, Figure 1)."
    143       },
    144       "prompts_provided": {
    145         "applies": true,
    146         "answer": true,
    147         "justification": "The paper states 'The specific prompt is provided in our supplementary material' (Section 2.1) and 'The full prompt is provided in the supplementary material' (Section 4.2.2). Prompts are provided, though in supplementary material rather than the main paper."
    148       },
    149       "hyperparameters_reported": {
    150         "applies": true,
    151         "answer": false,
    152         "justification": "No LLM hyperparameters (temperature, top-p, max tokens) are reported for any model used in the experiments. Only the model name/version and number of runs (5) are stated."
    153       },
    154       "scaffolding_described": {
    155         "applies": false,
    156         "answer": false,
    157         "justification": "No agentic scaffolding is used. The pipeline (Figure 3) is a sequential information retrieval → two-stage prompting → test execution workflow, not an agentic system with tools, memory, or feedback loops."
    158       },
    159       "data_preprocessing_documented": {
    160         "applies": true,
    161         "answer": true,
    162         "justification": "Section 2.1 documents the sampling procedure: 180 public methods from Defects4J fixed files, selected for longest comments, generating 540 comments across 3 LLMs. Labeling procedure described with two annotators and 87% agreement. Section 5.2 notes 141 unambiguous labels used."
    163       }
    164     },
    165     "limitations_and_scope": {
    166       "limitations_section_present": {
    167         "applies": true,
    168         "answer": true,
    169         "justification": "Section 7 'Discussion' includes Section 7.1 'Threats to Validity' with substantive discussion of internal and external threats. Section 7.2 'Future Work' also discusses current limitations."
    170       },
    171       "threats_to_validity_specific": {
    172         "applies": true,
    173         "answer": true,
    174         "justification": "Section 7.1 discusses specific threats: LLM output randomness mitigated by 5 runs, GPT training data potentially containing subject code (specific to their choice of Defects4J), results limited to Java code from Defects4J benchmark."
    175       },
    176       "scope_boundaries_stated": {
    177         "applies": true,
    178         "answer": true,
    179         "justification": "Section 7.1 explicitly states: 'the experimental results we present were done on Java code from the widely-used Defects4J benchmark; further research is required to tell whether these principles would work for other languages and projects.' Section 7.2 identifies specific expansions needed."
    180       }
    181     },
    182     "data_integrity": {
    183       "raw_data_available": {
    184         "applies": true,
    185         "answer": false,
    186         "justification": "The 540 labeled comments, accuracy labels, generated tests, and test execution results are not released. No data download link or archive is provided."
    187       },
    188       "data_collection_described": {
    189         "applies": true,
    190         "answer": true,
    191         "justification": "Section 2.1 describes data collection in detail: sampling criteria (public methods from Defects4J fixed files, longest comments), three LLMs used, labeling procedure (first author labels, second author independently labels subset, 87% agreement, discussion to resolve disagreements)."
    192       },
    193       "recruitment_methods_described": {
    194         "applies": false,
    195         "answer": false,
    196         "justification": "No human participants. The study uses code from the standard public Defects4J benchmark. Author labeling is part of the methodology, not human subject recruitment."
    197       },
    198       "data_pipeline_documented": {
    199         "applies": true,
    200         "answer": true,
    201         "justification": "The pipeline is documented: 180 methods from Defects4J → 540 comments (3 LLMs × 180) → manual labeling → 141 unambiguous GPT-3 labels for experiments. The document testing pipeline (Figure 3) documents information retrieval → prompting → test execution stages."
    202       }
    203     },
    204     "conflicts_of_interest": {
    205       "funding_disclosed": {
    206         "applies": true,
    207         "answer": false,
    208         "justification": "No funding source, acknowledgments section, or grant information is mentioned in the paper."
    209       },
    210       "affiliations_disclosed": {
    211         "applies": true,
    212         "answer": true,
    213         "justification": "All authors are from KAIST (Daejeon, South Korea), clearly stated in the author block. They evaluate third-party products (GPT, StarCoder) with no affiliation to OpenAI or BigCode."
    214       },
    215       "funder_independent_of_outcome": {
    216         "applies": true,
    217         "answer": false,
    218         "justification": "Funding is not disclosed, so independence cannot be assessed. KAIST authors evaluating third-party tools suggests no obvious conflict, but the absence of any funding disclosure means this cannot be confirmed."
    219       },
    220       "financial_interests_declared": {
    221         "applies": true,
    222         "answer": false,
    223         "justification": "No competing interests or financial interests statement is present in the paper."
    224       }
    225     },
    226     "contamination": {
    227       "training_cutoff_stated": {
    228         "applies": true,
    229         "answer": false,
    230         "justification": "No explicit training data cutoff dates are stated for GPT-3.5-turbo, GPT-4, or StarCoder. Section 7.1 acknowledges 'the training data of the GPT family of LLMs is unknown' but does not state any cutoff dates."
    231       },
    232       "train_test_overlap_discussed": {
    233         "applies": true,
    234         "answer": true,
    235         "justification": "Section 7.1 directly discusses this: 'As the training data of the GPT family of LLMs is unknown, there is also the risk that the LLM had learned the subject code. However, this did not translate into the LLM correctly predicting which comments were accurate on its own; only with document testing could we find a reliable predictor.'"
    236       },
    237       "benchmark_contamination_addressed": {
    238         "applies": true,
    239         "answer": true,
    240         "justification": "Defects4J has been publicly available since 2014, well before GPT model training. Section 7.1 acknowledges the contamination risk and argues it does not invalidate the document testing results, since the LLM alone cannot predict comment accuracy without testing."
    241       }
    242     },
    243     "human_studies": {
    244       "pre_registered": {
    245         "applies": false,
    246         "answer": false,
    247         "justification": "No human participants. The study evaluates LLM-generated comments on code from Defects4J. Author labeling is methodological, not a human subjects study."
    248       },
    249       "irb_or_ethics_approval": {
    250         "applies": false,
    251         "answer": false,
    252         "justification": "No human participants. The study analyzes code and LLM-generated comments."
    253       },
    254       "demographics_reported": {
    255         "applies": false,
    256         "answer": false,
    257         "justification": "No human participants in the study."
    258       },
    259       "inclusion_exclusion_criteria": {
    260         "applies": false,
    261         "answer": false,
    262         "justification": "No human participants in the study."
    263       },
    264       "randomization_described": {
    265         "applies": false,
    266         "answer": false,
    267         "justification": "No human participants in the study."
    268       },
    269       "blinding_described": {
    270         "applies": false,
    271         "answer": false,
    272         "justification": "No human participants in the study."
    273       },
    274       "attrition_reported": {
    275         "applies": false,
    276         "answer": false,
    277         "justification": "No human participants in the study."
    278       }
    279     },
    280     "cost_and_practicality": {
    281       "inference_cost_reported": {
    282         "applies": true,
    283         "answer": false,
    284         "justification": "No API costs, token counts, or wall-clock time are reported despite the approach requiring multiple LLM calls per comment (property extraction + test generation) plus test compilation and execution."
    285       },
    286       "compute_budget_stated": {
    287         "applies": true,
    288         "answer": false,
    289         "justification": "No total computational budget is stated. The approach requires 5 runs × 141 comments × multiple LLM calls each, plus EvoSuite runs for one condition, but none of this is quantified."
    290       }
    291     },
    292     "experimental_rigor": {
    293       "seed_sensitivity_reported": {
    294         "applies": true,
    295         "answer": true,
    296         "justification": "Section 5.2: 'we repeated experiments five times to verify that our approach works consistently.' Results are reported with 95% CIs across runs, demonstrating sensitivity to LLM randomness (analogous to seed sensitivity)."
    297       },
    298       "number_of_runs_stated": {
    299         "applies": true,
    300         "answer": true,
    301         "justification": "Section 5.2 explicitly states: 'we repeated experiments five times.' The top three ablation experiments in Figure 7 were performed once (noted in caption)."
    302       },
    303       "hyperparameter_search_budget": {
    304         "applies": true,
    305         "answer": false,
    306         "justification": "RQ3 explores the w parameter across 201 values using a logarithmic schedule (Section 6.3), but the selection of w=100 for RQ1 and RQ2 is not justified through a principled search on a separate validation set. No search budget for other design choices is stated."
    307       },
    308       "best_config_selection_justified": {
    309         "applies": true,
    310         "answer": false,
    311         "justification": "w=100 is used in RQ1 and RQ2 before its sensitivity is explored in RQ3. While RQ3 confirms w>10 works well, the specific choice of w=100 is not justified through validation set selection or principled methodology."
    312       },
    313       "multiple_comparison_correction": {
    314         "applies": true,
    315         "answer": false,
    316         "justification": "Table 1 reports p-values for 9 baseline comparisons, and Section 6.1 reports additional significance tests. No correction for multiple comparisons (Bonferroni, Holm, etc.) is applied."
    317       },
    318       "self_comparison_bias_addressed": {
    319         "applies": true,
    320         "answer": false,
    321         "justification": "The authors propose and evaluate their own document testing technique against baselines. While they retrained Deep-JIT to confirm similar accuracy to the original paper, they do not discuss the general bias of evaluating their own system."
    322       },
    323       "compute_budget_vs_performance": {
    324         "applies": true,
    325         "answer": false,
    326         "justification": "Document testing requires multiple LLM calls plus test compilation and execution per comment, which is substantially more compute than any baseline (which require a single forward pass). This compute difference is not discussed or controlled for."
    327       },
    328       "benchmark_construct_validity": {
    329         "applies": true,
    330         "answer": true,
    331         "justification": "Section 2.1 extensively argues that their factual accuracy definition is more meaningful than BLEU or Likert-scale evaluations. They justify their labeling criteria with concrete examples (Section 3) and distinguish it from subjective 'accuracy' in prior work."
    332       },
    333       "scaffold_confound_addressed": {
    334         "applies": false,
    335         "answer": false,
    336         "justification": "No scaffolding comparison is involved. The approach uses a fixed pipeline; no model-to-model comparison through different scaffolds occurs."
    337       }
    338     },
    339     "data_leakage": {
    340       "temporal_leakage_addressed": {
    341         "applies": true,
    342         "answer": false,
    343         "justification": "Defects4J code has been publicly available since 2014, predating all LLM training. While Section 7.1 discusses the general contamination risk, no temporal analysis is performed to assess whether specific methods or their tests appeared in training data."
    344       },
    345       "feature_leakage_addressed": {
    346         "applies": true,
    347         "answer": true,
    348         "justification": "Section 4.2.1 explicitly withholds the method body from test generation prompts: 'only the signature is provided because the aim is to evaluate the comment; if the code itself is exposed, the LLM may be more influenced by the code than the comment.' This is a deliberate leakage prevention measure."
    349       },
    350       "non_independence_addressed": {
    351         "applies": true,
    352         "answer": false,
    353         "justification": "Methods are sampled from multiple Defects4J projects, but no analysis of whether methods from the same project share characteristics that could inflate results. Non-independence of examples from the same class or project is not discussed."
    354       },
    355       "leakage_detection_method": {
    356         "applies": true,
    357         "answer": false,
    358         "justification": "No concrete leakage detection method (canary strings, membership inference, n-gram overlap, decontamination pipeline) is applied. The code withholding in Section 4.2.1 is a design choice for the pipeline, not a data contamination detection method."
    359       }
    360     }
    361   },
    362   "claims": [
    363     {
    364       "claim": "Even the best-performing LLM (GPT-4) generates factually inaccurate code comments in roughly one fifth of cases.",
    365       "evidence": "Manual inspection of 540 LLM-generated comments across StarCoder, GPT-3, and GPT-4. Figure 1 shows GPT-4 has ~20% inaccurate comments, GPT-3 ~33%, StarCoder ~50%. Inter-rater agreement at 87% (Section 2.1).",
    366       "supported": "strong"
    367     },
    368     {
    369       "claim": "Nine existing techniques (code-comment consistency detectors, similarity metrics, LLM self-inspection) show no statistically significant relationship with comment factual accuracy.",
    370       "evidence": "Table 1 reports Welch's t-test and Point-Biserial correlation p-values for all 9 baselines on 141 unambiguous GPT-3 labels. All p-values exceed 0.05 (Section 2.2).",
    371       "supported": "strong"
    372     },
    373     {
    374       "claim": "Document testing has a robust statistical relationship with comment accuracy, unlike all baselines.",
    375       "evidence": "Section 6.1: test pass rate difference between accurate and inaccurate comments (p=0.002), correctness estimator p < 10⁻⁹ (Welch's t-test), p < 10⁻¹¹ (Point-Biserial). ROC-AUC 0.67. Figure 5 shows document testing outperforms all baselines on both ROC-AUC and AP with non-overlapping 95% CIs.",
    376       "supported": "strong"
    377     },
    378     {
    379       "claim": "Each component of the helper information (class name, constructors, example tests, two-stage prompting) improves document testing performance.",
    380       "evidence": "RQ2 (Section 6.2, Figure 7) ablation study shows incremental improvement in both executable test proportion and ROC-AUC as components are added.",
    381       "supported": "moderate"
    382     },
    383     {
    384       "claim": "Failing tests are more predictive of inaccurate comments than passing tests (optimal w > 10).",
    385       "evidence": "RQ3 (Section 6.3, Figure 8) shows ROC-AUC peaks when w > 10, meaning failing tests are weighted more heavily. Explained by labeling procedure: one inaccurate statement makes the whole comment inaccurate.",
    386       "supported": "moderate"
    387     }
    388   ],
    389   "red_flags": [
    390     {
    391       "flag": "Small sample size for main evaluation",
    392       "detail": "Only 141 unambiguously labeled GPT-3 comments are used for the main evaluation (RQ1-4). The 540 comments span 3 LLMs, but only GPT-3's unambiguous subset is evaluated with document testing. The StarCoder validation is mentioned but not systematically reported."
    393     },
    394     {
    395       "flag": "Single primary annotator",
    396       "detail": "The first author labeled all 540 comments. The second author independently labeled only a 'subset' (proportion unspecified). At 87% agreement, 13% of labels are potentially contested. The final discussed labels are used, but the process relies heavily on one annotator."
    397     },
    398     {
    399       "flag": "Modest discriminative ability",
    400       "detail": "ROC-AUC of 0.67 is statistically significant and better than baselines (all near 0.5), but indicates limited practical discriminative ability. The paper frames this as strong but a random classifier achieves 0.5."
    401     },
    402     {
    403       "flag": "Parameter selection on evaluation data",
    404       "detail": "w=100 is used in RQ1-2 before RQ3 explores the parameter space on the same 141 examples. This circular use of the evaluation data could inflate reported performance."
    405     }
    406   ],
    407   "cited_papers": [
    408     {
    409       "title": "StarCoder: may the source be with you!",
    410       "authors": ["Raymond Li", "Loubna Ben Allal", "Yangtian Zi"],
    411       "year": 2023,
    412       "arxiv_id": "2305.06161",
    413       "relevance": "Open-source code LLM evaluated for comment generation accuracy, directly relevant to LLM code generation capabilities."
    414     },
    415     {
    416       "title": "GPT-4 Technical Report",
    417       "authors": ["OpenAI"],
    418       "year": 2023,
    419       "arxiv_id": "2303.08774",
    420       "relevance": "State-of-the-art LLM evaluated for comment generation; relevant to LLM capability assessment."
    421     },
    422     {
    423       "title": "Large language models for software engineering: Survey and open problems",
    424       "authors": ["Angela Fan", "Beliz Gokkaya", "Mark Harman"],
    425       "year": 2023,
    426       "arxiv_id": "2310.03533",
    427       "relevance": "Survey of LLMs for SE tasks including code generation and documentation."
    428     },
    429     {
    430       "title": "Large language models are few-shot summarizers: Multi-intent comment generation via in-context learning",
    431       "authors": ["Mingyang Geng", "Shangwen Wang", "Dezun Dong"],
    432       "year": 2024,
    433       "relevance": "LLM-based code comment generation achieving state-of-the-art; relevant to automated documentation quality."
    434     },
    435     {
    436       "title": "CodeBERT: A pre-trained model for programming and natural languages",
    437       "authors": ["Zhangyin Feng", "Daya Guo", "Duyu Tang"],
    438       "year": 2020,
    439       "arxiv_id": "2002.08155",
    440       "relevance": "Pre-trained code model used as baseline for code-comment consistency, relevant to code understanding capabilities."
    441     },
    442     {
    443       "title": "CodeT5: Identifier-aware unified pre-trained encoder-decoder models for code understanding and generation",
    444       "authors": ["Yue Wang", "Weishi Wang", "Shafiq Joty"],
    445       "year": 2021,
    446       "arxiv_id": "2109.00859",
    447       "relevance": "Pre-trained code model used as baseline; relevant to code generation and understanding."
    448     },
    449     {
    450       "title": "Large language models are few-shot testers: Exploring LLM-based general bug reproduction",
    451       "authors": ["Sungmin Kang", "Juyeon Yoon", "Shin Yoo"],
    452       "year": 2023,
    453       "relevance": "LLM-based test generation for bug reproduction; foundational to the document testing pipeline."
    454     },
    455     {
    456       "title": "Self-Consistency Improves Chain of Thought Reasoning in Language Models",
    457       "authors": ["Xuezhi Wang", "Jason Wei", "Dale Schuurmans"],
    458       "year": 2023,
    459       "relevance": "Self-consistency technique for estimating LLM correctness; alternative approach to hallucination detection."
    460     },
    461     {
    462       "title": "Enhancing Trust in LLM-Generated Code Summaries with Calibrated Confidence Scores",
    463       "authors": ["Yuvraj Virk", "Premkumar Devanbu", "Toufique Ahmed"],
    464       "year": 2024,
    465       "arxiv_id": "2404.19318",
    466       "relevance": "Directly related work on trusting LLM-generated code documentation using token probabilities."
    467     },
    468     {
    469       "title": "Chain of Thought Prompting Elicits Reasoning in Large Language Models",
    470       "authors": ["Jason Wei", "Xuezhi Wang", "Dale Schuurmans"],
    471       "year": 2022,
    472       "arxiv_id": "2201.11903",
    473       "relevance": "Chain-of-thought prompting technique used as a baseline variant (GPT-3-CoT) in experiments."
    474     },
    475     {
    476       "title": "Whole Test Suite Generation",
    477       "authors": ["Gordon Fraser", "Andrea Arcuri"],
    478       "year": 2013,
    479       "relevance": "EvoSuite automated test generation tool used as an alternative to human tests in the document testing pipeline."
    480     },
    481     {
    482       "title": "Mutation-based consistency testing for evaluating the code understanding capability of LLMs",
    483       "authors": ["Ziyu Li", "Donghwan Shin"],
    484       "year": 2024,
    485       "arxiv_id": "2401.05940",
    486       "relevance": "LLM code understanding evaluation via mutation-based consistency; baseline technique in this study."
    487     }
    488   ]
    489 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs