scan.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan.json (29691B)
      1 {
      2   "paper": {
      3     "title": "T3: Multi-level Tree-based Automatic Program Repair with Large Language Models",
      4     "authors": ["Quanming Liu", "Xupeng Bu", "Zhichao Yan", "Ru Li"],
      5     "year": 2025,
      6     "venue": "IEEE International Joint Conference on Neural Network (IJCNN)",
      7     "arxiv_id": "2506.21211",
      8     "doi": "10.1109/IJCNN64981.2025.11228000"
      9   },
     10   "scan_version": 3,
     11   "active_modules": ["experimental_rigor", "data_leakage"],
     12   "methodology_tags": ["benchmark-eval"],
     13   "key_findings": "T3 proposes a multi-level tree-based reasoning framework for automatic program repair that decomposes patch generation into sample retrieval, cause analysis, repair plan generation, and patch generation. On the MODIT dataset, T3 achieves 46.70% (B2Fs) and 28.20% (B2Fm) repair rates with GPT-3.5-turbo, and 48.20%/32.10% with GPT-4o-mini, outperforming CoT baselines by 11.20% and 9.90% respectively. Ablation shows both cause analysis and repair planning contribute, and lower SC accuracy on complex tasks suggests the method explores diverse reasoning paths.",
     14   "checklist": {
     15     "artifacts": {
     16       "code_released": {
     17         "applies": true,
     18         "answer": false,
     19         "justification": "No source code repository URL is provided anywhere in the paper. No GitHub link, Zenodo archive, or supplementary material reference."
     20       },
     21       "data_released": {
     22         "applies": true,
     23         "answer": true,
     24         "justification": "The paper uses the publicly available MODIT dataset [31] including B2Fs and B2Fm subsets [32], which are established public benchmarks of bug fix commits from GitHub."
     25       },
     26       "environment_specified": {
     27         "applies": true,
     28         "answer": false,
     29         "justification": "No environment specifications are provided. While the paper specifies model versions (gpt-3.5-turbo-0125, gpt-4o-mini-2024-07-18) and mentions the BM25 algorithm, there are no requirements.txt, Docker files, or library version details."
     30       },
     31       "reproduction_instructions": {
     32         "applies": true,
     33         "answer": false,
     34         "justification": "No step-by-step reproduction instructions are provided. The method is described algorithmically but there are no runnable scripts, README, or commands to replicate the experiments."
     35       }
     36     },
     37     "statistical_methodology": {
     38       "confidence_intervals_or_error_bars": {
     39         "applies": true,
     40         "answer": false,
     41         "justification": "All results in Tables I, II, and III are reported as single point estimates (e.g., '46.70%') with no confidence intervals, error bars, or ± notation."
     42       },
     43       "significance_tests": {
     44         "applies": true,
     45         "answer": false,
     46         "justification": "No statistical significance tests are performed. Claims like 'T3 achieves a repair rate of 46.70%, outperforming the CoT baseline by 11.20%' are based solely on comparing raw percentages without any p-values or tests."
     47       },
     48       "effect_sizes_reported": {
     49         "applies": true,
     50         "answer": true,
     51         "justification": "The paper reports percentage improvements with baseline context throughout, e.g., '11.20% improvement over CoT' with raw figures (46.70% vs 35.50% on B2Fs, 28.20% vs 18.30% on B2Fm), providing enough context to assess magnitude."
     52       },
     53       "sample_size_justified": {
     54         "applies": true,
     55         "answer": false,
     56         "justification": "No justification for why the MODIT dataset sizes are sufficient for the claims made, and no power analysis is discussed. The number of programs in each dataset is not even explicitly stated."
     57       },
     58       "variance_reported": {
     59         "applies": true,
     60         "answer": false,
     61         "justification": "Despite generating 30 samples per program with temperature 0.7 (Section IV.B), no standard deviation, variance, or spread measures across runs are reported. All tables show single aggregate numbers."
     62       }
     63     },
     64     "evaluation_design": {
     65       "baselines_included": {
     66         "applies": true,
     67         "answer": true,
     68         "justification": "Table I compares T3 against seven baselines: NatGen, S-C, S-C+BM25, CoT, Tree-of-Thought, Plan-and-Solve, and Analogical-Reasoning across two models."
     69       },
     70       "baselines_contemporary": {
     71         "applies": true,
     72         "answer": true,
     73         "justification": "Baselines include methods from 2022-2024: NatGen (2022), Self-Consistency+BM25 (2023), ChatRepair-style CoT variants. These are reasonably contemporary for the APR domain."
     74       },
     75       "ablation_study": {
     76         "applies": true,
     77         "answer": true,
     78         "justification": "Table III presents ablation results: 'w/o plan' (37.60%/19.90%), 'w/o cause' (37.30%/21.00%), and full T3 (46.70%/28.20%), demonstrating the contribution of each component."
     79       },
     80       "multiple_metrics": {
     81         "applies": true,
     82         "answer": true,
     83         "justification": "Two metrics are used: repair rate (Table I, proportion of successfully repaired programs) and SC accuracy (Table II, consistency of patch generation)."
     84       },
     85       "human_evaluation": {
     86         "applies": true,
     87         "answer": false,
     88         "justification": "No human evaluation of repair quality is conducted. Evaluation is entirely automated, based on whether generated patches match ground truth. A single qualitative case study (Table IV) is shown but involves no systematic human evaluation."
     89       },
     90       "held_out_test_set": {
     91         "applies": true,
     92         "answer": false,
     93         "justification": "The paper does not explicitly describe train/test splits or state that results are on a held-out test set. It mentions using the MODIT dataset's B2Fs and B2Fm subsets but does not clarify whether any data was used for tuning decisions."
     94       },
     95       "per_category_breakdown": {
     96         "applies": true,
     97         "answer": true,
     98         "justification": "Results are reported separately for B2Fs (smaller code sequences) and B2Fm datasets in all tables, allowing readers to assess performance across task complexity levels."
     99       },
    100       "failure_cases_discussed": {
    101         "applies": true,
    102         "answer": false,
    103         "justification": "The case study (Table IV, Section V.E) only shows where competing methods fail and T3 succeeds. No systematic analysis of T3's failure cases or where the approach breaks down is provided."
    104       },
    105       "negative_results_reported": {
    106         "applies": true,
    107         "answer": true,
    108         "justification": "Section V.B honestly reports that T3's SC accuracy on B2Fm (44.68%) is 'inferior relative to other methodologies,' noting a 'compromise in consistency during patch generation' for complex tasks. This is a genuine negative finding."
    109       }
    110     },
    111     "claims_and_evidence": {
    112       "abstract_claims_supported": {
    113         "applies": true,
    114         "answer": true,
    115         "justification": "The abstract claims '11.20% and 9.90% improvement compared to methods based on CoT reasoning,' which matches Table I: B2Fs 46.70% vs 35.50% CoT (=11.20pp), B2Fm 28.20% vs 18.30% (=9.90pp) on GPT-3.5-turbo."
    116       },
    117       "causal_claims_justified": {
    118         "applies": true,
    119         "answer": true,
    120         "justification": "The paper makes causal claims ('T3 improves repair accuracy') supported by controlled ablation studies (Table III) that isolate the contribution of cause analysis and repair planning components through single-variable removal."
    121       },
    122       "generalization_bounded": {
    123         "applies": true,
    124         "answer": false,
    125         "justification": "The title claims 'Automatic Program Repair with Large Language Models' broadly, but experiments use only the MODIT dataset (Java bug fixes from GitHub) with only two OpenAI models. No discussion of generalization boundaries to other languages, bug types, or models."
    126       },
    127       "alternative_explanations_discussed": {
    128         "applies": true,
    129         "answer": false,
    130         "justification": "No alternative explanations are considered for the observed improvements. The paper does not discuss whether improvements could stem from increased compute (more reasoning trees = more API calls), prompt engineering advantages, or dataset-specific properties."
    131       },
    132       "proxy_outcome_distinction": {
    133         "applies": true,
    134         "answer": true,
    135         "justification": "The paper measures repair rate (exact match of generated patches to correct fixes) and frames results in terms of repair rate. The claims match the granularity of measurements — no broader claims about 'software quality' or 'developer productivity' are made."
    136       }
    137     },
    138     "setup_transparency": {
    139       "model_versions_specified": {
    140         "applies": true,
    141         "answer": true,
    142         "justification": "Footnote 1 in Section IV.B explicitly states: 'we use the version gpt-3.5-turbo-0125 and gpt-4o-mini-2024-07-18,' providing exact API snapshot versions."
    143       },
    144       "prompts_provided": {
    145         "applies": true,
    146         "answer": false,
    147         "justification": "No actual prompt text is provided. The paper describes the prompting process conceptually (e.g., 'carefully designed prompt template') but never shows the actual prompts used in experiments."
    148       },
    149       "hyperparameters_reported": {
    150         "applies": true,
    151         "answer": true,
    152         "justification": "Section IV.B reports: temperature = 0.7, 30 generated samples per program. The BM25 retrieval parameter k is described. However, key T3-specific parameters (number of reasoning trees M, top-n values) are not explicitly stated."
    153       },
    154       "scaffolding_described": {
    155         "applies": true,
    156         "answer": true,
    157         "justification": "The four-stage framework is described in detail (Section III): BM25 sample retrieval, Forest of Thinking cause analysis with cross-tree voting, repair plan generation with frequency-based selection, and CoT patch generation. Workflow diagram provided in Fig. 2."
    158       },
    159       "data_preprocessing_documented": {
    160         "applies": true,
    161         "answer": false,
    162         "justification": "No description of how the MODIT dataset was preprocessed for the experiments. The paper jumps from dataset description ('bug fix commits collected from GitHub') directly to experimental results without documenting any intermediate processing steps."
    163       }
    164     },
    165     "limitations_and_scope": {
    166       "limitations_section_present": {
    167         "applies": true,
    168         "answer": false,
    169         "justification": "There is no limitations section, threats-to-validity section, or any dedicated discussion of the method's limitations. The conclusion does not mention limitations."
    170       },
    171       "threats_to_validity_specific": {
    172         "applies": true,
    173         "answer": false,
    174         "justification": "No threats to validity are discussed anywhere in the paper, neither specific nor generic."
    175       },
    176       "scope_boundaries_stated": {
    177         "applies": true,
    178         "answer": false,
    179         "justification": "No explicit scope boundaries are stated. The paper does not discuss what the results do not show, what settings are excluded, or what claims the authors are not making."
    180       }
    181     },
    182     "data_integrity": {
    183       "raw_data_available": {
    184         "applies": true,
    185         "answer": false,
    186         "justification": "No raw experimental data (generated patches, per-program results, model outputs) is made available. Only aggregated percentages in tables are reported."
    187       },
    188       "data_collection_described": {
    189         "applies": true,
    190         "answer": false,
    191         "justification": "The MODIT dataset description is minimal: 'includes two subsets: B2Fs (with smaller code sequences) and B2Fm' with 'bug fix commits collected from GitHub, along with detailed commit logs.' No time period, inclusion/exclusion criteria, or dataset size is stated."
    192       },
    193       "recruitment_methods_described": {
    194         "applies": false,
    195         "answer": false,
    196         "justification": "No human participants in this study. Data source is the MODIT standard benchmark dataset."
    197       },
    198       "data_pipeline_documented": {
    199         "applies": true,
    200         "answer": false,
    201         "justification": "No documentation of the data pipeline from raw dataset to final analysis. The paper does not describe how programs were selected from MODIT, how BM25 retrieval corpora were constructed, or how correctness of patches was verified."
    202       }
    203     },
    204     "conflicts_of_interest": {
    205       "funding_disclosed": {
    206         "applies": true,
    207         "answer": false,
    208         "justification": "No funding information is disclosed anywhere in the paper. No acknowledgments section mentioning grants or sponsors, despite using commercial API services that cost money."
    209       },
    210       "affiliations_disclosed": {
    211         "applies": true,
    212         "answer": true,
    213         "justification": "All four authors list their affiliation as School of Computer and Information Technology, Shanxi University, Taiyuan, China. Email addresses are provided."
    214       },
    215       "funder_independent_of_outcome": {
    216         "applies": true,
    217         "answer": false,
    218         "justification": "Since no funding source is disclosed, independence of the funder cannot be assessed. The paper evaluates OpenAI models, and it is unknown whether OpenAI provided API credits or support."
    219       },
    220       "financial_interests_declared": {
    221         "applies": true,
    222         "answer": false,
    223         "justification": "No competing interests or financial interests statement is included in the paper."
    224       }
    225     },
    226     "contamination": {
    227       "training_cutoff_stated": {
    228         "applies": true,
    229         "answer": false,
    230         "justification": "No training data cutoff dates are stated for GPT-3.5-turbo-0125 or GPT-4o-mini-2024-07-18, despite evaluating these models on a publicly available benchmark."
    231       },
    232       "train_test_overlap_discussed": {
    233         "applies": true,
    234         "answer": false,
    235         "justification": "No discussion of whether the MODIT dataset (GitHub bug fix commits) could have appeared in the GPT models' training data, which is highly plausible given GitHub is a major training data source."
    236       },
    237       "benchmark_contamination_addressed": {
    238         "applies": true,
    239         "answer": false,
    240         "justification": "The MODIT dataset was published in 2019 (Tufano et al.) and is from public GitHub commits, making it very likely to be in GPT-3.5 and GPT-4o training data. This contamination risk is not discussed."
    241       }
    242     },
    243     "human_studies": {
    244       "pre_registered": {
    245         "applies": false,
    246         "answer": false,
    247         "justification": "No human participants in this study. It is a benchmark evaluation of automated program repair methods."
    248       },
    249       "irb_or_ethics_approval": {
    250         "applies": false,
    251         "answer": false,
    252         "justification": "No human participants in this study."
    253       },
    254       "demographics_reported": {
    255         "applies": false,
    256         "answer": false,
    257         "justification": "No human participants in this study."
    258       },
    259       "inclusion_exclusion_criteria": {
    260         "applies": false,
    261         "answer": false,
    262         "justification": "No human participants in this study."
    263       },
    264       "randomization_described": {
    265         "applies": false,
    266         "answer": false,
    267         "justification": "No human participants in this study."
    268       },
    269       "blinding_described": {
    270         "applies": false,
    271         "answer": false,
    272         "justification": "No human participants in this study."
    273       },
    274       "attrition_reported": {
    275         "applies": false,
    276         "answer": false,
    277         "justification": "No human participants in this study."
    278       }
    279     },
    280     "cost_and_practicality": {
    281       "inference_cost_reported": {
    282         "applies": true,
    283         "answer": false,
    284         "justification": "No inference cost, latency, or token consumption is reported. The method requires multiple LLM calls per program (multiple reasoning trees, each with multiple CoT expansions, plus voting), making cost a significant practical concern that is completely unaddressed."
    285       },
    286       "compute_budget_stated": {
    287         "applies": true,
    288         "answer": false,
    289         "justification": "No total computational budget is stated. With 30 samples per program across multiple methods and two models, plus the multi-tree reasoning structure, the total API spend is unknown."
    290       }
    291     },
    292     "experimental_rigor": {
    293       "seed_sensitivity_reported": {
    294         "applies": true,
    295         "answer": false,
    296         "justification": "No seed sensitivity analysis. While temperature 0.7 introduces stochasticity and 30 samples are generated, results are not reported across multiple independent runs with different seeds."
    297       },
    298       "number_of_runs_stated": {
    299         "applies": true,
    300         "answer": true,
    301         "justification": "Section IV.B states: 'each program in every experimental group is tested with 30 generated samples.' The number of samples per program is explicitly stated."
    302       },
    303       "hyperparameter_search_budget": {
    304         "applies": true,
    305         "answer": false,
    306         "justification": "No hyperparameter search is described. Key parameters like the number of reasoning trees M, top-n selection threshold, and BM25 k value appear chosen without justification or search."
    307       },
    308       "best_config_selection_justified": {
    309         "applies": true,
    310         "answer": false,
    311         "justification": "No justification for how the final configuration was selected. The values of M (number of trees), n (top-n selection), and k (BM25 examples) are not even explicitly reported, let alone justified through systematic selection."
    312       },
    313       "multiple_comparison_correction": {
    314         "applies": true,
    315         "answer": false,
    316         "justification": "Multiple comparisons are made across methods, models, and datasets without any statistical testing, let alone correction for multiple comparisons."
    317       },
    318       "self_comparison_bias_addressed": {
    319         "applies": true,
    320         "answer": false,
    321         "justification": "No acknowledgment of author-evaluation bias. The authors implement all baseline prompting methods (CoT, Tree-of-Thought, Plan-and-Solve, Analogical-Reasoning) themselves, which risks systematically disadvantaging baselines."
    322       },
    323       "compute_budget_vs_performance": {
    324         "applies": true,
    325         "answer": false,
    326         "justification": "No comparison at matched compute budgets. T3 uses multiple reasoning trees with multiple CoT expansions each, consuming substantially more API calls than single-chain baselines like CoT. This compute advantage is not discussed."
    327       },
    328       "benchmark_construct_validity": {
    329         "applies": true,
    330         "answer": false,
    331         "justification": "No discussion of whether the MODIT dataset's exact-match repair metric adequately measures program repair capability. The paper does not consider whether B2Fs/B2Fm are representative of real-world bugs."
    332       },
    333       "scaffold_confound_addressed": {
    334         "applies": true,
    335         "answer": true,
    336         "justification": "The scaffolding difference IS the independent variable being studied. The paper compares different reasoning frameworks (CoT, ToT, Plan-and-Solve, T3) applied to the same models, with the scaffold/prompting strategy as the controlled treatment."
    337       }
    338     },
    339     "data_leakage": {
    340       "temporal_leakage_addressed": {
    341         "applies": true,
    342         "answer": false,
    343         "justification": "No discussion of temporal leakage. The MODIT dataset contains GitHub commits from before the GPT models' training, meaning the models may have seen both the buggy and fixed versions during training."
    344       },
    345       "feature_leakage_addressed": {
    346         "applies": true,
    347         "answer": false,
    348         "justification": "No discussion of whether the BM25-retrieved similar code examples or the error location information provided to the model constitutes feature leakage beyond what would be available in practice."
    349       },
    350       "non_independence_addressed": {
    351         "applies": true,
    352         "answer": false,
    353         "justification": "No discussion of whether the MODIT dataset examples are independent (e.g., multiple commits from the same repository, similar bug patterns that could inflate results)."
    354       },
    355       "leakage_detection_method": {
    356         "applies": true,
    357         "answer": false,
    358         "justification": "No concrete leakage detection or prevention method is applied. No canary strings, membership inference, n-gram overlap analysis, or temporal splits are used."
    359       }
    360     }
    361   },
    362   "claims": [
    363     {
    364       "claim": "T3 improves repair rate by 11.20% on B2Fs and 9.90% on B2Fm compared to CoT baseline using GPT-3.5-turbo.",
    365       "evidence": "Table I: T3 achieves 46.70% (B2Fs) and 28.20% (B2Fm) vs CoT at 35.50% and 18.30% respectively.",
    366       "supported": "moderate"
    367     },
    368     {
    369       "claim": "T3 maintains robust performance across different model architectures, achieving 48.20% on B2Fs and 32.10% on B2Fm with GPT-4o-mini.",
    370       "evidence": "Table I shows T3 results on both GPT-3.5-turbo and GPT-4o-mini, with consistent top performance on both.",
    371       "supported": "moderate"
    372     },
    373     {
    374       "claim": "Both cause analysis and repair plan generation components contribute to T3's performance.",
    375       "evidence": "Table III ablation: w/o plan = 37.60%/19.90%, w/o cause = 37.30%/21.00%, full T3 = 46.70%/28.20%. Removing either component degrades performance substantially.",
    376       "supported": "moderate"
    377     },
    378     {
    379       "claim": "T3 employs diverse reasoning pathways for complex tasks, trading consistency for repair efficacy.",
    380       "evidence": "Table II shows T3 SC accuracy of 44.68% on B2Fm (lowest among methods) while Table I shows highest repair rate (28.20%). Authors interpret this as evidence of path diversity.",
    381       "supported": "weak"
    382     },
    383     {
    384       "claim": "T3 achieves stable performance with as few as 3 example shots, outperforming other methods in low-sample conditions.",
    385       "evidence": "Section V.D and Fig. 3 described (but figure data not fully shown in text). Claims T3 stabilizes at 3 shots while Tree-of-Thought needs 5.",
    386       "supported": "weak"
    387     }
    388   ],
    389   "red_flags": [
    390     {
    391       "flag": "No error bars or statistical tests",
    392       "detail": "All claims of improvement (e.g., 11.20%, 9.90%) are based on comparing raw percentages without any significance testing, confidence intervals, or variance reporting, despite generating 30 stochastic samples per program."
    393     },
    394     {
    395       "flag": "Severe contamination risk unaddressed",
    396       "detail": "The MODIT dataset (GitHub commits from 2019 or earlier) is almost certainly in the training data of GPT-3.5-turbo and GPT-4o-mini. The models may have memorized the correct fixes, making the benchmark results unreliable. This is not discussed at all."
    397     },
    398     {
    399       "flag": "No code released",
    400       "detail": "Neither source code nor prompts are provided, making independent replication impossible. The exact prompts used for each method are critical to the results but are not shared."
    401     },
    402     {
    403       "flag": "Unfair compute comparison",
    404       "detail": "T3 uses multiple reasoning trees (M trees × N expansions each) for both cause analysis and repair planning, consuming far more API calls than single-chain baselines. No cost normalization or compute-matched comparison is provided."
    405     },
    406     {
    407       "flag": "No limitations section",
    408       "detail": "The paper contains no limitations, threats to validity, or scope boundaries discussion. This is a significant omission for a methods paper making comparative claims."
    409     },
    410     {
    411       "flag": "Key parameters unreported",
    412       "detail": "The number of reasoning trees M and top-n selection threshold — core design parameters of T3 — are never explicitly stated as concrete values used in experiments, only described abstractly."
    413     },
    414     {
    415       "flag": "Single case study as qualitative evidence",
    416       "detail": "Table IV shows one cherry-picked example where T3 succeeds and all competitors fail or partially fail. No systematic qualitative analysis or failure cases of T3 are shown."
    417     }
    418   ],
    419   "cited_papers": [
    420     {
    421       "title": "RAP-Gen: Retrieval-augmented Patch Generation with CodeT5 for Automatic Program Repair",
    422       "authors": ["W. Wang", "Y. Wang", "S. Joty", "S. C. Hoi"],
    423       "year": 2023,
    424       "relevance": "Retrieval-augmented APR using CodeT5, directly relevant as a fine-tuning-based LLM repair method."
    425     },
    426     {
    427       "title": "Less Training, More Repairing Please: Revisiting Automated Program Repair via Zero-shot Learning",
    428       "authors": ["C. S. Xia", "L. Zhang"],
    429       "year": 2022,
    430       "relevance": "ALPHA — zero-shot cloze-style APR using LLMs, a key baseline approach for LLM-based program repair."
    431     },
    432     {
    433       "title": "GAMMA: Revisiting Template-based Automated Program Repair via Mask Prediction",
    434       "authors": ["Q. Zhang", "C. Fang", "T. Zhang", "B. Yu", "W. Sun", "Z. Chen"],
    435       "year": 2023,
    436       "relevance": "Template-based APR using mask prediction with LLMs, representing the cloze-style repair paradigm."
    437     },
    438     {
    439       "title": "Automated Program Repair via Conversation: Fixing 162 out of 337 Bugs for $0.42 Each Using ChatGPT",
    440       "authors": ["C. S. Xia", "L. Zhang"],
    441       "year": 2024,
    442       "relevance": "ChatRepair — conversation-based APR using ChatGPT with iterative feedback, a major LLM-based repair approach."
    443     },
    444     {
    445       "title": "CodeT5: Identifier-aware Unified Pre-trained Encoder-Decoder Models for Code Understanding and Generation",
    446       "authors": ["Y. Wang", "W. Wang", "S. Joty", "S. C. Hoi"],
    447       "year": 2021,
    448       "relevance": "Foundation code LLM used in multiple APR systems, relevant to understanding LLM capabilities for code tasks."
    449     },
    450     {
    451       "title": "CodeBERT: A Pre-trained Model for Programming and Natural Languages",
    452       "authors": ["Z. Feng", "D. Guo", "D. Tang", "N. Duan"],
    453       "year": 2020,
    454       "relevance": "Pre-trained code representation model used as foundation for multiple code understanding and repair approaches."
    455     },
    456     {
    457       "title": "Chain-of-thought Prompting Elicits Reasoning in Large Language Models",
    458       "authors": ["J. Wei", "X. Wang", "D. Schuurmans"],
    459       "year": 2022,
    460       "relevance": "Foundational CoT prompting technique that the paper builds upon and compares against for APR tasks."
    461     },
    462     {
    463       "title": "Self-consistency Improves Chain of Thought Reasoning in Language Models",
    464       "authors": ["X. Wang", "J. Wei", "D. Schuurmans", "Q. V. Le"],
    465       "year": 2023,
    466       "relevance": "Self-consistency mechanism used in T3's voting/selection strategy for error causes and repair plans."
    467     },
    468     {
    469       "title": "Better Patching Using LLM Prompting, via Self-Consistency",
    470       "authors": ["T. Ahmed", "P. Devanbu"],
    471       "year": 2023,
    472       "relevance": "Directly relevant prior work applying self-consistency to LLM-based patching, source of the MODIT dataset and S-C baselines used in this paper."
    473     },
    474     {
    475       "title": "Evaluating Large Language Models Trained on Code",
    476       "authors": ["M. Chen", "J. Tworek", "H. Jun"],
    477       "year": 2021,
    478       "arxiv_id": "2107.03374",
    479       "relevance": "Codex paper — foundational evaluation of LLMs for code generation, establishing key evaluation paradigms."
    480     },
    481     {
    482       "title": "A Systematic Literature Review on Large Language Models for Automated Program Repair",
    483       "authors": ["Q. Zhang", "C. Fang", "Y. Xie"],
    484       "year": 2024,
    485       "arxiv_id": "2405.01466",
    486       "relevance": "Comprehensive survey of LLM-based APR methods, directly relevant to the survey scope."
    487     },
    488     {
    489       "title": "DEAR: A Novel Deep Learning-based Approach for Automated Program Repair",
    490       "authors": ["Y. Li", "S. Wang", "T. N. Nguyen"],
    491       "year": 2022,
    492       "relevance": "Deep learning-based APR approach representing the learning-based repair paradigm."
    493     },
    494     {
    495       "title": "NatGen: Generative Pre-training by 'Naturalizing' Source Code",
    496       "authors": ["S. Chakraborty", "T. Ahmed", "Y. Ding", "P. T. Devanbu", "B. Ray"],
    497       "year": 2022,
    498       "relevance": "Generative pre-training approach for code used as a baseline in the paper's experiments."
    499     },
    500     {
    501       "title": "Plan-and-Solve Prompting: Improving Zero-shot Chain-of-Thought Reasoning by Large Language Models",
    502       "authors": ["L. Wang", "W. Xu", "Y. Lan"],
    503       "year": 2023,
    504       "relevance": "Advanced CoT variant used as both a baseline and component in T3's reasoning framework."
    505     }
    506   ],
    507   "engagement_factors": {
    508     "practical_relevance": {
    509       "score": 1,
    510       "justification": "The multi-tree reasoning approach is interesting but no code is released, making it not immediately usable by practitioners."
    511     },
    512     "surprise_contrarian": {
    513       "score": 0,
    514       "justification": "Confirms the expected finding that structured multi-path reasoning improves LLM performance on complex tasks."
    515     },
    516     "fear_safety": {
    517       "score": 0,
    518       "justification": "No safety, security, or AI risk concerns raised; the paper is about improving bug-fixing accuracy."
    519     },
    520     "drama_conflict": {
    521       "score": 0,
    522       "justification": "No controversy or provocative claims; straightforward methods paper."
    523     },
    524     "demo_ability": {
    525       "score": 0,
    526       "justification": "No code, demo, or tool released; results cannot be reproduced or tried."
    527     },
    528     "brand_recognition": {
    529       "score": 0,
    530       "justification": "Authors are from Shanxi University with no major brand recognition in the LLM/APR community."
    531     }
    532   }
    533 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs