scan.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan.json (29699B)
      1 {
      2   "paper": {
      3     "title": "ThinkRepair: Self-Directed Automated Program Repair",
      4     "authors": [
      5       "Xin Yin",
      6       "Chao Ni",
      7       "Shaohua Wang",
      8       "Zhenhao Li",
      9       "Limin Zeng",
     10       "Xiaohu Yang"
     11     ],
     12     "year": 2024,
     13     "venue": "ISSTA 2024 (International Symposium on Software Testing and Analysis)",
     14     "arxiv_id": "2407.20898",
     15     "doi": "10.1145/3650212.3680359"
     16   },
     17   "checklist": {
     18     "artifacts": {
     19       "code_released": {
     20         "applies": true,
     21         "answer": true,
     22         "justification": "Section 9 states 'The replication of this paper is publicly available' with a GitHub URL (https://github.com/vinci-grape/ThinkRepair)."
     23       },
     24       "data_released": {
     25         "applies": true,
     26         "answer": true,
     27         "justification": "The evaluation uses public benchmarks Defects4J and QuixBugs. The RWB dataset collection procedure is described, and the replication package is released."
     28       },
     29       "environment_specified": {
     30         "applies": true,
     31         "answer": false,
     32         "justification": "Section 4.3 lists hardware (Intel Xeon Gold 6226R, 192GB RAM, NVIDIA RTX 3090, Ubuntu 20.04.1) and framework names (Python, PyTorch, Hugging Face), but no specific library version numbers, requirements.txt, or environment specification file is provided."
     33       },
     34       "reproduction_instructions": {
     35         "applies": true,
     36         "answer": false,
     37         "justification": "The paper provides a replication package URL but contains no step-by-step reproduction instructions, README description, or commands to run within the paper itself."
     38       }
     39     },
     40     "statistical_methodology": {
     41       "confidence_intervals_or_error_bars": {
     42         "applies": true,
     43         "answer": false,
     44         "justification": "All results are reported as point estimates (number of bugs fixed). No confidence intervals, error bars, or uncertainty measures are provided in any table or figure."
     45       },
     46       "significance_tests": {
     47         "applies": true,
     48         "answer": false,
     49         "justification": "The paper claims ThinkRepair outperforms baselines based solely on comparing raw bug counts (e.g., 98 vs 67 vs 76). No statistical significance tests (t-tests, Wilcoxon, etc.) are performed."
     50       },
     51       "effect_sizes_reported": {
     52         "applies": true,
     53         "answer": true,
     54         "justification": "Percentage improvements with baseline context are consistently reported, e.g., '27%~344.4% improvement' (abstract), '46.3% improvement over KNOD' (Section 5.2), '88.5% improvement over BaseChatGPT' (Section 5.1)."
     55       },
     56       "sample_size_justified": {
     57         "applies": true,
     58         "answer": false,
     59         "justification": "No justification is given for the sample sizes. The paper follows prior work in using Defects4J and QuixBugs without discussing whether the bug counts are sufficient for the claims made."
     60       },
     61       "variance_reported": {
     62         "applies": true,
     63         "answer": false,
     64         "justification": "No variance or standard deviation is reported across experimental runs. Despite using stochastic sampling (temperature=1) with 25 attempts, only single aggregate counts are reported with no spread measures."
     65       }
     66     },
     67     "evaluation_design": {
     68       "baselines_included": {
     69         "applies": true,
     70         "answer": true,
     71         "justification": "Extensive baseline comparison with 12 SOTA APR approaches: 8 NMT-based (KNOD, TENURE, SelfAPR, RewardRepair, CURE, DeepDebug, CoCoNuT, DLFix) and 4 LLM-based (ChatRepair, Codex, GPT-NeoX, AlphaRepair), as listed in Table 2."
     72       },
     73       "baselines_contemporary": {
     74         "applies": true,
     75         "answer": true,
     76         "justification": "Baselines include recent work: ChatRepair (2023), KNOD (ICSE 2023), TENURE (ICSE 2023), SelfAPR (ASE 2022), representing the state of the art at the time of submission."
     77       },
     78       "ablation_study": {
     79         "applies": true,
     80         "answer": true,
     81         "justification": "RQ3 (Section 5.3) provides a thorough ablation study: ThinkRepair-v1 (without interaction feedback), ThinkRepair-v2 (without CoT few-shot learning), varying interaction numbers (1-7), and four few-shot selection strategies."
     82       },
     83       "multiple_metrics": {
     84         "applies": true,
     85         "answer": true,
     86         "justification": "Section 4.2 states two evaluation metrics: 'number of correct patches' (semantically equivalent to actual fix) and 'number of plausible patches' (passes all tests but may not be semantically equivalent)."
     87       },
     88       "human_evaluation": {
     89         "applies": true,
     90         "answer": true,
     91         "justification": "Section 4.2 states 'we also manually check and identify the plausible patches that are semantically equivalent to the actual fixes,' following previous work. This is manual evaluation of the system's outputs."
     92       },
     93       "held_out_test_set": {
     94         "applies": true,
     95         "answer": true,
     96         "justification": "Defects4J V2.0 (228 bugs) is used for knowledge collection and V1.2 (255 bugs) for fixing, and vice versa. The paper explicitly states these are 'two completely independent versions, with no overlapping bugs between them.'"
     97       },
     98       "per_category_breakdown": {
     99         "applies": true,
    100         "answer": true,
    101         "justification": "Tables 3, 5, 6, and 10 provide per-project breakdowns (Chart, Closure, Lang, Math, Mockito, Time). Table 4 breaks down by scenario (single function, single hunk, single line)."
    102       },
    103       "failure_cases_discussed": {
    104         "applies": true,
    105         "answer": true,
    106         "justification": "Section 5.1 discusses the Mockito-12 failure case where ThinkRepair 'incorrectly revises the non-buggy line' due to 'over-inferring.' The Venn diagram (Fig. 5) shows 4 bugs only BaseChatGPT can fix."
    107       },
    108       "negative_results_reported": {
    109         "applies": true,
    110         "answer": true,
    111         "justification": "The paper reports ThinkRepair fails on 4 bugs that BaseChatGPT can fix, discusses over-inferring as a limitation, and the ablation shows diminishing returns with more interactions (Fig. 9)."
    112       }
    113     },
    114     "claims_and_evidence": {
    115       "abstract_claims_supported": {
    116         "applies": true,
    117         "answer": true,
    118         "justification": "Abstract claims are supported: '98 bugs on Defects4J V1.2 with 27%~344.4% improvement' matches Tables 4 and 6. 'Fixes 12~65 more bugs on V2.0' matches Tables 4 and 7. QuixBugs claims match Table 4."
    119       },
    120       "causal_claims_justified": {
    121         "applies": true,
    122         "answer": true,
    123         "justification": "Causal claims (CoT few-shot learning and interaction feedback improve performance) are justified by controlled ablation studies in RQ3 (Table 8), where single components are removed to measure their contribution."
    124       },
    125       "generalization_bounded": {
    126         "applies": true,
    127         "answer": true,
    128         "justification": "The abstract specifies evaluations on 'Defects4J and QuixBugs' by name. Section 6.2 notes 'The effectiveness observed in ThinkRepair's performance may not be applicable across different repair datasets.' Claims are largely bounded to tested settings."
    129       },
    130       "alternative_explanations_discussed": {
    131         "applies": true,
    132         "answer": true,
    133         "justification": "Section 6.1 substantively discusses data leakage as an alternative explanation, showing only 24.5% of patches lexically match ground truth and evaluating on post-cutoff RWB bugs. Section 6.2 discusses internal and external validity threats."
    134       },
    135       "proxy_outcome_distinction": {
    136         "applies": true,
    137         "answer": true,
    138         "justification": "The paper measures 'number of correct patches' (verified against ground truth and manually checked) and claims bug-fixing ability. The measurement directly matches the claimed capability with no significant proxy gap."
    139       }
    140     },
    141     "setup_transparency": {
    142       "model_versions_specified": {
    143         "applies": true,
    144         "answer": false,
    145         "justification": "The paper specifies 'gpt-3.5-turbo' without a snapshot date or API version. CodeLlama 13B, DeepSeek-Coder 7B, and StarCoder 16B have sizes but no specific version identifiers. Model behavior changes across versions of gpt-3.5-turbo."
    146       },
    147       "prompts_provided": {
    148         "applies": true,
    149         "answer": true,
    150         "justification": "Full prompt structure is provided in Figures 3 and 4: role designation ('You are an Automated Program Repair tool'), task description, buggy function format, CoT indicator ('Let's think step by step'), and feedback template with exact text."
    151       },
    152       "hyperparameters_reported": {
    153         "applies": true,
    154         "answer": true,
    155         "justification": "Section 4.3 reports: temperature=1, maximum repair attempts=25, maximum interaction number=5, two few-shot examples, 5-hour timeout per bug."
    156       },
    157       "scaffolding_described": {
    158         "applies": true,
    159         "answer": true,
    160         "justification": "The two-phase framework is described in detail: Algorithm 1 gives pseudocode, Figures 2-4 illustrate the workflow including knowledge pool collection, few-shot selection with clustering, iterative interaction with test feedback, and verification."
    161       },
    162       "data_preprocessing_documented": {
    163         "applies": true,
    164         "answer": true,
    165         "justification": "Table 1 shows filtering from total bugs to single-function bugs with counts. Section 6.1.2 describes RWB collection steps with specific counts (113→44 for V1.0, 61→29 for V2.0). Dataset splitting between V1.2 and V2.0 is documented."
    166       }
    167     },
    168     "limitations_and_scope": {
    169       "limitations_section_present": {
    170         "applies": true,
    171         "answer": true,
    172         "justification": "Section 6.2 'Threats to Validity' provides a dedicated subsection discussing internal validity (manual validation, data leakage) and external validity (dataset generalizability)."
    173       },
    174       "threats_to_validity_specific": {
    175         "applies": true,
    176         "answer": true,
    177         "justification": "Section 6.2 identifies specific threats: manual patch validation inconsistency risk (mitigated by thorough examination), data leakage from pre-training data (addressed quantitatively in Section 6.1), and dataset-specific generalizability."
    178       },
    179       "scope_boundaries_stated": {
    180         "applies": true,
    181         "answer": false,
    182         "justification": "The paper notes results 'may not be applicable across different repair datasets' but does not enumerate specific unstested settings (e.g., multi-function bugs, languages beyond Java/Python, real-world IDE integration, larger codebases)."
    183       }
    184     },
    185     "data_integrity": {
    186       "raw_data_available": {
    187         "applies": true,
    188         "answer": true,
    189         "justification": "Defects4J and QuixBugs are publicly available standard benchmarks. The replication package is released at https://github.com/vinci-grape/ThinkRepair, enabling independent verification."
    190       },
    191       "data_collection_described": {
    192         "applies": true,
    193         "answer": true,
    194         "justification": "Section 4.1 describes Defects4J (391 bugs in 6 projects for V1.2, 438 in 9 for V2.0) and QuixBugs (40 each for Java/Python). Section 6.1.2 describes RWB collection from commit histories of specific projects with date-based cutoffs."
    195       },
    196       "recruitment_methods_described": {
    197         "applies": false,
    198         "answer": false,
    199         "justification": "No human participants. Data sources are standard public benchmarks (Defects4J, QuixBugs) and a systematically collected dataset (RWB) from open-source projects."
    200       },
    201       "data_pipeline_documented": {
    202         "applies": true,
    203         "answer": true,
    204         "justification": "Table 1 documents filtering stages (total bugs → single function → single hunk → single line). Section 6.1.2 documents RWB pipeline with exact counts at each step (113→44 and 61→29 after two filtering steps)."
    205       }
    206     },
    207     "conflicts_of_interest": {
    208       "funding_disclosed": {
    209         "applies": true,
    210         "answer": true,
    211         "justification": "Acknowledgements section lists funding: NSFC (Grant No.62202419), Fundamental Research Funds for Central Universities, Zhejiang Provincial NSF, Ningbo NSF, and State Street Zhejiang University Technology Center."
    212       },
    213       "affiliations_disclosed": {
    214         "applies": true,
    215         "answer": true,
    216         "justification": "Author affiliations are clearly listed: Zhejiang University, Central University of Finance and Economics, and Concordia University. No conflicts with evaluated model providers (OpenAI, Meta, etc.)."
    217       },
    218       "funder_independent_of_outcome": {
    219         "applies": true,
    220         "answer": true,
    221         "justification": "Funders are Chinese government science foundations and a university technology center. None have a direct financial interest in ThinkRepair's performance or in the evaluated LLMs."
    222       },
    223       "financial_interests_declared": {
    224         "applies": true,
    225         "answer": false,
    226         "justification": "No competing interests or financial interests statement is included in the paper. Absence of disclosure is not the same as absence of conflict."
    227       }
    228     },
    229     "contamination": {
    230       "training_cutoff_stated": {
    231         "applies": true,
    232         "answer": true,
    233         "justification": "Section 6.1.2 states 'the pre-training data for ChatGPT was collected before September 2021' and 'the pre-training data for DeepSeek was collected from GitHub before February 2023.'"
    234       },
    235       "train_test_overlap_discussed": {
    236         "applies": true,
    237         "answer": true,
    238         "justification": "Section 6.1 extensively addresses train/test overlap: lexical matching analysis (24.5% of patches match ground truth), and temporal splits via the RWB dataset with post-cutoff bugs."
    239       },
    240       "benchmark_contamination_addressed": {
    241         "applies": true,
    242         "answer": true,
    243         "justification": "Defects4J (2014) and QuixBugs (2017) predate all model training cutoffs. Section 6.1 addresses this by creating RWB datasets with bugs after the models' training cutoffs (RWB V1.0 after Oct 2021, RWB V2.0 after Mar 2023)."
    244       }
    245     },
    246     "human_studies": {
    247       "pre_registered": {
    248         "applies": false,
    249         "answer": false,
    250         "justification": "No human participants in this study. The evaluation is entirely automated benchmark evaluation."
    251       },
    252       "irb_or_ethics_approval": {
    253         "applies": false,
    254         "answer": false,
    255         "justification": "No human participants. The study evaluates automated program repair on code benchmarks."
    256       },
    257       "demographics_reported": {
    258         "applies": false,
    259         "answer": false,
    260         "justification": "No human participants in this study."
    261       },
    262       "inclusion_exclusion_criteria": {
    263         "applies": false,
    264         "answer": false,
    265         "justification": "No human participants in this study."
    266       },
    267       "randomization_described": {
    268         "applies": false,
    269         "answer": false,
    270         "justification": "No human participants in this study."
    271       },
    272       "blinding_described": {
    273         "applies": false,
    274         "answer": false,
    275         "justification": "No human participants in this study."
    276       },
    277       "attrition_reported": {
    278         "applies": false,
    279         "answer": false,
    280         "justification": "No human participants in this study."
    281       }
    282     },
    283     "cost_and_practicality": {
    284       "inference_cost_reported": {
    285         "applies": true,
    286         "answer": true,
    287         "justification": "Section 4.3 reports 'the total time required is on average lower than 20 minutes' per bug with a 5-hour timeout, and Table 2 shows ThinkRepair generates ≤125 patches per bug. However, no monetary API costs or token counts are provided."
    288       },
    289       "compute_budget_stated": {
    290         "applies": true,
    291         "answer": false,
    292         "justification": "Hardware is described (Intel Xeon Gold 6226R, 192GB RAM, RTX 3090) but no total compute budget is quantified — no total GPU hours, total API spend, or aggregate wall-clock time for the full evaluation."
    293       }
    294     },
    295     "experimental_rigor": {
    296       "seed_sensitivity_reported": {
    297         "applies": true,
    298         "answer": false,
    299         "justification": "Despite using stochastic sampling (temperature=1), results are not reported across multiple random seeds. Only aggregate bug-fix counts are presented with no sensitivity analysis."
    300       },
    301       "number_of_runs_stated": {
    302         "applies": true,
    303         "answer": true,
    304         "justification": "Section 4.3 explicitly states 'The maximum number of repair attempts is set to 25' per bug with up to 5 interactions per session. These constitute the number of runs."
    305       },
    306       "hyperparameter_search_budget": {
    307         "applies": true,
    308         "answer": false,
    309         "justification": "RQ3 explores interaction numbers (1-7) and four selection strategies, but no systematic hyperparameter search budget is reported — no total configurations tried or compute spent on search."
    310       },
    311       "best_config_selection_justified": {
    312         "applies": true,
    313         "answer": true,
    314         "justification": "Section 5.3 justifies selecting 5 interactions as 'a better balance between effectiveness and cost' (Fig. 9 shows diminishing returns) and Contrastive-based Selection as the best performing strategy (Table 9)."
    315       },
    316       "multiple_comparison_correction": {
    317         "applies": true,
    318         "answer": false,
    319         "justification": "The paper makes numerous pairwise comparisons across 12+ baselines, 4 LLM backends, 4 selection strategies, and multiple datasets without any statistical tests, let alone multiple comparison corrections."
    320       },
    321       "self_comparison_bias_addressed": {
    322         "applies": true,
    323         "answer": false,
    324         "justification": "The authors implement their own BaseChatGPT, BaseCodeLlama, BaseDeepSeek, and BaseStarCoder baselines. While they use original paper results for external baselines, they do not acknowledge the bias of evaluating their own system against their own baseline implementations."
    325       },
    326       "compute_budget_vs_performance": {
    327         "applies": true,
    328         "answer": true,
    329         "justification": "Table 2 compares patch generation budgets: ThinkRepair ≤125 vs. CURE/CoCoNuT 10,000 and AlphaRepair 5,000. The paper explicitly notes 'ThinkRepair generates far fewer patches than the NMT-based approaches.'"
    330       },
    331       "benchmark_construct_validity": {
    332         "applies": true,
    333         "answer": false,
    334         "justification": "The paper does not discuss whether Defects4J and QuixBugs benchmarks actually measure real-world bug-fixing capability, or whether single-function single-hunk fixes are representative of real debugging scenarios."
    335       },
    336       "scaffold_confound_addressed": {
    337         "applies": true,
    338         "answer": false,
    339         "justification": "When comparing ThinkRepair against baselines (ChatRepair, AlphaRepair, etc.), each method uses different scaffolding (interaction strategies, prompting approaches, training paradigms). The scaffold confound is not discussed — improvements may stem from the scaffold rather than the CoT approach."
    340       }
    341     },
    342     "data_leakage": {
    343       "temporal_leakage_addressed": {
    344         "applies": true,
    345         "answer": true,
    346         "justification": "Section 6.1.2 creates RWB V1.0 (bugs after Oct 2021) and RWB V2.0 (bugs after Mar 2023) specifically to test on post-training-cutoff data, directly addressing temporal leakage."
    347       },
    348       "feature_leakage_addressed": {
    349         "applies": true,
    350         "answer": false,
    351         "justification": "The paper provides fault location information (perfect vs. method-level) but does not discuss whether this setup leaks information that would not be available in real usage scenarios. The 'perfect fault information' setting is standard in APR but not examined for leakage."
    352       },
    353       "non_independence_addressed": {
    354         "applies": true,
    355         "answer": true,
    356         "justification": "The paper explicitly states 'Defects4J V1.2 and Defects4J V2.0 are two completely independent versions, with no overlapping bugs between them.' Collection and fixing phases use separate dataset versions."
    357       },
    358       "leakage_detection_method": {
    359         "applies": true,
    360         "answer": true,
    361         "justification": "Section 6.1.1 uses lexical matching between generated patches and ground truth (24.5% match rate) as a detection method. Section 6.1.2 uses temporal splits via the RWB dataset as a concrete prevention method."
    362       }
    363     }
    364   },
    365   "scan_version": 3,
    366   "active_modules": ["experimental_rigor", "data_leakage"],
    367   "claims": [
    368     {
    369       "claim": "ThinkRepair fixes 98 bugs on Defects4J V1.2 with perfect fault information, improving baselines by 27%~344.4%.",
    370       "evidence": "Table 4 and Table 6 show 98 correct fixes. Comparisons: KNOD 67 (46.3% improvement), AlphaRepair 67 (46.3%), ChatRepair 76 (28.9%), GPT-NeoX 18 (444.4%). Section 5.1-5.2.",
    371       "supported": "moderate"
    372     },
    373     {
    374       "claim": "ThinkRepair fixes 107 bugs on Defects4J V2.0, 12~65 more bugs than SOTA APRs.",
    375       "evidence": "Table 4 and Table 7 show 107 fixes on V2.0, compared to KNOD 47, TENURE 43, RewardRepair 44. Section 5.2.",
    376       "supported": "moderate"
    377     },
    378     {
    379       "claim": "ThinkRepair exclusively fixes 32 bugs on Defects4J V1.2 that no other studied approach can fix.",
    380       "evidence": "Venn diagram in Figure 7 shows 32 unique bugs. Section 5.2 discusses the complementary nature of approaches.",
    381       "supported": "moderate"
    382     },
    383     {
    384       "claim": "Both CoT few-shot learning and interaction feedback components substantially contribute to ThinkRepair's performance.",
    385       "evidence": "Table 8 ablation: Zero-Shot=34, Few-Shot=36, ThinkRepair-v1 (CoT only)=57, ThinkRepair-v2 (feedback only)=62, combined=80. Section 5.3.",
    386       "supported": "strong"
    387     },
    388     {
    389       "claim": "Contrastive-based Selection is the best few-shot selection strategy for ThinkRepair.",
    390       "evidence": "Table 9: CSelect=249, SSelect=235, ISelect=225, RSelect=222 total bugs across all datasets. Section 5.3.",
    391       "supported": "moderate"
    392     },
    393     {
    394       "claim": "Data leakage does not significantly affect ThinkRepair's performance.",
    395       "evidence": "Section 6.1: Only 24.5% of correct patches lexically match ground truth. After excluding matching patches, ThinkRepair fixes 29 unique bugs no other method can. RWB evaluation: 19/44 (V1.0) and 10/29 (V2.0) bugs fixed.",
    396       "supported": "moderate"
    397     }
    398   ],
    399   "methodology_tags": ["benchmark-eval"],
    400   "key_findings": "ThinkRepair proposes a two-phase LLM-based APR approach using automated Chain-of-Thought collection and few-shot learning with interaction feedback. It fixes 98 bugs on Defects4J V1.2 (27-344% improvement over 12 baselines) and 107 bugs on Defects4J V2.0. Ablation studies show both CoT few-shot learning and interaction feedback independently contribute, with the combination achieving the best performance. The approach generalizes across multiple LLM backends (ChatGPT, CodeLlama, DeepSeek-Coder, StarCoder) and addresses data leakage concerns through evaluation on post-training-cutoff real-world bugs.",
    401   "red_flags": [
    402     {
    403       "flag": "No statistical significance testing",
    404       "detail": "All comparisons are based on raw bug counts without any statistical tests. Claims like '27%~344.4% improvement' and 'significantly better' are not backed by significance tests, despite stochastic sampling (temperature=1) meaning results could vary across runs."
    405     },
    406     {
    407       "flag": "No variance or error bars across runs",
    408       "detail": "Despite using stochastic LLM sampling with temperature=1 and 25 attempts per bug, no variance measures are reported. Different random runs could yield different bug-fix counts, but this variability is completely unreported."
    409     },
    410     {
    411       "flag": "Inconsistent experimental settings across baselines",
    412       "detail": "ThinkRepair uses only single-function experiments and derives single-hunk/single-line results from them, while baselines (ChatRepair, Codex, GPT-NeoX) ran separate experiments for each setting. The paper acknowledges this but still compares across all settings in Table 4."
    413     },
    414     {
    415       "flag": "Missing many baseline results",
    416       "detail": "Tables 4, 5, and 7 contain many '-' entries indicating missing baseline results. Per-project comparisons in Table 5 are incomplete for Codex, GPT-NeoX, and ChatRepair, making thorough comparison difficult."
    417     }
    418   ],
    419   "cited_papers": [
    420     {
    421       "title": "Less training, more repairing please: revisiting automated program repair via zero-shot learning",
    422       "authors": ["Chunqiu Steven Xia", "Lingming Zhang"],
    423       "year": 2022,
    424       "relevance": "AlphaRepair: first cloze-style LLM-based APR tool, key baseline demonstrating LLM superiority over NMT-based APR."
    425     },
    426     {
    427       "title": "Keep the Conversation Going: Fixing 162 out of 337 bugs for $0.42 each using ChatGPT",
    428       "authors": ["Chunqiu Steven Xia", "Lingming Zhang"],
    429       "year": 2023,
    430       "arxiv_id": "2304.00385",
    431       "relevance": "ChatRepair: uses ChatGPT conversational capability with iterative test feedback for APR, closest baseline to ThinkRepair."
    432     },
    433     {
    434       "title": "Automated program repair in the era of large pre-trained language models",
    435       "authors": ["Chunqiu Steven Xia", "Yuxiang Wei", "Lingming Zhang"],
    436       "year": 2023,
    437       "relevance": "Extensive study of LLM-based APR using various LLMs (Codex, GPT-NeoX, etc.) demonstrating superiority of LLM-based approaches."
    438     },
    439     {
    440       "title": "Chain of thought prompting elicits reasoning in large language models",
    441       "authors": ["Jason Wei", "Xuezhi Wang", "Dale Schuurmans"],
    442       "year": 2022,
    443       "arxiv_id": "2201.11903",
    444       "relevance": "Foundational CoT prompting technique that ThinkRepair builds upon for reasoning-based program repair."
    445     },
    446     {
    447       "title": "Knod: Domain knowledge distilled tree decoder for automated program repair",
    448       "authors": ["Nan Jiang", "Thibaud Lutellier", "Yiling Lou", "Lin Tan"],
    449       "year": 2023,
    450       "relevance": "Top NMT-based APR baseline using domain knowledge distillation, represents SOTA in neural program repair."
    451     },
    452     {
    453       "title": "An analysis of the automatic bug fixing performance of chatgpt",
    454       "authors": ["Dominik Sobania", "Martin Briesch", "Carol Hanna", "Justyna Petke"],
    455       "year": 2023,
    456       "arxiv_id": "2301.08653",
    457       "relevance": "Early analysis of ChatGPT's bug-fixing capabilities, relevant to understanding LLM-based APR."
    458     },
    459     {
    460       "title": "Selfapr: Self-supervised program repair with test execution diagnostics",
    461       "authors": ["He Ye", "Matias Martinez", "Xiapu Luo"],
    462       "year": 2022,
    463       "relevance": "NMT-based APR using self-supervised learning with test diagnostics, key baseline comparison."
    464     },
    465     {
    466       "title": "Cure: Code-aware neural machine translation for automatic program repair",
    467       "authors": ["Nan Jiang", "Thibaud Lutellier", "Lin Tan"],
    468       "year": 2021,
    469       "relevance": "Code-aware NMT-based APR approach, key baseline in the neural program repair space."
    470     },
    471     {
    472       "title": "Evaluating large language models trained on code",
    473       "authors": ["Mark Chen", "Jerry Tworek"],
    474       "year": 2021,
    475       "arxiv_id": "2107.03374",
    476       "relevance": "Codex evaluation paper, foundational work on LLM code generation capabilities."
    477     },
    478     {
    479       "title": "Code llama: Open foundation models for code",
    480       "authors": ["Baptiste Rozière", "Jonas Gehring"],
    481       "year": 2023,
    482       "arxiv_id": "2308.12950",
    483       "relevance": "CodeLlama model used as one of ThinkRepair's backend LLMs, open-source code generation model."
    484     },
    485     {
    486       "title": "Neural program repair with execution-based backpropagation",
    487       "authors": ["He Ye", "Matias Martinez", "Martin Monperrus"],
    488       "year": 2022,
    489       "relevance": "RewardRepair: NMT-based APR with execution feedback, key baseline using test execution signals."
    490     },
    491     {
    492       "title": "Patch generation with language models: Feasibility and scaling behavior",
    493       "authors": ["Sophia D Kolak", "Ruben Martins", "Claire Le Goues"],
    494       "year": 2022,
    495       "relevance": "Early work on using Codex for patch generation, studying feasibility and scaling of LLM-based APR."
    496     }
    497   ],
    498   "engagement_factors": {
    499     "practical_relevance": {
    500       "score": 2,
    501       "justification": "Replication package released; practitioners could adapt the CoT-based prompting approach for their own bug-fixing workflows with various LLMs."
    502     },
    503     "surprise_contrarian": {
    504       "score": 1,
    505       "justification": "Confirms expected findings that CoT prompting and few-shot learning help LLMs; the magnitude of improvement is noteworthy but not paradigm-shifting."
    506     },
    507     "fear_safety": {
    508       "score": 0,
    509       "justification": "No safety or security implications; this is an automated bug-fixing tool."
    510     },
    511     "drama_conflict": {
    512       "score": 0,
    513       "justification": "No controversy; straightforward empirical comparison against existing methods."
    514     },
    515     "demo_ability": {
    516       "score": 2,
    517       "justification": "GitHub replication package available; requires setting up Defects4J and LLM access but is reproducible."
    518     },
    519     "brand_recognition": {
    520       "score": 1,
    521       "justification": "Uses well-known ChatGPT/CodeLlama but the research group (Zhejiang University) is not a high-profile AI lab."
    522     }
    523   }
    524 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs