scan-v5.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan-v5.json (28695B)
      1 {
      2   "scan_version": 5,
      3   "paper_type": "empirical",
      4   "paper": {
      5     "title": "The Impact of Fine-Tuning Large Language Models on Automated Program Repair",
      6     "authors": [
      7       "Roman Machácek",
      8       "Anastasiia Grishina",
      9       "Max Hort",
     10       "Leon Moonen"
     11     ],
     12     "year": 2025,
     13     "venue": "IEEE International Conference on Software Maintenance and Evolution",
     14     "arxiv_id": "2507.19909",
     15     "doi": "10.1109/ICSME64153.2025.00042"
     16   },
     17   "checklist": {
     18     "claims_and_evidence": {
     19       "abstract_claims_supported": {
     20         "applies": true,
     21         "answer": true,
     22         "justification": "The abstract claims that full fine-tuning decreases benchmarking performance and PEFT achieves better results — both are directly supported by Tables III and V, which show degradation for DeepSeekCoder/StarCoder under full FT and improvements under LoRA.",
     23         "source": "haiku"
     24       },
     25       "causal_claims_justified": {
     26         "applies": true,
     27         "answer": true,
     28         "justification": "The paper makes causal claims about fine-tuning affecting APR performance using a controlled experimental design that compares the same models across three conditions (no FT, full FT, PEFT) on identical benchmarks, which is adequate for this type of causal inference.",
     29         "source": "haiku"
     30       },
     31       "generalization_bounded": {
     32         "applies": true,
     33         "answer": true,
     34         "justification": "Conclusions are bounded to the six selected Java-focused LLMs and three APR benchmarks; the threats-to-validity section explicitly notes that QuixBugs/HumanEval-Java contain simple bugs not representative of complex real-world bugs.",
     35         "source": "haiku"
     36       },
     37       "alternative_explanations_discussed": {
     38         "applies": true,
     39         "answer": true,
     40         "justification": "The paper discusses multiple alternative explanations for performance degradation under full fine-tuning: data distribution mismatch between CLM and benchmark datasets, overfitting, and model size constraints.",
     41         "source": "haiku"
     42       },
     43       "proxy_outcome_distinction": {
     44         "applies": true,
     45         "answer": true,
     46         "justification": "The paper explicitly distinguishes between plausibility (passes all tests) and correctness, stating 'Plausibility shows whether a patch passes all available tests but is not a guarantee of its correctness,' and uses this as motivation for also reporting CodeBLEU and exact match.",
     47         "source": "haiku"
     48       }
     49     },
     50     "limitations_and_scope": {
     51       "limitations_section_present": {
     52         "applies": true,
     53         "answer": true,
     54         "justification": "Section IV-E 'Threats to Validity' is a dedicated section covering internal and external validity concerns including benchmark representativeness, versioning issues, data distribution mismatch, data leakage, and plausibility vs. correctness.",
     55         "source": "haiku"
     56       },
     57       "threats_to_validity_specific": {
     58         "applies": true,
     59         "answer": true,
     60         "justification": "Threats are specific: e.g., 'benchmarks like HumanEval-Java and QuixBugs were created from simple projects and consist of bugs that are not representative of complex real-world bugs,' and specific mention of Java versioning causing result differences from Jiang et al.",
     61         "source": "haiku"
     62       },
     63       "scope_boundaries_stated": {
     64         "applies": true,
     65         "answer": true,
     66         "justification": "The paper is explicit that results apply to Java programs, the six selected LLMs, and three specific benchmarks; the threats section acknowledges that larger and more complex datasets would be needed to represent real-world bugs.",
     67         "source": "haiku"
     68       }
     69     },
     70     "conflicts_of_interest": {
     71       "funding_disclosed": {
     72         "applies": true,
     73         "answer": true,
     74         "justification": "Acknowledgments explicitly disclose funding from the Research Council of Norway (secureIT #288787) and European Union Horizon Europe Marie Skłodowska-Curie Actions (#101151798).",
     75         "source": "haiku"
     76       },
     77       "affiliations_disclosed": {
     78         "applies": true,
     79         "answer": true,
     80         "justification": "All four authors' affiliations are disclosed (University of Bern, Simula Research Laboratory) — academic institutions with no commercial interest in the evaluated models.",
     81         "source": "haiku"
     82       },
     83       "funder_independent_of_outcome": {
     84         "applies": true,
     85         "answer": true,
     86         "justification": "Funders (Research Council of Norway, EU) are public/governmental bodies independent of the commercial LLMs (CodeLlama, DeepSeekCoder, StarCoder) being evaluated.",
     87         "source": "haiku"
     88       },
     89       "financial_interests_declared": {
     90         "applies": true,
     91         "answer": false,
     92         "justification": "No competing interests statement or declaration of financial interests (patents, equity, consulting) is present in the paper; only funding acknowledgments are provided.",
     93         "source": "haiku"
     94       }
     95     },
     96     "scope_and_framing": {
     97       "key_terms_defined": {
     98         "applies": true,
     99         "answer": true,
    100         "justification": "APR, LLMs, PEFT, LoRA, and IA3 are all defined and explained in Section II with mathematical formulations; the paper also defines evaluation outcomes (plausible, timeout, uncompilable, wrong, unknown).",
    101         "source": "haiku"
    102       },
    103       "intended_contribution_clear": {
    104         "applies": true,
    105         "answer": true,
    106         "justification": "Section I lists five explicit contributions (baseline establishment, full FT assessment, PEFT effects, LoRA hyperparameter analysis, replication package) with bullet points.",
    107         "source": "haiku"
    108       },
    109       "engagement_with_prior_work": {
    110         "applies": true,
    111         "answer": true,
    112         "justification": "The paper explicitly builds on Jiang et al. [8], numerically compares results with Li et al. [58] where models overlap, and situates its novel contribution (PEFT for APR with different fine-tuning data) relative to concurrent work by Huang et al. [59].",
    113         "source": "haiku"
    114       }
    115     }
    116   },
    117   "type_checklist": {
    118     "empirical": {
    119       "artifacts": {
    120         "code_released": {
    121           "applies": true,
    122           "answer": true,
    123           "justification": "A replication package with code and results is publicly available at https://doi.org/10.5281/zenodo.16359186, explicitly cited twice in the paper.",
    124           "source": "haiku"
    125         },
    126         "data_released": {
    127           "applies": true,
    128           "answer": true,
    129           "justification": "All three benchmarks (Defects4J, QuixBugs, HumanEval-Java) are publicly available; the CLM fine-tuning dataset is from a public GitHub repo (lin-tan/clm).",
    130           "source": "haiku"
    131         },
    132         "environment_specified": {
    133           "applies": true,
    134           "answer": false,
    135           "justification": "The paper mentions using Hugging Face and A100/V100 GPUs but provides no requirements.txt, Dockerfile, or specific library version pinning in the paper text; environment reproducibility relies on the replication package, which cannot be verified here.",
    136           "source": "haiku"
    137         },
    138         "reproduction_instructions": {
    139           "applies": true,
    140           "answer": true,
    141           "justification": "The Zenodo replication package explicitly contains 'code and results,' and the paper describes preprocessing formats in Listing 1 plus hyperparameter defaults; the replication package is sufficient to enable reproduction.",
    142           "source": "haiku"
    143         }
    144       },
    145       "statistical_methodology": {
    146         "confidence_intervals_or_error_bars": {
    147           "applies": true,
    148           "answer": false,
    149           "justification": "All results are presented as raw counts of plausible patches with no confidence intervals, error bars, or statistical uncertainty measures across the 10 generated patches or runs.",
    150           "source": "haiku"
    151         },
    152         "significance_tests": {
    153           "applies": true,
    154           "answer": false,
    155           "justification": "No statistical significance tests are applied to any of the comparative claims despite multiple model-by-benchmark comparisons; differences are described informally as 'improvements' or 'deterioration.'",
    156           "source": "haiku"
    157         },
    158         "effect_sizes_reported": {
    159           "applies": true,
    160           "answer": true,
    161           "justification": "Percentage improvements with baseline context are reported in RQ3 summary (e.g., 'performance gains of 172%, 225%, 153% on QuixBugs, HumanEval-Java and Defects4J benchmarks' for CodeGen-2B with LoRA).",
    162           "source": "haiku"
    163         },
    164         "sample_size_justified": {
    165           "applies": true,
    166           "answer": false,
    167           "justification": "The benchmark sizes (40, 163, and 219 programs) are inherited from prior work without any power analysis or justification for their adequacy to detect performance differences.",
    168           "source": "haiku"
    169         },
    170         "variance_reported": {
    171           "applies": true,
    172           "answer": false,
    173           "justification": "Results are counts of plausible patches with no variance, standard deviation, or spread across multiple experimental runs; stochasticity of LLM inference is acknowledged but not quantified.",
    174           "source": "haiku"
    175         }
    176       },
    177       "evaluation_design": {
    178         "baselines_included": {
    179           "applies": true,
    180           "answer": true,
    181           "justification": "All tables compare against base (no fine-tuning) models, and Table V directly compares base vs. FMFT vs. LoRA vs. IA3 for the same models.",
    182           "source": "haiku"
    183         },
    184         "baselines_contemporary": {
    185           "applies": true,
    186           "answer": true,
    187           "justification": "Baselines include contemporary models (DeepSeekCoder v1, StarCoder, CodeLlama-2, all from 2023-2024) selected specifically to improve on Jiang et al.'s prior work.",
    188           "source": "haiku"
    189         },
    190         "ablation_study": {
    191           "applies": true,
    192           "answer": true,
    193           "justification": "RQ4 systematically varies LoRA rank and scaling factor across 8 values each (1,2,4,8,16,32,64), functioning as a hyperparameter ablation; the four experimental conditions (no FT, full FT, LoRA, IA3) also constitute an ablation.",
    194           "source": "haiku"
    195         },
    196         "multiple_metrics": {
    197           "applies": true,
    198           "answer": true,
    199           "justification": "The paper uses plausible patch count (test-based), CodeBLEU, exact match, and training/validation loss as complementary metrics.",
    200           "source": "haiku"
    201         },
    202         "human_evaluation": {
    203           "applies": false,
    204           "answer": false,
    205           "justification": "No human evaluation of patch quality is conducted; the authors explicitly chose plausibility over manual correctness verification to avoid subjectivity issues.",
    206           "source": "haiku"
    207         },
    208         "held_out_test_set": {
    209           "applies": true,
    210           "answer": true,
    211           "justification": "The three APR benchmarks serve as held-out test sets, completely separate from the CLM fine-tuning dataset; Defects4J-related patches were explicitly removed from CLM to prevent leakage.",
    212           "source": "haiku"
    213         },
    214         "per_category_breakdown": {
    215           "applies": true,
    216           "answer": true,
    217           "justification": "All result tables provide per-benchmark (QuixBugs, HumanEval-Java, Defects4J) and per-model breakdowns, enabling granular comparison across conditions.",
    218           "source": "haiku"
    219         },
    220         "failure_cases_discussed": {
    221           "applies": true,
    222           "answer": true,
    223           "justification": "The paper discusses failure modes explicitly: full fine-tuning causes performance degradation for stronger models (DeepSeekCoder, StarCoder) due to data distribution mismatch, and CodeT5 models underperform despite PEFT.",
    224           "source": "haiku"
    225         },
    226         "negative_results_reported": {
    227           "applies": true,
    228           "answer": true,
    229           "justification": "Negative results are a central finding: full fine-tuning degrades performance for DeepSeekCoder-1.3b from 33/94/72 to 15/64/80 on QB/HE/D4J, and IA3 underperforms LoRA in 21/24 cases.",
    230           "source": "haiku"
    231         }
    232       },
    233       "setup_transparency": {
    234         "model_versions_specified": {
    235           "applies": true,
    236           "answer": true,
    237           "justification": "Specific model versions are named (CodeGen-1, CodeT5, StarCoderBase, DeepSeekCoder-Base v1, Bloom, CodeLlama-2) with parameter sizes; the paper notes 'at the time of writing, we used the latest DeepSeekCoder model available, i.e., v1.'",
    238           "source": "haiku"
    239         },
    240         "prompts_provided": {
    241           "applies": true,
    242           "answer": true,
    243           "justification": "Listing 1 provides the full prompt format for each model (Bloom, CodeGEN, CodeLlama2, CodeT5, DeepSeekCoder, StarCoder) with actual code examples and fill-in-the-middle tokens.",
    244           "source": "haiku"
    245         },
    246         "hyperparameters_reported": {
    247           "applies": true,
    248           "answer": true,
    249           "justification": "LoRA defaults (r=8, α=16), number of epochs (3), and the 8 values tested for rank and scaling factor are explicitly reported; IA3 uses Hugging Face defaults.",
    250           "source": "haiku"
    251         },
    252         "scaffolding_described": {
    253           "applies": false,
    254           "answer": false,
    255           "justification": "This is a fine-tuning study with direct model inference, not an agentic scaffolding setup; no agentic scaffolding is used.",
    256           "source": "haiku"
    257         },
    258         "data_preprocessing_documented": {
    259           "applies": true,
    260           "answer": true,
    261           "justification": "Section III-D describes preprocessing for each model with Listing 1 showing exact input formats; CLM dataset filtering criteria (single-hunk patches, Defects4J deduplication) are also documented.",
    262           "source": "haiku"
    263         }
    264       },
    265       "data_integrity": {
    266         "raw_data_available": {
    267           "applies": true,
    268           "answer": true,
    269           "justification": "The Zenodo replication package explicitly contains 'code and results,' making raw experimental results available for independent verification.",
    270           "source": "haiku"
    271         },
    272         "data_collection_described": {
    273           "applies": true,
    274           "answer": true,
    275           "justification": "Section III-A describes all datasets: Defects4J 2.0.1 (835 active bugs), QuixBugs (40 Java programs), HumanEval-Java (163 bugs), and CLM (143,666 instances from 1,083,185 GitHub commits, March 2011-March 2018).",
    276           "source": "haiku"
    277         },
    278         "recruitment_methods_described": {
    279           "applies": false,
    280           "answer": false,
    281           "justification": "Standard benchmarks are used; no human participant recruitment is involved.",
    282           "source": "haiku"
    283         },
    284         "data_pipeline_documented": {
    285           "applies": true,
    286           "answer": true,
    287           "justification": "The preprocessing pipeline from raw datasets to model inputs is documented in Section III-D and Listing 1, including CLM filtering steps (single-hunk filtering, Defects4J deduplication via AST comparison).",
    288           "source": "haiku"
    289         }
    290       },
    291       "contamination": {
    292         "training_cutoff_stated": {
    293           "applies": true,
    294           "answer": false,
    295           "justification": "Training data cutoffs for the evaluated LLMs are not explicitly stated; the paper acknowledges contamination risk but does not report training cutoff dates for any of the six models.",
    296           "source": "haiku"
    297         },
    298         "train_test_overlap_discussed": {
    299           "applies": true,
    300           "answer": true,
    301           "justification": "Section IV-E explicitly discusses data leakage as 'the biggest reason for concern' and notes that models are pre-trained on GitHub which may include benchmark code; HumanEval-Java was specifically Python→Java converted to reduce overlap.",
    302           "source": "haiku"
    303         },
    304         "benchmark_contamination_addressed": {
    305           "applies": true,
    306           "answer": true,
    307           "justification": "HumanEval-Java was created by translating Python to Java specifically to eliminate pre-training contamination; Defects4J-related patches were removed from CLM using AST comparison; contamination is acknowledged as unresolvable for public GitHub-trained models.",
    308           "source": "haiku"
    309         }
    310       },
    311       "human_studies": {
    312         "pre_registered": {
    313           "applies": false,
    314           "answer": false,
    315           "justification": "No human participants.",
    316           "source": "haiku"
    317         },
    318         "irb_or_ethics_approval": {
    319           "applies": false,
    320           "answer": false,
    321           "justification": "No human participants.",
    322           "source": "haiku"
    323         },
    324         "demographics_reported": {
    325           "applies": false,
    326           "answer": false,
    327           "justification": "No human participants.",
    328           "source": "haiku"
    329         },
    330         "inclusion_exclusion_criteria": {
    331           "applies": false,
    332           "answer": false,
    333           "justification": "No human participants.",
    334           "source": "haiku"
    335         },
    336         "randomization_described": {
    337           "applies": false,
    338           "answer": false,
    339           "justification": "No human participants.",
    340           "source": "haiku"
    341         },
    342         "blinding_described": {
    343           "applies": false,
    344           "answer": false,
    345           "justification": "No human participants.",
    346           "source": "haiku"
    347         },
    348         "attrition_reported": {
    349           "applies": false,
    350           "answer": false,
    351           "justification": "No human participants.",
    352           "source": "haiku"
    353         }
    354       },
    355       "cost_and_practicality": {
    356         "inference_cost_reported": {
    357           "applies": true,
    358           "answer": false,
    359           "justification": "GPU hardware (A100, V100) is mentioned as a constraint for model selection, but no specific inference cost, latency, or wall-clock time for running experiments is reported.",
    360           "source": "haiku"
    361         },
    362         "compute_budget_stated": {
    363           "applies": true,
    364           "answer": false,
    365           "justification": "Only the hardware type ('1 node with A100, and V100 GPUs') is mentioned; no total GPU-hours, training time, or compute budget is quantified.",
    366           "source": "haiku"
    367         }
    368       }
    369     }
    370   },
    371   "claims": [
    372     {
    373       "claim": "Full fine-tuning decreases APR performance for stronger pre-trained models (DeepSeekCoder, StarCoder) due to data distribution mismatch and overfitting.",
    374       "evidence": "Table III shows DeepSeekCoder-1.3b drops from 33/94/72 (base) to 15/64/80 (FMFT epoch 3) on QB/HE/D4J; StarCoder-1b drops from 22/69/62 to 13/49/71.",
    375       "supported": "strong"
    376     },
    377     {
    378       "claim": "Parameter-efficient fine-tuning (LoRA) improves APR performance over both base models and full fine-tuning for most configurations.",
    379       "evidence": "Table V shows CodeGen-2B with LoRA achieves 19/81/98 vs. base 13/44/20 and FMFT 11/36/64; described as '172%, 225%, 153% performance gains.'",
    380       "supported": "strong"
    381     },
    382     {
    383       "claim": "LoRA outperforms IA3 in 21 out of 24 cases, contrary to Liu et al.'s claim that IA3 is generally superior.",
    384       "evidence": "Table V comparison across CodeGen, CodeT5, and DeepSeekCoder models shows LoRA higher in 21/24 benchmark-model pairs; discrepancy with Li et al. [58] acknowledged.",
    385       "supported": "strong"
    386     },
    387     {
    388       "claim": "LoRA hyperparameters (rank and scaling factor) have negligible impact on APR performance.",
    389       "evidence": "Figures 3-6 show CodeBLEU varies only 0.6-0.64 across all tested rank/scaling values (1,2,4,8,16,32,64); exact match shows slightly more variation but remains low throughout.",
    390       "supported": "moderate"
    391     },
    392     {
    393       "claim": "Larger models generally achieve more plausible patches without fine-tuning.",
    394       "evidence": "Table I shows CodeGen-2B > CodeGen-350M in 5/6 cases, StarCoder-3b ≥ StarCoder-1b in 5/6 cases; holds in 34/48 cases overall.",
    395       "supported": "moderate"
    396     },
    397     {
    398       "claim": "PEFT methods use less than 1% of original trainable parameters while achieving competitive or superior performance.",
    399       "evidence": "Table VI shows LoRA uses 0.06-0.49% and IA3 uses 0.02-0.07% of total parameters; Table V shows these achieve better results than full fine-tuning in many cases.",
    400       "supported": "strong"
    401     }
    402   ],
    403   "methodology_tags": [
    404     "benchmark-eval"
    405   ],
    406   "key_findings": "Full fine-tuning improves weaker baseline models (CodeT5, Bloom) but degrades stronger pre-trained models (DeepSeekCoder, StarCoder) due to data distribution mismatch between the CLM fine-tuning dataset and APR benchmarks. Parameter-efficient fine-tuning with LoRA consistently outperforms full fine-tuning and uses under 0.2% of trainable parameters, making it the recommended approach for APR. Contrary to prior work claiming IA3 superiority, LoRA outperforms IA3 in 21/24 experimental configurations. LoRA hyperparameters (rank, scaling factor) have negligible effect on final performance, supporting use of default values.",
    407   "red_flags": [
    408     {
    409       "flag": "No statistical significance testing",
    410       "detail": "All comparative claims (PEFT better than full FT, LoRA better than IA3) are supported only by raw counts with no significance tests despite multiple pairwise comparisons across 3 benchmarks × 6+ models."
    411     },
    412     {
    413       "flag": "No variance across runs",
    414       "detail": "Results report counts of plausible patches from a single run with 10 patches each; LLM inference stochasticity is acknowledged but not quantified through repeated experiments."
    415     },
    416     {
    417       "flag": "Small benchmark scale limits power",
    418       "detail": "QuixBugs has only 40 programs — small differences (e.g., 2-3 additional plausible patches) are treated as meaningful findings without any power analysis."
    419     },
    420     {
    421       "flag": "Training cutoffs not stated for evaluated models",
    422       "detail": "Contamination concern is acknowledged but training data cutoffs for DeepSeekCoder, StarCoder, CodeLlama-2, etc. are not retrieved or stated, leaving the contamination risk unquantified."
    423     },
    424     {
    425       "flag": "Plausibility ≠ correctness",
    426       "detail": "Plausible patches (passing all provided tests) are used as the primary success metric, but test suites in these benchmarks are incomplete; some plausible patches may be incorrect fixes that happen to satisfy tests."
    427     }
    428   ],
    429   "cited_papers": [
    430     {
    431       "title": "Impact of Code Language Models on Automated Program Repair",
    432       "relevance": "Direct predecessor work by Jiang et al. that this paper builds upon, sharing the framework, fine-tuning dataset (CLM), and two of the six LLMs (CodeGen, CodeT5)"
    433     },
    434     {
    435       "title": "Exploring Parameter-Efficient Fine-Tuning of Large Language Model on Automated Program Repair",
    436       "relevance": "Closest related work (Li et al.) that also investigates PEFT for APR but uses instruction-tuning datasets; numerical comparisons made where models overlap"
    437     },
    438     {
    439       "title": "Comprehensive Fine-Tuning Large Language Models of Code for Automated Program Repair",
    440       "relevance": "Concurrent study (Huang et al.) on fine-tuning LLMs for APR, situates this paper's contribution relative to parallel work"
    441     },
    442     {
    443       "title": "LoRA: Low-Rank Adaptation of Large Language Models",
    444       "relevance": "Foundational technique paper for LoRA PEFT method, one of the two PEFT approaches evaluated"
    445     },
    446     {
    447       "title": "Few-Shot Parameter-Efficient Fine-Tuning Is Better and Cheaper than in-Context Learning",
    448       "relevance": "Foundational paper for IA3 PEFT method, the second PEFT approach evaluated; claimed IA3 outperforms LoRA, a claim this paper partially refutes"
    449     },
    450     {
    451       "title": "Defects4J: A Database of Existing Faults to Enable Controlled Testing Studies for Java Programs",
    452       "relevance": "Primary complex benchmark used for APR evaluation throughout the study"
    453     },
    454     {
    455       "title": "RepairLLaMA: Efficient Representations and Fine-Tuned Adapters for Program Repair",
    456       "relevance": "Related PEFT-for-APR work that fine-tuned CodeLlama-7b with LoRA, cited for context on PEFT in APR"
    457     },
    458     {
    459       "title": "A Syntax-Guided Edit Decoder for Neural Program Repair",
    460       "relevance": "Source of the CLM fine-tuning dataset (143,666 bug-fix pairs from GitHub) used in all fine-tuning experiments"
    461     }
    462   ],
    463   "engagement_factors": {
    464     "practical_relevance": {
    465       "score": 2,
    466       "justification": "Provides directly actionable guidance: use LoRA over full fine-tuning and IA3 for APR, with specific default hyperparameters validated across 6 models and 3 benchmarks."
    467     },
    468     "surprise_contrarian": {
    469       "score": 2,
    470       "justification": "Counter-intuitive finding that full fine-tuning hurts better-performing models, and that LoRA beats IA3 in 21/24 cases contrary to IA3's claimed superiority in the original IA3 paper."
    471     },
    472     "fear_safety": {
    473       "score": 0,
    474       "justification": "No AI safety or risk concerns — this is a technical performance comparison in a software engineering research context."
    475     },
    476     "drama_conflict": {
    477       "score": 1,
    478       "justification": "Mild contradiction with Li et al.'s IA3 vs. LoRA findings creates some academic tension, but the discrepancy is discussed constructively rather than as a controversy."
    479     },
    480     "demo_ability": {
    481       "score": 1,
    482       "justification": "Replication package exists at Zenodo with code, but running experiments requires substantial GPU resources (A100/V100) making casual reproduction difficult."
    483     },
    484     "brand_recognition": {
    485       "score": 0,
    486       "justification": "Simula Research Laboratory and University of Bern are respected institutions but not high-profile AI labs; no famous products or models evaluated are from these institutions."
    487     }
    488   },
    489   "hn_data": {
    490     "threads": [
    491       {
    492         "hn_id": "41613513",
    493         "title": "AI Companions Reduce Loneliness",
    494         "points": 51,
    495         "comments": 81,
    496         "url": "https://news.ycombinator.com/item?id=41613513"
    497       },
    498       {
    499         "hn_id": "43246743",
    500         "title": "Order Doesn’t Matter, But Reasoning Does",
    501         "points": 14,
    502         "comments": 16,
    503         "url": "https://news.ycombinator.com/item?id=43246743"
    504       },
    505       {
    506         "hn_id": "41116325",
    507         "title": "Substantial Risk of Atlantic Circulation Tipping Under Moderate Climate Change",
    508         "points": 5,
    509         "comments": 0,
    510         "url": "https://news.ycombinator.com/item?id=41116325"
    511       },
    512       {
    513         "hn_id": "44542845",
    514         "title": "Simulated impact on LSST data of Starlink v1.5 and V2 satellites",
    515         "points": 2,
    516         "comments": 0,
    517         "url": "https://news.ycombinator.com/item?id=44542845"
    518       },
    519       {
    520         "hn_id": "44438536",
    521         "title": "CoVE: Compressed Vocabulary Expansion Makes Better LLM-Based Recommender Systems",
    522         "points": 2,
    523         "comments": 0,
    524         "url": "https://news.ycombinator.com/item?id=44438536"
    525       },
    526       {
    527         "hn_id": "44240945",
    528         "title": "Is (Selective) Round-to-Nearest Quantization All You Need?",
    529         "points": 2,
    530         "comments": 0,
    531         "url": "https://news.ycombinator.com/item?id=44240945"
    532       },
    533       {
    534         "hn_id": "43265110",
    535         "title": "Training LLMs with Order-Centric Augmentation",
    536         "points": 2,
    537         "comments": 0,
    538         "url": "https://news.ycombinator.com/item?id=43265110"
    539       },
    540       {
    541         "hn_id": "27997501",
    542         "title": "So you want to analyze Scheme programs with Datalog?",
    543         "points": 2,
    544         "comments": 0,
    545         "url": "https://news.ycombinator.com/item?id=27997501"
    546       },
    547       {
    548         "hn_id": "42116437",
    549         "title": "A Survey of Explainable AI in Financial Forecasting",
    550         "points": 1,
    551         "comments": 1,
    552         "url": "https://news.ycombinator.com/item?id=42116437"
    553       },
    554       {
    555         "hn_id": "44465492",
    556         "title": "Few-Shot Learning for Industrial Time Series: Screw-Fastening Process Monitoring",
    557         "points": 1,
    558         "comments": 0,
    559         "url": "https://news.ycombinator.com/item?id=44465492"
    560       }
    561     ],
    562     "top_points": 51,
    563     "total_points": 82,
    564     "total_comments": 98
    565   }
    566 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs