scan-v5.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan-v5.json (28640B)
      1 {
      2   "scan_version": 5,
      3   "paper_type": "empirical",
      4   "paper": {
      5     "title": "Towards Effectively Leveraging Execution Traces for Program Repair with Code LLMs",
      6     "authors": [
      7       "Mirazul Haque",
      8       "Petr Babkin",
      9       "Farima Farmahinifarahani",
     10       "Manuela Veloso"
     11     ],
     12     "year": 2025,
     13     "venue": "Proceedings of the 4th International Workshop on Knowledge-Augmented Methods for Natural Language Processing",
     14     "arxiv_id": "2505.04441",
     15     "doi": "10.18653/v1/2025.knowledgenlp-1.17"
     16   },
     17   "checklist": {
     18     "claims_and_evidence": {
     19       "abstract_claims_supported": {
     20         "applies": true,
     21         "answer": true,
     22         "justification": "All abstract claims are supported by paper content: limited improvements demonstrated in Table 1 (2/6 configs), complexity relationship shown in Fig 2, LLM-optimized traces in Table 2, superior to finetuning in Section 5.1.",
     23         "source": "haiku"
     24       },
     25       "causal_claims_justified": {
     26         "applies": true,
     27         "answer": true,
     28         "justification": "Section 4 provides ablation studies comparing trace formats (collated vs OPT vs routing), and RQ2 uses observational correlation analysis (Fig 2). Study design with multiple prompt conditions and controlled comparisons supports causal framing.",
     29         "source": "haiku"
     30       },
     31       "generalization_bounded": {
     32         "applies": true,
     33         "answer": true,
     34         "justification": "Results explicitly scoped to three datasets (Refactory, RunBugRun, HumanEval-Java) and two models (GPT-3.5/4). Paper acknowledges 'their effectiveness varies with the dataset and LLM used' and notes future work needed on other models.",
     35         "source": "haiku"
     36       },
     37       "alternative_explanations_discussed": {
     38         "applies": true,
     39         "answer": true,
     40         "justification": "Multiple alternative explanations offered: GPT-3.5 failure attributed to 'qualitative generational gap' in emergent abilities; collated trace failure to lack of training exposure, attention dilution in loops, and truncation issues; finetuning underperformance to limited training data.",
     41         "source": "haiku"
     42       },
     43       "proxy_outcome_distinction": {
     44         "applies": true,
     45         "answer": true,
     46         "justification": "Metrics precisely defined: CFA = percentage of fixes passing all tests, CPA = percentage of programs with at least one correct fix. Claims of 'effectiveness' map directly to these automated test-passing metrics without conflation.",
     47         "source": "haiku"
     48       }
     49     },
     50     "limitations_and_scope": {
     51       "limitations_section_present": {
     52         "applies": true,
     53         "answer": false,
     54         "justification": "No dedicated Limitations or Threats-to-Validity section exists. Some constraints scattered in text (e.g., 'limited training data' for finetuning, scope to two models), but not systematically presented.",
     55         "source": "haiku"
     56       },
     57       "threats_to_validity_specific": {
     58         "applies": true,
     59         "answer": true,
     60         "justification": "Specific threats discussed: dataset selection rationale ('realistic datasets...require significant manual effort'), I/O wrapper for RunBugRun, truncation rates (5-10% of prompts), confidence elicitation bias. However, contamination risk (datasets in GPT training) not addressed.",
     61         "source": "haiku"
     62       },
     63       "scope_boundaries_stated": {
     64         "applies": true,
     65         "answer": true,
     66         "justification": "Scope bounded to two commercial LLMs, three datasets, and APR task. Paper states 'scope for including more...open source models...we leave this to be explored in future work.' Boundaries mostly implicit in setup, not explicit summary.",
     67         "source": "haiku"
     68       }
     69     },
     70     "conflicts_of_interest": {
     71       "funding_disclosed": {
     72         "applies": true,
     73         "answer": true,
     74         "justification": "Disclaimer states 'This paper was prepared for informational purposes by the Artificial Intelligence Research group of JPMorgan Chase & Co.' JPMorgan affiliation and funding clearly disclosed.",
     75         "source": "haiku"
     76       },
     77       "affiliations_disclosed": {
     78         "applies": true,
     79         "answer": true,
     80         "justification": "All authors listed with J. P. Morgan AI Research affiliation and specific office locations (New York, Palo Alto).",
     81         "source": "haiku"
     82       },
     83       "funder_independent_of_outcome": {
     84         "applies": true,
     85         "answer": true,
     86         "justification": "JPMorgan (funder) is not the vendor of evaluated models (OpenAI). No conflict where funder profits from positive results. However, JPMorgan internally uses GPT models, so independence not absolute.",
     87         "source": "haiku"
     88       },
     89       "financial_interests_declared": {
     90         "applies": true,
     91         "answer": false,
     92         "justification": "No explicit 'Competing Interests' statement or financial interests declaration. Disclaimer disclaims liability but does not declare competing interests (patents, equity, consulting).",
     93         "source": "haiku"
     94       }
     95     },
     96     "scope_and_framing": {
     97       "key_terms_defined": {
     98         "applies": true,
     99         "answer": true,
    100         "justification": "APR defined in introduction. Execution traces informally described ('capturing every change to the function's variables'). CFA/CPA formally defined in metrics section. Some terms (static vs dynamic analysis) assumed familiar but contextually clear.",
    101         "source": "haiku"
    102       },
    103       "intended_contribution_clear": {
    104         "applies": true,
    105         "answer": true,
    106         "justification": "Three research questions explicitly framed: RQ1 (do traces help?), RQ2 (how does complexity affect effectiveness?), RQ3 (can format be optimized?). Contribution is empirical evaluation of execution traces in LLM-based APR prompting.",
    107         "source": "haiku"
    108       },
    109       "engagement_with_prior_work": {
    110         "applies": true,
    111         "answer": true,
    112         "justification": "Section 2 engages with SelfAPR, TRACED, TraceFixer, Self-Debug. Paper explicitly states prior work focuses on finetuning/pretraining, whereas this work evaluates prompting with traces. Clear positioning of novel contribution.",
    113         "source": "haiku"
    114       }
    115     }
    116   },
    117   "type_checklist": {
    118     "empirical": {
    119       "artifacts": {
    120         "code_released": {
    121           "applies": true,
    122           "answer": false,
    123           "justification": "No mention of source code release, GitHub repository, or supplementary code. Paper describes methodology but code unavailable for reproduction.",
    124           "source": "haiku"
    125         },
    126         "data_released": {
    127           "applies": true,
    128           "answer": true,
    129           "justification": "Uses three existing public datasets: Refactory, RunBugRun (from CodeNet), and HumanEval-Java. These are standard benchmarks used unmodified, meeting the criterion.",
    130           "source": "haiku"
    131         },
    132         "environment_specified": {
    133           "applies": true,
    134           "answer": false,
    135           "justification": "Mentions PySnooper library, OpenAI models, and deepseek-coder, but no requirements.txt, Dockerfile, or systematic environment specification. 'Training settings...suggested by deepseek-coder developers' references external docs, not included.",
    136           "source": "haiku"
    137         },
    138         "reproduction_instructions": {
    139           "applies": true,
    140           "answer": false,
    141           "justification": "Methodology described in prose but no step-by-step reproduction instructions. Someone attempting to replicate would need to infer many details and cannot access code.",
    142           "source": "haiku"
    143         }
    144       },
    145       "statistical_methodology": {
    146         "confidence_intervals_or_error_bars": {
    147           "applies": true,
    148           "answer": false,
    149           "justification": "Tables 1 and 2 report only point estimates (e.g., 0.421, 0.525) with no error bars, confidence intervals, or uncertainty measures across runs.",
    150           "source": "haiku"
    151         },
    152         "significance_tests": {
    153           "applies": true,
    154           "answer": false,
    155           "justification": "Comparative claims made (e.g., 'Error Prompt...outperform...trace-based prompts...by multiple percentage points') but no statistical significance tests (t-test, chi-square, etc.) reported to support comparisons.",
    156           "source": "haiku"
    157         },
    158         "effect_sizes_reported": {
    159           "applies": true,
    160           "answer": true,
    161           "justification": "Percentages and raw metrics (CFA, CPA) reported with baseline context for comparison (e.g., 0.525 vs 0.509). Though formal effect size statistics (Cohen's d) not provided, magnitude is quantified.",
    162           "source": "haiku"
    163         },
    164         "sample_size_justified": {
    165           "applies": true,
    166           "answer": false,
    167           "justification": "Sample sizes stated (Refactory ~2000, RunBugRun 1000, HumanEval-Java unspecified) but not justified. No power analysis or rationale for sufficiency. Finetuning acknowledges 'limited training data.'",
    168           "source": "haiku"
    169         },
    170         "variance_reported": {
    171           "applies": true,
    172           "answer": false,
    173           "justification": "Tables show single point estimates per configuration. No standard deviation, variance, or results across multiple runs/seeds reported.",
    174           "source": "haiku"
    175         }
    176       },
    177       "evaluation_design": {
    178         "baselines_included": {
    179           "applies": true,
    180           "answer": true,
    181           "justification": "Compares against Self-Debug baseline (Chen et al. 2023), Error Prompt baseline, and fine-tuned model baselines. Multiple baselines enable comparative claims.",
    182           "source": "haiku"
    183         },
    184         "baselines_contemporary": {
    185           "applies": true,
    186           "answer": true,
    187           "justification": "Self-Debug from 2023 is recent. GPT-3.5 Turbo and GPT-4 are state-of-the-art models at time of submission. Baselines are not outdated.",
    188           "source": "haiku"
    189         },
    190         "ablation_study": {
    191           "applies": true,
    192           "answer": true,
    193           "justification": "Section 4 systematically ablates trace representation: collated format, LLM-optimized (OPT), confidence-based routing, and trace-length routing. Results shown in Table 2.",
    194           "source": "haiku"
    195         },
    196         "multiple_metrics": {
    197           "applies": true,
    198           "answer": true,
    199           "justification": "Two metrics used: CFA (Correct Fix Accuracy) and CPA (Correct Program Accuracy). Both reported in all tables.",
    200           "source": "haiku"
    201         },
    202         "human_evaluation": {
    203           "applies": false,
    204           "answer": false,
    205           "justification": "No human evaluation of fix quality. Evaluation is purely automated (test passing). Probing studies (Section 5.2) include manual review of trace diffs but not system output evaluation.",
    206           "source": "haiku"
    207         },
    208         "held_out_test_set": {
    209           "applies": true,
    210           "answer": true,
    211           "justification": "Main experiments use established benchmarks with standard train/test splits. Finetuning uses 80/20 split: '80% of the problems are randomly selected for training, and the rest are reserved for testing.'",
    212           "source": "haiku"
    213         },
    214         "per_category_breakdown": {
    215           "applies": true,
    216           "answer": true,
    217           "justification": "Results broken down by dataset (Refactory, HumanEval-Java, RunBugRun) and model (GPT-3.5, GPT-4) in Tables 1-2. No breakdown by problem difficulty or algorithmic type.",
    218           "source": "haiku"
    219         },
    220         "failure_cases_discussed": {
    221           "applies": true,
    222           "answer": true,
    223           "justification": "RQ2 analysis (Figure 2) shows where traces fail—longer traces correlate with incorrect fixes. Section 5.2 probing studies manually review failures: 'within loops, the LLM tends to either miss or add extra variable modifications.'",
    224           "source": "haiku"
    225         },
    226         "negative_results_reported": {
    227           "applies": true,
    228           "answer": true,
    229           "justification": "Main finding is negative: 'Trace prompts do not consistently outperform Error Prompts.' Explicitly reports only 2 of 6 dataset/model configurations benefit. Includes null and negative results.",
    230           "source": "haiku"
    231         }
    232       },
    233       "setup_transparency": {
    234         "model_versions_specified": {
    235           "applies": true,
    236           "answer": true,
    237           "justification": "Models named: 'GPT-3.5 Turbo' and 'GPT-4.' For open-source, 'deepseek-coder-1.3b-instruct' cited with GitHub link. OpenAI models don't have datestamped snapshots but versions are identified.",
    238           "source": "haiku"
    239         },
    240         "prompts_provided": {
    241           "applies": true,
    242           "answer": true,
    243           "justification": "Figure 1 shows example prompt structure (buggy program, failing test, execution trace). Methodology described: 'We follow the instruction template for complete function generation used by Xia et al. (2023), expanding it.' Full templates not in appendix but structure clear.",
    244           "source": "haiku"
    245         },
    246         "hyperparameters_reported": {
    247           "applies": true,
    248           "answer": false,
    249           "justification": "No temperature, top-p, max_tokens, or sampling parameters reported for OpenAI models. Finetuning training parameters referenced as 'suggested by deepseek-coder developers' but not specified in paper.",
    250           "source": "haiku"
    251         },
    252         "scaffolding_described": {
    253           "applies": true,
    254           "answer": true,
    255           "justification": "Trace generation via PySnooper decorator, postprocessing steps (remove timestamps, strip formatting), truncation at 200 lines, and prompt format variations (error-only, trace, collated, OPT) all described in detail.",
    256           "source": "haiku"
    257         },
    258         "data_preprocessing_documented": {
    259           "applies": true,
    260           "answer": true,
    261           "justification": "Preprocessing documented: PySnooper setup, 'removal of timestamps and stripping of terminal formatting command sequences,' RunBugRun I/O wrapper handling, truncation logic. Pipeline scattered across sections but adequately covered.",
    262           "source": "haiku"
    263         }
    264       },
    265       "data_integrity": {
    266         "raw_data_available": {
    267           "applies": true,
    268           "answer": true,
    269           "justification": "Uses established public datasets (Refactory, RunBugRun, HumanEval-Java). Raw data available from original sources, enabling independent verification.",
    270           "source": "haiku"
    271         },
    272         "data_collection_described": {
    273           "applies": true,
    274           "answer": true,
    275           "justification": "For existing datasets, sources cited. Paper describes selection criteria: 'dataset size, program diversity, unit test availability, dataset origin.' Sampling for RunBugRun: '1000 Python bugs for evaluation.'",
    276           "source": "haiku"
    277         },
    278         "recruitment_methods_described": {
    279           "applies": false,
    280           "answer": false,
    281           "justification": "No human participants or recruitment. Standard benchmark datasets used.",
    282           "source": "haiku"
    283         },
    284         "data_pipeline_documented": {
    285           "applies": true,
    286           "answer": true,
    287           "justification": "Pipeline from dataset selection → trace generation (PySnooper) → postprocessing → prompt formatting described across Sections 3.1-3.2. Not consolidated in one place but adequately documented.",
    288           "source": "haiku"
    289         }
    290       },
    291       "contamination": {
    292         "training_cutoff_stated": {
    293           "applies": true,
    294           "answer": false,
    295           "justification": "No training data cutoff dates stated for GPT-3.5 or GPT-4. Critical omission when evaluating on benchmarks like CodeNet that may have been in pretraining.",
    296           "source": "haiku"
    297         },
    298         "train_test_overlap_discussed": {
    299           "applies": true,
    300           "answer": false,
    301           "justification": "No discussion of whether Refactory, RunBugRun, or HumanEval-Java datasets appeared in GPT-3.5/4 training. Risk of benchmark contamination not addressed.",
    302           "source": "haiku"
    303         },
    304         "benchmark_contamination_addressed": {
    305           "applies": true,
    306           "answer": false,
    307           "justification": "Standard benchmarks used but contamination risk not discussed. This is significant for proprietary models with undisclosed training data.",
    308           "source": "haiku"
    309         }
    310       },
    311       "human_studies": {
    312         "pre_registered": {
    313           "applies": false,
    314           "answer": false,
    315           "justification": "No human participants. Automatic benchmarking only.",
    316           "source": "haiku"
    317         },
    318         "irb_or_ethics_approval": {
    319           "applies": false,
    320           "answer": false,
    321           "justification": "No human subjects study.",
    322           "source": "haiku"
    323         },
    324         "demographics_reported": {
    325           "applies": false,
    326           "answer": false,
    327           "justification": "No human participants.",
    328           "source": "haiku"
    329         },
    330         "inclusion_exclusion_criteria": {
    331           "applies": false,
    332           "answer": false,
    333           "justification": "No human participants.",
    334           "source": "haiku"
    335         },
    336         "randomization_described": {
    337           "applies": false,
    338           "answer": false,
    339           "justification": "No human subjects experiment.",
    340           "source": "haiku"
    341         },
    342         "blinding_described": {
    343           "applies": false,
    344           "answer": false,
    345           "justification": "No human subjects.",
    346           "source": "haiku"
    347         },
    348         "attrition_reported": {
    349           "applies": false,
    350           "answer": false,
    351           "justification": "No human participants.",
    352           "source": "haiku"
    353         }
    354       },
    355       "cost_and_practicality": {
    356         "inference_cost_reported": {
    357           "applies": true,
    358           "answer": false,
    359           "justification": "No API costs, pricing, or latency reported. OPT approach requires two-stage inference (optimize trace then repair) but computational cost not analyzed.",
    360           "source": "haiku"
    361         },
    362         "compute_budget_stated": {
    363           "applies": true,
    364           "answer": false,
    365           "justification": "No total computational budget, number of API calls, token usage, or cost analysis provided. This is significant for work relying on commercial LLM APIs.",
    366           "source": "haiku"
    367         }
    368       }
    369     }
    370   },
    371   "claims": [
    372     {
    373       "claim": "Execution traces do not consistently improve LLM-based program repair performance",
    374       "evidence": "Table 1 shows trace-based prompts outperform error-only prompts in only 2 of 6 dataset/model configurations (HumanEval-Java and RunBugRun with GPT-4). Performance degrades on Refactory with both models.",
    375       "supported": "strong"
    376     },
    377     {
    378       "claim": "Longer execution traces reduce the effectiveness of trace-based APR prompts",
    379       "evidence": "Figure 2 distributions show median trace length and variable modifications significantly higher for failing fixes than correct ones in HumanEval-Java and RunBugRun. Refactory shows opposite pattern but with lower absolute trace complexity.",
    380       "supported": "strong"
    381     },
    382     {
    383       "claim": "LLM-optimized (condensed) execution traces provide the most consistent performance gains",
    384       "evidence": "Table 2 shows OPT traces are top-3 performing on all 6 configurations, with best CFA on 3/6 and consistently high CPA across datasets.",
    385       "supported": "strong"
    386     },
    387     {
    388       "claim": "Prompting-based approaches outperform fine-tuning on small datasets",
    389       "evidence": "Figure 4 shows all prompting techniques (Error, Trace, OPT) outperform fine-tuned deepseek-coder-1.3b across CFA and CPA metrics on RunBugRun and HumanEval-Java.",
    390       "supported": "moderate"
    391     },
    392     {
    393       "claim": "GPT-4 benefits from execution traces while GPT-3.5 does not",
    394       "evidence": "Table 1 shows GPT-4 improves on 2/3 datasets with traces; GPT-3.5 shows no consistent benefit and occasionally degrades. Paper attributes this to 'qualitative generational gap.'",
    395       "supported": "strong"
    396     },
    397     {
    398       "claim": "LLMs have limited but non-trivial ability to align programs with execution traces",
    399       "evidence": "Table 3 probing studies: trace collation accuracy 88% on Refactory reference programs but drops to 45% on diverse Geeks-for-geeks dataset. Trace prediction from scratch reaches max 50% on reference data, 15% on diverse data.",
    400       "supported": "strong"
    401     }
    402   ],
    403   "methodology_tags": [
    404     "benchmark-eval",
    405     "observational"
    406   ],
    407   "key_findings": "Execution traces do not reliably improve LLM-based program repair—they help in only 2 of 6 dataset/model configurations tested. Trace complexity is the critical factor: longer traces and more variable modifications correlate with worse repair performance. LLM-optimized (condensed) traces provide the most consistent improvements across configurations. GPT-4 benefits from execution traces while GPT-3.5 largely does not, suggesting model capacity matters. Prompting-based approaches outperform fine-tuning on small datasets, and probing studies reveal LLMs have limited but non-trivial ability to work with execution traces—collation reaches 45-88% accuracy depending on dataset diversity.",
    408   "red_flags": [
    409     {
    410       "flag": "No statistical significance testing",
    411       "detail": "Claims of 'outperformance' lack p-values, t-tests, or confidence intervals. Differences between 0.52 and 0.53 reported as meaningful but statistical significance unknown."
    412     },
    413     {
    414       "flag": "No variance or confidence bounds",
    415       "detail": "Single point estimates per configuration with no error bars, standard deviation, or results across multiple runs. Stability of findings unclear."
    416     },
    417     {
    418       "flag": "Contamination risk not addressed",
    419       "detail": "No discussion of whether Refactory, RunBugRun, or HumanEval-Java appear in GPT-3.5/4 training data. Dataset contamination would invalidate results."
    420     },
    421     {
    422       "flag": "Training data cutoff not stated",
    423       "detail": "No cutoff dates provided for GPT-3.5 or GPT-4 training. Cannot assess benchmark contamination without this information."
    424     },
    425     {
    426       "flag": "Inconsistent results across datasets unexplained",
    427       "detail": "Refactory shows no benefit from traces while other datasets do. Pattern not deeply investigated—suggests task-dependent effects not well understood."
    428     },
    429     {
    430       "flag": "Limited to two commercial LLMs",
    431       "detail": "Results may not generalize to open-source models (acknowledged) or future proprietary models. Scope limitation acknowledged but not addressed."
    432     },
    433     {
    434       "flag": "Weak fine-tuning baseline",
    435       "detail": "Only 1.3B parameter model with ~500 training samples. Paper acknowledges 'limited training data.' Fine-tuning comparison not a fair test of that approach."
    436     },
    437     {
    438       "flag": "Inference hyperparameters not reported",
    439       "detail": "Temperature, top-p, max_tokens not specified for LLM calls. Reproducibility compromised; results may be sensitive to these settings."
    440     },
    441     {
    442       "flag": "No code release",
    443       "detail": "Methodology described but source code unavailable. Reproduction requires reimplementation and API access."
    444     },
    445     {
    446       "flag": "Cost not analyzed",
    447       "detail": "OPT approach requires two-stage inference (optimize then repair) but computational cost and API expense not discussed. Practicality undermined."
    448     },
    449     {
    450       "flag": "Prompt truncation impact not analyzed",
    451       "detail": "~10% of RunBugRun prompts truncated at 200 lines. Effect on results and conclusions not quantified."
    452     }
    453   ],
    454   "cited_papers": [
    455     {
    456       "title": "SelfAPR: Self-supervised program repair with test execution diagnostics",
    457       "authors": "Ye et al.",
    458       "year": 2022,
    459       "relevance": "Prior work on using execution diagnostics for APR; baseline for comparison."
    460     },
    461     {
    462       "title": "Teaching large language models to self-debug",
    463       "authors": "Chen et al.",
    464       "year": 2023,
    465       "relevance": "Self-Debug baseline approach using chain-of-thought for code generation; traces generated by LLM rather than actual execution."
    466     },
    467     {
    468       "title": "TRACED: Execution-aware pre-training for source code",
    469       "authors": "Ding et al.",
    470       "year": 2023,
    471       "relevance": "Execution traces incorporated during pre-training rather than prompting; related approach to trace-augmented code understanding."
    472     },
    473     {
    474       "title": "TraceFixer: Execution trace-driven program repair",
    475       "authors": "Bouzenia et al.",
    476       "year": 2023,
    477       "relevance": "Fine-tuned CodeT5 with execution traces for APR; fine-tuning baseline comparison and inspiration."
    478     },
    479     {
    480       "title": "Automated program repair in the era of large pre-trained language models",
    481       "authors": "Xia et al.",
    482       "year": 2023,
    483       "relevance": "LLM-based APR foundational work; prompt template baseline used in this paper."
    484     },
    485     {
    486       "title": "Impact of code language models on automated program repair",
    487       "authors": "Jiang et al.",
    488       "year": 2023,
    489       "relevance": "Benchmarking code LLMs on APR; introduces HumanEval-Java dataset used in paper."
    490     },
    491     {
    492       "title": "CodeNet: A large-scale AI for code dataset",
    493       "authors": "Puri et al.",
    494       "year": 2021,
    495       "relevance": "Large code benchmark dataset; RunBugRun derived from CodeNet."
    496     }
    497   ],
    498   "engagement_factors": {
    499     "practical_relevance": {
    500       "score": 2,
    501       "justification": "OPT traces not consistently better than baselines in real-world configurations; requires additional optimization API calls, reducing practical value for practitioners."
    502     },
    503     "surprise_contrarian": {
    504       "score": 2,
    505       "justification": "Moderately contrarian to intuition that execution traces obviously help debugging. Main finding negates expected intuition, though paper frames this as expected given prior work limitations."
    506     },
    507     "fear_safety": {
    508       "score": 0,
    509       "justification": "Standard software engineering work on program repair. No AI safety, security vulnerabilities, or risk concerns raised."
    510     },
    511     "drama_conflict": {
    512       "score": 1,
    513       "justification": "Straightforward empirical study. No controversy, disagreement with other work, or conflict angle. Results are mixed but not dramatic."
    514     },
    515     "demo_ability": {
    516       "score": 1,
    517       "justification": "Code not released. Requires OpenAI API access (GPT-3.5/4) which is paid and not freely available. Difficult for readers to immediately reproduce or try."
    518     },
    519     "brand_recognition": {
    520       "score": 2,
    521       "justification": "JPMorgan AI Research is reputable. OpenAI models (GPT-3.5/4) famous. But workshop venue (not main conference track) and niche APR task limit reach."
    522     }
    523   },
    524   "hn_data": {
    525     "threads": [
    526       {
    527         "hn_id": "43120088",
    528         "title": "Show HN: We have just released our first Debloating tool for Containers",
    529         "points": 5,
    530         "comments": 4,
    531         "url": "https://news.ycombinator.com/item?id=43120088"
    532       },
    533       {
    534         "hn_id": "42657501",
    535         "title": "The GAN is dead; long live the GAN - A Modern GAN Baseline",
    536         "points": 3,
    537         "comments": 1,
    538         "url": "https://news.ycombinator.com/item?id=42657501"
    539       },
    540       {
    541         "hn_id": "44439235",
    542         "title": "Wider or Deeper? Scaling LLM Inference-Time Compute with Adaptive Tree Search",
    543         "points": 3,
    544         "comments": 0,
    545         "url": "https://news.ycombinator.com/item?id=44439235"
    546       },
    547       {
    548         "hn_id": "44312317",
    549         "title": "Self-Supervised Contrastive Learning Approximates Supervised CL",
    550         "points": 3,
    551         "comments": 0,
    552         "url": "https://news.ycombinator.com/item?id=44312317"
    553       },
    554       {
    555         "hn_id": "44363141",
    556         "title": "Revisiting the Othello World Model Hypothesis",
    557         "points": 1,
    558         "comments": 0,
    559         "url": "https://news.ycombinator.com/item?id=44363141"
    560       },
    561       {
    562         "hn_id": "9586780",
    563         "title": "Untangling the roles of parasites in food webs with generative network models",
    564         "points": 1,
    565         "comments": 0,
    566         "url": "https://news.ycombinator.com/item?id=9586780"
    567       }
    568     ],
    569     "top_points": 5,
    570     "total_points": 16,
    571     "total_comments": 5
    572   }
    573 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs