scan-v5.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan-v5.json (28417B)
      1 {
      2   "scan_version": 5,
      3   "paper_type": "empirical",
      4   "paper": {
      5     "title": "Lost in Translation: A Study of Bugs Introduced by Large Language Models while Translating Code",
      6     "authors": [
      7       "Rangeet Pan",
      8       "Ali Reza Ibrahimzada",
      9       "Rahul Krishna",
     10       "Divya Sankar",
     11       "Lambert Pouguem Wassi",
     12       "Michele Merler",
     13       "Boris Sobolev",
     14       "Raju Pavuluri",
     15       "Saurabh Sinha",
     16       "Reyhaneh Jabbarvand"
     17     ],
     18     "year": 2024,
     19     "venue": "ICSE 2024",
     20     "arxiv_id": "2308.03109",
     21     "doi": "10.1145/3597503.3639226"
     22   },
     23   "checklist": {
     24     "claims_and_evidence": {
     25       "abstract_claims_supported": {
     26         "applies": true,
     27         "answer": true,
     28         "justification": "All major abstract claims (1,700 samples, 2.1%–47.3% success range, 15 bug categories, 5.5% avg improvement from prompt crafting) are supported by Tables 2, 4, and Figure 6 respectively.",
     29         "source": "haiku"
     30       },
     31       "causal_claims_justified": {
     32         "applies": true,
     33         "answer": true,
     34         "justification": "The claim that iterative prompt crafting improves translation is tested empirically via controlled before/after comparison (vanilla vs. iter1/iter2) on 4 models, providing adequate support for a causal effect claim in this software engineering context.",
     35         "source": "haiku"
     36       },
     37       "generalization_bounded": {
     38         "applies": true,
     39         "answer": true,
     40         "justification": "Conclusions are generally scoped to the 5 studied PLs, 7 LLMs, and 3 benchmarks plus 2 real-world projects; the threats section explicitly limits external validity to this experimental setting.",
     41         "source": "haiku"
     42       },
     43       "alternative_explanations_discussed": {
     44         "applies": true,
     45         "answer": false,
     46         "justification": "The paper attributes failure patterns to LLM limitations without discussing alternative explanations such as prompt sensitivity, benchmark difficulty bias, or the possibility that different prompting strategies (not bug types) drive the observed failure distribution.",
     47         "source": "haiku"
     48       },
     49       "proxy_outcome_distinction": {
     50         "applies": true,
     51         "answer": true,
     52         "justification": "The paper explicitly justifies using test execution (compile + runtime + test pass) over static metrics like CodeBLEU, noting static metrics can be misleading and do not validate functional correctness.",
     53         "source": "haiku"
     54       }
     55     },
     56     "limitations_and_scope": {
     57       "limitations_section_present": {
     58         "applies": true,
     59         "answer": true,
     60         "justification": "Section 8 'Threats to Validity' covers external, internal, and construct validity threats in a dedicated section.",
     61         "source": "haiku"
     62       },
     63       "threats_to_validity_specific": {
     64         "applies": true,
     65         "answer": true,
     66         "justification": "Specific threats are named: single-run translation (non-determinism not controlled), absence of formal inter-rater reliability metric, and CodeNet's single test case per sample that may allow buggy translations to pass.",
     67         "source": "haiku"
     68       },
     69       "scope_boundaries_stated": {
     70         "applies": true,
     71         "answer": true,
     72         "justification": "The study is explicitly scoped to 5 PLs, 7 specific LLMs, and 3 benchmarks plus 2 real-world projects; the threats section acknowledges results may not generalize beyond these settings.",
     73         "source": "haiku"
     74       }
     75     },
     76     "conflicts_of_interest": {
     77       "funding_disclosed": {
     78         "applies": true,
     79         "answer": true,
     80         "justification": "Acknowledgments state: 'This work is supported by IBM-Illinois Discovery Accelerator Institute and NSF CCF 22-38045 CAR grants.'",
     81         "source": "haiku"
     82       },
     83       "affiliations_disclosed": {
     84         "applies": true,
     85         "answer": true,
     86         "justification": "Author affiliations (IBM Research, Yorktown Heights; University of Illinois Urbana-Champaign) are clearly stated on the paper.",
     87         "source": "haiku"
     88       },
     89       "funder_independent_of_outcome": {
     90         "applies": true,
     91         "answer": true,
     92         "justification": "IBM funds the work but does not produce any of the evaluated LLMs (GPT-4, StarCoder, CodeGeeX, etc.); NSF is independent; no funder has a direct stake in which model ranks best.",
     93         "source": "haiku"
     94       },
     95       "financial_interests_declared": {
     96         "applies": true,
     97         "answer": false,
     98         "justification": "There is no competing interests statement; only funding sources are disclosed, with no declaration of patents, equity, or consulting relationships.",
     99         "source": "haiku"
    100       }
    101     },
    102     "scope_and_framing": {
    103       "key_terms_defined": {
    104         "applies": true,
    105         "answer": true,
    106         "justification": "Code translation is defined in the introduction as 'transforming a piece of code from one programming language to another, while preserving the original functionality'; translation success is precisely defined (compiles + passes runtime + passes tests).",
    107         "source": "haiku"
    108       },
    109       "intended_contribution_clear": {
    110         "applies": true,
    111         "answer": true,
    112         "justification": "Four explicit contributions are listed at the end of Section 1: comprehensive evaluation, bug taxonomy, prompt crafting heuristics, and public artifacts.",
    113         "source": "haiku"
    114       },
    115       "engagement_with_prior_work": {
    116         "applies": true,
    117         "answer": true,
    118         "justification": "Section 9 situates the work relative to transpiler-based and learning-based translation approaches and bug/repair studies, explicitly noting this is the first study to catalog LLM code-translation bugs at scale.",
    119         "source": "haiku"
    120       }
    121     }
    122   },
    123   "type_checklist": {
    124     "empirical": {
    125       "artifacts": {
    126         "code_released": {
    127           "applies": true,
    128           "answer": true,
    129           "justification": "Reference [7] links to a public GitHub repo (https://github.com/Intelligent-CAT-Lab/PLTranslationEmpirical) containing manual labeling and automation scripts.",
    130           "source": "haiku"
    131         },
    132         "data_released": {
    133           "applies": true,
    134           "answer": true,
    135           "justification": "The artifact package includes 1,748 manually labeled bugs, translated code, and test cases; three of the four source datasets (CodeNet, Avatar, EvalPlus) are established public benchmarks.",
    136           "source": "haiku"
    137         },
    138         "environment_specified": {
    139           "applies": true,
    140           "answer": false,
    141           "justification": "Compiler and runtime versions are listed (Python 3.10, g++ 11, GCC Clang 14.0, Java 11, Go 1.20) but no requirements.txt, Dockerfile, or equivalent environment specification file is provided.",
    142           "source": "haiku"
    143         },
    144         "reproduction_instructions": {
    145           "applies": true,
    146           "answer": false,
    147           "justification": "The paper refers readers to the artifact website for replication materials but provides no step-by-step reproduction instructions in the paper itself.",
    148           "source": "haiku"
    149         }
    150       },
    151       "statistical_methodology": {
    152         "confidence_intervals_or_error_bars": {
    153           "applies": true,
    154           "answer": false,
    155           "justification": "No confidence intervals or error bars are reported for any of the main results in Tables 2–5 or Figures 2, 6, or 7.",
    156           "source": "haiku"
    157         },
    158         "significance_tests": {
    159           "applies": true,
    160           "answer": false,
    161           "justification": "Comparative claims (GPT-4 vs. StarCoder, LLM vs. non-LLM approaches, vanilla vs. iterative prompting) are made without any statistical significance tests.",
    162           "source": "haiku"
    163         },
    164         "effect_sizes_reported": {
    165           "applies": true,
    166           "answer": true,
    167           "justification": "Percentage improvements are reported with baseline context: iterative prompting improves GPT-4 by 12.33% and average by 5.5% over vanilla prompting baseline.",
    168           "source": "haiku"
    169         },
    170         "sample_size_justified": {
    171           "applies": true,
    172           "answer": false,
    173           "justification": "The 1,700 code samples are drawn from available benchmark datasets without any power analysis or formal sample size justification.",
    174           "source": "haiku"
    175         },
    176         "variance_reported": {
    177           "applies": true,
    178           "answer": false,
    179           "justification": "The paper explicitly notes that each translation task was performed only once; no variance, standard deviation, or confidence intervals are reported across runs.",
    180           "source": "haiku"
    181         }
    182       },
    183       "evaluation_design": {
    184         "baselines_included": {
    185           "applies": true,
    186           "answer": true,
    187           "justification": "Vanilla prompting serves as the baseline for prompt-crafting experiments; non-LLM transpilers (CxGo, C2Rust, JavaToCSharp) serve as baselines for the LLM comparison.",
    188           "source": "haiku"
    189         },
    190         "baselines_contemporary": {
    191           "applies": true,
    192           "answer": true,
    193           "justification": "All non-LLM baselines (CxGo, C2Rust, JavaToCSharp) are contemporary tools, and LLM baselines span the top open and closed-source models available at the time of the study (2023).",
    194           "source": "haiku"
    195         },
    196         "ablation_study": {
    197           "applies": true,
    198           "answer": false,
    199           "justification": "No formal ablation is performed; the prompt crafting components (stack trace, error log, test input/output) are added together without isolating the contribution of each element.",
    200           "source": "haiku"
    201         },
    202         "multiple_metrics": {
    203           "applies": true,
    204           "answer": true,
    205           "justification": "Evaluation uses success rate, error type distribution (compilation/runtime/functional/non-terminating), and bug category prevalence across translation pairs.",
    206           "source": "haiku"
    207         },
    208         "human_evaluation": {
    209           "applies": true,
    210           "answer": true,
    211           "justification": "Eight human labelers analyzed 1,748 unsuccessful GPT-4 translations over 630 person-hours to construct the bug taxonomy, with pairwise labeling and discrepancy resolution.",
    212           "source": "haiku"
    213         },
    214         "held_out_test_set": {
    215           "applies": true,
    216           "answer": true,
    217           "justification": "All benchmarks (CodeNet, Avatar, EvalPlus, Commons CLI, Click) come with pre-existing test suites that are used to evaluate translated code correctness.",
    218           "source": "haiku"
    219         },
    220         "per_category_breakdown": {
    221           "applies": true,
    222           "answer": true,
    223           "justification": "Tables 2, 3, and 4 provide detailed breakdowns by source/target language pair, LLM, and bug category; Figure 7 breaks down error evolution by type across iterations.",
    224           "source": "haiku"
    225         },
    226         "failure_cases_discussed": {
    227           "applies": true,
    228           "answer": true,
    229           "justification": "Section 4 is entirely devoted to failure analysis, with 15 bug categories illustrated by concrete code examples of incorrect translations.",
    230           "source": "haiku"
    231         },
    232         "negative_results_reported": {
    233           "applies": true,
    234           "answer": true,
    235           "justification": "The main finding is negative: average success rate is only 11.94%; all models achieve 0% on Click and most achieve 0% on Commons CLI; the iterative approach still leaves large room for improvement.",
    236           "source": "haiku"
    237         }
    238       },
    239       "setup_transparency": {
    240         "model_versions_specified": {
    241           "applies": true,
    242           "answer": false,
    243           "justification": "Open-source models are identified by name and release date, but GPT-4 is not specified by snapshot (e.g., gpt-4-0314); Table 1 gives release months but not API version identifiers.",
    244           "source": "haiku"
    245         },
    246         "prompts_provided": {
    247           "applies": true,
    248           "answer": true,
    249           "justification": "Figure 1 shows all three vanilla prompting templates used per model family, and Figure 5 shows the full iterative prompt-crafting template with all contextual components labeled.",
    250           "source": "haiku"
    251         },
    252         "hyperparameters_reported": {
    253           "applies": true,
    254           "answer": false,
    255           "justification": "No generation hyperparameters (temperature, top-p, max tokens, etc.) are reported for any of the seven LLMs.",
    256           "source": "haiku"
    257         },
    258         "scaffolding_described": {
    259           "applies": false,
    260           "answer": false,
    261           "justification": "No multi-step agentic scaffolding is used; the evaluation is direct prompting with one or two iterations.",
    262           "source": "haiku"
    263         },
    264         "data_preprocessing_documented": {
    265           "applies": true,
    266           "answer": true,
    267           "justification": "Section 2 documents preprocessing steps: real-world project files were split by class/file, comments were removed, EvalPlus Python tests were manually translated to JUnit, and CodeNet/Avatar test format differences are explained.",
    268           "source": "haiku"
    269         }
    270       },
    271       "data_integrity": {
    272         "raw_data_available": {
    273           "applies": true,
    274           "answer": true,
    275           "justification": "The artifact package at the GitHub link includes 43K+ translated code samples, 1,748 manually labeled bugs, and 1,365 bug-fix pairs for independent verification.",
    276           "source": "haiku"
    277         },
    278         "data_collection_described": {
    279           "applies": true,
    280           "answer": true,
    281           "justification": "Section 2 describes dataset selection criteria (PL popularity via TIOBE, programming paradigm diversity, dataset availability) and how each benchmark's test format differs.",
    282           "source": "haiku"
    283         },
    284         "recruitment_methods_described": {
    285           "applies": false,
    286           "answer": false,
    287           "justification": "No external participant recruitment; the eight human labelers are the paper's own authors and collaborators (researchers and software engineers within the team).",
    288           "source": "haiku"
    289         },
    290         "data_pipeline_documented": {
    291           "applies": true,
    292           "answer": true,
    293           "justification": "The pipeline from code sample selection → prompting → translation → test execution → bug labeling is documented across Sections 2–4, including the two-phase taxonomy construction methodology.",
    294           "source": "haiku"
    295         }
    296       },
    297       "contamination": {
    298         "training_cutoff_stated": {
    299           "applies": true,
    300           "answer": false,
    301           "justification": "Training data cutoffs are not stated for any of the seven LLMs; only release dates are given in Table 1.",
    302           "source": "haiku"
    303         },
    304         "train_test_overlap_discussed": {
    305           "applies": true,
    306           "answer": false,
    307           "justification": "The paper does not discuss whether benchmark code samples (from CodeNet, Avatar, EvalPlus) may have appeared in the LLMs' pretraining corpora.",
    308           "source": "haiku"
    309         },
    310         "benchmark_contamination_addressed": {
    311           "applies": true,
    312           "answer": false,
    313           "justification": "No discussion of whether CodeNet, Avatar, or EvalPlus samples were available before any model's training cutoff; contamination is not mentioned as a threat.",
    314           "source": "haiku"
    315         }
    316       },
    317       "human_studies": {
    318         "pre_registered": {
    319           "applies": false,
    320           "answer": false,
    321           "justification": "No human participants in the research-subject sense; the human labelers are the paper's own team members.",
    322           "source": "haiku"
    323         },
    324         "irb_or_ethics_approval": {
    325           "applies": false,
    326           "answer": false,
    327           "justification": "No human subjects research requiring IRB approval.",
    328           "source": "haiku"
    329         },
    330         "demographics_reported": {
    331           "applies": false,
    332           "answer": false,
    333           "justification": "NA — no external human participants.",
    334           "source": "haiku"
    335         },
    336         "inclusion_exclusion_criteria": {
    337           "applies": false,
    338           "answer": false,
    339           "justification": "NA — no external human participants.",
    340           "source": "haiku"
    341         },
    342         "randomization_described": {
    343           "applies": false,
    344           "answer": false,
    345           "justification": "NA — no human subjects experiment.",
    346           "source": "haiku"
    347         },
    348         "blinding_described": {
    349           "applies": false,
    350           "answer": false,
    351           "justification": "NA — no human subjects experiment.",
    352           "source": "haiku"
    353         },
    354         "attrition_reported": {
    355           "applies": false,
    356           "answer": false,
    357           "justification": "NA — no human subjects experiment.",
    358           "source": "haiku"
    359         }
    360       },
    361       "cost_and_practicality": {
    362         "inference_cost_reported": {
    363           "applies": true,
    364           "answer": false,
    365           "justification": "Only the GPU hardware (16 A100 80GB) is mentioned; no inference time, API cost, or dollar amounts are reported for the 43,379 translations.",
    366           "source": "haiku"
    367         },
    368         "compute_budget_stated": {
    369           "applies": true,
    370           "answer": false,
    371           "justification": "The GPU cluster type is mentioned but total GPU-hours or compute budget is not stated.",
    372           "source": "haiku"
    373         }
    374       }
    375     }
    376   },
    377   "claims": [
    378     {
    379       "claim": "State-of-the-art LLMs achieve only 2.1%–47.3% correct code translation across benchmarks, with an average of 11.94% (median 5.3%)",
    380       "evidence": "Table 2 reports per-model, per-dataset success rates across 43,379 translations; GPT-4 is best at 47.3%, TB-Vicuna worst at 2.1%",
    381       "supported": "strong"
    382     },
    383     {
    384       "claim": "LLMs almost completely fail on real-world projects: GPT-4 achieves 8.1% on Commons CLI and 0% on Click; all other models achieve 0% on both",
    385       "evidence": "Table 2 last two rows; Section 4.3 explains the causes including context window limitations and complex dependencies",
    386       "supported": "strong"
    387     },
    388     {
    389       "claim": "77.8% of unsuccessful translations result in compilation errors, indicating LLMs primarily struggle with code syntax rather than semantics",
    390       "evidence": "Table 3 and Figure 2 aggregate error types across all models and language pairs",
    391       "supported": "strong"
    392     },
    393     {
    394       "claim": "Data-related bugs (33.5%) are the most prevalent category, driven by incorrect input parsing (18.1%) and incorrect data types (11.5%)",
    395       "evidence": "Table 4 reports bug prevalence percentages for all 15 categories across GPT-4's 1,748 unsuccessful translations",
    396       "supported": "strong"
    397     },
    398     {
    399       "claim": "Iterative prompt crafting with error context improves translation success by 5.5% on average, with 12.33% improvement for GPT-4",
    400       "evidence": "Figure 6 shows before/after success rates; single-run without variance makes the magnitude estimates uncertain",
    401       "supported": "moderate"
    402     },
    403     {
    404       "claim": "C2Rust outperforms GPT-4 on C-to-Rust translation (95% vs 61%) but GPT-4 outperforms CxGo on C-to-Go (72.5% vs 62.3%)",
    405       "evidence": "Table 5 comparison; C2Rust generates unsafe code (Figure 4) while GPT-4 generates idiomatic Rust",
    406       "supported": "strong"
    407     },
    408     {
    409       "claim": "There is a strong correlation (r=0.64–0.85) between the number of tests per sample and unsuccessful translation rate",
    410       "evidence": "Reported in Section 3.1 bullet points; more rigorous test suites expose more functional failures",
    411       "supported": "moderate"
    412     }
    413   ],
    414   "methodology_tags": [
    415     "benchmark-eval",
    416     "observational",
    417     "qualitative"
    418   ],
    419   "key_findings": "LLMs are far from reliably automating code translation: the best model (GPT-4) achieves only 47.3% success on crafted benchmarks and near-zero on real-world projects, with compilation errors dominating (77.8% of failures). A systematic manual analysis of 1,748 GPT-4 failures by 8 labelers over 630 person-hours produced a 15-category bug taxonomy organized into 5 groups, with data-related bugs (33.5%) most prevalent. LLM and non-LLM transpiler approaches have complementary strengths: C2Rust achieves 95% on C-to-Rust while GPT-4 leads on C-to-Go and Java-to-C#. Iterative prompting with error feedback improves success rates by 5.5% on average but leaves substantial room for improvement.",
    420   "red_flags": [
    421     {
    422       "flag": "Single-run translations",
    423       "detail": "All 43,379 translations were performed exactly once; LLMs are non-deterministic, so results have unknown variance. Success rates could shift with different samples."
    424     },
    425     {
    426       "flag": "No inter-rater reliability metric",
    427       "detail": "The bug taxonomy was built by 8 labelers with pairwise reconciliation, but no formal IRR metric (Cohen's kappa, Krippendorff's alpha) is reported, making reproducibility of the taxonomy uncertain."
    428     },
    429     {
    430       "flag": "GPT-4 version unspecified",
    431       "detail": "GPT-4 is identified only as 'GPT-4' with a 'Mar'23' release date; the specific API snapshot is not given, making the GPT-4 results non-reproducible as the API has been updated."
    432     },
    433     {
    434       "flag": "Benchmark contamination not addressed",
    435       "detail": "CodeNet, Avatar, and EvalPlus may have been in the pretraining corpora of the evaluated LLMs; the paper does not discuss this threat or check training cutoffs."
    436     },
    437     {
    438       "flag": "Weak test suites for CodeNet",
    439       "detail": "The authors note CodeNet has only one test case per sample, which is explicitly flagged as a construct validity threat allowing some buggy translations to be counted as successes."
    440     },
    441     {
    442       "flag": "No ablation on prompt components",
    443       "detail": "The iterative prompt crafting adds stack trace, error log, and test I/O together without isolating the contribution of each element."
    444     },
    445     {
    446       "flag": "No statistical significance tests",
    447       "detail": "All comparative claims (model rankings, LLM vs. non-LLM, before/after prompt crafting) are made without significance tests or confidence intervals despite the empirical nature."
    448     }
    449   ],
    450   "cited_papers": [
    451     {
    452       "title": "Unsupervised translation of programming languages",
    453       "relevance": "Seminal LLM-based code translation work (TransCoder) that this paper extends and evaluates against"
    454     },
    455     {
    456       "title": "Avatar: A parallel corpus for java-python program translation",
    457       "relevance": "One of the three benchmark datasets used for evaluation"
    458     },
    459     {
    460       "title": "Evaluating large language models trained on code (Codex)",
    461       "relevance": "Foundational code LLM evaluation paper; EvalPlus benchmark used in this study extends HumanEval from Codex"
    462     },
    463     {
    464       "title": "StarCoder: may the source be with you!",
    465       "relevance": "Second-best performing model in the study; code LLM baseline"
    466     },
    467     {
    468       "title": "CodeGeeX: A pre-trained model for code generation with multilingual evaluations on HumanEval-X",
    469       "relevance": "Code LLM specifically designed for code translation, included as baseline"
    470     },
    471     {
    472       "title": "Is your code generated by ChatGPT really correct? Rigorous evaluation of large language models for code generation (EvalPlus)",
    473       "relevance": "Provides the EvalPlus benchmark with rigorous test suites used in the evaluation"
    474     },
    475     {
    476       "title": "CodeNet: A Large-Scale AI for Code Dataset for Learning a Diversity of Coding Tasks",
    477       "relevance": "Primary dataset with 1,000 of the 1,700 code samples used in the study"
    478     },
    479     {
    480       "title": "Leveraging automated unit tests for unsupervised code translation",
    481       "relevance": "Prior LLM-based code translation approach that uses tests for feedback — directly related to this paper's evaluation methodology"
    482     }
    483   ],
    484   "engagement_factors": {
    485     "practical_relevance": {
    486       "score": 3,
    487       "justification": "Directly addresses a high-demand enterprise use case (code migration/modernization) with a taxonomy practitioners can use to anticipate and fix LLM translation bugs."
    488     },
    489     "surprise_contrarian": {
    490       "score": 2,
    491       "justification": "The 11.94% average success rate and near-zero performance on real-world projects challenges optimistic narratives about LLMs automating software engineering tasks."
    492     },
    493     "fear_safety": {
    494       "score": 1,
    495       "justification": "Notes that C2Rust generates universally unsafe Rust code with buffer overflow risks, but this is a secondary finding not the paper's focus."
    496     },
    497     "drama_conflict": {
    498       "score": 1,
    499       "justification": "LLM vs. non-LLM comparison creates mild tension but results are framed as complementary rather than adversarial."
    500     },
    501     "demo_ability": {
    502       "score": 2,
    503       "justification": "Artifacts are publicly available on GitHub; practitioners can run the evaluation scripts on their own code samples."
    504     },
    505     "brand_recognition": {
    506       "score": 2,
    507       "justification": "GPT-4 is the headline model; IBM Research affiliation adds credibility; ICSE is a top-tier venue."
    508     }
    509   },
    510   "hn_data": {
    511     "threads": [
    512       {
    513         "hn_id": "34337707",
    514         "title": "“A Handbook of Integer Sequences” Fifty Years Later",
    515         "points": 139,
    516         "comments": 45,
    517         "url": "https://news.ycombinator.com/item?id=34337707",
    518         "created_at": "2023-01-11T12:37:58Z"
    519       },
    520       {
    521         "hn_id": "38740280",
    522         "title": "Using sequences of life-events to predict human lives",
    523         "points": 98,
    524         "comments": 51,
    525         "url": "https://news.ycombinator.com/item?id=38740280",
    526         "created_at": "2023-12-23T00:08:58Z"
    527       },
    528       {
    529         "hn_id": "37434069",
    530         "title": "Large Language Models as Optimizers. +50% on Big Bench Hard",
    531         "points": 95,
    532         "comments": 33,
    533         "url": "https://news.ycombinator.com/item?id=37434069",
    534         "created_at": "2023-09-08T14:37:30Z"
    535       },
    536       {
    537         "hn_id": "28119944",
    538         "title": "The Challenge of Finding Security Advice for Smart Home Devices",
    539         "points": 6,
    540         "comments": 0,
    541         "url": "https://news.ycombinator.com/item?id=28119944",
    542         "created_at": "2021-08-09T17:50:13Z"
    543       },
    544       {
    545         "hn_id": "38696040",
    546         "title": "Using sequences of life-events to predict human lives",
    547         "points": 4,
    548         "comments": 1,
    549         "url": "https://news.ycombinator.com/item?id=38696040",
    550         "created_at": "2023-12-19T14:40:17Z"
    551       },
    552       {
    553         "hn_id": "35085380",
    554         "title": "Φ-So – Physical Symbolic Optimization in PyTorch",
    555         "points": 4,
    556         "comments": 0,
    557         "url": "https://news.ycombinator.com/item?id=35085380",
    558         "created_at": "2023-03-09T19:19:42Z"
    559       },
    560       {
    561         "hn_id": "34649117",
    562         "title": "Goniometers: A Powerful Acoustic Feature for Music Information Retrieval Tasks",
    563         "points": 3,
    564         "comments": 0,
    565         "url": "https://news.ycombinator.com/item?id=34649117",
    566         "created_at": "2023-02-03T23:07:39Z"
    567       },
    568       {
    569         "hn_id": "34839217",
    570         "title": "Cinematic Techniques in Narrative Visualization",
    571         "points": 1,
    572         "comments": 0,
    573         "url": "https://news.ycombinator.com/item?id=34839217",
    574         "created_at": "2023-02-17T19:31:48Z"
    575       },
    576       {
    577         "hn_id": "38600573",
    578         "title": "Can large language models democratize access to dual-use biotechnology? [pdf]",
    579         "points": 1,
    580         "comments": 0,
    581         "url": "https://news.ycombinator.com/item?id=38600573",
    582         "created_at": "2023-12-11T13:45:09Z"
    583       },
    584       {
    585         "hn_id": "37137695",
    586         "title": "Efficient Domain Adaptation of Sentence Embeddings Using Adapters",
    587         "points": 1,
    588         "comments": 0,
    589         "url": "https://news.ycombinator.com/item?id=37137695",
    590         "created_at": "2023-08-15T18:46:46Z"
    591       }
    592     ],
    593     "top_points": 139,
    594     "total_points": 352,
    595     "total_comments": 130
    596   }
    597 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs