scan-v5.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan-v5.json (26036B)
      1 {
      2   "scan_version": 5,
      3   "paper_type": "empirical",
      4   "paper": {
      5     "title": "Granite Code Models: A Family of Open Foundation Models for Code Intelligence",
      6     "authors": [
      7       "Mayank Mishra",
      8       "Matt Stallone",
      9       "Gaoyuan Zhang",
     10       "Yikang Shen",
     11       "Aditya Prasad"
     12     ],
     13     "year": 2024,
     14     "venue": "arXiv.org",
     15     "arxiv_id": "2405.04324",
     16     "doi": "10.48550/arXiv.2405.04324"
     17   },
     18   "checklist": {
     19     "claims_and_evidence": {
     20       "abstract_claims_supported": {
     21         "applies": true,
     22         "answer": false,
     23         "justification": "The abstract claims models 'consistently reach state-of-the-art performance,' but tables show numerous counter-examples: CodeGemma-7B beats Granite-8B on MBPP by 10+ points (53.0% vs 42.2%) and on CRUXEval; StarCoder2-15B leads at mid-size on MultiPL-E.",
     24         "source": "haiku"
     25       },
     26       "causal_claims_justified": {
     27         "applies": true,
     28         "answer": false,
     29         "justification": "The paper attributes superior explanation/fixing performance to 'data mixture and base model training decisions' but provides no ablation studies isolating the contribution of phase-2 training, FIM objective, or specific data sources.",
     30         "source": "haiku"
     31       },
     32       "generalization_bounded": {
     33         "applies": true,
     34         "answer": false,
     35         "justification": "Claims about suitability for 'enterprise software development workflows' are not bounded — all evidence comes from academic benchmarks with no real-world development task validation.",
     36         "source": "haiku"
     37       },
     38       "alternative_explanations_discussed": {
     39         "applies": true,
     40         "answer": false,
     41         "justification": "When Granite outperforms on explanation/fixing tasks, the paper attributes this to training data without considering alternatives such as evaluation format alignment with training distribution.",
     42         "source": "haiku"
     43       },
     44       "proxy_outcome_distinction": {
     45         "applies": true,
     46         "answer": false,
     47         "justification": "Pass@1 on benchmarks is equated with 'code intelligence' and enterprise productivity throughout without discussion of how academic benchmark performance relates to real developer outcomes.",
     48         "source": "haiku"
     49       }
     50     },
     51     "limitations_and_scope": {
     52       "limitations_section_present": {
     53         "applies": true,
     54         "answer": false,
     55         "justification": "No dedicated limitations or threats-to-validity section exists; the paper moves directly from evaluation results to conclusion.",
     56         "source": "haiku"
     57       },
     58       "threats_to_validity_specific": {
     59         "applies": true,
     60         "answer": false,
     61         "justification": "No threats to validity are discussed despite obvious concerns: IBM employees evaluating IBM models, and training data sourced from GitHub that likely overlaps with evaluation benchmarks.",
     62         "source": "haiku"
     63       },
     64       "scope_boundaries_stated": {
     65         "applies": true,
     66         "answer": false,
     67         "justification": "The conclusion only mentions future work directions; no explicit statements about what the benchmark results do not demonstrate (e.g., real-world productivity, safety, alignment beyond instruction following).",
     68         "source": "haiku"
     69       }
     70     },
     71     "conflicts_of_interest": {
     72       "funding_disclosed": {
     73         "applies": true,
     74         "answer": false,
     75         "justification": "No explicit funding disclosure section is present; IBM authorship is evident but institutional funding is not formally declared.",
     76         "source": "haiku"
     77       },
     78       "affiliations_disclosed": {
     79         "applies": true,
     80         "answer": true,
     81         "justification": "All authors are listed with 'IBM Research' affiliation clearly disclosed in the author block.",
     82         "source": "haiku"
     83       },
     84       "funder_independent_of_outcome": {
     85         "applies": true,
     86         "answer": false,
     87         "justification": "IBM employees evaluate IBM-developed models; the organization directly benefits commercially from positive evaluation results via watsonx Code Assistant.",
     88         "source": "haiku"
     89       },
     90       "financial_interests_declared": {
     91         "applies": true,
     92         "answer": false,
     93         "justification": "No competing interests, patent, equity, or financial interest declaration is present anywhere in the paper.",
     94         "source": "haiku"
     95       }
     96     },
     97     "scope_and_framing": {
     98       "key_terms_defined": {
     99         "applies": true,
    100         "answer": false,
    101         "justification": "Terms like 'enterprise-grade,' 'all-around code model,' and 'code intelligence' are used throughout without precise definitions; what distinguishes 'enterprise' use from general use is never clarified.",
    102         "source": "haiku"
    103       },
    104       "intended_contribution_clear": {
    105         "applies": true,
    106         "answer": true,
    107         "justification": "The contribution is explicitly stated: a family of open-source code LLMs (3B–34B params) for enterprise code tasks, released under Apache 2.0, trained on 116 programming languages.",
    108         "source": "haiku"
    109       },
    110       "engagement_with_prior_work": {
    111         "applies": true,
    112         "answer": true,
    113         "justification": "The paper actively compares against StarCoder, StarCoder2, CodeLlama, CodeGemma, and Llama-3, discussing gaps in prior work (task diversity beyond generation, enterprise license issues).",
    114         "source": "haiku"
    115       }
    116     }
    117   },
    118   "type_checklist": {
    119     "empirical": {
    120       "artifacts": {
    121         "code_released": {
    122           "applies": true,
    123           "answer": true,
    124           "justification": "Models released at https://github.com/ibm-granite/granite-code-models under Apache 2.0 license, explicitly stated in abstract and paper.",
    125           "source": "haiku"
    126         },
    127         "data_released": {
    128           "applies": true,
    129           "answer": true,
    130           "justification": "All evaluation benchmarks (HumanEvalPack, MBPP, RepoBench, CrossCodeEval, etc.) are standard publicly available datasets; the filtered training corpus is not released but evaluation data is accessible.",
    131           "source": "haiku"
    132         },
    133         "environment_specified": {
    134           "applies": true,
    135           "answer": false,
    136           "justification": "Training infrastructure (FlashAttention 2, NVIDIA Apex, Megatron-LM, BF16 precision) is mentioned but no requirements.txt, Dockerfile, or reproducible environment specification is provided.",
    137           "source": "haiku"
    138         },
    139         "reproduction_instructions": {
    140           "applies": true,
    141           "answer": false,
    142           "justification": "No step-by-step training or evaluation reproduction instructions are provided; evaluation scripts are mentioned ('same script and environment') but not shared in the paper.",
    143           "source": "haiku"
    144         }
    145       },
    146       "statistical_methodology": {
    147         "confidence_intervals_or_error_bars": {
    148           "applies": true,
    149           "answer": false,
    150           "justification": "No confidence intervals or error bars appear in any results table despite sampling multiple completions per problem (e.g., 50 for MultiPL-E).",
    151           "source": "haiku"
    152         },
    153         "significance_tests": {
    154           "applies": true,
    155           "answer": false,
    156           "justification": "No statistical significance tests are conducted; small differences like 0.1% (Granite-20B vs StarCoder2-15B on HumanEvalSynthesize) are presented as meaningful.",
    157           "source": "haiku"
    158         },
    159         "effect_sizes_reported": {
    160           "applies": true,
    161           "answer": true,
    162           "justification": "Absolute percentage improvements are consistently reported (e.g., '12 points improvement on HumanEvalPack,' '4% improvement on HumanEvalSynthesize'), providing effect size context.",
    163           "source": "haiku"
    164         },
    165         "sample_size_justified": {
    166           "applies": true,
    167           "answer": false,
    168           "justification": "Fixed benchmark sizes (e.g., HumanEval: 164 problems) are used without justification that they are sufficient for the numerous comparative claims made across 30+ model comparisons.",
    169           "source": "haiku"
    170         },
    171         "variance_reported": {
    172           "applies": true,
    173           "answer": false,
    174           "justification": "Despite sampling multiple completions per problem (40–50 samples for several benchmarks), no variance or standard deviation across sampling runs is reported.",
    175           "source": "haiku"
    176         }
    177       },
    178       "evaluation_design": {
    179         "baselines_included": {
    180           "applies": true,
    181           "answer": true,
    182           "justification": "Extensive baselines: StarCoder, StarCoder2, CodeLlama (7B/13B/34B/70B), CodeGemma, StableCode, Mistral, Llama-3, Gemma, Mixtral across all benchmarks.",
    183           "source": "haiku"
    184         },
    185         "baselines_contemporary": {
    186           "applies": true,
    187           "answer": true,
    188           "justification": "Baselines include recently released models contemporary to the May 2024 submission: StarCoder2, CodeGemma, and Llama-3 all from early 2024.",
    189           "source": "haiku"
    190         },
    191         "ablation_study": {
    192           "applies": true,
    193           "answer": false,
    194           "justification": "No ablation studies isolate the contribution of phase-2 training, FIM objective (α=0.5), depth upscaling, NeFTune noise, or specific data source contributions.",
    195           "source": "haiku"
    196         },
    197         "multiple_metrics": {
    198           "applies": true,
    199           "answer": true,
    200           "justification": "Multiple metrics used: Pass@1, exact match, edit similarity, ExcessCode, identifier F1, AST evaluation, executable evaluation, RP@1 (robustness), across 19 benchmarks.",
    201           "source": "haiku"
    202         },
    203         "human_evaluation": {
    204           "applies": false,
    205           "answer": false,
    206           "justification": "Human evaluation is not applicable; automated test execution for functional correctness is the standard and appropriate method for code generation evaluation.",
    207           "source": "haiku"
    208         },
    209         "held_out_test_set": {
    210           "applies": true,
    211           "answer": true,
    212           "justification": "Standard benchmarks (HumanEval, MBPP, RepoBench, etc.) provide held-out test sets not used during training.",
    213           "source": "haiku"
    214         },
    215         "per_category_breakdown": {
    216           "applies": true,
    217           "answer": true,
    218           "justification": "Per-language breakdowns provided for MultiPL-E (18 languages), DS-1000 (7 libraries), HumanEvalPack (6 languages × 3 tasks), and per-category for BFCL.",
    219           "source": "haiku"
    220         },
    221         "failure_cases_discussed": {
    222           "applies": true,
    223           "answer": false,
    224           "justification": "No failure cases are discussed; the only failure mention is a footnote about Llama-3-8B generating invalid Python programs, not about Granite models.",
    225           "source": "haiku"
    226         },
    227         "negative_results_reported": {
    228           "applies": true,
    229           "answer": true,
    230           "justification": "The paper explicitly acknowledges losses: 'Granite-8B-Code-Base lags behind CodeGemma-7B on all [ReCode] categories,' 'no single model which performs consistently best at 3B parameters,' and Granite-3B falls short of StarCoder2-3B on MBPP.",
    231           "source": "haiku"
    232         }
    233       },
    234       "setup_transparency": {
    235         "model_versions_specified": {
    236           "applies": true,
    237           "answer": false,
    238           "justification": "Baseline models referenced by paper name without pinned checkpoint hashes or release dates; some linked to GitHub but without specific version commits.",
    239           "source": "haiku"
    240         },
    241         "prompts_provided": {
    242           "applies": true,
    243           "answer": false,
    244           "justification": "Evaluation format (completion vs. instruction template) is mentioned but actual prompts are not shown; the paper states it follows official formats without providing them.",
    245           "source": "haiku"
    246         },
    247         "hyperparameters_reported": {
    248           "applies": true,
    249           "answer": true,
    250           "justification": "Training hyperparameters (AdamW β1=0.9, β2=0.95, learning rates, batch sizes, warmup steps) and evaluation parameters (temperature 0.2/0.8, top-p 0.95, max tokens) are thoroughly documented.",
    251           "source": "haiku"
    252         },
    253         "scaffolding_described": {
    254           "applies": false,
    255           "answer": false,
    256           "justification": "No agentic scaffolding used; this is base/instruct model evaluation on standard benchmarks.",
    257           "source": "haiku"
    258         },
    259         "data_preprocessing_documented": {
    260           "applies": true,
    261           "answer": true,
    262           "justification": "Section 2 provides detailed documentation of crawling, language filtering with explicit rules, SHA256 + MinHash/LSH deduplication (Jaccard threshold 0.7), HAP keyword filtering, and StarPII-based redaction.",
    263           "source": "haiku"
    264         }
    265       },
    266       "data_integrity": {
    267         "raw_data_available": {
    268           "applies": true,
    269           "answer": false,
    270           "justification": "The filtered training corpus is not publicly released; source datasets (GitHub Code Clean, StarCoderdata) are available but not the processed version actually used for training.",
    271           "source": "haiku"
    272         },
    273         "data_collection_described": {
    274           "applies": true,
    275           "answer": true,
    276           "justification": "Section 2 describes data sources, filtering criteria, deduplication methodology, PII redaction, malware scanning, and natural language data curation in sufficient detail.",
    277           "source": "haiku"
    278         },
    279         "recruitment_methods_described": {
    280           "applies": false,
    281           "answer": false,
    282           "justification": "No human participants; standard public benchmarks used for evaluation.",
    283           "source": "haiku"
    284         },
    285         "data_pipeline_documented": {
    286           "applies": true,
    287           "answer": true,
    288           "justification": "Full pipeline documented: source data → language filtering → quality filtering → exact deduplication (SHA256) → fuzzy deduplication (MinHash/LSH) → HAP filtering → PII redaction → malware scan → tokenization.",
    289           "source": "haiku"
    290         }
    291       },
    292       "contamination": {
    293         "training_cutoff_stated": {
    294           "applies": true,
    295           "answer": false,
    296           "justification": "No training data cutoff date is stated despite training on GitHub data that could substantially overlap with widely-used evaluation benchmarks.",
    297           "source": "haiku"
    298         },
    299         "train_test_overlap_discussed": {
    300           "applies": true,
    301           "answer": false,
    302           "justification": "No discussion of potential overlap between GitHub-sourced training data and evaluation benchmarks (HumanEval, MBPP) which are publicly available on GitHub and existed before data collection.",
    303           "source": "haiku"
    304         },
    305         "benchmark_contamination_addressed": {
    306           "applies": true,
    307           "answer": false,
    308           "justification": "HumanEval (2021) and MBPP (2021) were publicly available long before training data collection; no decontamination steps or overlap analysis are described.",
    309           "source": "haiku"
    310         }
    311       },
    312       "human_studies": {
    313         "pre_registered": {
    314           "applies": false,
    315           "answer": false,
    316           "justification": "No human participants.",
    317           "source": "haiku"
    318         },
    319         "irb_or_ethics_approval": {
    320           "applies": false,
    321           "answer": false,
    322           "justification": "No human participants.",
    323           "source": "haiku"
    324         },
    325         "demographics_reported": {
    326           "applies": false,
    327           "answer": false,
    328           "justification": "No human participants.",
    329           "source": "haiku"
    330         },
    331         "inclusion_exclusion_criteria": {
    332           "applies": false,
    333           "answer": false,
    334           "justification": "No human participants.",
    335           "source": "haiku"
    336         },
    337         "randomization_described": {
    338           "applies": false,
    339           "answer": false,
    340           "justification": "No human participants.",
    341           "source": "haiku"
    342         },
    343         "blinding_described": {
    344           "applies": false,
    345           "answer": false,
    346           "justification": "No human participants.",
    347           "source": "haiku"
    348         },
    349         "attrition_reported": {
    350           "applies": false,
    351           "answer": false,
    352           "justification": "No human participants.",
    353           "source": "haiku"
    354         }
    355       },
    356       "cost_and_practicality": {
    357         "inference_cost_reported": {
    358           "applies": true,
    359           "answer": false,
    360           "justification": "No inference latency, throughput, or cost estimates are provided despite the paper positioning models for enterprise deployment where cost-per-token matters.",
    361           "source": "haiku"
    362         },
    363         "compute_budget_stated": {
    364           "applies": true,
    365           "answer": false,
    366           "justification": "Training infrastructure is described and carbon emissions estimated (~455 tCO2eq), but total GPU-hours or FLOPs budget is not explicitly reported.",
    367           "source": "haiku"
    368         }
    369       }
    370     }
    371   },
    372   "claims": [
    373     {
    374       "claim": "Granite-8B-Code-Base outperforms CodeGemma-8B by ~12 points on the full HumanEvalPack (synthesis+explanation+fixing)",
    375       "evidence": "Figure 1 and Tables 3/10/11 show Granite-8B averages substantially higher than CodeGemma-7B on the combined HumanEvalPack tasks, particularly on explanation (26.4% vs 12.4%) and fixing (29.6% vs 10.1%)",
    376       "supported": "strong"
    377     },
    378     {
    379       "claim": "Granite Code models consistently reach state-of-the-art performance among open-source code LLMs",
    380       "evidence": "Tables show multiple counter-examples: CodeGemma-7B beats Granite-8B on MBPP (53.0% vs 42.2%) and CRUXEval; StarCoder2-15B leads Granite-20B on several MultiPL-E languages; no single claim of universal SOTA is defensible",
    381       "supported": "weak"
    382     },
    383     {
    384       "claim": "Granite-3B-Code-Instruct surpasses CodeLlama-34B-Instruct on HumanEvalSynthesize",
    385       "evidence": "Table 3 directly contradicts this: Granite-3B-Code-Instruct averages 39.6% vs CodeLlama-34B-Instruct at 41.3%; this textual claim is false per the paper's own data",
    386       "supported": "unsupported"
    387     },
    388     {
    389       "claim": "Granite-8B-Code-Base outperforms Llama-3-8B-Base by ~12 points on GSM8K and ~6 points on MATH",
    390       "evidence": "Table 15 confirms: Granite-8B at 61.9% vs Llama-3-8B at 49.8% on GSM8K; 21.4% vs 15.6% on MATH",
    391       "supported": "strong"
    392     },
    393     {
    394       "claim": "Two-phase training with code then code+language data improves reasoning capabilities",
    395       "evidence": "Asserted in paper but no ablation comparing phase-1-only vs phase-2 trained models is provided; claim is plausible but undemonstrated",
    396       "supported": "weak"
    397     },
    398     {
    399       "claim": "Depth upscaling from 20B to 34B results in minimal performance drop that quickly recovers with continued pretraining",
    400       "evidence": "Section 3 describes this qualitatively ('drop in performance is pretty small') but provides no pre/post quantitative comparison tables",
    401       "supported": "weak"
    402     }
    403   ],
    404   "methodology_tags": [
    405     "benchmark-eval"
    406   ],
    407   "key_findings": "IBM's Granite Code Models (3B–34B parameters) achieve competitive performance on code benchmarks across generation, explanation, fixing, and translation tasks in 116 languages, with the 8B model approaching 70B-class performance on explanation/fixing tasks. The models demonstrate good performance-to-size ratios versus CodeLlama variants. However, performance is highly benchmark-dependent: CodeGemma-7B beats Granite-8B on MBPP by 10+ points and on robustness benchmarks, and no single Granite model leads consistently across all evaluations. The paper contains at least one factual internal inconsistency: the text claims Granite-3B-Instruct surpasses CodeLlama-34B-Instruct, but their own Table 3 shows the opposite (39.6% vs 41.3%).",
    408   "red_flags": [
    409     {
    410       "flag": "Internal claim contradicts own table",
    411       "detail": "Section 6.1.1 states 'Granite-3B-Code-Instruct surpasses the performance of CodeLlama-34B-Instruct' but Table 3 shows 39.6% vs 41.3% average — the claim is directly contradicted by the paper's own data."
    412     },
    413     {
    414       "flag": "Self-evaluation conflict of interest",
    415       "detail": "IBM employees evaluate IBM-developed models with no independent verification; the organization directly benefits commercially from positive results via watsonx Code Assistant, yet no competing interests are declared."
    416     },
    417     {
    418       "flag": "Benchmark contamination unaddressed",
    419       "detail": "Training data is sourced from GitHub where HumanEval, MBPP, and other evaluation benchmarks have been publicly available since 2021; no decontamination analysis or training cutoff date is provided."
    420     },
    421     {
    422       "flag": "No ablation studies despite causal claims",
    423       "detail": "The paper makes causal claims attributing performance gains to 'data mixture and training decisions' but provides zero ablations for phase-2 training, FIM objective weighting, depth upscaling, or NeFTune noise."
    424     },
    425     {
    426       "flag": "No statistical rigor for comparative claims",
    427       "detail": "Differences as small as 0.1% are treated as meaningful (e.g., Section 6.1.1 on StarCoder2-15B comparison) with no confidence intervals, error bars, or significance tests across 30+ model comparisons."
    428     }
    429   ],
    430   "cited_papers": [
    431     {
    432       "title": "StarCoder: May the Source Be With You!",
    433       "relevance": "Direct predecessor; Granite reuses StarCoder's tokenizer, FIM training format, and builds on StarCoderData as a training source"
    434     },
    435     {
    436       "title": "OctoPack: Instruction Tuning Code Large Language Models (HumanEvalPack benchmark)",
    437       "relevance": "Provides the primary multi-task evaluation benchmark (synthesis, explanation, fixing) across 6 languages that is central to Granite's evaluation narrative"
    438     },
    439     {
    440       "title": "Code Llama: Open Foundation Models for Code",
    441       "relevance": "Key baseline across all benchmarks; Granite explicitly positions itself relative to CodeLlama variants at each model size"
    442     },
    443     {
    444       "title": "StarCoder 2 and The Stack v2: The Next Generation",
    445       "relevance": "Most direct contemporary competitor; competitive comparison throughout, especially on MultiPL-E and FIM tasks"
    446     },
    447     {
    448       "title": "CrossCodeEval: A Diverse and Multilingual Benchmark for Cross-File Code Completion",
    449       "relevance": "Repository-level code completion benchmark used to evaluate practical coding capability beyond isolated function generation"
    450     },
    451     {
    452       "title": "CRUXEval: A Benchmark for Code Reasoning, Understanding and Execution",
    453       "relevance": "Code reasoning and execution benchmark used to evaluate deeper code understanding beyond surface generation"
    454     },
    455     {
    456       "title": "Program Synthesis with Large Language Models (MBPP)",
    457       "relevance": "Standard Python code generation benchmark used across model comparisons"
    458     },
    459     {
    460       "title": "Evaluating Large Language Models Trained on Code (HumanEval)",
    461       "relevance": "Foundation benchmark for code generation evaluation; forms basis of HumanEvalPack"
    462     }
    463   ],
    464   "engagement_factors": {
    465     "practical_relevance": {
    466       "score": 3,
    467       "justification": "Models are immediately downloadable from HuggingFace under Apache 2.0 and cover real enterprise code tasks across 116 languages."
    468     },
    469     "surprise_contrarian": {
    470       "score": 1,
    471       "justification": "Results confirm expected pattern that domain-specific code models beat general models; the 8B explanation/fixing performance is notable but not paradigm-shifting."
    472     },
    473     "fear_safety": {
    474       "score": 0,
    475       "justification": "No AI safety, misuse, or risk concerns are raised; HAP filtering is framed as a feature."
    476     },
    477     "drama_conflict": {
    478       "score": 1,
    479       "justification": "Implicitly competitive with Google (CodeGemma), Meta (CodeLlama/Llama-3), and BigCode (StarCoder2) in the open-source code model race."
    480     },
    481     "demo_ability": {
    482       "score": 3,
    483       "justification": "Models are publicly available on HuggingFace with Apache 2.0 license; anyone can download and test immediately via standard inference libraries."
    484     },
    485     "brand_recognition": {
    486       "score": 2,
    487       "justification": "IBM is a globally recognized enterprise technology brand with established credibility, linked to the commercial watsonx Code Assistant product."
    488     }
    489   },
    490   "hn_data": {
    491     "threads": [
    492       {
    493         "hn_id": "39385811",
    494         "title": "Personality trait recognition using ECG spectrograms and deep learning",
    495         "points": 48,
    496         "comments": 40,
    497         "url": "https://news.ycombinator.com/item?id=39385811",
    498         "created_at": "2024-02-15T17:49:03Z"
    499       },
    500       {
    501         "hn_id": "31324857",
    502         "title": "Panoptic Neural Fields: A Semantic Object-Aware Neural Scene Representation",
    503         "points": 2,
    504         "comments": 0,
    505         "url": "https://news.ycombinator.com/item?id=31324857",
    506         "created_at": "2022-05-10T08:38:46Z"
    507       },
    508       {
    509         "hn_id": "42912008",
    510         "title": "HarmBench: A Standardized Evaluation Framework for Robust Refusal",
    511         "points": 1,
    512         "comments": 0,
    513         "url": "https://news.ycombinator.com/item?id=42912008",
    514         "created_at": "2025-02-02T21:26:17Z"
    515       },
    516       {
    517         "hn_id": "35991015",
    518         "title": "Penguin Huddling: A Continuum Model",
    519         "points": 1,
    520         "comments": 0,
    521         "url": "https://news.ycombinator.com/item?id=35991015",
    522         "created_at": "2023-05-18T17:08:38Z"
    523       }
    524     ],
    525     "top_points": 48,
    526     "total_points": 52,
    527     "total_comments": 40
    528   }
    529 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs