scan-v5.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan-v5.json (26496B)
      1 {
      2   "scan_version": 5,
      3   "paper_type": "empirical",
      4   "paper": {
      5     "title": "A GPT-based Code Review System for Programming Language Learning",
      6     "authors": [
      7       "Lee Dong-Kyu"
      8     ],
      9     "year": 2024,
     10     "venue": "arXiv.org",
     11     "arxiv_id": "2407.04722",
     12     "doi": "10.48550/arXiv.2407.04722"
     13   },
     14   "checklist": {
     15     "claims_and_evidence": {
     16       "abstract_claims_supported": {
     17         "applies": true,
     18         "answer": true,
     19         "justification": "Abstract claims (learner-friendly reviews, cheating prevention, error detection, reduced latency/costs, code review quality) are supported by Tables III–VI and system design descriptions.",
     20         "source": "haiku"
     21       },
     22       "causal_claims_justified": {
     23         "applies": true,
     24         "answer": false,
     25         "justification": "Improvements are confounded—prompt optimization and code validation module added together without ablation study. Comparisons lack statistical testing. Small expert sample (n=6) insufficient for causal claims about pedagogical effect.",
     26         "source": "haiku"
     27       },
     28       "generalization_bounded": {
     29         "applies": true,
     30         "answer": false,
     31         "justification": "Paper claims suitability for primary/secondary students but evaluation used only 6 educators, not actual students. Generalization to target population unsubstantiated.",
     32         "source": "haiku"
     33       },
     34       "alternative_explanations_discussed": {
     35         "applies": true,
     36         "answer": false,
     37         "justification": "Why does improved system perform better? Code validation module, prompt changes, or parameter tuning—not separated. No discussion of evaluator bias, selection effects, or confounding in results.",
     38         "source": "haiku"
     39       },
     40       "proxy_outcome_distinction": {
     41         "applies": true,
     42         "answer": false,
     43         "justification": "Paper claims pedagogical effectiveness and 'learner-friendly' quality but measures only response time, API cost, and expert Likert ratings—no mapping to actual learning outcomes.",
     44         "source": "haiku"
     45       }
     46     },
     47     "limitations_and_scope": {
     48       "limitations_section_present": {
     49         "applies": true,
     50         "answer": false,
     51         "justification": "Section V is a 3-sentence conclusion mentioning future work (verify with students, add membership system), not a dedicated threats-to-validity or limitations section.",
     52         "source": "haiku"
     53       },
     54       "threats_to_validity_specific": {
     55         "applies": true,
     56         "answer": false,
     57         "justification": "No discussion of small expert sample (n=6), lack of student evaluation, potential evaluator bias, or confounded system changes. Only generic statement to 'verify effectiveness with students.'",
     58         "source": "haiku"
     59       },
     60       "scope_boundaries_stated": {
     61         "applies": true,
     62         "answer": false,
     63         "justification": "Paper does not explicitly state what the system does NOT show: no evidence on actual student learning, long-term retention, or effectiveness vs. human tutoring. Scope boundaries not articulated.",
     64         "source": "haiku"
     65       }
     66     },
     67     "conflicts_of_interest": {
     68       "funding_disclosed": {
     69         "applies": true,
     70         "answer": false,
     71         "justification": "No funding source mentioned anywhere in paper.",
     72         "source": "haiku"
     73       },
     74       "affiliations_disclosed": {
     75         "applies": true,
     76         "answer": false,
     77         "justification": "Author affiliation (University of Hanyang) is stated, but no disclosure of financial interest in system deployed at codetutor119judge.com or related company.",
     78         "source": "haiku"
     79       },
     80       "funder_independent_of_outcome": {
     81         "applies": false,
     82         "answer": false,
     83         "justification": "No funding disclosed.",
     84         "source": "haiku"
     85       },
     86       "financial_interests_declared": {
     87         "applies": true,
     88         "answer": false,
     89         "justification": "No competing interests statement or disclosure of financial stakes in the deployed system.",
     90         "source": "haiku"
     91       }
     92     },
     93     "scope_and_framing": {
     94       "key_terms_defined": {
     95         "applies": true,
     96         "answer": false,
     97         "justification": "Terms like 'learner-friendly,' 'personalized feedback,' and 'rigor' used throughout but not formally defined. 'AI-assist cheating' explained via examples but lacks precise definition.",
     98         "source": "haiku"
     99       },
    100       "intended_contribution_clear": {
    101         "applies": true,
    102         "answer": true,
    103         "justification": "Contribution is clearly stated: a web-based GPT-4 system for code review in K-12 education, with prompt engineering to prevent cheating and provide supportive feedback.",
    104         "source": "haiku"
    105       },
    106       "engagement_with_prior_work": {
    107         "applies": true,
    108         "answer": true,
    109         "justification": "Section II discusses related work on LLMs in educational feedback (Dai et al.), code review automation (Singh, Lu et al.), prompt templates, and LLMs in CS classrooms (Lau & Guo, Kazemitabaar).",
    110         "source": "haiku"
    111       }
    112     }
    113   },
    114   "type_checklist": {
    115     "empirical": {
    116       "artifacts": {
    117         "code_released": {
    118           "applies": true,
    119           "answer": false,
    120           "justification": "System deployed at web URL but source code not released for independent verification or reproduction.",
    121           "source": "haiku"
    122         },
    123         "data_released": {
    124           "applies": true,
    125           "answer": false,
    126           "justification": "Dataset sourced from 'Company C's Online Judge System' (proprietary, anonymous) and not made publicly available. 93 test cases not released.",
    127           "source": "haiku"
    128         },
    129         "environment_specified": {
    130           "applies": true,
    131           "answer": false,
    132           "justification": "Tech stack mentioned (Next.js, Azure Static Web App, Monaco Editor) but no requirements.txt, Docker file, or specific versions. Python environment not documented.",
    133           "source": "haiku"
    134         },
    135         "reproduction_instructions": {
    136           "applies": true,
    137           "answer": false,
    138           "justification": "No step-by-step reproduction instructions provided. Cannot rebuild or extend the system from the paper alone.",
    139           "source": "haiku"
    140         }
    141       },
    142       "statistical_methodology": {
    143         "confidence_intervals_or_error_bars": {
    144           "applies": true,
    145           "answer": false,
    146           "justification": "Tables III–VI report point estimates and ranges but no confidence intervals, error bars, or variance statistics for main results.",
    147           "source": "haiku"
    148         },
    149         "significance_tests": {
    150           "applies": true,
    151           "answer": false,
    152           "justification": "RQ1–RQ3 report descriptive statistics (percentages, means) with no p-values, t-tests, or other significance testing. RQ4 survey means reported without statistical analysis.",
    153           "source": "haiku"
    154         },
    155         "effect_sizes_reported": {
    156           "applies": true,
    157           "answer": true,
    158           "justification": "Response time improvements quantified as 12–58% reduction; API cost reduction 8.53%; error detection rates reported as percentages.",
    159           "source": "haiku"
    160         },
    161         "sample_size_justified": {
    162           "applies": true,
    163           "answer": false,
    164           "justification": "RQ1–RQ3: 92–108 test codes analyzed with no power analysis or justification. RQ4: n=6 evaluators with no sample size rationale for drawing conclusions about system quality.",
    165           "source": "haiku"
    166         },
    167         "variance_reported": {
    168           "applies": true,
    169           "answer": false,
    170           "justification": "Table IV shows min/max response times but no standard deviation or spread. Survey responses (Table VI) report means without SDs or confidence intervals.",
    171           "source": "haiku"
    172         }
    173       },
    174       "evaluation_design": {
    175         "baselines_included": {
    176           "applies": true,
    177           "answer": true,
    178           "justification": "RQ1 compares against 'existing online judge system'; RQ2–RQ3 compare initial vs. improved system versions. Baselines present across RQs.",
    179           "source": "haiku"
    180         },
    181         "baselines_contemporary": {
    182           "applies": true,
    183           "answer": false,
    184           "justification": "Online judge system baseline not dated or versioned. No comparison to other modern LLM-based code review tools (e.g., GitHub Copilot, recent research systems).",
    185           "source": "haiku"
    186         },
    187         "ablation_study": {
    188           "applies": true,
    189           "answer": false,
    190           "justification": "Improvements bundle code validation module, prompt optimization, and parameter tuning together. No separate evaluation of each component's contribution to better performance.",
    191           "source": "haiku"
    192         },
    193         "multiple_metrics": {
    194           "applies": true,
    195           "answer": true,
    196           "justification": "RQ1: 4 error types; RQ2: response time; RQ3: API cost (tokens); RQ4: 5 Likert criteria (precision, usefulness, specificity, tone, learning effect).",
    197           "source": "haiku"
    198         },
    199         "human_evaluation": {
    200           "applies": true,
    201           "answer": true,
    202           "justification": "RQ4: 6 evaluators (educators) rated code review comments on 5-point Likert scale across 5 dimensions.",
    203           "source": "haiku"
    204         },
    205         "held_out_test_set": {
    206           "applies": false,
    207           "answer": false,
    208           "justification": "Not a prediction task; system evaluated on same 27 exercises used for development.",
    209           "source": "haiku"
    210         },
    211         "per_category_breakdown": {
    212           "applies": true,
    213           "answer": true,
    214           "justification": "RQ1: failure rates per error type (4 types). RQ4: results broken down by 5 evaluation criteria.",
    215           "source": "haiku"
    216         },
    217         "failure_cases_discussed": {
    218           "applies": true,
    219           "answer": false,
    220           "justification": "Usability test identified issues that were fixed, but paper does not discuss remaining failure cases or edge cases in the improved system.",
    221           "source": "haiku"
    222         },
    223         "negative_results_reported": {
    224           "applies": true,
    225           "answer": false,
    226           "justification": "All results positive (improvements in error detection, latency, cost, quality). Survey shows some lower ratings (Fig. 14 has responses of 3/5) but not highlighted or discussed as limitations.",
    227           "source": "haiku"
    228         }
    229       },
    230       "setup_transparency": {
    231         "model_versions_specified": {
    232           "applies": true,
    233           "answer": false,
    234           "justification": "Only 'GPT-4' named; no API version, snapshot date, or model ID (e.g., gpt-4-0613). Cannot reproduce exact behavior.",
    235           "source": "haiku"
    236         },
    237         "prompts_provided": {
    238           "applies": true,
    239           "answer": false,
    240           "justification": "Paper describes prompt structure (role-setting, review necessity, comment generation) and iterations but does not provide actual final prompts used in deployed system.",
    241           "source": "haiku"
    242         },
    243         "hyperparameters_reported": {
    244           "applies": true,
    245           "answer": false,
    246           "justification": "Text mentions 'temperature and topP parameters were adjusted' and 'maxTokens value was optimized' but specific values not reported.",
    247           "source": "haiku"
    248         },
    249         "scaffolding_described": {
    250           "applies": true,
    251           "answer": true,
    252           "justification": "System flow documented (Figure 10), code editor features described, integration of code correctness and review modules explained. Scaffold is clear.",
    253           "source": "haiku"
    254         },
    255         "data_preprocessing_documented": {
    256           "applies": true,
    257           "answer": true,
    258           "justification": "Dataset collection describes raw data from 2021 onwards, static filtering (remove duplicates, delete comments), and composition into 93 test cases.",
    259           "source": "haiku"
    260         }
    261       },
    262       "data_integrity": {
    263         "raw_data_available": {
    264           "applies": true,
    265           "answer": false,
    266           "justification": "Data from proprietary 'Company C' online judge system, not publicly available. Test dataset (93 codes) not released.",
    267           "source": "haiku"
    268         },
    269         "data_collection_described": {
    270           "applies": true,
    271           "answer": false,
    272           "justification": "Describes starting point (2021) and filtering steps, but lacks detail on selection criteria, coverage, and representativeness of the 93 test cases.",
    273           "source": "haiku"
    274         },
    275         "recruitment_methods_described": {
    276           "applies": true,
    277           "answer": false,
    278           "justification": "Usability test and survey recruited educators ('at least 2 years experience') but no details on how many approached, dropout rates, or selection bias.",
    279           "source": "haiku"
    280         },
    281         "data_pipeline_documented": {
    282           "applies": true,
    283           "answer": true,
    284           "justification": "Figure 2 shows research methodology flow (collection → prompt design → system build → usability test → improvement → evaluation). Pipeline is documented.",
    285           "source": "haiku"
    286         }
    287       },
    288       "contamination": {
    289         "training_cutoff_stated": {
    290           "applies": true,
    291           "answer": false,
    292           "justification": "Paper does not mention GPT-4's training data cutoff or when it was trained. No discussion of model version knowledge dates.",
    293           "source": "haiku"
    294         },
    295         "train_test_overlap_discussed": {
    296           "applies": true,
    297           "answer": false,
    298           "justification": "Exercises from online judge system could be in GPT-4's training data (system is publicly accessible). Risk not discussed.",
    299           "source": "haiku"
    300         },
    301         "benchmark_contamination_addressed": {
    302           "applies": true,
    303           "answer": false,
    304           "justification": "Exercises not a closed academic benchmark, but potential overlap with GPT-4 training data not addressed or acknowledged.",
    305           "source": "haiku"
    306         }
    307       },
    308       "human_studies": {
    309         "pre_registered": {
    310           "applies": false,
    311           "answer": false,
    312           "justification": "Not a formal human subjects study; usability test and survey conducted post-hoc without pre-registration.",
    313           "source": "haiku"
    314         },
    315         "irb_or_ethics_approval": {
    316           "applies": false,
    317           "answer": false,
    318           "justification": "No IRB approval mentioned. Participants were educators (not vulnerable subjects), but ethical oversight not reported.",
    319           "source": "haiku"
    320         },
    321         "demographics_reported": {
    322           "applies": true,
    323           "answer": false,
    324           "justification": "Only inclusion criterion given: '2+ years programming education experience.' No age, gender, institution, or other demographics reported for the 6 evaluators.",
    325           "source": "haiku"
    326         },
    327         "inclusion_exclusion_criteria": {
    328           "applies": true,
    329           "answer": false,
    330           "justification": "Minimal criteria: ≥2 years teaching experience. No exclusion criteria or justification for selection.",
    331           "source": "haiku"
    332         },
    333         "randomization_described": {
    334           "applies": false,
    335           "answer": false,
    336           "justification": "Not applicable; no experimental randomization of evaluators or conditions.",
    337           "source": "haiku"
    338         },
    339         "blinding_described": {
    340           "applies": false,
    341           "answer": false,
    342           "justification": "Not applicable; no blinding in a tool evaluation study.",
    343           "source": "haiku"
    344         },
    345         "attrition_reported": {
    346           "applies": false,
    347           "answer": false,
    348           "justification": "Not applicable; no longitudinal follow-up or dropout reported.",
    349           "source": "haiku"
    350         }
    351       },
    352       "cost_and_practicality": {
    353         "inference_cost_reported": {
    354           "applies": true,
    355           "answer": true,
    356           "justification": "RQ3 directly addresses API cost. Table V reports input/output tokens and cost per call (USD) based on OpenAI's pricing.",
    357           "source": "haiku"
    358         },
    359         "compute_budget_stated": {
    360           "applies": true,
    361           "answer": false,
    362           "justification": "Cost per call reported but total computational budget (across all evaluation runs, deployment cost, etc.) not stated.",
    363           "source": "haiku"
    364         }
    365       }
    366     }
    367   },
    368   "claims": [
    369     {
    370       "claim": "Improved system identifies error types (hard coding, unnecessary code, etc.) more effectively than online judge system",
    371       "evidence": "Table III shows failure rates per error type: hard coding 21.3%, unnecessary code 17.59%, requirement not met 15.74%, computation error 7.41%",
    372       "supported": "moderate"
    373     },
    374     {
    375       "claim": "Improved system reduces response time by 12–58% vs. initial system",
    376       "evidence": "Table IV reports response times for 92 error codes across 27 questions; improvement percentage calculated",
    377       "supported": "strong"
    378     },
    379     {
    380       "claim": "Improved system reduces API call costs by up to 8.53%",
    381       "evidence": "Table V quantifies input/output token counts and cost per call in USD based on OpenAI pricing",
    382       "supported": "strong"
    383     },
    384     {
    385       "claim": "Code review comment quality is maintained despite reduced latency and costs",
    386       "evidence": "Table VI survey of 6 evaluators on 5-point Likert scale; most ratings are 4–5 (high satisfaction)",
    387       "supported": "moderate"
    388     },
    389     {
    390       "claim": "System minimizes AI-assist cheating through design (code reviews only on submitted code, no direct solutions)",
    391       "evidence": "System architecture described; feature prevents direct code generation by requiring submission first",
    392       "supported": "weak"
    393     },
    394     {
    395       "claim": "System is suitable for primary and secondary school students",
    396       "evidence": "Expert evaluators affirmed suitability in survey; no evaluation with actual students",
    397       "supported": "weak"
    398     },
    399     {
    400       "claim": "Prompt engineering improves code review quality and reduces hallucination",
    401       "evidence": "Text mentions 5 prompt iterations and reduction of hallucination by adding solution/example sub-prompts; no quantitative data provided",
    402       "supported": "weak"
    403     }
    404   ],
    405   "methodology_tags": [
    406     "empirical",
    407     "case-study"
    408   ],
    409   "key_findings": "A GPT-4-based code review system deployed on the web successfully provides educational feedback to programming students, with an improved version reducing response time by up to 58% and API costs by 8.53% compared to the initial system. Error type detection improved significantly (hard coding 21.3%, unnecessary code 17.59%), and expert evaluators rated code review quality highly on precision, usefulness, and supportive tone. However, the system was evaluated only with educators (n=6), not actual students, leaving pedagogical effectiveness unvalidated.",
    410   "red_flags": [
    411     {
    412       "flag": "No evaluation with target population",
    413       "detail": "Claims suitability for K-12 students but testing used only 6 educators. Generalizability to actual students unsupported."
    414     },
    415     {
    416       "flag": "Confounded improvements",
    417       "detail": "Code validation module, prompt optimization, and parameter tuning deployed together without ablation. Cannot isolate drivers of improvement."
    418     },
    419     {
    420       "flag": "Unvalidated safety claim",
    421       "detail": "Claims cheating prevention via system design (code reviews only on submitted code) but never verifies this is effective or sufficient."
    422     },
    423     {
    424       "flag": "Small sample sizes without justification",
    425       "detail": "Usability test (n=3), survey (n=6). No power analysis or justification for sample adequacy."
    426     },
    427     {
    428       "flag": "No statistical significance testing",
    429       "detail": "Comparisons (initial vs. improved, error detection) reported as descriptive statistics without p-values or confidence intervals."
    430     },
    431     {
    432       "flag": "Data and code not reproducible",
    433       "detail": "Dataset from proprietary system, code not released. Cannot independently reproduce or extend work."
    434     },
    435     {
    436       "flag": "Incomplete model specification",
    437       "detail": "Only 'GPT-4' named; no API version, snapshot date, or model ID provided."
    438     },
    439     {
    440       "flag": "Prompts not disclosed",
    441       "detail": "Prompt structure described but actual final prompts not provided. Cannot replicate prompt engineering approach."
    442     },
    443     {
    444       "flag": "Training data contamination not addressed",
    445       "detail": "Exercises from public online judge system could be in GPT-4 training data. Risk not discussed."
    446     },
    447     {
    448       "flag": "Limited technical novelty",
    449       "detail": "Appears to be application of existing LLM techniques (few-shot prompting, role-setting) to a new domain without significant methodological contribution."
    450     }
    451   ],
    452   "cited_papers": [
    453     {
    454       "title": "Can Large Language Models Provide Feedback to Students? A Case Study on ChatGPT",
    455       "relevance": "Establishes precedent for using LLMs to generate educational feedback on student work"
    456     },
    457     {
    458       "title": "Automated feedback generation for introductory programming assignments",
    459       "relevance": "Defines feedback structure (error location, problematic expression, fix) needed for novice learners"
    460     },
    461     {
    462       "title": "LLaMA-Reviewer: Advancing Code Review Automation with Large Language Models through Parameter-Efficient Fine-Tuning",
    463       "relevance": "Proposes code review pipeline (review necessity, comment generation, refinement) used in this system"
    464     },
    465     {
    466       "title": "A Critical Review of Large Language Model on Software Engineering: An Example from ChatGPT and Automated Program Repair",
    467       "relevance": "Documents data leakage risk in LLM evaluation and demonstrates ChatGPT's code repair abilities"
    468     },
    469     {
    470       "title": "CodeAid: Evaluating a Classroom Deployment of an LLM-based Programming Assistant that Balances Student and Educator Needs",
    471       "relevance": "Discusses prompt principles for learner-friendly feedback (examples, structure, tone, accuracy)"
    472     },
    473     {
    474       "title": "From 'Ban It Till We Understand It' to 'Resistance is Futile': How University Programming Instructors Plan to Adapt as More Students Use AI Code Generation and Explanation Tools",
    475       "relevance": "Captures instructor concerns about AI-assist cheating and divided opinions on LLM adoption in CS education"
    476     }
    477   ],
    478   "engagement_factors": {
    479     "practical_relevance": {
    480       "score": 2,
    481       "justification": "System deployed and accessible at codetutor119judge.com for educators, but no evidence of actual student adoption or learning impact."
    482     },
    483     "surprise_contrarian": {
    484       "score": 0,
    485       "justification": "Using GPT-4 for code review and prompt engineering for educational feedback are well-established approaches; no novel or contrarian insights."
    486     },
    487     "fear_safety": {
    488       "score": 1,
    489       "justification": "Addresses AI-assist cheating risk through system design but provides only superficial treatment without deep safety analysis or validation."
    490     },
    491     "drama_conflict": {
    492       "score": 0,
    493       "justification": "No conflict, controversy, or dramatic narrative; straightforward tool-building paper."
    494     },
    495     "demo_ability": {
    496       "score": 2,
    497       "justification": "Live system deployed and accessible via web URL, but requires account/login and not freely available for casual exploration."
    498     },
    499     "brand_recognition": {
    500       "score": 1,
    501       "justification": "Author from Hanyang University (moderate tier); system leverages GPT-4 brand but brings limited independent credibility."
    502     }
    503   },
    504   "hn_data": {
    505     "threads": [
    506       {
    507         "hn_id": "39660780",
    508         "title": "How far are we from intelligent visual deductive reasoning?",
    509         "points": 124,
    510         "comments": 118,
    511         "url": "https://news.ycombinator.com/item?id=39660780"
    512       },
    513       {
    514         "hn_id": "37197734",
    515         "title": "Large Language Models As General Pattern Machines",
    516         "points": 84,
    517         "comments": 35,
    518         "url": "https://news.ycombinator.com/item?id=37197734"
    519       },
    520       {
    521         "hn_id": "39363613",
    522         "title": "A 1.9 solar mass neutron star candidate in a 2-year orbit",
    523         "points": 77,
    524         "comments": 25,
    525         "url": "https://news.ycombinator.com/item?id=39363613"
    526       },
    527       {
    528         "hn_id": "39297479",
    529         "title": "Direct Language Model Alignment from Online AI Feedback",
    530         "points": 61,
    531         "comments": 4,
    532         "url": "https://news.ycombinator.com/item?id=39297479"
    533       },
    534       {
    535         "hn_id": "40559259",
    536         "title": "Is Complexity an Illusion?",
    537         "points": 2,
    538         "comments": 1,
    539         "url": "https://news.ycombinator.com/item?id=40559259"
    540       },
    541       {
    542         "hn_id": "36783913",
    543         "title": "AnimateDiff: Animate Your Diffusion Models Without Tuning",
    544         "points": 2,
    545         "comments": 0,
    546         "url": "https://news.ycombinator.com/item?id=36783913"
    547       },
    548       {
    549         "hn_id": "36679216",
    550         "title": "Large Language Models can complete complex non linguistic patterns in context",
    551         "points": 2,
    552         "comments": 0,
    553         "url": "https://news.ycombinator.com/item?id=36679216"
    554       },
    555       {
    556         "hn_id": "41358552",
    557         "title": "A Review of Pseudo-Labeling for Computer Vision",
    558         "points": 1,
    559         "comments": 0,
    560         "url": "https://news.ycombinator.com/item?id=41358552"
    561       },
    562       {
    563         "hn_id": "40308637",
    564         "title": "LLMs Can Patch Up Missing Relevance Judgments in Evaluation",
    565         "points": 1,
    566         "comments": 0,
    567         "url": "https://news.ycombinator.com/item?id=40308637"
    568       },
    569       {
    570         "hn_id": "39047436",
    571         "title": "Identifying Fabricated Networks Within Authorship-for-Sale Enterprises",
    572         "points": 1,
    573         "comments": 0,
    574         "url": "https://news.ycombinator.com/item?id=39047436"
    575       }
    576     ],
    577     "top_points": 124,
    578     "total_points": 355,
    579     "total_comments": 183
    580   }
    581 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs