scan.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan.json (28037B)
      1 {
      2   "paper": {
      3     "title": "Semantic Compression for Memory Retention in LLM Test Generation",
      4     "authors": ["Gan Wang", "Hiroaki Hashiura"],
      5     "year": 2026,
      6     "venue": "IEICE Transactions on Information and Systems (Letter, Special Section on Knowledge-Based Software Engineering)",
      7     "doi": "10.1587/transinf.2025KBL0001"
      8   },
      9   "scan_version": 3,
     10   "active_modules": ["experimental_rigor", "data_leakage"],
     11   "methodology_tags": ["benchmark-eval", "case-study"],
     12   "key_findings": "Semantic compression of code and test-execution artifacts achieves an average 16% compression ratio (comparable to natural-language compression). Supplying two most-recent compressed memory items to GPT-4o during iterative test generation improves maximum code coverage across all eight tested {fmt} library modules compared to a no-memory baseline. However, compilation success rates remain very low (24% average with memory, 6% without), and the evaluation is limited to a single C++ library.",
     13   "checklist": {
     14     "artifacts": {
     15       "code_released": {
     16         "applies": true,
     17         "answer": false,
     18         "justification": "No repository URL, code archive, or download link is provided anywhere in the paper. The Python tool described in Section 3 is not released."
     19       },
     20       "data_released": {
     21         "applies": true,
     22         "answer": false,
     23         "justification": "Neither the generated tests, compressed data, coverage measurements, nor any experimental outputs are released. The {fmt} library used as input is public, but no experimental artifacts are shared."
     24       },
     25       "environment_specified": {
     26         "applies": true,
     27         "answer": false,
     28         "justification": "The paper mentions the tool is 'written in Python' targeting C++ with gcov/lcov for coverage, but provides no Python version, library versions, requirements file, or environment setup details."
     29       },
     30       "reproduction_instructions": {
     31         "applies": true,
     32         "answer": false,
     33         "justification": "No step-by-step instructions, README, or reproduction guide is provided. A reader would have to reconstruct the entire pipeline from the prose description."
     34       }
     35     },
     36     "statistical_methodology": {
     37       "confidence_intervals_or_error_bars": {
     38         "applies": true,
     39         "answer": false,
     40         "justification": "Figure 3 shows maximum coverage per module as bar charts with no error bars. Table 2 shows averages with no confidence intervals or uncertainty measures. Only point estimates are reported throughout."
     41       },
     42       "significance_tests": {
     43         "applies": true,
     44         "answer": true,
     45         "justification": "A Wilcoxon signed-rank test is reported for the compilation success rate comparison between two-most-recent memories and entire history (p=0.015625, α=0.05). However, no significance test is applied to the main coverage claims."
     46       },
     47       "effect_sizes_reported": {
     48         "applies": true,
     49         "answer": false,
     50         "justification": "Raw percentages are reported (compression ratios, coverage values, compilation success rates) but no formal effect sizes such as Cohen's d or relative improvement with baseline context. The coverage comparison is shown only as a bar chart with no numerical effect quantification."
     51       },
     52       "sample_size_justified": {
     53         "applies": true,
     54         "answer": false,
     55         "justification": "20 test-generation sets per module (160 total) are used with no justification for why 20 was chosen. No power analysis or discussion of whether this sample size is adequate for the claims."
     56       },
     57       "variance_reported": {
     58         "applies": true,
     59         "answer": false,
     60         "justification": "Only maximum coverage (Figure 3) and averages (Table 2) are reported across the 20 runs per module. No standard deviations, IQR, or any spread measure is provided for the coverage or compression results."
     61       }
     62     },
     63     "evaluation_design": {
     64       "baselines_included": {
     65         "applies": true,
     66         "answer": true,
     67         "justification": "The paper compares test generation without memory retention (baseline) vs. with memory retention (proposed method), as shown in Figure 3 and Table 1."
     68       },
     69       "baselines_contemporary": {
     70         "applies": true,
     71         "answer": false,
     72         "justification": "The only baseline is the authors' own no-memory version. No comparison is made against any existing test generation tools, methods, or prior work (e.g., the multi-agent approach of Garlapati et al. [7] or the prompt-chain approach of Yin et al. [8] discussed in Section 5)."
     73       },
     74       "ablation_study": {
     75         "applies": true,
     76         "answer": false,
     77         "justification": "The system has multiple components (semantic compression, memory selection strategy, test generation prompts) but no systematic ablation removing individual components. The comparison of two-recent vs. all memories (Table 1) tests one design choice but is not a component ablation."
     78       },
     79       "multiple_metrics": {
     80         "applies": true,
     81         "answer": true,
     82         "justification": "Three metrics are reported: code coverage (Figure 3), compression ratio (Table 2), and compilation success rate (Table 1)."
     83       },
     84       "human_evaluation": {
     85         "applies": true,
     86         "answer": false,
     87         "justification": "Evaluation is entirely automated via gcov/lcov coverage measurement. No human review of generated test quality, readability, or correctness is performed."
     88       },
     89       "held_out_test_set": {
     90         "applies": true,
     91         "answer": false,
     92         "justification": "All eight {fmt} modules are used for both development/tuning (e.g., deciding to use two most recent memories) and final evaluation. No modules are held out for independent testing."
     93       },
     94       "per_category_breakdown": {
     95         "applies": true,
     96         "answer": true,
     97         "justification": "Results are broken down per module in Table 1 (compilation success), Figure 3 (coverage), and Table 2 (compression ratios) for all eight {fmt} modules."
     98       },
     99       "failure_cases_discussed": {
    100         "applies": true,
    101         "answer": false,
    102         "justification": "Despite very low compilation success rates (4% for 'compile' module, 24% average), the paper does not analyze why compilations fail, what errors occur, or where the approach breaks down."
    103       },
    104       "negative_results_reported": {
    105         "applies": true,
    106         "answer": false,
    107         "justification": "All results are presented positively. The extremely low compilation success rates (average 24%) are not discussed as a significant limitation or negative finding. No failed approaches or configurations are reported."
    108       }
    109     },
    110     "claims_and_evidence": {
    111       "abstract_claims_supported": {
    112         "applies": true,
    113         "answer": false,
    114         "justification": "The abstract claims 'higher consistency and coverage.' Coverage improvement is shown in Figure 3, but 'consistency' is never measured with any metric — it is asserted without evidence. The abstract also omits the very low compilation success rates."
    115       },
    116       "causal_claims_justified": {
    117         "applies": true,
    118         "answer": false,
    119         "justification": "The paper claims semantic compression 'improves coverage' (causal language). However, the comparison is confounded: 10 independent baseline generations are compared against 20 iterative memory-retention generations. The improvement could be due to iterative refinement or simply more attempts rather than the semantic compression itself. No controlled experiment isolates the effect of compression."
    120       },
    121       "generalization_bounded": {
    122         "applies": true,
    123         "answer": false,
    124         "justification": "The paper tests only on one C++ library ({fmt}) with one model (GPT-4o), yet the title claims broadly about 'LLM Test Generation.' Section 5 explicitly states 'we believe the findings generalize to other models' without evidence. No scope boundaries are stated."
    125       },
    126       "alternative_explanations_discussed": {
    127         "applies": true,
    128         "answer": false,
    129         "justification": "The Threats to Validity section (Section 6) discusses stochastic compression fidelity and coverage counting methodology, but does not consider alternative explanations for the main results: that improvement could stem from iterative refinement, additional generation attempts, or GPT-4o's prior knowledge of {fmt}."
    130       },
    131       "proxy_outcome_distinction": {
    132         "applies": true,
    133         "answer": false,
    134         "justification": "Line coverage (gcov/lcov) is used as the sole proxy for test quality/effectiveness, but the paper does not acknowledge that coverage is a limited proxy — it does not capture assertion quality, mutation score, fault detection, or test maintainability."
    135       }
    136     },
    137     "setup_transparency": {
    138       "model_versions_specified": {
    139         "applies": true,
    140         "answer": false,
    141         "justification": "The paper states 'gpt-4o' (Section 3) without any version identifier, snapshot date, or API version. Per schema rules, marketing names without a snapshot date do not count as specified versions."
    142       },
    143       "prompts_provided": {
    144         "applies": true,
    145         "answer": true,
    146         "justification": "Both the semantic compression prompt and the test-generation prompt are provided in full in Section 3, including the actual text with placeholders clearly marked for variable inputs."
    147       },
    148       "hyperparameters_reported": {
    149         "applies": true,
    150         "answer": false,
    151         "justification": "No LLM hyperparameters (temperature, top-p, max tokens, etc.) are reported for either the compression or generation calls to GPT-4o."
    152       },
    153       "scaffolding_described": {
    154         "applies": true,
    155         "answer": true,
    156         "justification": "Section 3 describes the pipeline: Python tool generates C++ tests, executes them, compresses results via LLM, stores in database, and references compressed data in subsequent generations. The memory retention mechanism and data flow are explained."
    157       },
    158       "data_preprocessing_documented": {
    159         "applies": true,
    160         "answer": false,
    161         "justification": "The paper mentions that 'the necessary information had been extracted with an LLM' before compression (Table 2 note) but does not document this extraction step. Coverage counting modifications (removing blank lines, commented code, preprocessor lines) are described in Section 6 but framed as a validity threat rather than documented preprocessing."
    162       }
    163     },
    164     "limitations_and_scope": {
    165       "limitations_section_present": {
    166         "applies": true,
    167         "answer": true,
    168         "justification": "Section 6 'Threats to Validity' is a dedicated section discussing internal and external validity concerns."
    169       },
    170       "threats_to_validity_specific": {
    171         "applies": true,
    172         "answer": true,
    173         "justification": "Section 6 discusses specific threats: (1) stochastic LLM compression making exact reproduction impossible (internal), and (2) the specific way gcov/lcov counts coverage lines after removing blanks, comments, and preprocessor-discarded lines (external). These are specific to this study."
    174       },
    175       "scope_boundaries_stated": {
    176         "applies": true,
    177         "answer": false,
    178         "justification": "The paper does not explicitly state what the results do NOT show. Instead, Section 5 claims generalizability to other models ('we believe the findings generalize to other models') and Section 7 claims applicability to 'other valuable artifacts.' No explicit scope boundaries are set."
    179       }
    180     },
    181     "data_integrity": {
    182       "raw_data_available": {
    183         "applies": true,
    184         "answer": false,
    185         "justification": "No raw data (generated tests, coverage logs, compression outputs, execution results) is made available for independent verification."
    186       },
    187       "data_collection_described": {
    188         "applies": true,
    189         "answer": true,
    190         "justification": "Section 4 describes: using {fmt} library (with URL and access date 2025-08-11), API Reference as specification, 8 modules, 20 independent sets per module, gcov/lcov for coverage. The experimental procedure is outlined."
    191       },
    192       "recruitment_methods_described": {
    193         "applies": false,
    194         "answer": false,
    195         "justification": "No human participants. The data source is the {fmt} open-source library."
    196       },
    197       "data_pipeline_documented": {
    198         "applies": true,
    199         "answer": false,
    200         "justification": "The pipeline from generation to final results is incompletely documented. It is unclear how the 160 generations map to the reported maximum coverage values, how failed compilations (76% of attempts) are handled, or how the 'average character statistics' in Table 2 are computed across the 20 runs."
    201       }
    202     },
    203     "conflicts_of_interest": {
    204       "funding_disclosed": {
    205         "applies": true,
    206         "answer": true,
    207         "justification": "Acknowledgment section states: 'This work was supported by JSPS KAKENHI Grant Numbers 24K15214.'"
    208       },
    209       "affiliations_disclosed": {
    210         "applies": true,
    211         "answer": true,
    212         "justification": "Both authors are affiliated with Nippon Institute of Technology, Saitama, Japan, as stated in the paper header. No conflict with evaluated products."
    213       },
    214       "funder_independent_of_outcome": {
    215         "applies": true,
    216         "answer": true,
    217         "justification": "JSPS KAKENHI is a Japanese government research funding program with no financial stake in the research outcomes."
    218       },
    219       "financial_interests_declared": {
    220         "applies": true,
    221         "answer": false,
    222         "justification": "No competing interests or financial interests statement is included in the paper."
    223       }
    224     },
    225     "contamination": {
    226       "training_cutoff_stated": {
    227         "applies": true,
    228         "answer": false,
    229         "justification": "The paper uses GPT-4o but does not state the model's training data cutoff date, making it impossible to assess whether {fmt} library code or tests were in the training data."
    230       },
    231       "train_test_overlap_discussed": {
    232         "applies": true,
    233         "answer": false,
    234         "justification": "{fmt} is a popular, widely-used C++ library on GitHub. GPT-4o has very likely seen {fmt} source code and possibly its existing test suite during training. This potential overlap is not discussed."
    235       },
    236       "benchmark_contamination_addressed": {
    237         "applies": true,
    238         "answer": false,
    239         "justification": "The {fmt} library and its extensive test suite have been publicly available on GitHub since well before GPT-4o's training. The model may already know effective test patterns for this specific library. This contamination risk is not addressed."
    240       }
    241     },
    242     "human_studies": {
    243       "pre_registered": {
    244         "applies": false,
    245         "answer": false,
    246         "justification": "No human participants in this study."
    247       },
    248       "irb_or_ethics_approval": {
    249         "applies": false,
    250         "answer": false,
    251         "justification": "No human participants in this study."
    252       },
    253       "demographics_reported": {
    254         "applies": false,
    255         "answer": false,
    256         "justification": "No human participants in this study."
    257       },
    258       "inclusion_exclusion_criteria": {
    259         "applies": false,
    260         "answer": false,
    261         "justification": "No human participants in this study."
    262       },
    263       "randomization_described": {
    264         "applies": false,
    265         "answer": false,
    266         "justification": "No human participants in this study."
    267       },
    268       "blinding_described": {
    269         "applies": false,
    270         "answer": false,
    271         "justification": "No human participants in this study."
    272       },
    273       "attrition_reported": {
    274         "applies": false,
    275         "answer": false,
    276         "justification": "No human participants in this study."
    277       }
    278     },
    279     "cost_and_practicality": {
    280       "inference_cost_reported": {
    281         "applies": true,
    282         "answer": false,
    283         "justification": "The method calls GPT-4o for both compression and generation across 160+ experiment runs, but no API costs, token counts, or latency figures are reported."
    284       },
    285       "compute_budget_stated": {
    286         "applies": true,
    287         "answer": false,
    288         "justification": "No total computational budget, API spend, or hardware details are stated despite extensive API usage."
    289       }
    290     },
    291     "experimental_rigor": {
    292       "seed_sensitivity_reported": {
    293         "applies": true,
    294         "answer": false,
    295         "justification": "20 independent runs per module are performed, but no variance, standard deviation, or sensitivity analysis across runs is reported. Only maximum coverage is shown."
    296       },
    297       "number_of_runs_stated": {
    298         "applies": true,
    299         "answer": true,
    300         "justification": "Section 4 states: 'For each of eight modules we run 20 independent test-generation sets, maintaining memory within each set, resulting in 160 test generations in total.' Ten baseline generations are also stated."
    301       },
    302       "hyperparameter_search_budget": {
    303         "applies": true,
    304         "answer": false,
    305         "justification": "The choice of two most recent memories (vs. all) was made based on Table 1, but no systematic hyperparameter search is described or budgeted. Prompt wording and memory count were not systematically tuned."
    306       },
    307       "best_config_selection_justified": {
    308         "applies": true,
    309         "answer": false,
    310         "justification": "The choice to use two most recent memories was justified by Table 1's compilation success comparison, but this comparison was done on the same data used for final evaluation — no held-out validation."
    311       },
    312       "multiple_comparison_correction": {
    313         "applies": true,
    314         "answer": false,
    315         "justification": "Coverage comparisons are made across 8 modules with no statistical testing at all. The single Wilcoxon test for compilation rate does not address the multiple module-level coverage comparisons."
    316       },
    317       "self_comparison_bias_addressed": {
    318         "applies": true,
    319         "answer": false,
    320         "justification": "The authors evaluate their own system against their own baseline with no acknowledgment of self-evaluation bias. No independent evaluation is conducted."
    321       },
    322       "compute_budget_vs_performance": {
    323         "applies": true,
    324         "answer": false,
    325         "justification": "The memory-retention method requires additional GPT-4o API calls for compression on top of generation calls, using substantially more compute than the baseline. This compute difference is not discussed or controlled for."
    326       },
    327       "benchmark_construct_validity": {
    328         "applies": true,
    329         "answer": false,
    330         "justification": "Line coverage via gcov/lcov is used as the sole benchmark metric without any discussion of whether line coverage is a valid measure of test quality or effectiveness. No comparison with mutation testing, fault detection, or other test quality metrics."
    331       },
    332       "scaffold_confound_addressed": {
    333         "applies": false,
    334         "answer": false,
    335         "justification": "The memory retention pipeline IS the intervention being tested, not a confound. No model comparisons are made across different scaffolds."
    336       }
    337     },
    338     "data_leakage": {
    339       "temporal_leakage_addressed": {
    340         "applies": true,
    341         "answer": false,
    342         "justification": "The {fmt} library and its test suite existed long before GPT-4o's training cutoff. GPT-4o may have learned test patterns for this specific library. This temporal leakage is not discussed."
    343       },
    344       "feature_leakage_addressed": {
    345         "applies": true,
    346         "answer": false,
    347         "justification": "The source code and specification are provided as intended inputs, but GPT-4o's pre-existing knowledge of {fmt} could provide additional information beyond what the prompts supply. This is not discussed."
    348       },
    349       "non_independence_addressed": {
    350         "applies": true,
    351         "answer": false,
    352         "justification": "All eight modules come from the same library ({fmt}), share coding conventions, and may have structural similarities. Non-independence of test subjects is not discussed."
    353       },
    354       "leakage_detection_method": {
    355         "applies": true,
    356         "answer": false,
    357         "justification": "No leakage detection method (canary strings, membership inference, n-gram overlap with known {fmt} tests) is applied."
    358       }
    359     }
    360   },
    361   "claims": [
    362     {
    363       "claim": "Semantic compression achieves an average compression ratio of approximately 16% for code, logs, and test information, comparable to natural-language compression (15-20%).",
    364       "evidence": "Table 2 shows per-module compression ratios ranging from 14.41% (args) to 18.86% (compile), with an overall average of 16.39%. Reference [5] is cited for the 15-20% natural language baseline.",
    365       "supported": "moderate"
    366     },
    367     {
    368       "claim": "Memory retention with semantically compressed information improves coverage across all modules compared to generation without memory.",
    369       "evidence": "Figure 3 shows maximum coverage bars for 'with memory' exceeding 'without memory' for all eight {fmt} modules. However, no statistical test is performed on coverage differences, no variance is shown, and the comparison is confounded by different numbers of generation attempts (10 baseline vs. 20 memory).",
    370       "supported": "weak"
    371     },
    372     {
    373       "claim": "Using the two most recent memory items significantly improves compilation success rate compared to using entire history (p=0.015625).",
    374       "evidence": "Table 1 shows per-module compilation success rates: 24% average for two most recent vs. 6% for entire history. A Wilcoxon signed-rank test yields p=0.015625 (α=0.05).",
    375       "supported": "moderate"
    376     },
    377     {
    378       "claim": "The findings generalize to other LLM models beyond GPT-4o.",
    379       "evidence": "Section 5 states 'Although our experiments employed GPT–4o, we believe the findings generalize to other models' but provides no evidence for this claim.",
    380       "supported": "unsupported"
    381     }
    382   ],
    383   "red_flags": [
    384     {
    385       "flag": "Confounded baseline comparison",
    386       "detail": "10 independent baseline generations are compared against 20 iterative memory-retention generations. The improvement could be due to having more attempts or iterative refinement rather than semantic compression. The comparison does not isolate the effect of compression."
    387     },
    388     {
    389       "flag": "Very low compilation success rates",
    390       "detail": "Only 24% of generated tests compile successfully with the proposed method (6% with full history). This means 76% of outputs are unusable, yet this is not discussed as a major limitation. The 'compile' module achieves only 4% compilation success."
    391     },
    392     {
    393       "flag": "Single-library evaluation",
    394       "detail": "All experiments use only the {fmt} C++ library. Results may not generalize to other libraries, languages, or codebases, yet the paper claims broad applicability and generalizes to 'other models' in Section 5."
    395     },
    396     {
    397       "flag": "Contamination risk from popular library",
    398       "detail": "{fmt} is a widely-used, popular C++ library on GitHub with an extensive existing test suite. GPT-4o has very likely seen both the library's source code and tests during training, potentially inflating coverage results."
    399     },
    400     {
    401       "flag": "No variance or uncertainty on main results",
    402       "detail": "Coverage results are reported as maximums across 20 runs with no error bars, standard deviations, or confidence intervals. The reader cannot assess result stability or whether differences are meaningful."
    403     },
    404     {
    405       "flag": "Missing cost reporting",
    406       "detail": "The method requires multiple GPT-4o API calls per generation (compression + generation) across 160+ experiments with no cost, token count, or latency data reported."
    407     }
    408   ],
    409   "cited_papers": [
    410     {
    411       "title": "Software testing with large language models: Survey, landscape, and vision",
    412       "authors": ["J. Wang", "Y. Huang", "C. Chen", "Z. Liu", "S. Wang", "Q. Wang"],
    413       "year": 2024,
    414       "relevance": "Comprehensive survey of LLM-based software testing techniques, directly relevant to the survey's scope on AI-assisted development."
    415     },
    416     {
    417       "title": "An empirical evaluation of using large language models for automated unit test generation",
    418       "authors": ["M. Schäfer", "S. Nadi", "A. Eghbali", "F. Tip"],
    419       "year": 2024,
    420       "relevance": "Empirical evaluation of LLM unit test generation, covering coverage and quality issues central to the survey's focus on AI code generation evaluation."
    421     },
    422     {
    423       "title": "Evaluating and improving ChatGPT for unit test generation",
    424       "authors": ["Z. Yuan", "M. Liu", "S. Ding", "K. Wang", "Y. Chen", "X. Peng", "Y. Lou"],
    425       "year": 2024,
    426       "relevance": "Evaluates ChatGPT for test generation with improvement strategies, directly relevant to LLM capability assessment in software engineering."
    427     },
    428     {
    429       "title": "Code-aware prompting: A study of coverage-guided test generation in regression setting using LLM",
    430       "authors": ["G. Ryan", "S. Jain", "M. Shang", "S. Wang", "X. Ma", "M.K. Ramanathan", "B. Ray"],
    431       "year": 2024,
    432       "relevance": "Studies coverage-guided LLM test generation with code-aware prompting, addressing the same consistency and coverage challenges as the current paper."
    433     },
    434     {
    435       "title": "Extending context window of large language models via semantic compression",
    436       "authors": ["W. Fei", "X. Niu", "P. Zhou", "L. Hou", "B. Bai", "L. Deng", "W. Han"],
    437       "year": 2024,
    438       "relevance": "Foundational work on semantic compression for LLMs that this paper builds upon, relevant to understanding LLM context management techniques."
    439     },
    440     {
    441       "title": "AI-powered multi-agent framework for automated unit test case generation: Enhancing software quality through LLMs",
    442       "authors": ["A. Garlapati", "M.N.V. Satya Sai Muni Parmesh", "Savitha", "J. S"],
    443       "year": 2024,
    444       "relevance": "Multi-agent framework for test generation achieving high coverage scores, relevant to the survey's scope on agentic AI approaches to software engineering."
    445     },
    446     {
    447       "title": "Leveraging pre-trained large language models (LLMs) for on-premises comprehensive automated test case generation: An empirical study",
    448       "authors": ["H. Yin", "H. Mohammed", "S. Boyapati"],
    449       "year": 2024,
    450       "relevance": "Empirical study of on-premises LLM test generation using prompt-chain and fine-tuning methods, relevant to practical LLM deployment for software testing."
    451     }
    452   ],
    453   "engagement_factors": {
    454     "practical_relevance": {
    455       "score": 1,
    456       "justification": "The idea of compressing prior context for iterative LLM test generation has practical appeal, but no tool is released and results are limited to one C++ library with low compilation rates."
    457     },
    458     "surprise_contrarian": {
    459       "score": 0,
    460       "justification": "Confirms the expected finding that providing more context to LLMs improves output quality; no conventional wisdom is challenged."
    461     },
    462     "fear_safety": {
    463       "score": 0,
    464       "justification": "No AI safety, security, or risk concerns raised."
    465     },
    466     "drama_conflict": {
    467       "score": 0,
    468       "justification": "No controversy or conflict with existing claims or institutions."
    469     },
    470     "demo_ability": {
    471       "score": 0,
    472       "justification": "No code, tool, or demo is released; the approach cannot be tried."
    473     },
    474     "brand_recognition": {
    475       "score": 1,
    476       "justification": "Uses GPT-4o (a recognized model name) but the authors and institution are not widely known in the AI community."
    477     }
    478   }
    479 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs