scan.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan.json (31784B)
      1 {
      2   "paper": {
      3     "title": "A Retrieval-Augmented Generation Approach to Extracting Algorithmic Logic from Neural Networks",
      4     "authors": [
      5       "Waleed Khalid",
      6       "Dmitry Ignatov",
      7       "Radu Timofte"
      8     ],
      9     "year": 2025,
     10     "venue": "arXiv.org",
     11     "arxiv_id": "2512.04329",
     12     "doi": "10.48550/arXiv.2512.04329"
     13   },
     14   "scan_version": 3,
     15   "active_modules": ["experimental_rigor", "data_leakage"],
     16   "methodology_tags": ["benchmark-eval", "case-study"],
     17   "key_findings": "NN-RAG extracts dependency-closed, executable PyTorch modules from heterogeneous repositories via scope-aware dependency resolution and three-stage validation (AST parse, bytecode compilation, sandboxed execution). Applied to 19 repositories, the pipeline targeted 1,289 blocks and validated 941 (73.0% pass rate), with 82% qualifying as structurally unique. Multi-level deduplication shows NN-RAG supplies 72.46% of all unique architectures in the LEMUR dataset. The framework also surfaced a CIFAR-10 model achieving 92.81% accuracy, the best in LEMUR at time of writing.",
     18   "checklist": {
     19     "artifacts": {
     20       "code_released": {
     21         "applies": true,
     22         "answer": true,
     23         "justification": "GitHub repositories are provided: NN-RAG at https://github.com/ABrain-One/nn-rag (ref [10]) and NN-DUP at https://github.com/ABrain-One/nn-dup (ref [9]). Working URLs are cited in the references."
     24       },
     25       "data_released": {
     26         "applies": true,
     27         "answer": true,
     28         "justification": "The LEMUR dataset [19] is publicly available. The paper states 'We release the nn-dup configuration and logs alongside the code to make this tally reproducible and auditable.' The extracted blocks can be regenerated from the open-source tool against public repositories."
     29       },
     30       "environment_specified": {
     31         "applies": true,
     32         "answer": false,
     33         "justification": "The paper mentions dependencies (LibCST, SQLite, concurrent.futures) and CLI usage but provides no requirements.txt, Dockerfile, conda environment file, or dedicated environment setup section with library versions."
     34       },
     35       "reproduction_instructions": {
     36         "applies": true,
     37         "answer": false,
     38         "justification": "CLI examples are provided in Listing 1 (e.g., 'python3 -m ab.rag --block BertLayer') and a programmatic example in Listing 2, but there are no step-by-step instructions for reproducing the main experimental results (1,289 → 941 validated blocks, CIFAR-10 accuracy). The repository configuration JSON and indexing procedure are described but not as a runnable reproduction protocol."
     39       }
     40     },
     41     "statistical_methodology": {
     42       "confidence_intervals_or_error_bars": {
     43         "applies": true,
     44         "answer": true,
     45         "justification": "Section 6 states: 'Using a Wilson binomial interval at 95% confidence, the pass rate lies in 70.5%–75.4%.' This is a proper confidence interval for the validation pass rate."
     46       },
     47       "significance_tests": {
     48         "applies": true,
     49         "answer": false,
     50         "justification": "No statistical significance tests are reported. The CIFAR-10 model comparison (Fig 7) and extraction success comparisons are presented as raw numbers without any significance testing."
     51       },
     52       "effect_sizes_reported": {
     53         "applies": true,
     54         "answer": false,
     55         "justification": "Percentage values are reported (73.0% pass rate, 72.46% of unique set from NN-RAG, 92.81% CIFAR-10 accuracy) but no formal effect sizes (Cohen's d, relative improvement with context) are provided for key comparisons."
     56       },
     57       "sample_size_justified": {
     58         "applies": true,
     59         "answer": false,
     60         "justification": "The choice of 19 repositories and 1,289 blocks is not justified with power analysis or sample size reasoning. The paper says repositories were 'carefully curated' but does not explain why 19 is sufficient or how the configuration was determined."
     61       },
     62       "variance_reported": {
     63         "applies": true,
     64         "answer": false,
     65         "justification": "No standard deviations, variance, or spread measures are reported for any results. Extraction time is reported as '~2.5 s per block' (a point estimate). CIFAR-10 accuracy is a single number (92.81%) with no variance across runs."
     66       }
     67     },
     68     "evaluation_design": {
     69       "baselines_included": {
     70         "applies": true,
     71         "answer": true,
     72         "justification": "The CIFAR-10 evaluation compares against the top 10 models in the LEMUR dataset (Fig 7). The deduplication analysis compares NN-RAG extractions against pre-existing LEMUR content (Table 4: 72.46% NN-RAG vs 27.54% other)."
     73       },
     74       "baselines_contemporary": {
     75         "applies": true,
     76         "answer": true,
     77         "justification": "The LEMUR dataset (ref [19], 2025) is contemporary. The comparison is against other models in the same current dataset snapshot."
     78       },
     79       "ablation_study": {
     80         "applies": true,
     81         "answer": false,
     82         "justification": "The paper describes performing ablations in Section 6: 'we ablate each factor in isolation (remove SE; replace anti-aliased downsampling with stride-2 convolutions; disable stochastic depth; and swap the augmentation recipe)' but does not present numerical ablation results anywhere in the paper text. The methodology is described without showing the actual accuracy deltas."
     83       },
     84       "multiple_metrics": {
     85         "applies": true,
     86         "answer": true,
     87         "justification": "Section 5 reports extraction success rate, validation pass rate, dependency resolution accuracy, code quality score, average extraction time, and cache hit rate. The deduplication analysis adds uniqueness metrics (Table 4). CIFAR-10 uses accuracy."
     88       },
     89       "human_evaluation": {
     90         "applies": true,
     91         "answer": true,
     92         "justification": "Section 5 states: 'Manual inspection of 50 random blocks showed: (i) formatting preserved in 98%, (ii) dependency completeness in 94%, and (iii) PEP 8 import organization.' This constitutes human evaluation of the system's outputs."
     93       },
     94       "held_out_test_set": {
     95         "applies": true,
     96         "answer": true,
     97         "justification": "The CIFAR-10 result is reported on the 'standard CIFAR-10 test split' (Fig 7 caption), which is a standard held-out test set."
     98       },
     99       "per_category_breakdown": {
    100         "applies": true,
    101         "answer": true,
    102         "justification": "Table 2 breaks down target blocks by category (attention ~180, convolutional ~220, transformer ~150, etc.). Fig 2 shows per-repository extraction counts. Table 4 provides deduplication statistics by type. Failure causes are categorized (C++/CUDA ops 25%, circular dependencies 20%, etc.)."
    103       },
    104       "failure_cases_discussed": {
    105         "applies": true,
    106         "answer": true,
    107         "justification": "Section 5 categorizes the 348 failures: 'external C++/CUDA ops (~87; 25%), complex/circular dependencies (~70; 20%), dynamic metaprogramming (~52; 15%), and repo-specific utilities/configuration (~139; 40%).' Section 6 further discusses failure mechanisms related to dynamic imports."
    108       },
    109       "negative_results_reported": {
    110         "applies": true,
    111         "answer": true,
    112         "justification": "The 27% failure rate is prominently reported. Specific failure categories and their causes are documented. The paper acknowledges 'native operators that require toolchains and GPU runtimes are out of scope, and dynamic factories remain partially opaque to static closure.'"
    113       }
    114     },
    115     "claims_and_evidence": {
    116       "abstract_claims_supported": {
    117         "applies": true,
    118         "answer": true,
    119         "justification": "Abstract claims of 1,289 candidates, 941 validated (73.0%), 80%+ structurally unique, and ~72% of LEMUR unique architectures are all supported by Table 3, Table 4, and Section 5 results."
    120       },
    121       "causal_claims_justified": {
    122         "applies": true,
    123         "answer": false,
    124         "justification": "The paper claims 'recombining underused practices can yield a robust deep model' and that specific ingredients (SE attention, anti-aliased downsampling, stochastic depth, augmentation recipe) drive the CIFAR-10 improvement. Ablation methodology is described but actual numerical results are not presented, making these causal claims inadequately supported."
    125       },
    126       "generalization_bounded": {
    127         "applies": true,
    128         "answer": true,
    129         "justification": "Section 5 states: 'our evaluation therefore prioritizes vision blocks and tasks, using a vision-centric repository configuration while keeping all mechanisms of NN-RAG unchanged.' The scope is bounded to 19 PyTorch repositories and vision-centric evaluation. The abstract focuses on 'vision code.'"
    130       },
    131       "alternative_explanations_discussed": {
    132         "applies": true,
    133         "answer": false,
    134         "justification": "No substantive discussion of alternative explanations for the results. For example, the 72.46% share of unique architectures in LEMUR could be influenced by how the deduplication thresholds (τ=0.90, κ=0.95) were chosen, but this is deferred to a supplement. The CIFAR-10 result could reflect training recipe rather than architectural novelty, but ablation results are not shown."
    135       },
    136       "proxy_outcome_distinction": {
    137         "applies": true,
    138         "answer": false,
    139         "justification": "The paper measures executability (AST parse, compile, sandboxed run) as a proxy for 'reusability' but does not discuss the gap. A module that compiles and runs is not necessarily reusable in practice—it may require additional context, documentation, or API compatibility. The paper frames validated blocks as 'a vetted palette for ablations and compositional design' without acknowledging the proxy gap."
    140       }
    141     },
    142     "setup_transparency": {
    143       "model_versions_specified": {
    144         "applies": true,
    145         "answer": false,
    146         "justification": "Listing 2 references 'ab.gpt.TuneNNGen_8B' suggesting an 8B-parameter language model for synthesis, but the exact model, version, and provider are not specified. The main extraction pipeline does not use LLMs, but the synthesis integration lacks version details."
    147       },
    148       "prompts_provided": {
    149         "applies": false,
    150         "answer": false,
    151         "justification": "The core NN-RAG extraction pipeline is a static analysis and code generation system, not LLM-based prompting. The optional NN-GPT synthesis uses prompts but is not the main contribution evaluated."
    152       },
    153       "hyperparameters_reported": {
    154         "applies": true,
    155         "answer": false,
    156         "justification": "Deduplication thresholds are reported (τ=0.90, κ=0.95). Listing 2 shows lr=0.01, momentum=0.9 for the CIFAR-10 example. However, the full training recipe for the winning CIFAR-10 model (epochs, batch size, weight decay, augmentation parameters) is not comprehensively reported—only described qualitatively (RandAugment, mixup/CutMix)."
    157       },
    158       "scaffolding_described": {
    159         "applies": false,
    160         "answer": false,
    161         "justification": "No agentic scaffolding is used. NN-RAG is a deterministic code analysis and extraction pipeline, not an agent-based system."
    162       },
    163       "data_preprocessing_documented": {
    164         "applies": true,
    165         "answer": true,
    166         "justification": "The 7-phase extraction pipeline is documented in detail (Section 4.3, Figure 3): block discovery → repository cloning → LibCST parsing → symbol discovery → dependency resolution → code generation → validation. Deduplication steps are documented in Section 6 with counts at each stage (Table 4: exact 104,804, lexical 8,939, structural 320)."
    167       }
    168     },
    169     "limitations_and_scope": {
    170       "limitations_section_present": {
    171         "applies": true,
    172         "answer": true,
    173         "justification": "Section 6 contains a dedicated limitations paragraph: 'Limitations reflect deliberate sandbox constraints: native operators that require toolchains and GPU runtimes are out of scope, and dynamic factories remain partially opaque to static closure.' Section 7 expands on these limitations with planned mitigations."
    174       },
    175       "threats_to_validity_specific": {
    176         "applies": true,
    177         "answer": true,
    178         "justification": "Specific threats are identified: native C++/CUDA operators cannot be validated in the sandbox, dynamic import mechanisms (string-to-class registries, plugin loaders) are partially opaque to static analysis, and the repository selection is vision-centric. These are specific to this system rather than generic disclaimers."
    179       },
    180       "scope_boundaries_stated": {
    181         "applies": true,
    182         "answer": true,
    183         "justification": "The paper states scope boundaries: limited to PyTorch codebases, vision-centric repository selection, code-only clones (no weights), and sandbox constraints excluding native operators. Section 5 states the evaluation 'prioritizes vision blocks and tasks.'"
    184       }
    185     },
    186     "data_integrity": {
    187       "raw_data_available": {
    188         "applies": true,
    189         "answer": false,
    190         "justification": "The extracted block files are intentionally not redistributed due to licensing ('Our extractor never redistributes third-party source files'). While the tool is released to regenerate extractions, the actual raw extracted blocks from the evaluation are not directly available for independent verification."
    191       },
    192       "data_collection_described": {
    193         "applies": true,
    194         "answer": true,
    195         "justification": "The data collection process is well-documented: 19 specific repositories are listed (Table 1), the 7-phase extraction pipeline is described in detail (Section 4.3), and the automated block discovery criteria are specified (inherits nn.Module, implements forward(), non-abstract)."
    196       },
    197       "recruitment_methods_described": {
    198         "applies": true,
    199         "answer": false,
    200         "justification": "The 19 repositories are described as 'carefully curated' representing 'state of the art in computer vision, natural language processing, and graph neural networks,' but the selection criteria and curation process are not fully documented. No explanation for why these specific 19 were chosen over alternatives."
    201       },
    202       "data_pipeline_documented": {
    203         "applies": true,
    204         "answer": true,
    205         "justification": "The full pipeline is documented: 1,289 targets discovered → 1,289 extracted (100%) → 941 validated (73.0%) → 348 failures categorized by cause. The deduplication pipeline is also documented (Table 4): 10,483 → 1,064 after exact/lexical/structural dedup."
    206       }
    207     },
    208     "conflicts_of_interest": {
    209       "funding_disclosed": {
    210         "applies": true,
    211         "answer": false,
    212         "justification": "No funding acknowledgment section is present in the paper. The work is from the University of Würzburg but no grants or sponsors are mentioned."
    213       },
    214       "affiliations_disclosed": {
    215         "applies": true,
    216         "answer": true,
    217         "justification": "Author affiliations are listed: 'Computer Vision Lab, CAIDAS & IFI, University of Würzburg, Germany.' The authors evaluate their own systems (NN-RAG and NN-DUP), and their institutional affiliation is clearly stated."
    218       },
    219       "funder_independent_of_outcome": {
    220         "applies": true,
    221         "answer": false,
    222         "justification": "Cannot assess funder independence since funding is not disclosed. Absence of funding disclosure prevents verification."
    223       },
    224       "financial_interests_declared": {
    225         "applies": true,
    226         "answer": false,
    227         "justification": "No competing interests or financial interests statement is present in the paper."
    228       }
    229     },
    230     "contamination": {
    231       "training_cutoff_stated": {
    232         "applies": false,
    233         "answer": false,
    234         "justification": "The paper evaluates a code extraction pipeline, not a pre-trained model's knowledge capabilities on a benchmark. The CIFAR-10 model is trained from scratch on extracted architectural blocks, not a pre-trained LLM being tested."
    235       },
    236       "train_test_overlap_discussed": {
    237         "applies": false,
    238         "answer": false,
    239         "justification": "The paper does not evaluate a pre-trained model's capability on any benchmark. NN-RAG is a deterministic code extraction system, and the CIFAR-10 model is trained from scratch."
    240       },
    241       "benchmark_contamination_addressed": {
    242         "applies": false,
    243         "answer": false,
    244         "justification": "Benchmark contamination in the LLM sense is not applicable. The system extracts code via static analysis, and the CIFAR-10 model is trained from scratch rather than tested for pre-existing knowledge."
    245       }
    246     },
    247     "human_studies": {
    248       "pre_registered": {
    249         "applies": false,
    250         "answer": false,
    251         "justification": "No human participants in this study. The research involves automated code extraction from public repositories."
    252       },
    253       "irb_or_ethics_approval": {
    254         "applies": false,
    255         "answer": false,
    256         "justification": "No human participants. The study analyzes public code repositories."
    257       },
    258       "demographics_reported": {
    259         "applies": false,
    260         "answer": false,
    261         "justification": "No human participants in this study."
    262       },
    263       "inclusion_exclusion_criteria": {
    264         "applies": false,
    265         "answer": false,
    266         "justification": "No human participants in this study."
    267       },
    268       "randomization_described": {
    269         "applies": false,
    270         "answer": false,
    271         "justification": "No human participants in this study."
    272       },
    273       "blinding_described": {
    274         "applies": false,
    275         "answer": false,
    276         "justification": "No human participants in this study."
    277       },
    278       "attrition_reported": {
    279         "applies": false,
    280         "answer": false,
    281         "justification": "No human participants in this study."
    282       }
    283     },
    284     "cost_and_practicality": {
    285       "inference_cost_reported": {
    286         "applies": true,
    287         "answer": true,
    288         "justification": "Section 5 reports: 'Average extraction time was ~2.5 s per block; cache hits on repeat runs reached ~95%.' Section 6 adds: 'Cold-start indexing of 19 repositories completes in ~5–10 minutes; subsequent runs under the missing policy complete in < 30 seconds.' Wall-clock time is reported for the pipeline."
    289       },
    290       "compute_budget_stated": {
    291         "applies": true,
    292         "answer": false,
    293         "justification": "No total computational budget is stated—no GPU hours for CIFAR-10 training, no hardware specifications, no total API spend. Only per-block extraction time and cold-start indexing time are mentioned."
    294       }
    295     },
    296     "experimental_rigor": {
    297       "seed_sensitivity_reported": {
    298         "applies": true,
    299         "answer": false,
    300         "justification": "No mention of multiple random seeds for any experiment. The CIFAR-10 accuracy (92.81%) appears to be from a single run with no seed sensitivity analysis."
    301       },
    302       "number_of_runs_stated": {
    303         "applies": true,
    304         "answer": false,
    305         "justification": "The number of experimental runs is not stated for the CIFAR-10 evaluation. The extraction pipeline is deterministic (no randomness), but the training evaluation does not state run count."
    306       },
    307       "hyperparameter_search_budget": {
    308         "applies": true,
    309         "answer": false,
    310         "justification": "Deduplication thresholds are stated as defaults (τ=0.90, κ=0.95) with sensitivity deferred to supplement. For the CIFAR-10 model training, no hyperparameter search budget is reported. No mention of how many configurations were tried."
    311       },
    312       "best_config_selection_justified": {
    313         "applies": true,
    314         "answer": false,
    315         "justification": "The 'winning' CIFAR-10 model was surfaced through the pipeline. The paper states this was 'unintentional,' but does not explain the selection process or whether this was the best of many NN-RAG configurations tested."
    316       },
    317       "multiple_comparison_correction": {
    318         "applies": false,
    319         "answer": false,
    320         "justification": "No multiple statistical comparisons are performed. The paper does not run multiple significance tests that would require correction."
    321       },
    322       "self_comparison_bias_addressed": {
    323         "applies": true,
    324         "answer": false,
    325         "justification": "The authors evaluate their own system (NN-RAG) and its contribution to LEMUR (which they also develop). This self-comparison bias is not acknowledged or addressed. The deduplication pipeline (NN-DUP) that determines NN-RAG's contribution share is also their own tool."
    326       },
    327       "compute_budget_vs_performance": {
    328         "applies": true,
    329         "answer": false,
    330         "justification": "No comparison of compute budget vs performance across methods. The CIFAR-10 comparison in Fig 7 ranks by accuracy without reporting compute cost for each model."
    331       },
    332       "benchmark_construct_validity": {
    333         "applies": true,
    334         "answer": false,
    335         "justification": "The paper does not discuss whether CIFAR-10 accuracy is a valid measure for the broader claim of 'algorithmic discovery.' LEMUR's construct validity as a benchmark for neural architecture quality is not examined."
    336       },
    337       "scaffold_confound_addressed": {
    338         "applies": false,
    339         "answer": false,
    340         "justification": "No agentic scaffolding is involved. NN-RAG is a deterministic extraction pipeline, and the CIFAR-10 evaluation is standard model training."
    341       }
    342     },
    343     "data_leakage": {
    344       "temporal_leakage_addressed": {
    345         "applies": true,
    346         "answer": false,
    347         "justification": "No discussion of temporal leakage. The extracted blocks come from repositories that may have been optimized for CIFAR-10 or similar benchmarks over time, but this potential source of bias is not discussed."
    348       },
    349       "feature_leakage_addressed": {
    350         "applies": true,
    351         "answer": false,
    352         "justification": "No discussion of whether the extraction process introduces information leakage. The pipeline selects modules from repositories that already contain optimized components for well-known benchmarks."
    353       },
    354       "non_independence_addressed": {
    355         "applies": true,
    356         "answer": false,
    357         "justification": "No discussion of non-independence between extracted modules. Many blocks come from the same repositories (e.g., transformers, timm) and share design patterns, but this structural non-independence is not addressed."
    358       },
    359       "leakage_detection_method": {
    360         "applies": true,
    361         "answer": false,
    362         "justification": "No leakage detection or prevention method is applied. The deduplication pipeline addresses duplicate architectures but not leakage of benchmark-specific information."
    363       }
    364     }
    365   },
    366   "claims": [
    367     {
    368       "claim": "NN-RAG extracted 1,289 candidate blocks and validated 941 as executable (73.0% pass rate) across 19 repositories.",
    369       "evidence": "Table 3 and Section 5 report 100% extraction coverage and 941/1,289 validated blocks. Wilson 95% CI: 70.5%–75.4%.",
    370       "supported": "strong"
    371     },
    372     {
    373       "claim": "Over 80% of extracted blocks are structurally unique (81.93% uniqueness rate).",
    374       "evidence": "Table 4 and Section 7 state 771 of 941 validated modules qualify as unique after three-level deduplication (exact, lexical MinHash/LSH, AST fingerprint).",
    375       "supported": "moderate"
    376     },
    377     {
    378       "claim": "NN-RAG supplies approximately 72% of all novel network structures in the LEMUR dataset.",
    379       "evidence": "Table 4 shows 771 NN-RAG extractions out of 1,064 unique records (72.46%), after deduplication from 10,483 total LEMUR records.",
    380       "supported": "moderate"
    381     },
    382     {
    383       "claim": "The best CIFAR-10 model in LEMUR was assembled using NN-RAG, achieving 92.81% accuracy.",
    384       "evidence": "Figure 7 shows the NN-RAG-assembled model (rag-6d58587b76d7e03be409f7e7289d4a58) at 92.81% accuracy on CIFAR-10 test split, topping the LEMUR leaderboard.",
    385       "supported": "weak"
    386     },
    387     {
    388       "claim": "NN-RAG enables cross-repository migration of architectural patterns, automatically identifying reusable modules in one project and regenerating them dependency-complete in another context.",
    389       "evidence": "Described in the abstract and methodology (Section 4) as a capability of the system, but no experiment directly demonstrates cross-repo migration with before/after evaluation.",
    390       "supported": "weak"
    391     },
    392     {
    393       "claim": "The winning model's performance stems from convergent underused ingredients: pre-activation residual backbone, SE attention, anti-aliased downsampling, stochastic depth, and modern augmentation recipe.",
    394       "evidence": "Section 6 describes ablation methodology (removing each factor in isolation) but does not present numerical ablation results in the paper.",
    395       "supported": "unsupported"
    396     }
    397   ],
    398   "red_flags": [
    399     {
    400       "flag": "Ablation described but results not shown",
    401       "detail": "Section 6 describes a systematic ablation (removing SE, replacing anti-aliased downsampling, disabling stochastic depth, swapping augmentation) but the actual numerical results of these ablations are never presented. The paper claims ablations 'pinpoint the primary driver(s)' without showing the data."
    402     },
    403     {
    404       "flag": "Self-evaluation of own system and dataset",
    405       "detail": "The authors evaluate their own tool (NN-RAG) using their own deduplication tool (NN-DUP) on a dataset they contribute to (LEMUR). There is no independent evaluation or external validation. The claim that NN-RAG supplies 72% of unique architectures depends entirely on their own deduplication pipeline and threshold choices."
    406     },
    407     {
    408       "flag": "Single-run benchmark result without variance",
    409       "detail": "The CIFAR-10 accuracy of 92.81% (claimed as best in LEMUR) is reported as a single number without error bars, multiple runs, or seed sensitivity analysis. This makes it impossible to assess whether the result is stable or a lucky run."
    410     },
    411     {
    412       "flag": "Sensitivity to deduplication thresholds not shown",
    413       "detail": "The key claim that NN-RAG contributes 72% of unique architectures depends on deduplication thresholds (τ=0.90, κ=0.95). Sensitivity analysis is deferred to a supplement ('See supplement for sensitivity to τ, κ') rather than presented in the main paper."
    414     },
    415     {
    416       "flag": "Approximate counts throughout",
    417       "detail": "Table 2 uses approximate counts (~180, ~220, ~150, etc.) rather than exact numbers for the distribution of target blocks. Several metrics use tilde notation (~2.5s, ~95%, ~5-10 minutes) without reporting actual distributions."
    418     }
    419   ],
    420   "cited_papers": [
    421     {
    422       "title": "SWE-bench: Can Language Models Resolve Real-World GitHub Issues?",
    423       "authors": ["Carlos E. Jimenez"],
    424       "year": 2023,
    425       "arxiv_id": "2310.06770",
    426       "relevance": "Major benchmark for evaluating LLM-based software engineering capabilities on real GitHub issues."
    427     },
    428     {
    429       "title": "SWE-bench+: Enhanced Coding Benchmark for LLMs",
    430       "authors": ["Rehab Aleithan"],
    431       "year": 2024,
    432       "arxiv_id": "2410.06992",
    433       "relevance": "Enhanced version of SWE-bench addressing limitations in the original benchmark evaluation methodology."
    434     },
    435     {
    436       "title": "SWE-agent: Agent-Computer Interfaces Enable Automated Software Engineering",
    437       "authors": ["Jiawei Yang"],
    438       "year": 2024,
    439       "relevance": "Agentic framework for automated software engineering with explicit interfaces for editing, testing, and navigating code."
    440     },
    441     {
    442       "title": "RepoCoder: Repository-Level Code Completion Through Iterative Retrieval and Generation",
    443       "authors": ["Fengji Zhang", "Bei Chen", "Yue Zhang", "Jacky Keung", "Jin Liu", "Daoguang Zan", "Yi Mao", "Jian-Guang Lou", "Weizhu Chen"],
    444       "year": 2023,
    445       "arxiv_id": "2303.12570",
    446       "relevance": "Integrates iterative retrieval with code LLMs for repository-level completion, closely related to RAG-based code systems."
    447     },
    448     {
    449       "title": "CodeR: Issue Resolving with Multi-Agent and Task Graphs",
    450       "authors": ["Dong Chen"],
    451       "year": 2024,
    452       "arxiv_id": "2406.01304",
    453       "relevance": "Multi-agent system for automated software engineering task resolution using task graphs."
    454     },
    455     {
    456       "title": "Retrieval-Augmented Generation for Knowledge-Intensive NLP Tasks",
    457       "authors": ["Patrick Lewis", "Ethan Perez", "Aleksandra Piktus", "Fabio Petroni", "Vladimir Karpukhin", "Naman Goyal", "Heinrich Küttler", "Mike Lewis", "Wen-tau Yih", "Tim Rocktäschel", "Sebastian Riedel", "Douwe Kiela"],
    458       "year": 2020,
    459       "relevance": "Foundational RAG paper establishing the retrieval-augmented generation paradigm used in NN-RAG."
    460     },
    461     {
    462       "title": "A Survey on Hallucination in Large Language Models: Principles, Taxonomy, Challenges, and Open Questions",
    463       "authors": ["Lei Huang", "Weijiang Yu", "Weitao Ma"],
    464       "year": 2023,
    465       "arxiv_id": "2311.05232",
    466       "relevance": "Survey on LLM hallucination relevant to understanding why retrieval-first approaches reduce generation risks."
    467     },
    468     {
    469       "title": "Do Users Write More Insecure Code with AI Assistants?",
    470       "authors": ["N. Perry"],
    471       "year": 2023,
    472       "relevance": "Studies security risks of AI coding assistance, motivating NN-RAG's validator-gated approach over unconstrained generation."
    473     },
    474     {
    475       "title": "Improving Reproducibility in Machine Learning Research",
    476       "authors": ["Joelle Pineau"],
    477       "year": 2021,
    478       "relevance": "Establishes community reproducibility guidelines that NN-RAG's design claims to follow."
    479     },
    480     {
    481       "title": "Deep Code Search",
    482       "authors": ["Xiaodong Gu", "Hongyu Zhang", "Sunghun Kim"],
    483       "year": 2018,
    484       "relevance": "Deep learning approach to code search using joint embeddings, a predecessor to retrieval-augmented code systems."
    485     },
    486     {
    487       "title": "Large-Scale Near-Deduplication Behind BigCode",
    488       "authors": ["BigCode"],
    489       "year": 2023,
    490       "relevance": "Documents deduplication methodology for large code datasets, relevant to understanding code uniqueness measurement."
    491     }
    492   ],
    493   "engagement_factors": {
    494     "practical_relevance": {
    495       "score": 2,
    496       "justification": "NN-RAG is a released tool that ML practitioners could use to discover and extract reusable PyTorch modules, though it targets a specialized workflow (neural architecture mining)."
    497     },
    498     "surprise_contrarian": {
    499       "score": 1,
    500       "justification": "The finding that retrieval-first approaches can surface a CIFAR-10 winner is mildly surprising, but the overall approach (code mining + validation) is conventional."
    501     },
    502     "fear_safety": {
    503       "score": 0,
    504       "justification": "No AI safety or security concerns raised; the paper focuses on code extraction and reuse."
    505     },
    506     "drama_conflict": {
    507       "score": 0,
    508       "justification": "No controversy or conflict with existing work; positioned as complementary to existing tools."
    509     },
    510     "demo_ability": {
    511       "score": 2,
    512       "justification": "Code is released on GitHub with CLI examples (python3 -m ab.rag --block ResNet), though not a one-click demo."
    513     },
    514     "brand_recognition": {
    515       "score": 0,
    516       "justification": "University of Würzburg Computer Vision Lab is not widely known outside the CV research community."
    517     }
    518   }
    519 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs