scan.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan.json (31364B)
      1 {
      2   "paper": {
      3     "title": "Citation-Grounded Code Comprehension: Preventing LLM Hallucination Through Hybrid Retrieval and Graph-Augmented Context",
      4     "authors": ["Jahidul Arafat"],
      5     "year": 2025,
      6     "venue": "arXiv (submitted to ICSE)",
      7     "arxiv_id": "2512.12117"
      8   },
      9   "checklist": {
     10     "artifacts": {
     11       "code_released": {
     12         "applies": true,
     13         "answer": true,
     14         "justification": "A GitHub repository URL is provided: https://github.com/jahidul-arafat/code_comprehension_sdk_design_with_citation_verification (noted as 'anonymized for review'). The paper also states 'All code, evaluation data, and experiment configurations will be made publicly available under MIT license upon acceptance.' The URL is provided now, so this counts as YES."
     15       },
     16       "data_released": {
     17         "applies": true,
     18         "answer": true,
     19         "justification": "The evaluation uses 30 publicly available open-source Python repositories (Flask, Django, FastAPI, NumPy, etc.) with permissive licenses. The paper states all evaluation data will be made publicly available. The repositories are public benchmarks they did not modify."
     20       },
     21       "environment_specified": {
     22         "applies": true,
     23         "answer": true,
     24         "justification": "Table 6 provides detailed version specifications: Python 3.10.12, FAISS 1.7.4, Neo4j CE 5.12.0, Transformers 4.35.0, PyTorch 2.1.0, rank-bm25 0.2.2, NumPy 1.24.3, SciPy 1.11.3. Section 5.2.1 states 'versions pinned in requirements.txt'. Hardware is specified (CloudLab c6420, dual Xeon Gold 6142, 376GB RAM)."
     25       },
     26       "reproduction_instructions": {
     27         "applies": true,
     28         "answer": false,
     29         "justification": "While the paper describes the system architecture and pipeline in detail (Section 5), and mentions a run_experiment.py orchestrator with YAML configurations, no explicit step-by-step reproduction instructions (e.g., README with commands to run) are described in the paper itself. The system is described at an architectural level but not with 'run this command' specificity."
     30       }
     31     },
     32     "statistical_methodology": {
     33       "confidence_intervals_or_error_bars": {
     34         "applies": true,
     35         "answer": false,
     36         "justification": "The paper reports point estimates (e.g., 92% citation accuracy, 78% dense-only, 74% sparse-only) without confidence intervals or error bars. Table 7 reports std dev for the Flask/Werkzeug case study (e.g., Citation Accuracy 100% ± 0%), but the main results across 180 questions lack uncertainty quantification."
     37       },
     38       "significance_tests": {
     39         "applies": true,
     40         "answer": true,
     41         "justification": "Statistical significance tests are reported: 'Paired t-test confirmed statistical significance with p < 0.001 and Cohen's d = 1.83' (Section 3.2). Table 3 reports p-values for all metrics. Table 4 reports p < 0.001 for graph expansion improvements. Correlation significance p < 0.01 for model compliance (Section 4.3)."
     42       },
     43       "effect_sizes_reported": {
     44         "applies": true,
     45         "answer": true,
     46         "justification": "Cohen's d = 1.83 is reported for the main hybrid vs. single-mode comparison (Section 3.2). Percentage point improvements with baseline context are reported throughout (e.g., '14-18 percentage point improvement' from 74-78% to 92%). Correlation coefficients (r = -0.72) are reported for model compliance vs. hallucination."
     47       },
     48       "sample_size_justified": {
     49         "applies": true,
     50         "answer": false,
     51         "justification": "No justification is provided for why 180 questions, 30 repositories, or 6 models were chosen. No power analysis is discussed. The 5-participant developer study (Section 5.4.2) has a very small sample with no justification for the size."
     52       },
     53       "variance_reported": {
     54         "applies": true,
     55         "answer": false,
     56         "justification": "The paper states 'We test each question-model combination once (no repeated trials) as temperature 0.2 produces near-deterministic outputs' (Section 5.2.2). No variance across runs is reported since experiments are single-run. Table 7 reports std dev for some metrics in the Flask case study, but these are across questions not across runs."
     57       }
     58     },
     59     "evaluation_design": {
     60       "baselines_included": {
     61         "applies": true,
     62         "answer": true,
     63         "justification": "Multiple baselines are compared: sparse-only (BM25), dense-only (BGE), hybrid without graph, and three external baselines (keyword search via GitHub, neural code search via Sourcegraph, ChatGPT without retrieval). Finding 14 in Section 5.4 presents comparative analysis."
     64       },
     65       "baselines_contemporary": {
     66         "applies": true,
     67         "answer": false,
     68         "justification": "The baselines are primarily internal ablations (sparse-only, dense-only). External baselines include ChatGPT (GPT-4 November 2023) and Sourcegraph, but no contemporary RAG-based code comprehension systems are compared. The paper acknowledges in Table 1 that no prior system combines hybrid retrieval with graph expansion and citation verification, but does not compare against recent RAG frameworks like RAGA or LangChain-based systems."
     69       },
     70       "ablation_study": {
     71         "applies": true,
     72         "answer": true,
     73         "justification": "The paper conducts systematic ablations: sparse-only vs. dense-only vs. hybrid (RQ1), text-only vs. graph-expanded (RQ2), three packing strategies (RQ5), and hyperparameter sweeps across fusion weights, candidate counts, context budgets, and graph parameters (RQ4). Section 4.1-4.5 present detailed ablation results."
     74       },
     75       "multiple_metrics": {
     76         "applies": true,
     77         "answer": true,
     78         "justification": "Multiple metrics are reported: citation accuracy, precision, recall, F1-score, evidence diversity (unique files), hallucination rate, self-citation rate, retrieval latency, and end-to-end time. Tables 3, 4, 5, and 7 present these across different dimensions."
     79       },
     80       "human_evaluation": {
     81         "applies": true,
     82         "answer": true,
     83         "justification": "Section 5.4.2 describes a developer evaluation with 5 participants performing 2-hour comprehension tasks. Manual review of 30% of responses (324 responses) for hallucination detection is also reported (Section 4.3). The system received 4.2/5 average usefulness rating."
     84       },
     85       "held_out_test_set": {
     86         "applies": true,
     87         "answer": false,
     88         "justification": "No separation between development and test sets is described. The paper does not mention holding out any questions or repositories for tuning versus final evaluation. Hyperparameters (alpha=0.45, k=28, gamma=0.25) appear to be tuned and evaluated on the same 180 questions and 30 repositories."
     89       },
     90       "per_category_breakdown": {
     91         "applies": true,
     92         "answer": true,
     93         "justification": "Results are broken down by retrieval mode (Table 3), by model (Table 5), by query type (cross-file 62.3% vs. single-file 37.7%), by architectural pattern (exception handling 30%, utility delegation 40%, configuration coupling 29%), and by packing strategy (Section 4.5). The Flask/Werkzeug case study provides per-repository detail."
     94       },
     95       "failure_cases_discussed": {
     96         "applies": true,
     97         "answer": true,
     98         "justification": "Failure cases are discussed extensively. Section 4.1 identifies three failure patterns for sparse retrieval and three for dense retrieval. Section 4.3 identifies four failure modes for model citation compliance. Section 5.4.2 identifies 4 common failure modes from developer evaluation. Section 5.5 discusses limitations."
     99       },
    100       "negative_results_reported": {
    101         "applies": true,
    102         "answer": true,
    103         "justification": "The paper reports that 8% of responses were flagged as hallucinations, sparse-only and dense-only both underperform, per-chunk 120 lines reduced accuracy to 90% (Section 4.4), and Mistral-7B achieved only 62% compliance with 19% hallucination. The 5-participant study reported limitations users found."
    104       }
    105     },
    106     "claims_and_evidence": {
    107       "abstract_claims_supported": {
    108         "applies": true,
    109         "answer": true,
    110         "justification": "The abstract claims 92% citation accuracy (supported by Table 3), zero hallucinations (supported by mechanical verification results), 14-18pp improvement over baselines (supported by Section 4.1), 62% cross-file evidence discovery (supported by Section 3.2 Finding 2), and 24pp citation completeness improvement from graph expansion (supported by Table 4). All claims are backed by results in the paper."
    111       },
    112       "causal_claims_justified": {
    113         "applies": true,
    114         "answer": true,
    115         "justification": "The paper makes causal claims through ablation studies: removing graph expansion reduces cross-file completeness by 24pp (Table 4), changing packing strategy changes diversity by 18% (Section 4.5). These are controlled single-variable manipulations. The correlation claim (r=-0.72 between compliance and hallucination) is appropriately stated as correlation, not causation."
    116       },
    117       "generalization_bounded": {
    118         "applies": true,
    119         "answer": false,
    120         "justification": "The title claims 'Code Comprehension' generally but results are only on Python repositories. The abstract does not bound this to Python. Section 5.5.1 acknowledges 'Current implementation targets Python codebases exclusively' but the title and abstract present findings as general code comprehension results. The paper says extending to other languages 'requires language-specific AST parsers' but does not bound claims in the abstract or title."
    121       },
    122       "alternative_explanations_discussed": {
    123         "applies": true,
    124         "answer": false,
    125         "justification": "The paper does not discuss alternative explanations for the observed improvements. For example, could the 92% accuracy be due to the specific question design favoring cross-file patterns? Could the 14-18pp improvement be attributed to the specific repositories chosen rather than hybrid retrieval superiority? The limitations section (5.5) discusses scope limitations but not alternative explanations for the results."
    126       }
    127     },
    128     "setup_transparency": {
    129       "model_versions_specified": {
    130         "applies": true,
    131         "answer": true,
    132         "justification": "Exact model filenames with quantization formats are specified in Section 5.2.1: 'llama-3-groq-8b-8192-tool-use-preview-Q4_K_M.gguf', 'codellama-13b-instruct.Q4_K_M.gguf', 'mistral-7b-instruct-v0.3.Q4_K_M.gguf', 'deepseek-coder-6.7b-instruct.Q4_K_M.gguf', 'Qwen2.5-Coder-7B-Instruct-Q4_K_M.gguf', 'Phi-3-mini-4k-instruct-Q4_K_M.gguf'. BGE-base-en-v1.5 is also specified."
    133       },
    134       "prompts_provided": {
    135         "applies": true,
    136         "answer": false,
    137         "justification": "The paper describes the system prompt in natural language: 'System prompt enforces citation format: \"You must cite code locations using [file:start-end] format...\"' (Section 5.2.2). However, the full prompt text used for LLM generation is not provided. The paper describes what the prompt does but does not provide the complete actual prompt used in experiments."
    138       },
    139       "hyperparameters_reported": {
    140         "applies": true,
    141         "answer": true,
    142         "justification": "Comprehensive hyperparameters are reported in Section 5.2.2: fusion weights alpha=0.45, beta=0.55; k=28 candidates; graph bonus gamma=0.25; decay delta=0.6; context budget 11,000-12,000 characters; per-chunk 100 lines. LLM parameters: temperature 0.2, top-p 0.9, top-k 40, max tokens 1024, repetition penalty 1.1."
    143       },
    144       "scaffolding_described": {
    145         "applies": true,
    146         "answer": true,
    147         "justification": "The pipeline is described in extensive detail: parallel BM25 and FAISS retrieval, hybrid fusion with normalization, graph expansion via Neo4j BFS, submodular context packing, LLM generation with citation format requirements, and mechanical citation verification. Section 3.1.3 and Section 5.1 describe the full system architecture with four core modules."
    148       },
    149       "data_preprocessing_documented": {
    150         "applies": true,
    151         "answer": true,
    152         "justification": "Section 5.1.3 describes the four-stage preprocessing pipeline in detail: AST parsing for chunking, embedding generation with BGE, FAISS index construction, and graph loading from import statements. Section 3.1.1 describes repository selection criteria. Section 3.1.2 describes question generation methodology with template-based and issue-mining approaches."
    153       }
    154     },
    155     "limitations_and_scope": {
    156       "limitations_section_present": {
    157         "applies": true,
    158         "answer": true,
    159         "justification": "Section 5.5 'Limitations and Future Work' provides extensive discussion across six subsections: Language and Paradigm Coverage (5.5.1), Scalability Boundaries (5.5.2), Query Understanding Limitations (5.5.3), Citation Granularity Trade-offs (5.5.4), Evaluation Methodology (5.5.5), and Privacy and Security Considerations (5.5.6)."
    160       },
    161       "threats_to_validity_specific": {
    162         "applies": true,
    163         "answer": true,
    164         "justification": "Section 5.5 discusses specific threats: Python-only evaluation (5.5.1), FAISS degradation at 100K+ chunks (5.5.2), single-run experiments with temperature 0.2 (implied in 5.2.2), questions skewing toward architectural comprehension (5.5.5), repositories selected for popularity not representative sampling (5.5.5), and the 5-participant developer study's small size."
    165       },
    166       "scope_boundaries_stated": {
    167         "applies": true,
    168         "answer": true,
    169         "justification": "Section 5.5.1 explicitly states 'Current implementation targets Python codebases exclusively.' Section 5.5.5 states 'questions skew toward architectural comprehension over debugging or performance analysis, repositories selected for popularity and documentation quality rather than representative sampling.' Section 5.5.3 lists unsupported query types: temporal, counterfactual, and debugging queries."
    170       }
    171     },
    172     "data_integrity": {
    173       "raw_data_available": {
    174         "applies": true,
    175         "answer": false,
    176         "justification": "The paper states code and data will be 'made publicly available under MIT license upon acceptance' but this is conditional on acceptance. No raw data (per-question results, retrieval logs, per-response analysis) is currently available for independent verification."
    177       },
    178       "data_collection_described": {
    179         "applies": true,
    180         "answer": true,
    181         "justification": "Section 3.1.1 describes repository selection with four criteria (open-source, active development, moderate complexity, domain diversity). Section 3.1.2 describes question generation through template-based (80-120 questions) and issue mining (40-60 questions from 500+ candidates). Table 2 lists all 30 repositories with metadata."
    182       },
    183       "recruitment_methods_described": {
    184         "applies": true,
    185         "answer": false,
    186         "justification": "For the 5-participant developer evaluation (Section 5.4.2), the paper states '5 software developers (3 with Flask experience, 2 without)' but does not describe how they were recruited, from what population, or whether selection could introduce bias."
    187       },
    188       "data_pipeline_documented": {
    189         "applies": true,
    190         "answer": true,
    191         "justification": "The full data pipeline is documented: repository selection criteria (Section 3.1.1), question generation with counts (Section 3.1.2: template-based 80-120 + issue mining 40-60 from 500+ candidates = 180 final), preprocessing stages (Section 5.1.3), and evaluation protocol (Section 5.2.2). The question distribution is reported: 62.3% cross-file, 37.7% single-file, 45.2% API-specific, 54.8% conceptual."
    192       }
    193     },
    194     "conflicts_of_interest": {
    195       "funding_disclosed": {
    196         "applies": true,
    197         "answer": true,
    198         "justification": "The author's affiliation states 'Presidential and Woltosz Graduate Research Fellow, Department of Computer Science and Software Engineering, Auburn University.' The acknowledgments mention 'computational resources provided by Auburn University.' The fellowship funding is disclosed through the affiliation."
    199       },
    200       "affiliations_disclosed": {
    201         "applies": true,
    202         "answer": true,
    203         "justification": "The author's affiliation is clearly disclosed: 'Jahidul Arafat — Principal Investigator (PI); Presidential and Woltosz Graduate Research Fellow, Department of Computer Science and Software Engineering, Auburn University, Alabama, USA.' The paper evaluates open-source tools and local LLMs, not the author's employer's products."
    204       },
    205       "funder_independent_of_outcome": {
    206         "applies": true,
    207         "answer": true,
    208         "justification": "The funding comes from Auburn University through a graduate fellowship. The university has no financial stake in the outcome of the research on hybrid code retrieval systems. The evaluated tools are open-source (FAISS, Neo4j Community Edition, BGE, LM Studio)."
    209       },
    210       "financial_interests_declared": {
    211         "applies": true,
    212         "answer": false,
    213         "justification": "No competing interests statement is found in the paper. While the research appears academic with no obvious commercial conflicts, the absence of a formal competing interests declaration means this criterion is not satisfied."
    214       }
    215     },
    216     "contamination": {
    217       "training_cutoff_stated": {
    218         "applies": true,
    219         "answer": false,
    220         "justification": "The paper evaluates 6 LLMs on code comprehension tasks but does not state the training data cutoff dates for any of the models used. For the ChatGPT baseline, it mentions 'November 2023' as the version date but not the training cutoff. The models could have seen the evaluated repositories during training."
    221       },
    222       "train_test_overlap_discussed": {
    223         "applies": true,
    224         "answer": false,
    225         "justification": "The paper does not discuss whether the 30 Python repositories (Flask, Django, NumPy, etc.) were in the training data of the evaluated models. These are extremely popular repositories almost certainly present in training data. The paper briefly mentions 'training data staleness' but does not address the overlap with evaluation data."
    226       },
    227       "benchmark_contamination_addressed": {
    228         "applies": true,
    229         "answer": false,
    230         "justification": "The 30 repositories used for evaluation (Flask, Django, PyTorch, etc.) are among the most popular Python projects and were certainly available online before any model's training cutoff. The paper does not address whether model familiarity with these codebases could inflate citation accuracy or comprehension quality metrics."
    231       }
    232     },
    233     "human_studies": {
    234       "pre_registered": {
    235         "applies": true,
    236         "answer": false,
    237         "justification": "The 5-participant developer evaluation (Section 5.4.2) is not pre-registered. No link to a pre-registration platform is provided."
    238       },
    239       "irb_or_ethics_approval": {
    240         "applies": true,
    241         "answer": false,
    242         "justification": "No IRB or ethics board approval is mentioned for the developer evaluation study involving 5 human participants (Section 5.4.2)."
    243       },
    244       "demographics_reported": {
    245         "applies": true,
    246         "answer": false,
    247         "justification": "The only demographic information provided is '5 software developers (3 with Flask experience, 2 without)' (Section 5.4.2). No experience levels, years of experience, gender, geographic distribution, or other demographics are reported."
    248       },
    249       "inclusion_exclusion_criteria": {
    250         "applies": true,
    251         "answer": false,
    252         "justification": "No inclusion or exclusion criteria are stated for the 5 developer participants. The paper does not describe who was eligible, how they were screened, or why 5 was chosen."
    253       },
    254       "randomization_described": {
    255         "applies": true,
    256         "answer": false,
    257         "justification": "The developer study uses a within-subjects design (participants first use their tools, then the system), but no randomization or counterbalancing of task order is described. All participants appear to follow the same sequence, introducing potential order effects."
    258       },
    259       "blinding_described": {
    260         "applies": true,
    261         "answer": false,
    262         "justification": "No blinding is described for the developer evaluation. Participants knew which tool they were using (their preferred tools vs. the proposed system). The within-subjects design with no blinding could introduce expectation bias."
    263       },
    264       "attrition_reported": {
    265         "applies": true,
    266         "answer": false,
    267         "justification": "No information is provided about whether all 5 participants completed all tasks. Attrition or dropout is not discussed."
    268       }
    269     },
    270     "cost_and_practicality": {
    271       "inference_cost_reported": {
    272         "applies": true,
    273         "answer": true,
    274         "justification": "Detailed latency is reported: end-to-end 16.5 seconds average (Section 3.2), per-stage breakdown (BM25 85ms, FAISS 120ms, graph 27ms, packing 45ms, verification 15ms, LLM generation 11.4s). Per-model generation times are reported in Table 5 (8.5-12.1 seconds). The Flask case study reports 10.1s average total latency (Table 7). No monetary API costs since local inference is used."
    275       },
    276       "compute_budget_stated": {
    277         "applies": true,
    278         "answer": true,
    279         "justification": "Section 5.2.2 states 'Total experimental budget: 180 questions × 6 models × 5 ablation conditions = 5,400 trials requiring approximately 180 machine-hours across 6 repositories.' Hardware specified as CloudLab c6420 machines with dual Intel Xeon Gold 6142 CPUs, 376GB RAM."
    280       }
    281     }
    282   },
    283   "claims": [
    284     {
    285       "claim": "Hybrid retrieval combining BM25 and BGE dense embeddings achieves 92% citation accuracy, outperforming single-mode approaches by 14-18 percentage points.",
    286       "evidence": "Section 3.2 Finding 1 and Table 3 report 92% hybrid vs. 78% dense-only and 74% sparse-only, with paired t-test p < 0.001 and Cohen's d = 1.83.",
    287       "supported": "strong"
    288     },
    289     {
    290       "claim": "62.3% of code comprehension questions require cross-file evidence spanning multiple modules.",
    291       "evidence": "Section 3.2 Finding 2 reports manual analysis of 180 questions showing 112/180 (62.3%) require multi-file citations, broken down into three patterns: exception handling (34), utility delegation (45), configuration coupling (33).",
    292       "supported": "moderate"
    293     },
    294     {
    295       "claim": "Graph expansion discovers average 11.8 cross-file neighbors per query, increasing cross-file citation completeness by 24 percentage points (from 58% to 82%).",
    296       "evidence": "Table 4 and Section 4.2 Finding 7 report these numbers with p < 0.001 significance. Analysis covers 112 cross-file questions.",
    297       "supported": "strong"
    298     },
    299     {
    300       "claim": "Citation verification prevents hallucination with zero false positives across 1,080 verified responses.",
    301       "evidence": "Section 3.2 Finding 3 reports 100% precision on hallucination detection. Manual inspection of 30% sample (324 responses) found 97% true positive rate (31/32 genuine hallucinations).",
    302       "supported": "moderate"
    303     },
    304     {
    305       "claim": "The system achieves 100% citation accuracy on Flask and Werkzeug with zero hallucinations.",
    306       "evidence": "Table 7 and Section 5.4 Finding 13 report 100% accuracy across 30 questions with 156 citations verified. However, this is on only one of the 30 repository pairs and may not generalize.",
    307       "supported": "moderate"
    308     },
    309     {
    310       "claim": "Developer evaluation shows citation-grounded approach reduces time-to-understanding by 60% compared to manual code search.",
    311       "evidence": "Section 5.4.2 Finding 15 reports reduction from 6.8 minutes to 2.7 minutes across 5 developers. Answer completeness improved from 65% to 95%.",
    312       "supported": "weak"
    313     },
    314     {
    315       "claim": "Model citation compliance varies from 88% (DeepSeek-Coder) to 62% (Mistral) with strong negative correlation (r = -0.72) between self-citation rate and hallucination rate.",
    316       "evidence": "Table 5 and Section 4.3 Finding 9 present per-model compliance and hallucination statistics across 1,080 responses (180 questions x 6 models), with p < 0.01 for the correlation.",
    317       "supported": "moderate"
    318     }
    319   ],
    320   "methodology_tags": ["benchmark-eval", "case-study"],
    321   "key_findings": "The paper develops a hybrid retrieval system combining BM25 sparse matching, BGE dense embeddings, and Neo4j graph expansion for citation-grounded code comprehension. Evaluated across 30 Python repositories with 180 queries and 6 LLMs, the system achieves 92% citation accuracy, outperforming single-mode baselines by 14-18 percentage points. Graph expansion via import relationships discovers cross-file evidence in 62% of architectural queries, improving citation completeness by 24 percentage points, while mechanical citation verification prevents hallucination with zero false negatives across 1,080 verified responses.",
    322   "red_flags": [
    323     {
    324       "flag": "No train-test separation for hyperparameter tuning",
    325       "detail": "Hyperparameters (alpha=0.45, k=28, gamma=0.25) appear to be tuned on the same 180 questions used for final evaluation. No held-out test set is described. This could overfit parameters to the evaluation set, inflating reported accuracy."
    326     },
    327     {
    328       "flag": "Single-run experiments with no variance reporting",
    329       "detail": "Each question-model combination is tested once with temperature 0.2, justified as 'near-deterministic.' No repeated runs or variance across seeds are reported. Temperature 0.2 does not produce identical outputs, and result stability is unverified."
    330     },
    331     {
    332       "flag": "Very small user study (N=5) with no controls",
    333       "detail": "The developer evaluation (Section 5.4.2) uses only 5 participants, has no randomization or counterbalancing of tool order, no blinding, no IRB approval mentioned, and no demographic reporting. The within-subjects design with fixed ordering (manual first, then system) introduces learning and order effects that could inflate the 60% time improvement claim."
    334     },
    335     {
    336       "flag": "Contamination risk with popular repositories",
    337       "detail": "The 30 evaluated repositories (Flask, Django, NumPy, PyTorch, etc.) are among the most popular Python projects, almost certainly present in the training data of all evaluated models. The paper does not discuss how model familiarity with these codebases could affect results."
    338     },
    339     {
    340       "flag": "Generalization claims exceed evidence",
    341       "detail": "The title and abstract present findings as applicable to 'code comprehension' generally, but the system is Python-only, evaluated on popular well-documented repositories, with questions skewed toward architectural comprehension. The paper acknowledges this in limitations but the framing is broader than the evidence supports."
    342     },
    343     {
    344       "flag": "Solo author claiming PI status on own work",
    345       "detail": "The paper lists a single author (a graduate student) who designates themselves as 'Principal Investigator (PI).' This is unusual for a graduate student paper and no advisor or collaborators are listed as co-authors, though the acknowledgments thank 'our advisor.'"
    346     }
    347   ],
    348   "cited_papers": [
    349     {
    350       "title": "Evaluating Large Language Models Trained on Code",
    351       "authors": ["Mark Chen"],
    352       "year": 2021,
    353       "arxiv_id": "2107.03374",
    354       "relevance": "Foundational work on evaluating LLM code generation capabilities (Codex/HumanEval), directly relevant to code generation evaluation methodology."
    355     },
    356     {
    357       "title": "Grounded Copilot: How Programmers Interact with Code-Generating Models",
    358       "authors": ["Shraddha Barke"],
    359       "year": 2023,
    360       "relevance": "Empirical study of developer interaction with code-generating models, relevant to understanding AI coding assistant productivity."
    361     },
    362     {
    363       "title": "CodeBERT: A Pre-Trained Model for Programming and Natural Languages",
    364       "authors": ["Zhangyin Feng", "Daya Guo", "Duyu Tang"],
    365       "year": 2020,
    366       "arxiv_id": "2002.08155",
    367       "relevance": "Pre-trained code model used as baseline for code embeddings in retrieval, relevant to code understanding models."
    368     },
    369     {
    370       "title": "GraphCodeBERT: Pre-training Code Representations with Data Flow",
    371       "authors": ["Daya Guo"],
    372       "year": 2021,
    373       "relevance": "Code representation model incorporating data flow graphs, relevant to structural code understanding approaches."
    374     },
    375     {
    376       "title": "CodeRetriever: Retrieved Code is Better Than You Think",
    377       "authors": ["Yao Li"],
    378       "year": 2022,
    379       "relevance": "Hybrid keyword and embedding code retrieval system, directly comparable baseline for code search methodology."
    380     },
    381     {
    382       "title": "The Causes of and Solutions to Hallucinations in Language Models",
    383       "authors": ["Tianyi Zhang"],
    384       "year": 2023,
    385       "arxiv_id": "2311.09417",
    386       "relevance": "Survey of LLM hallucination including code-specific hallucination patterns, directly relevant to AI safety and reliability."
    387     },
    388     {
    389       "title": "SelfCheck: Detecting LLM Hallucinations Without External Knowledge",
    390       "authors": ["Potsawee Manakul"],
    391       "year": 2023,
    392       "relevance": "Self-consistency based hallucination detection method, comparable approach to the citation verification mechanism evaluated."
    393     },
    394     {
    395       "title": "Retrieval-Augmented Generation for Knowledge-Intensive NLP Tasks",
    396       "authors": ["Patrick Lewis"],
    397       "year": 2020,
    398       "relevance": "Foundational RAG paper that the citation-grounded approach extends, relevant to understanding RAG methodology."
    399     },
    400     {
    401       "title": "Code Llama: Open Foundation Models for Code",
    402       "authors": ["Baptiste Rozière"],
    403       "year": 2023,
    404       "arxiv_id": "2308.12950",
    405       "relevance": "Open-source code LLM evaluated in this study and widely used for code generation benchmarks."
    406     },
    407     {
    408       "title": "Mistral 7B",
    409       "authors": ["Albert Q Jiang"],
    410       "year": 2023,
    411       "arxiv_id": "2310.06825",
    412       "relevance": "General-purpose LLM evaluated in this study, relevant to understanding model capability variation across architectures."
    413     },
    414     {
    415       "title": "LLMs for Code Generation: Assessing Correctness and Hallucinations",
    416       "authors": ["Yu Huang"],
    417       "year": 2023,
    418       "relevance": "Study characterizing LLM hallucination specifically in code generation tasks, directly relevant to AI code reliability assessment."
    419     },
    420     {
    421       "title": "Expectations vs. Experience: Empirical Effects of Code Generation Assistants",
    422       "authors": ["Priyan Vaithilingam"],
    423       "year": 2022,
    424       "relevance": "Empirical study of developer experience with code generation tools, relevant to AI programming productivity evaluation."
    425     }
    426   ]
    427 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs