scan.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan.json (28271B)
      1 {
      2   "paper": {
      3     "title": "Novel Preprocessing Technique for Data Embedding in Engineering Code Generation Using Large Language Model",
      4     "authors": [
      5       "Yu-Chen Lin",
      6       "Akhilesh Kumar",
      7       "Norman Chang",
      8       "Wenliang Zhang",
      9       "Muhammad Zakir",
     10       "Rucha Apte",
     11       "Haiyang He",
     12       "Chao Wang",
     13       "Jyh-Shing Roger Jang"
     14     ],
     15     "year": 2023,
     16     "venue": "2024 IEEE LLM Aided Design Workshop (LAD)",
     17     "arxiv_id": "2311.16267",
     18     "doi": "10.1109/LAD62341.2024.10691715"
     19   },
     20   "scan_version": 2,
     21   "active_modules": ["experimental_rigor", "data_leakage"],
     22   "methodology_tags": ["benchmark-eval", "case-study"],
     23   "key_findings": "The paper proposes four data preprocessing techniques for RAG-based domain-specific code generation: LLM-based Data Splitting, Data Renovation with CoDRC confidence scoring, Data Augmentation via script reconstruction, and the IKEC prompt technique. In a preliminary experiment on 5 RedHawk-SC MapReduce scripts, RAG+IKEC+ChatEDA achieves 73.33% 'Percentage of Correct Lines.' The paper explicitly states that the full experiment (10 planned comparison groups on 20 scripts) has not been completed, and overall method results are not yet available.",
     24   "checklist": {
     25     "artifacts": {
     26       "code_released": {
     27         "applies": true,
     28         "answer": false,
     29         "justification": "No repository URL, code archive, or release of any kind is mentioned in the paper."
     30       },
     31       "data_released": {
     32         "applies": true,
     33         "answer": false,
     34         "justification": "The scripts and API documentation are proprietary to Ansys RedHawk-SC. No data is released."
     35       },
     36       "environment_specified": {
     37         "applies": true,
     38         "answer": false,
     39         "justification": "The paper mentions 'Azure OpenAI Studio' and 'GPT-4, temperature = 0.0, other parameters: default' but provides no requirements.txt, dependency list, or environment specification sufficient to recreate the setup."
     40       },
     41       "reproduction_instructions": {
     42         "applies": true,
     43         "answer": false,
     44         "justification": "No reproduction instructions are provided. The pipeline relies on proprietary scripts and internal Ansys tools that are not available to external researchers."
     45       }
     46     },
     47     "statistical_methodology": {
     48       "confidence_intervals_or_error_bars": {
     49         "applies": true,
     50         "answer": false,
     51         "justification": "The 73.33% 'Percentage of Correct Lines' is a bare point estimate with no confidence intervals or error bars."
     52       },
     53       "significance_tests": {
     54         "applies": true,
     55         "answer": false,
     56         "justification": "No statistical significance tests are performed. The IKEC comparison (86.21% vs 93.10%) and the 73.33% average are presented without any tests."
     57       },
     58       "effect_sizes_reported": {
     59         "applies": true,
     60         "answer": false,
     61         "justification": "The 73.33% is reported without a baseline comparison. The IKEC example (86.21% vs 93.10%) provides some context but is a single example, not a systematic effect size report."
     62       },
     63       "sample_size_justified": {
     64         "applies": true,
     65         "answer": false,
     66         "justification": "The preliminary experiment uses N=5 scripts with no justification for this sample size. The paper acknowledges 'Most significant improvements have been observed in small datasets (at least five questions)' in Section 7 but offers no power analysis or rationale."
     67       },
     68       "variance_reported": {
     69         "applies": true,
     70         "answer": false,
     71         "justification": "No variance, standard deviation, or spread measure is reported. Only the average across 5 samples (73.33%) is given."
     72       }
     73     },
     74     "evaluation_design": {
     75       "baselines_included": {
     76         "applies": true,
     77         "answer": false,
     78         "justification": "The preliminary experiment (Section 4.2.1) tests only RAG+IKEC+ChatEDA. Ten comparison groups including baselines (RAG alone, ReAct, etc.) are designed in Section 4.2.2 but explicitly stated as not yet completed."
     79       },
     80       "baselines_contemporary": {
     81         "applies": true,
     82         "answer": false,
     83         "justification": "Planned baselines include RAG and ReAct which are contemporary, but since the baseline experiment has not been executed, no actual baseline comparison exists."
     84       },
     85       "ablation_study": {
     86         "applies": true,
     87         "answer": false,
     88         "justification": "The planned experiment (Section 4.2.2) is an ablation design with 10 groups systematically adding components. However, the paper states 'the full experiment has not yet been completed.' Type A tests show individual components qualitatively but are not ablation studies."
     89       },
     90       "multiple_metrics": {
     91         "applies": true,
     92         "answer": false,
     93         "justification": "Only 'Percentage of Correct Lines' is used. The planned Executability and Functionality (unit test pass rate) metrics in Section 4.2.2 have not been implemented."
     94       },
     95       "human_evaluation": {
     96         "applies": true,
     97         "answer": true,
     98         "justification": "For Data Augmentation, 20 generated scripts were 'manually evaluated by in-house RedHawk-SC experts' (Section 4.1.1). For the code generation examples, errors are manually marked by experts."
     99       },
    100       "held_out_test_set": {
    101         "applies": true,
    102         "answer": false,
    103         "justification": "The 5 test scripts are used as ground truth, but it is unclear whether they overlap with the RAG data source. No explicit separation of test data from retrieval data is described."
    104       },
    105       "per_category_breakdown": {
    106         "applies": true,
    107         "answer": false,
    108         "justification": "The planned experiment includes difficulty categories (easy/medium/hard) but this has not been executed. The preliminary result is a single average across 5 scripts."
    109       },
    110       "failure_cases_discussed": {
    111         "applies": true,
    112         "answer": true,
    113         "justification": "Fig. 9 shows failure cases in red (missing return value, function name errors). Section 4.1.1 notes 'optimizations in code performance are still needed' for generated scripts."
    114       },
    115       "negative_results_reported": {
    116         "applies": true,
    117         "answer": true,
    118         "justification": "The RAG-only result (without IKEC) in Fig. 9 shows clear failures: omitted layer count code, missing return value, missing MapReduce result. The paper also acknowledges incomplete experiments and small datasets."
    119       }
    120     },
    121     "claims_and_evidence": {
    122       "abstract_claims_supported": {
    123         "applies": true,
    124         "answer": false,
    125         "justification": "The abstract claims the preprocessing techniques 'enhance the Retrieval-Augmented Generation method in retrieving more relevant information,' but the full experiment comparing with and without preprocessing has not been completed. The 73.33% result tests RAG+IKEC+ChatEDA without the proposed preprocessing pipeline."
    126       },
    127       "causal_claims_justified": {
    128         "applies": true,
    129         "answer": false,
    130         "justification": "The paper claims IKEC 'improves' performance and preprocessing techniques 'enhance' RAG. These are causal claims based on single examples (Fig. 9) and a 5-sample preliminary test without controlled comparisons or statistical evidence."
    131       },
    132       "generalization_bounded": {
    133         "applies": true,
    134         "answer": false,
    135         "justification": "While the abstract bounds to 'MapReduce applications,' the conclusion claims 'Domain-specific code generation presents a challenging yet promising arena' and 'Such a method promises rapid deployment across numerous fields,' generalizing far beyond the single tool (RedHawk-SC) tested."
    136       },
    137       "alternative_explanations_discussed": {
    138         "applies": true,
    139         "answer": false,
    140         "justification": "No alternative explanations for the results are discussed. For example, the improvements shown in Fig. 9 could be due to prompt length differences or specific example selection, but this is not considered."
    141       },
    142       "proxy_outcome_distinction": {
    143         "applies": true,
    144         "answer": false,
    145         "justification": "The paper measures 'Percentage of Correct Lines' but frames this as demonstrating 'effectiveness' of the code generation method. No discussion of whether correct lines equates to functional, correct code — the paper itself plans to add executability and unit test metrics, implicitly acknowledging the proxy gap without discussing it."
    146       }
    147     },
    148     "setup_transparency": {
    149       "model_versions_specified": {
    150         "applies": true,
    151         "answer": false,
    152         "justification": "The paper says 'GPT-4' and 'GPT-4-Turbo' without specifying versions (e.g., 'gpt-4-0613'). The Azure OpenAI Studio deployment is mentioned but no model snapshot date or API version is given."
    153       },
    154       "prompts_provided": {
    155         "applies": true,
    156         "answer": true,
    157         "justification": "Full prompt text is provided in Figures 2, 3, and 8 for all major components: Data Augmentation, Data Splitter, Data Renovation, Task Planner, Script Generator, and IKEC prompts."
    158       },
    159       "hyperparameters_reported": {
    160         "applies": true,
    161         "answer": true,
    162         "justification": "Temperature is reported as 0 (or 0.0) throughout, and the paper states 'other parameters: default' (Section 4.1.1, 4.1.2). The planned full experiment also fixes temperature at 0 with 'all other parameters set to their default values.'"
    163       },
    164       "scaffolding_described": {
    165         "applies": true,
    166         "answer": true,
    167         "justification": "The RAG pipeline with Data Splitter, Data Renovation, CoDRC, ATR Algorithm, Data Augmentation, Task Planner, and Script Generator is described in detail in Section 3 with workflow diagrams (Fig. 1 and Fig. 2)."
    168       },
    169       "data_preprocessing_documented": {
    170         "applies": true,
    171         "answer": true,
    172         "justification": "Data preprocessing is the main contribution. Sections 3.1-3.6 detail Data Augmentation, Data Splitter, Data Renovation, CoDRC, and ATR Algorithm with prompts, formulas, and examples."
    173       }
    174     },
    175     "limitations_and_scope": {
    176       "limitations_section_present": {
    177         "applies": true,
    178         "answer": true,
    179         "justification": "Section 7 ('Statement') acknowledges 'certain aspects, such as evaluation, have not yet been fully realized' and 'Most significant improvements have been observed in small datasets (at least five questions).' While not titled 'Limitations,' it provides substantive disclosure of the paper's incompleteness."
    180       },
    181       "threats_to_validity_specific": {
    182         "applies": true,
    183         "answer": true,
    184         "justification": "Section 7 identifies specific threats: evaluation not fully realized, results only from small datasets (N=5), and findings are preliminary. These are specific to this study's state of completion."
    185       },
    186       "scope_boundaries_stated": {
    187         "applies": true,
    188         "answer": false,
    189         "justification": "The paper does not explicitly state what the results do NOT show. While the domain is bounded to RedHawk-SC/MapReduce, the conclusion makes broad claims about 'rapid deployment across numerous fields' and 'algorithm design code generation' without bounding what has not been tested."
    190       }
    191     },
    192     "data_integrity": {
    193       "raw_data_available": {
    194         "applies": true,
    195         "answer": false,
    196         "justification": "No raw data is available. The scripts, API documentation, and evaluation results are all proprietary to Ansys."
    197       },
    198       "data_collection_described": {
    199         "applies": true,
    200         "answer": true,
    201         "justification": "The paper describes using 23 original RedHawk-SC scripts (from Jibin John per acknowledgments), API documentation, and manually generated queries for 5 test scripts. The source and nature of the data is described."
    202       },
    203       "recruitment_methods_described": {
    204         "applies": true,
    205         "answer": false,
    206         "justification": "The 23 scripts and 5 test subjects were selected from internal Ansys resources, but no systematic selection process is described. How and why these specific scripts were chosen is not explained."
    207       },
    208       "data_pipeline_documented": {
    209         "applies": true,
    210         "answer": false,
    211         "justification": "The preprocessing pipeline is well-documented, but the evaluation data pipeline is not. How 5 scripts were selected from the available pool, how queries were manually created, and how error lines were counted is only briefly described."
    212       }
    213     },
    214     "conflicts_of_interest": {
    215       "funding_disclosed": {
    216         "applies": true,
    217         "answer": true,
    218         "justification": "Section 6 acknowledges 'Ansys Inc. for providing resources' and the first author's footnote states 'This project was completed during the internship at Ansys.'"
    219       },
    220       "affiliations_disclosed": {
    221         "applies": true,
    222         "answer": true,
    223         "justification": "Author affiliations are clearly listed: National Taiwan University and Ansys, Inc. The connection between Ansys employees and the RedHawk-SC tool being evaluated is visible from the affiliations."
    224       },
    225       "funder_independent_of_outcome": {
    226         "applies": true,
    227         "answer": false,
    228         "justification": "Ansys provides resources and employs most authors. The paper demonstrates LLM-based code generation for Ansys's RedHawk-SC product, which directly benefits Ansys. The funder has a stake in positive results."
    229       },
    230       "financial_interests_declared": {
    231         "applies": true,
    232         "answer": false,
    233         "justification": "No competing interests statement or financial interest disclosure is included in the paper."
    234       }
    235     },
    236     "contamination": {
    237       "training_cutoff_stated": {
    238         "applies": true,
    239         "answer": false,
    240         "justification": "No training data cutoff date is stated for GPT-4. The paper uses GPT-4 without specifying when its training data ends."
    241       },
    242       "train_test_overlap_discussed": {
    243         "applies": true,
    244         "answer": true,
    245         "justification": "Section 1 states: 'any publicly available information would have already been learned by LLMs, and thus the datasets must be generated by the researchers themselves to avoid any potential biases.' The paper explicitly chose a proprietary domain (RedHawk-SC) to avoid training data overlap."
    246       },
    247       "benchmark_contamination_addressed": {
    248         "applies": true,
    249         "answer": true,
    250         "justification": "The paper argues the test data is in a domain 'where LLMs have not been trained' (Section 1), using proprietary scripts and custom-generated queries specifically to avoid contamination. While no formal verification is performed, the conceptual risk is addressed."
    251       }
    252     },
    253     "human_studies": {
    254       "pre_registered": {
    255         "applies": false,
    256         "answer": false,
    257         "justification": "No human participants in the study. Expert reviewers evaluated outputs but were not study subjects."
    258       },
    259       "irb_or_ethics_approval": {
    260         "applies": false,
    261         "answer": false,
    262         "justification": "No human participants in the study."
    263       },
    264       "demographics_reported": {
    265         "applies": false,
    266         "answer": false,
    267         "justification": "No human participants in the study."
    268       },
    269       "inclusion_exclusion_criteria": {
    270         "applies": false,
    271         "answer": false,
    272         "justification": "No human participants in the study."
    273       },
    274       "randomization_described": {
    275         "applies": false,
    276         "answer": false,
    277         "justification": "No human participants in the study."
    278       },
    279       "blinding_described": {
    280         "applies": false,
    281         "answer": false,
    282         "justification": "No human participants in the study."
    283       },
    284       "attrition_reported": {
    285         "applies": false,
    286         "answer": false,
    287         "justification": "No human participants in the study."
    288       }
    289     },
    290     "cost_and_practicality": {
    291       "inference_cost_reported": {
    292         "applies": true,
    293         "answer": false,
    294         "justification": "No API costs, tokens consumed, or latency figures are reported despite the method making multiple GPT-4 calls per code generation (Data Splitter, Renovation, CoDRC, Task Planner, Script Generator)."
    295       },
    296       "compute_budget_stated": {
    297         "applies": true,
    298         "answer": false,
    299         "justification": "No computational budget is stated. The total cost of running the multi-stage GPT-4 pipeline is not quantified."
    300       }
    301     },
    302     "experimental_rigor": {
    303       "seed_sensitivity_reported": {
    304         "applies": true,
    305         "answer": false,
    306         "justification": "No multiple seeds or sensitivity analysis. Temperature is set to 0 (deterministic), but no analysis of whether results are sensitive to this choice or other stochastic factors."
    307       },
    308       "number_of_runs_stated": {
    309         "applies": true,
    310         "answer": false,
    311         "justification": "The paper states 5 scripts were used as test subjects but does not state how many times each code generation was run. With temperature=0 this may be deterministic, but this is not explicitly addressed."
    312       },
    313       "hyperparameter_search_budget": {
    314         "applies": true,
    315         "answer": false,
    316         "justification": "No hyperparameter search is reported. Temperature=0 and default parameters are used without justification for why these settings were chosen or whether alternatives were explored."
    317       },
    318       "best_config_selection_justified": {
    319         "applies": true,
    320         "answer": false,
    321         "justification": "Fixed configuration (temperature=0, defaults) is used without justification. The planned experiment mentions adjusting 'chunk_size' and 'similarity_top_k' but this has not been executed."
    322       },
    323       "multiple_comparison_correction": {
    324         "applies": false,
    325         "answer": false,
    326         "justification": "No statistical tests are performed, so multiple comparison correction is not applicable."
    327       },
    328       "self_comparison_bias_addressed": {
    329         "applies": true,
    330         "answer": false,
    331         "justification": "The authors propose and evaluate their own method without acknowledging self-evaluation bias. The qualitative examples in Type A tests (Figs. 5-9) are author-selected."
    332       },
    333       "compute_budget_vs_performance": {
    334         "applies": true,
    335         "answer": false,
    336         "justification": "No performance-compute analysis. The multi-stage pipeline (multiple GPT-4 calls) is not compared against simpler approaches at matched compute budgets."
    337       },
    338       "benchmark_construct_validity": {
    339         "applies": true,
    340         "answer": false,
    341         "justification": "'Percentage of Correct Lines' is used as the sole metric without discussing whether it measures code generation quality. The paper implicitly acknowledges this gap by planning executability and unit test metrics in Section 4.2.2, but does not discuss construct validity."
    342       },
    343       "scaffold_confound_addressed": {
    344         "applies": false,
    345         "answer": false,
    346         "justification": "The scaffold (RAG pipeline) IS the contribution being tested. Only one model (GPT-4) is used, so model-scaffold confounding is not a concern."
    347       }
    348     },
    349     "data_leakage": {
    350       "temporal_leakage_addressed": {
    351         "applies": true,
    352         "answer": false,
    353         "justification": "While the paper argues RedHawk-SC is a novel domain for LLMs, it does not formally discuss temporal leakage — e.g., whether any RedHawk-SC documentation was available online before GPT-4's training cutoff."
    354       },
    355       "feature_leakage_addressed": {
    356         "applies": true,
    357         "answer": false,
    358         "justification": "No discussion of whether the RAG retrieval process leaks answer information into the generation context. The method intentionally provides relevant documentation, but the boundary between 'retrieval' and 'leakage' in evaluation is not discussed."
    359       },
    360       "non_independence_addressed": {
    361         "applies": true,
    362         "answer": false,
    363         "justification": "The 5 test scripts are drawn from the same set of 23 RedHawk-SC scripts used for Data Augmentation training. No discussion of whether the generated augmented scripts introduce overlap with test data."
    364       },
    365       "leakage_detection_method": {
    366         "applies": true,
    367         "answer": false,
    368         "justification": "No concrete leakage detection method is used. The reliance on a proprietary domain is argued conceptually but not verified empirically."
    369       }
    370     }
    371   },
    372   "claims": [
    373     {
    374       "claim": "The method achieves 73.33% 'Percentage of Correct Lines' for code generation in MapReduce applications.",
    375       "evidence": "Section 4.2.1: Average across 5 generated code samples using RAG+IKEC+ChatEDA workflow. Individual examples show 86.21% (RAG only) and 93.10% (RAG+IKEC).",
    376       "supported": "weak"
    377     },
    378     {
    379       "claim": "Data Augmentation generates syntactically correct scripts with appropriate structure.",
    380       "evidence": "Section 4.1.1: 20 new scripts generated from 23 originals, manually evaluated by RedHawk-SC experts. 'All syntax is correct, and the code is generated following the LLM's self-defined objectives.'",
    381       "supported": "moderate"
    382     },
    383     {
    384       "claim": "IKEC improves code generation quality by avoiding fatal logic errors.",
    385       "evidence": "Section 4.1.4 and Fig. 9: Single example showing RAG-only has 3 errors (86.21%) while RAG+IKEC has 2 errors (93.10%). Specific improvements: correct layer count, return dictionary, MapReduce result.",
    386       "supported": "weak"
    387     },
    388     {
    389       "claim": "Data Renovation provides reliable expanded content for concise technical documents.",
    390       "evidence": "Section 4.1.3 and Fig. 7: Single example showing 'get_current_heatmap' chunk expanded with parameter usage and return value descriptions.",
    391       "supported": "weak"
    392     },
    393     {
    394       "claim": "The preprocessing techniques enhance RAG performance for domain-specific code generation.",
    395       "evidence": "No completed experiment tests the full preprocessing pipeline. The planned 10-group ablation (Section 4.2.2) is not executed. Only individual component demonstrations (Type A) and a partial test (RAG+IKEC only) are available.",
    396       "supported": "unsupported"
    397     }
    398   ],
    399   "red_flags": [
    400     {
    401       "flag": "Incomplete experiment",
    402       "detail": "The paper explicitly states in Section 7: 'certain aspects, such as evaluation, have not yet been fully realized' and 'experimental results for our overall method presented in this paper are yet to be obtained.' The planned 10-group ablation study on 20 scripts has not been executed."
    403     },
    404     {
    405       "flag": "Tiny sample size",
    406       "detail": "The main quantitative result (73.33%) is based on only 5 code generation tasks. Individual component demonstrations use single examples."
    407     },
    408     {
    409       "flag": "No baselines in quantitative experiment",
    410       "detail": "The 73.33% result has no comparison point — no baseline RAG-only or ReAct result is reported. Without baselines, it is impossible to assess whether the proposed method improves over alternatives."
    411     },
    412     {
    413       "flag": "Company evaluating own product ecosystem",
    414       "detail": "Most authors are Ansys employees evaluating LLM code generation for Ansys's RedHawk-SC product. Expert evaluation of generated code was conducted by in-house Ansys staff, not independent reviewers."
    415     },
    416     {
    417       "flag": "Cherry-picked examples",
    418       "detail": "Type A individual tests (Data Splitter, Renovation, IKEC) are each demonstrated with a single hand-selected example. No aggregate metrics or failure rates for these components are reported."
    419     },
    420     {
    421       "flag": "Claims outrun evidence",
    422       "detail": "The abstract presents the 73.33% result as demonstrating 'the effectiveness of our data preprocessing method,' but this result tests RAG+IKEC+ChatEDA only — the proposed preprocessing pipeline (Data Splitter, Data Renovation) is not included in the measured experiment."
    423     }
    424   ],
    425   "cited_papers": [
    426     {
    427       "title": "GPT-4 technical report",
    428       "authors": ["OpenAI"],
    429       "year": 2023,
    430       "arxiv_id": "2303.08774",
    431       "relevance": "Foundation model used in the experiments; central to evaluating LLM code generation capabilities."
    432     },
    433     {
    434       "title": "ChatEDA: A large language model powered autonomous agent for EDA",
    435       "authors": ["Zhuolun He", "Haoyuan Wu", "Xinyun Zhang", "Xufeng Yao", "Su Zheng", "Haisheng Zheng", "Bei Yu"],
    436       "year": 2023,
    437       "relevance": "Directly builds on ChatEDA's code generation workflow and data augmentation approach for domain-specific EDA code generation."
    438     },
    439     {
    440       "title": "An empirical evaluation of using large language models for automated unit test generation",
    441       "authors": ["Max Schäfer", "Sarah Nadi", "Aryaz Eghbali", "Frank Tip"],
    442       "year": 2023,
    443       "relevance": "TestPilot: evaluates LLM-based code generation for Mocha framework using prompt engineering without fine-tuning."
    444     },
    445     {
    446       "title": "VeriGen: A large language model for Verilog code generation",
    447       "authors": ["Shailja Thakur", "Baleegh Ahmad", "Hammond Pearce", "Benjamin Tan", "Brendan Dolan-Gavitt", "Ramesh Karri", "Siddharth Garg"],
    448       "year": 2023,
    449       "arxiv_id": "2308.00708",
    450       "relevance": "Domain-specific code generation benchmark using fine-tuning on CodeGen-16B for Verilog, relevant to evaluating LLM code gen in specialized domains."
    451     },
    452     {
    453       "title": "Self-planning code generation with large language models",
    454       "authors": ["Xue Jiang", "Yihong Dong", "Lecheng Wang", "Zheng Fang", "Qiwei Shang", "Ge Li", "Zhi Jin", "Wenpin Jiao"],
    455       "year": 2023,
    456       "relevance": "Progressive code generation strategy that divides tasks into subtasks, integrated into this paper's workflow."
    457     },
    458     {
    459       "title": "AgentCoder: Multi-agent-based code generation with iterative testing and optimisation",
    460       "authors": ["Dong Huang", "Qingwen Bu", "Jie M. Zhang", "Michael Luck", "Heming Cui"],
    461       "year": 2024,
    462       "relevance": "Multi-agent code generation framework with iterative testing, relevant to agentic coding workflows."
    463     },
    464     {
    465       "title": "Chain-of-thought prompting elicits reasoning in large language models",
    466       "authors": ["Jason Wei", "Xuezhi Wang", "Dale Schuurmans", "Maarten Bosma", "Fei Xia", "Ed Chi", "Quoc V Le", "Denny Zhou"],
    467       "year": 2022,
    468       "relevance": "Foundational prompting technique that this paper's IKEC method modifies and extends."
    469     },
    470     {
    471       "title": "ReAct: Synergizing reasoning and acting in language models",
    472       "authors": ["Shunyu Yao", "Jeffrey Zhao", "Dian Yu", "Nan Du", "Izhak Shafran", "Karthik Narasimhan", "Yuan Cao"],
    473       "year": 2022,
    474       "arxiv_id": "2210.03629",
    475       "relevance": "Planned baseline method for code generation comparison; key prompting framework for complex reasoning tasks."
    476     },
    477     {
    478       "title": "Retrieval-augmented generation for knowledge-intensive NLP tasks",
    479       "authors": ["Patrick Lewis", "Ethan Perez", "Aleksandra Piktus", "Fabio Petroni", "Vladimir Karpukhin", "Naman Goyal"],
    480       "year": 2021,
    481       "relevance": "Foundation for the RAG technique that is central to the paper's approach."
    482     },
    483     {
    484       "title": "CodeGen: An open large language model for code with multi-turn program synthesis",
    485       "authors": ["Erik Nijkamp", "Bo Pang", "Hiroaki Hayashi", "Lifu Tu", "Huan Wang", "Yingbo Zhou", "Silvio Savarese", "Caiming Xiong"],
    486       "year": 2023,
    487       "relevance": "Open-source code generation model mentioned as potential fine-tuning target for domain-specific code generation."
    488     },
    489     {
    490       "title": "Llama 2: Open foundation and fine-tuned chat models",
    491       "authors": ["Hugo Touvron", "Louis Martin", "Kevin Stone"],
    492       "year": 2023,
    493       "arxiv_id": "2307.09288",
    494       "relevance": "Major open-source LLM used by ChatEDA for fine-tuning; relevant to code generation model ecosystem."
    495     }
    496   ]
    497 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs