scan.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan.json (26728B)
      1 {
      2   "paper": {
      3     "title": "CigaR: Cost-efficient Program Repair with LLMs",
      4     "authors": [
      5       "Dávid Hidvégi",
      6       "Khashayar Etemadi",
      7       "Sofia Bobadilla",
      8       "Martin Monperrus"
      9     ],
     10     "year": 2024,
     11     "venue": "arXiv",
     12     "arxiv_id": "2402.06598",
     13     "doi": "10.48550/arXiv.2402.06598"
     14   },
     15   "checklist": {
     16     "artifacts": {
     17       "code_released": {
     18         "applies": true,
     19         "answer": true,
     20         "justification": "The paper provides a GitHub link: https://github.com/ASSERT-KTH/cigar. Section I states 'We make CIGAR publicly available for future research at https://github.com/ASSERT-KTH/cigar' and Section III-D confirms 'CIGAR and all the data related to our experiments are made publicly available.'"
     21       },
     22       "data_released": {
     23         "applies": true,
     24         "answer": true,
     25         "justification": "The benchmarks used (Defects4J and HumanEval-Java) are publicly available standard benchmarks. Additionally, Section III-D states 'CIGAR and all the data related to our experiments are made publicly available' including cached prompts, responses, and test results."
     26       },
     27       "environment_specified": {
     28         "applies": true,
     29         "answer": false,
     30         "justification": "The paper mentions CIGAR is implemented in Python and uses the OpenAI gpt-3.5-turbo-0301 model, but does not provide requirements.txt, Dockerfile, or detailed library version specifications. No environment setup section with dependency versions is present."
     31       },
     32       "reproduction_instructions": {
     33         "applies": true,
     34         "answer": false,
     35         "justification": "The paper does not include step-by-step reproduction instructions. While the code repository is referenced, the paper itself contains no README-style commands, 'Reproducing Results' section, or specific scripts to replicate experiments."
     36       }
     37     },
     38     "statistical_methodology": {
     39       "confidence_intervals_or_error_bars": {
     40         "applies": true,
     41         "answer": false,
     42         "justification": "Results are reported as raw counts and percentages (e.g., '171/429 (39.8%)') without confidence intervals or error bars. No uncertainty quantification is provided for any results."
     43       },
     44       "significance_tests": {
     45         "applies": true,
     46         "answer": false,
     47         "justification": "The paper claims CIGAR 'outperforms' CHATREPAIR and other baselines based solely on comparing raw numbers (e.g., 171 vs 138 correct patches). No statistical significance tests (p-values, bootstrap tests, etc.) are used to validate these comparisons."
     48       },
     49       "effect_sizes_reported": {
     50         "applies": true,
     51         "answer": true,
     52         "justification": "The paper reports percentage improvements with baseline context. For example, '73% cost reduction' (from 467K to 127K tokens per bug), '96% saving' on jointly-fixed bugs (from 608K to 20K), and specific counts for correct patches (171 vs 138). These provide enough context to understand the magnitude."
     53       },
     54       "sample_size_justified": {
     55         "applies": true,
     56         "answer": false,
     57         "justification": "The paper uses 429 bugs (267 from Defects4J and 162 from HumanEval-Java) without any justification for why this sample size is adequate. No power analysis or discussion of whether 429 bugs is sufficient for the claims made."
     58       },
     59       "variance_reported": {
     60         "applies": true,
     61         "answer": false,
     62         "justification": "The paper reports single-run results. Due to the temperature=1 setting and stochastic nature of LLM sampling, results could vary across runs, but no variance, standard deviation, or results across multiple seeds are reported."
     63       }
     64     },
     65     "evaluation_design": {
     66       "baselines_included": {
     67         "applies": true,
     68         "answer": true,
     69         "justification": "The paper compares against three baselines: nl2fix, STEAM, and CHATREPAIR (Table II). CHATREPAIR is reimplemented and compared in detail for both effectiveness (RQ1) and efficiency (RQ2)."
     70       },
     71       "baselines_contemporary": {
     72         "applies": true,
     73         "answer": true,
     74         "justification": "The baselines are all from 2023, which is contemporary for a 2024 paper. CHATREPAIR is described as 'the most advanced APR tool that uses OpenAI's models' based on a literature review (Section IV-C)."
     75       },
     76       "ablation_study": {
     77         "applies": true,
     78         "answer": false,
     79         "justification": "CIGAR has multiple components (initiation prompts, improvement prompts, rebooting, patch multiplication), but no ablation study systematically removes each component to measure its individual contribution. RQ3 studies exploration progress but does not isolate component contributions."
     80       },
     81       "multiple_metrics": {
     82         "applies": true,
     83         "answer": true,
     84         "justification": "The paper uses multiple metrics: number of plausible patches, number of correct patches (exact AST match), token cost (total and per-bug average), cost saving percentage, and distinct patch counts."
     85       },
     86       "human_evaluation": {
     87         "applies": true,
     88         "answer": false,
     89         "justification": "Correctness is determined entirely by automated AST matching against ground truth. While this is a reasonable automated approach, no human evaluation of patch quality, readability, or semantic correctness beyond test passing is conducted. Given claims about practical usefulness for companies, human evaluation would be relevant."
     90       },
     91       "held_out_test_set": {
     92         "applies": true,
     93         "answer": true,
     94         "justification": "The paper uses Defects4J and HumanEval-Java as evaluation benchmarks. HumanEval-Java is explicitly used as a held-out set to address data leakage concerns: 'this dataset is more recent than the training data used in gpt-3.5-0301, it is not prone to the data leakage problem' (Section VI-A)."
     95       },
     96       "per_category_breakdown": {
     97         "applies": true,
     98         "answer": true,
     99         "justification": "Table II provides breakdowns by dataset (Defects4J vs HumanEval-Java). Table I shows per-project statistics. Table III breaks down token costs by bug category (fixed by CHATREPAIR only, CIGAR only, both, neither). Table IV provides per-bug analysis for selected bugs."
    100       },
    101       "failure_cases_discussed": {
    102         "applies": true,
    103         "answer": true,
    104         "justification": "The paper discusses unfixed bugs: Table III shows 245 bugs fixed by neither tool with cost analysis. The exploration analysis (RQ3, Figures 5 and 6) discusses cases where long bugs yield few distinct patches during multiplication. Section V-C discusses how bugs with long functions limit patch generation."
    105       },
    106       "negative_results_reported": {
    107         "applies": true,
    108         "answer": false,
    109         "justification": "Every experiment shows CIGAR improving over baselines. No failed configurations, abandoned approaches, or settings that hurt performance are reported. The paper does not discuss any attempted strategies that did not work."
    110       }
    111     },
    112     "claims_and_evidence": {
    113       "abstract_claims_supported": {
    114         "applies": true,
    115         "answer": true,
    116         "justification": "The abstract claims '73% cost reduction' (supported by Table III: 467K→127K), '96% saving on jointly-fixed bugs' (supported by Table III: 608K→20K), and '171/429 (39.8%)' correct patches (supported by Table II). All numerical claims match the results."
    117       },
    118       "causal_claims_justified": {
    119         "applies": true,
    120         "answer": false,
    121         "justification": "The paper makes causal claims like 'CIGAR reduces the token cost by 73%' and 'the rebooting strategy... enables it to efficiently explore different parts of the search space.' These are attributed to CIGAR's design but not rigorously validated with controlled ablations. No component is systematically removed to confirm it causes the observed improvement."
    122       },
    123       "generalization_bounded": {
    124         "applies": true,
    125         "answer": false,
    126         "justification": "The paper tests only on Java bugs with a single model (gpt-3.5-turbo-0301) but makes broad claims like 'those original, largely unknown, prompting techniques could be used beyond program repair, in many of the software engineering tasks based on LLMs' (Section V-C). The title says 'Program Repair with LLMs' generically while results are Java-only with one model."
    127       },
    128       "alternative_explanations_discussed": {
    129         "applies": true,
    130         "answer": false,
    131         "justification": "The threats to validity section (Section VI) discusses construct validity (token count as cost metric) and external validity (Java-only), but does not consider alternative explanations for the results. For example, the cost savings could be partly due to CHATREPAIR's inefficient reimplementation rather than CIGAR's design, or the 50-sample batch size being the dominant factor rather than the prompting strategy."
    132       }
    133     },
    134     "setup_transparency": {
    135       "model_versions_specified": {
    136         "applies": true,
    137         "answer": true,
    138         "justification": "Section III-D explicitly states 'CIGAR is implemented in Python and uses OpenAI gpt-3.5-turbo-0301 as the underlying LLM.' This includes the specific version snapshot (0301)."
    139       },
    140       "prompts_provided": {
    141         "applies": true,
    142         "answer": true,
    143         "justification": "The paper provides detailed prompt structures in Figures 2 and 3, showing the initiation prompt and multiplication prompt with their specific sections (system message, one-shot example, buggy code, test failure details, call to action). While these are structural descriptions with example content rather than full verbatim prompt text, the prompts are described at sufficient detail with actual examples shown in the figures."
    144       },
    145       "hyperparameters_reported": {
    146         "applies": true,
    147         "answer": true,
    148         "justification": "Section III-D reports temperature=1. Section III-B1 states 50 samples per request. Section III-B2 states max_invoke=10 (maximum LLM invocations per round). Section III-C states 5 multiplication invocations. These are the key hyperparameters."
    149       },
    150       "scaffolding_described": {
    151         "applies": true,
    152         "answer": true,
    153         "justification": "The paper provides a detailed workflow diagram (Figure 1) and describes the full pipeline: initiation → partial patch improvement (iterative) → reboot logic → plausible patch multiplication. The feedback mechanism (summarizing previous patches), retry logic (max_invoke=10 invocations), and reboot strategy are all described in Section III."
    154       },
    155       "data_preprocessing_documented": {
    156         "applies": true,
    157         "answer": true,
    158         "justification": "Section IV-B and Table I describe how bugs were filtered: only single-function bugs from Defects4J are considered, reducing 596 total bugs to 429. The filtering criterion is explicitly stated. Section III-B1 describes how test results and buggy code are extracted and formatted for prompts."
    159       }
    160     },
    161     "limitations_and_scope": {
    162       "limitations_section_present": {
    163         "applies": true,
    164         "answer": true,
    165         "justification": "Section VI 'Threats to Validity' contains three subsections (Construct Validity, Internal Validity, External Validity) with substantive discussion of limitations."
    166       },
    167       "threats_to_validity_specific": {
    168         "applies": true,
    169         "answer": true,
    170         "justification": "The threats are specific to this study: token count as cost metric may not capture all costs (Section VI-A), data leakage risk for Defects4J with gpt-3.5-0301 (Section VI-A), bug set differences with nl2fix and STEAM (Section VI-B), and Java-only evaluation (Section VI-C). These are particular to CIGAR's experimental setup."
    171       },
    172       "scope_boundaries_stated": {
    173         "applies": true,
    174         "answer": false,
    175         "justification": "While the threats section notes Java-only and Defects4J limitations, the paper does not explicitly state what its results do NOT show. It does not bound its claims to say, e.g., 'our results do not demonstrate cost savings with other LLMs' or 'we do not show our prompting strategies generalize beyond APR.' The conclusion instead suggests broad applicability."
    176       }
    177     },
    178     "data_integrity": {
    179       "raw_data_available": {
    180         "applies": true,
    181         "answer": true,
    182         "justification": "Section III-D states 'CIGAR implements a thorough caching system, which stores all the prompts sent to the LLM, all responses received from the LLM as well as the test execution results' and 'CIGAR and all the data related to our experiments are made publicly available.'"
    183       },
    184       "data_collection_described": {
    185         "applies": true,
    186         "answer": true,
    187         "justification": "Section IV-B describes the bug datasets (Defects4J and HumanEval-Java), how bugs were selected (single-function only), and provides per-project statistics in Table I. Section IV-C describes how baselines were identified (Google Scholar search for GPT-related APR tools)."
    188       },
    189       "recruitment_methods_described": {
    190         "applies": false,
    191         "answer": false,
    192         "justification": "No human participants are involved. The study uses standard benchmarks (Defects4J and HumanEval-Java), so recruitment methods are not applicable."
    193       },
    194       "data_pipeline_documented": {
    195         "applies": true,
    196         "answer": true,
    197         "justification": "The pipeline is documented: bugs are selected from Defects4J/HumanEval-Java (Table I shows filtering from 596 to 429 single-function bugs), CIGAR runs with cached prompts and responses, and patches are validated via test execution and AST matching. The CHATREPAIR reimplementation process is also described."
    198       }
    199     },
    200     "conflicts_of_interest": {
    201       "funding_disclosed": {
    202         "applies": true,
    203         "answer": false,
    204         "justification": "No funding source or acknowledgments section is present in the paper. There is no mention of grants, sponsors, or funding agencies."
    205       },
    206       "affiliations_disclosed": {
    207         "applies": true,
    208         "answer": true,
    209         "justification": "All authors are identified as being from KTH Royal Institute of Technology, Stockholm, Sweden, with email addresses provided. This is an academic institution without obvious conflicts in evaluating LLM-based APR."
    210       },
    211       "funder_independent_of_outcome": {
    212         "applies": true,
    213         "answer": false,
    214         "justification": "No funding information is disclosed, so independence of the funder cannot be assessed. The absence of funding disclosure means this criterion cannot be satisfied."
    215       },
    216       "financial_interests_declared": {
    217         "applies": true,
    218         "answer": false,
    219         "justification": "No competing interests statement or financial interest declaration is present in the paper."
    220       }
    221     },
    222     "contamination": {
    223       "training_cutoff_stated": {
    224         "applies": true,
    225         "answer": false,
    226         "justification": "The paper does not state the training data cutoff for gpt-3.5-turbo-0301. It acknowledges the model 'is likely to have parts of the DEFECTS4J dataset in its training data' (Section VI-A) but does not specify the actual cutoff date."
    227       },
    228       "train_test_overlap_discussed": {
    229         "applies": true,
    230         "answer": true,
    231         "justification": "Section VI-A explicitly discusses data leakage: 'The gpt-3.5-0301 model used by CIGAR is likely to have parts of the DEFECTS4J dataset in its training data.' The paper addresses this by also evaluating on HumanEval-Java, which is 'more recent than the training data used in gpt-3.5-0301.'"
    232       },
    233       "benchmark_contamination_addressed": {
    234         "applies": true,
    235         "answer": true,
    236         "justification": "The paper explicitly addresses contamination risk for Defects4J and mitigates it by using HumanEval-Java as a separate benchmark: 'this dataset is more recent than the training data used in gpt-3.5-0301, it is not prone to the data leakage problem' (Section VI-A). Results on both datasets are compared."
    237       }
    238     },
    239     "human_studies": {
    240       "pre_registered": {
    241         "applies": false,
    242         "answer": false,
    243         "justification": "No human participants are involved in this benchmark evaluation study."
    244       },
    245       "irb_or_ethics_approval": {
    246         "applies": false,
    247         "answer": false,
    248         "justification": "No human participants are involved in this benchmark evaluation study."
    249       },
    250       "demographics_reported": {
    251         "applies": false,
    252         "answer": false,
    253         "justification": "No human participants are involved in this benchmark evaluation study."
    254       },
    255       "inclusion_exclusion_criteria": {
    256         "applies": false,
    257         "answer": false,
    258         "justification": "No human participants are involved in this benchmark evaluation study."
    259       },
    260       "randomization_described": {
    261         "applies": false,
    262         "answer": false,
    263         "justification": "No human participants are involved in this benchmark evaluation study."
    264       },
    265       "blinding_described": {
    266         "applies": false,
    267         "answer": false,
    268         "justification": "No human participants are involved in this benchmark evaluation study."
    269       },
    270       "attrition_reported": {
    271         "applies": false,
    272         "answer": false,
    273         "justification": "No human participants are involved in this benchmark evaluation study."
    274       }
    275     },
    276     "cost_and_practicality": {
    277       "inference_cost_reported": {
    278         "applies": true,
    279         "answer": true,
    280         "justification": "Cost reporting is the central contribution of the paper. Table III reports total token costs (204.3M for CHATREPAIR, 54.9M for CIGAR), per-bug averages (467K vs 127K), and per-category breakdowns. Token costs are the primary metric."
    281       },
    282       "compute_budget_stated": {
    283         "applies": true,
    284         "answer": false,
    285         "justification": "While token costs are reported extensively, the total computational budget (wall-clock time, total API spend in dollars, or GPU hours for test execution) is not stated. The paper mentions OpenAI pricing in Section II-B but does not report the actual dollar cost of the experiments."
    286       }
    287     }
    288   },
    289   "claims": [
    290     {
    291       "claim": "CIGAR reduces token cost by 73% compared to CHATREPAIR across 429 bugs.",
    292       "evidence": "Table III shows total token cost of 204.3M for CHATREPAIR vs 54.9M for CIGAR, a reduction of 149.4M/204.3M = 73%. Per-bug average: 467K vs 127K tokens.",
    293       "supported": "strong"
    294     },
    295     {
    296       "claim": "CIGAR generates correct patches for 171/429 (39.8%) bugs, outperforming all compared LLM-based APR tools.",
    297       "evidence": "Table II shows CIGAR_R12 fixes 171/429 bugs vs CHATREPAIR's 138/429. However, CHATREPAIR was reimplemented by the authors (Section IV-C), introducing potential implementation bias.",
    298       "supported": "moderate"
    299     },
    300     {
    301       "claim": "On bugs fixed by both tools, CIGAR saves 96% of token cost (20K vs 608K per bug).",
    302       "evidence": "Table III 'Both (125)' row shows CHC_AVG=608K, CIC_AVG=20K. The 96% figure is confirmed by the data.",
    303       "supported": "strong"
    304     },
    305     {
    306       "claim": "CIGAR is the first LLM-based APR tool that focuses on minimizing repair cost.",
    307       "evidence": "Section VII-A reviews related work and positions CIGAR as 'the first to focus and demonstrate cost minimization without losing effectiveness.' This is a novelty claim about the literature.",
    308       "supported": "moderate"
    309     },
    310     {
    311       "claim": "The reboot strategy enables CIGAR to efficiently explore different parts of the search space.",
    312       "evidence": "Figure 5 shows distinct patch counts jumping at reboot boundaries. However, no ablation removes the reboot strategy to measure its isolated contribution vs simply spending more tokens in a single conversation.",
    313       "supported": "moderate"
    314     },
    315     {
    316       "claim": "46 bugs are uniquely fixed by CIGAR (not fixed by CHATREPAIR).",
    317       "evidence": "Figure 4 shows the overlap: 125 bugs fixed by both, 46 only by CIGAR, 13 only by CHATREPAIR. These numbers are from the merged dataset.",
    318       "supported": "strong"
    319     }
    320   ],
    321   "methodology_tags": [
    322     "benchmark-eval"
    323   ],
    324   "key_findings": "CigaR is an LLM-based automated program repair tool designed to minimize token cost through three strategies: iterative prompting with patch summarization, search rebooting with high temperature, and plausible patch multiplication. Evaluated on 429 bugs from Defects4J and HumanEval-Java using gpt-3.5-turbo-0301, CigaR reduces token cost by 73% compared to ChatRepair (127K vs 467K tokens per bug on average) while fixing more bugs (171 vs 138 correct patches). On bugs fixed by both tools, CigaR achieves 96% cost savings, spending only 20K tokens per bug versus ChatRepair's 608K.",
    325   "red_flags": [
    326     {
    327       "flag": "Reimplemented baseline",
    328       "detail": "CHATREPAIR was reimplemented by the authors because the original was not publicly available ('As CHATREPAIR and its patches for DEFECTS4J are not publicly available... we carefully reimplement it'). Reimplementation introduces risk of unintentional handicapping of the baseline, even with private author communication."
    329     },
    330     {
    331       "flag": "No variance across runs",
    332       "detail": "With temperature=1, LLM outputs are highly stochastic, yet the paper reports single-run results without any variance or confidence measures. The reported numbers could differ substantially across runs."
    333     },
    334     {
    335       "flag": "No ablation study",
    336       "detail": "CIGAR has three distinct components (improvement prompts, rebooting, patch multiplication) but no ablation study isolates their individual contributions. It is unclear which design choices drive the cost reduction and which are incidental."
    337     },
    338     {
    339       "flag": "No statistical tests for comparisons",
    340       "detail": "Claims of outperforming baselines rest on comparing raw counts (e.g., 171 vs 138 correct patches) without any statistical significance testing. With stochastic LLM outputs, these differences might not be statistically significant."
    341     },
    342     {
    343       "flag": "Single model evaluation",
    344       "detail": "All experiments use only gpt-3.5-turbo-0301. The cost minimization strategies are claimed to work for 'any LLM that has an API' but are only tested on one model, limiting generalizability."
    345     }
    346   ],
    347   "cited_papers": [
    348     {
    349       "title": "Keep the conversation going: Fixing 162 out of 337 bugs for $0.42 each using chatgpt",
    350       "authors": ["C. S. Xia", "L. Zhang"],
    351       "year": 2023,
    352       "arxiv_id": "2304.00385",
    353       "relevance": "CHATREPAIR is the primary baseline for both effectiveness and cost comparison in LLM-based program repair."
    354     },
    355     {
    356       "title": "Impact of code language models on automated program repair",
    357       "authors": ["N. Jiang", "K. Liu", "T. Lutellier", "L. Tan"],
    358       "year": 2023,
    359       "doi": "10.1109/ICSE48619.2023.00125",
    360       "relevance": "Large-scale study on using LLMs for APR including HumanEval-Java benchmark used by CigaR for external validity."
    361     },
    362     {
    363       "title": "FrugalGPT: How to use large language models while reducing cost and improving performance",
    364       "authors": ["L. Chen", "M. Zaharia", "J. Zou"],
    365       "year": 2023,
    366       "arxiv_id": "2305.05176",
    367       "relevance": "Key related work on LLM cost minimization strategies including cascading, prompt adaptation, and model approximation."
    368     },
    369     {
    370       "title": "Cost-effective hyperparameter optimization for large language model generation inference",
    371       "authors": ["C. Wang", "S. X. Liu", "A. H. Awadallah"],
    372       "year": 2023,
    373       "arxiv_id": "2303.04673",
    374       "relevance": "EcoOptiGen proposes optimizing LLM hyperparameters for cost-effective inference, directly relevant to LLM cost management."
    375     },
    376     {
    377       "title": "Evaluating large language models trained on code",
    378       "authors": ["M. Chen", "J. Tworek", "H. Jun"],
    379       "year": 2021,
    380       "arxiv_id": "2107.03374",
    381       "relevance": "Codex paper introducing HumanEval benchmark and foundational work on LLM code generation evaluation."
    382     },
    383     {
    384       "title": "Automated program repair in the era of large pre-trained language models",
    385       "authors": ["C. S. Xia", "Y. Wei", "L. Zhang"],
    386       "year": 2023,
    387       "relevance": "Study on few-shot prompting for APR across nine LLMs, foundational work on prompt engineering for program repair."
    388     },
    389     {
    390       "title": "A critical review of large language model on software engineering: An example from chatgpt and automated program repair",
    391       "authors": ["Q. Zhang", "T. Zhang", "J. Zhai"],
    392       "year": 2023,
    393       "arxiv_id": "2310.08879",
    394       "relevance": "Critical review of LLMs in software engineering with focus on automated program repair prompt engineering."
    395     },
    396     {
    397       "title": "Demystifying gpt self-repair for code generation",
    398       "authors": ["T. X. Olausson", "J. P. Inala", "C. Wang"],
    399       "year": 2023,
    400       "arxiv_id": "2306.09896",
    401       "relevance": "Introduces pass@t metric for token efficiency in LLM code generation, the cost metric adopted by CigaR."
    402     },
    403     {
    404       "title": "An analysis of the automatic bug fixing performance of chatgpt",
    405       "authors": ["D. Sobania", "M. Briesch", "C. Hanna", "J. Petke"],
    406       "year": 2023,
    407       "arxiv_id": "2301.08653",
    408       "relevance": "Early analysis of ChatGPT for automated bug fixing, relevant to understanding LLM capabilities in program repair."
    409     },
    410     {
    411       "title": "Automated repair of programs from large language models",
    412       "authors": ["Z. Fan", "X. Gao", "M. Mirchev", "A. Roychoudhury", "S. H. Tan"],
    413       "year": 2023,
    414       "relevance": "Iterative LLM-based APR approach, part of the landscape of LLM repair techniques compared with CigaR."
    415     },
    416     {
    417       "title": "Examining zero-shot vulnerability repair with large language models",
    418       "authors": ["H. Pearce", "B. Tan", "B. Ahmad", "R. Karri", "B. Dolan-Gavitt"],
    419       "year": 2023,
    420       "relevance": "Evaluates LLMs for security vulnerability repair using zero-shot prompts, relevant to LLM code repair capabilities."
    421     },
    422     {
    423       "title": "EcoAssistant: Using LLM assistant more affordably and accurately",
    424       "authors": ["J. Zhang", "R. Krishna", "A. H. Awadallah", "C. Wang"],
    425       "year": 2023,
    426       "arxiv_id": "2310.03046",
    427       "relevance": "Proposes iterative LLM cost reduction by cascading from cheap to expensive models, related to LLM cost management."
    428     }
    429   ]
    430 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs