scan.json (29768B)
1 { 2 "paper": { 3 "title": "Large Language Model Powered Automated Modeling and Optimization of Active Distribution Network Dispatch Problems", 4 "authors": [ 5 "Xu Yang", 6 "Chenhui Lin", 7 "Yue Yang", 8 "Qi Wang", 9 "Haotian Liu", 10 "Haizhou Hua", 11 "Wenchuan Wu" 12 ], 13 "year": 2025, 14 "venue": "IEEE Transactions on Smart Grid", 15 "arxiv_id": "2507.21162", 16 "doi": "10.1109/TSG.2025.3621438" 17 }, 18 "checklist": { 19 "artifacts": { 20 "code_released": { 21 "applies": true, 22 "answer": true, 23 "justification": "Reference [32] points to a GitHub repository (https://github.com/YangXuSteve/LLM-Modeling-and-Optimization) containing supplementary files including complete prompts, test cases, and detailed results." 24 }, 25 "data_released": { 26 "applies": true, 27 "answer": true, 28 "justification": "The supplementary file [32] includes test case topologies, profiles, parameters, and all 30 dispatch requests. The test systems use publicly available IEEE 33-bus, 69-bus, and 141-bus distribution systems." 29 }, 30 "environment_specified": { 31 "applies": true, 32 "answer": false, 33 "justification": "Table I lists LLM configurations (temperature, top-p, embedding dimension, qwen-plus version) but no requirements.txt, Dockerfile, or detailed software environment specification is provided for reproducing the full pipeline." 34 }, 35 "reproduction_instructions": { 36 "applies": true, 37 "answer": false, 38 "justification": "No step-by-step reproduction instructions are provided. The supplementary file contains prompts and test cases but no README or script describing how to run the end-to-end pipeline." 39 } 40 }, 41 "statistical_methodology": { 42 "confidence_intervals_or_error_bars": { 43 "applies": true, 44 "answer": false, 45 "justification": "Results in Figs. 5-6 and Tables II-III report only average scores and pass rates across 90 tests. No confidence intervals or error bars are provided." 46 }, 47 "significance_tests": { 48 "applies": true, 49 "answer": false, 50 "justification": "The paper compares 7 methods and claims performance differences (e.g., 'Full' vs ablation variants) based solely on comparing average scores without any statistical significance tests." 51 }, 52 "effect_sizes_reported": { 53 "applies": true, 54 "answer": true, 55 "justification": "Scores are reported on a 100-point scale with per-component breakdowns (5×20 points), and pass@1/pass@3 rates provide sufficient context to understand magnitudes. The end-to-end example reports 12.6% power loss reduction." 56 }, 57 "sample_size_justified": { 58 "applies": true, 59 "answer": false, 60 "justification": "30 dispatch requests with 3 seeds each (90 evaluations total) are used with no justification for why this sample size is sufficient for the claims made." 61 }, 62 "variance_reported": { 63 "applies": true, 64 "answer": false, 65 "justification": "Three tests per request with different random seeds are conducted, but the paper reports only 'average values across these 90 tests' with no standard deviation, IQR, or other spread measure." 66 } 67 }, 68 "evaluation_design": { 69 "baselines_included": { 70 "applies": true, 71 "answer": true, 72 "justification": "Seven ablation variants are compared: Full, No-IE, No-PF, No-IEPF, No-EK, No-FS, and No-RAG, each removing a specific component or enhancement method." 73 }, 74 "baselines_contemporary": { 75 "applies": true, 76 "answer": false, 77 "justification": "All comparisons are internal ablation variants. No external baselines from prior work are included. References [17] and [18] describe related LLM+RL approaches for power systems but are not compared against. No non-LLM baseline is included either." 78 }, 79 "ablation_study": { 80 "applies": true, 81 "answer": true, 82 "justification": "Section IV.C presents comprehensive ablation studies: architecture ablations (No-IE, No-PF, No-IEPF) and enhancement method ablations (No-EK, No-FS, No-RAG), each isolating one component's contribution." 83 }, 84 "multiple_metrics": { 85 "applies": true, 86 "answer": true, 87 "justification": "Four metrics are used: problem formulation score (human-evaluated, 100-point), code programming score (human-evaluated, 100-point), pass@1, and pass@3." 88 }, 89 "human_evaluation": { 90 "applies": true, 91 "answer": true, 92 "justification": "Human experts score problem formulation and code programming results using a 5-component scoring rubric (each 0/10/20 points). The scoring criterion is described in Section IV.A." 93 }, 94 "held_out_test_set": { 95 "applies": true, 96 "answer": false, 97 "justification": "There is no explicit separation between a development set used for prompt tuning and the 30 test requests. It is unclear whether prompts were iteratively refined on the same requests used for final evaluation." 98 }, 99 "per_category_breakdown": { 100 "applies": true, 101 "answer": true, 102 "justification": "Results are broken down by method variant (Figs. 5-6, Table II), by LLM (qwen-plus vs qwen2.5-72b), and by model parameter size (Table III: 72b, 32b, 14b, 7b, 3b). The scoring rubric decomposes into 5 sub-components." 103 }, 104 "failure_cases_discussed": { 105 "applies": true, 106 "answer": true, 107 "justification": "Common errors are discussed: 'symbol inaccuracies and adding/removing a term during modeling' for Full, 'misrecognition' for No-IE, function usage errors for No-RAG, and complete failure for qwen2.5-3b." 108 }, 109 "negative_results_reported": { 110 "applies": true, 111 "answer": true, 112 "justification": "No-FS achieves 0% pass rate across all attempts. qwen2.5-3b 'can no longer meet the requirements.' Removing the Information Extractor causes significant misrecognition errors." 113 } 114 }, 115 "claims_and_evidence": { 116 "abstract_claims_supported": { 117 "applies": true, 118 "answer": true, 119 "justification": "The abstract claims 'Comprehensive comparisons and end-to-end demonstrations on various test cases validate the effectiveness,' which is supported by Section IV's ablation studies, pass rates, and end-to-end example." 120 }, 121 "causal_claims_justified": { 122 "applies": true, 123 "answer": true, 124 "justification": "Causal claims are made through ablation studies (e.g., removing Information Extractor causes performance decline). The ablation design uses controlled single-variable manipulation, which is adequate for these claims." 125 }, 126 "generalization_bounded": { 127 "applies": true, 128 "answer": false, 129 "justification": "The title claims broad applicability to 'Active Distribution Network Dispatch Problems,' but testing is limited to 3 IEEE test systems (33/69/141-bus), 4 equipment types, and Qwen-family LLMs only. These boundaries are not explicitly acknowledged." 130 }, 131 "alternative_explanations_discussed": { 132 "applies": true, 133 "answer": false, 134 "justification": "No alternative explanations for the results are discussed. For instance, the paper does not consider whether the strong performance of 'Full' might be partly due to the specific test request design or the Qwen model family's particular strengths." 135 }, 136 "proxy_outcome_distinction": { 137 "applies": true, 138 "answer": true, 139 "justification": "The paper explicitly distinguishes between code executability (pass rates) and code correctness (human expert scores), noting 'the pass rates only measure the code's executability rather than correctness.' The scoring rubric's limitations are also acknowledged ('simple and may introduce some human bias')." 140 } 141 }, 142 "setup_transparency": { 143 "model_versions_specified": { 144 "applies": true, 145 "answer": true, 146 "justification": "Table I specifies qwen-plus version '2025-04-28', qwen2.5 models with explicit parameter sizes (72b, 32b, 14b, 7b, 3b), and text-embedding-v3 from Aliyun." 147 }, 148 "prompts_provided": { 149 "applies": true, 150 "answer": true, 151 "justification": "The paper states 'Complete prompts for all LLM agents can be found in the online supplementary file [32]' at the GitHub repository. Prompt structure is described in detail in Section III.B." 152 }, 153 "hyperparameters_reported": { 154 "applies": true, 155 "answer": true, 156 "justification": "Table I reports temperature=0.6, top-p=0.7, embedding dimension=1024, and the qwen-plus version date." 157 }, 158 "scaffolding_described": { 159 "applies": true, 160 "answer": true, 161 "justification": "The multi-LLM coordination architecture is described in detail in Section III: Information Extractor (prompt-based), Problem Formulator (6-round dialogue), Code Programmer (RAG-enhanced), with inputs/outputs, workflow, and enhancement methods all documented." 162 }, 163 "data_preprocessing_documented": { 164 "applies": true, 165 "answer": true, 166 "justification": "The pipeline from natural language request through structured extraction, multi-round optimization problem formulation, RAG-based example retrieval, and code generation is documented in Sections III.A-D. Case format (Python dictionary) and RAG vectorization process are described." 167 } 168 }, 169 "limitations_and_scope": { 170 "limitations_section_present": { 171 "applies": true, 172 "answer": false, 173 "justification": "No dedicated limitations or threats-to-validity section exists. The conclusion briefly mentions 'future research will explore lightweight model implementations' but this is a single forward-looking sentence, not a substantive limitations discussion." 174 }, 175 "threats_to_validity_specific": { 176 "applies": true, 177 "answer": false, 178 "justification": "No threats to validity are discussed anywhere in the paper." 179 }, 180 "scope_boundaries_stated": { 181 "applies": true, 182 "answer": false, 183 "justification": "The paper does not explicitly state what the results do not show, what settings are excluded, or what claims the authors are not making. The scope is implicitly limited by the test cases but never explicitly bounded." 184 } 185 }, 186 "data_integrity": { 187 "raw_data_available": { 188 "applies": true, 189 "answer": true, 190 "justification": "The supplementary file [32] contains test requests, case data, and 'detailed scores and pass/fail results of each test' per the paper." 191 }, 192 "data_collection_described": { 193 "applies": true, 194 "answer": true, 195 "justification": "Section IV.A describes the test setup: 3 districts (IEEE 33/69/141-bus), 10 requests per district covering common objectives, equipment, and operational requirements with various operator tones. Three tests per request with different random seeds." 196 }, 197 "recruitment_methods_described": { 198 "applies": false, 199 "answer": false, 200 "justification": "No human participants were recruited for a study. The human experts performing evaluation scoring are evaluators, not study participants. The test data uses standard IEEE benchmark systems." 201 }, 202 "data_pipeline_documented": { 203 "applies": true, 204 "answer": true, 205 "justification": "The paper documents the full pipeline: dispatch request → Information Extractor → Problem Formulator (6-round dialogue) → Code Programmer (with RAG) → solver → results. Each stage's inputs and outputs are specified." 206 } 207 }, 208 "conflicts_of_interest": { 209 "funding_disclosed": { 210 "applies": true, 211 "answer": true, 212 "justification": "Funding is disclosed: 'supported in part by the Beijing Natural Science Foundation under Grant L243003 and the National Science Foundation of China under Grant U24B6009.'" 213 }, 214 "affiliations_disclosed": { 215 "applies": true, 216 "answer": true, 217 "justification": "All author affiliations are listed: Tsinghua University, Hefei University of Technology, and Hong Kong Polytechnic University. Notable: author Y. Yang is a co-author of PyOptInterface [25], the modeling language used in experiments." 218 }, 219 "funder_independent_of_outcome": { 220 "applies": true, 221 "answer": true, 222 "justification": "Beijing Natural Science Foundation and National Science Foundation of China are government agencies with no financial stake in the specific experimental outcomes." 223 }, 224 "financial_interests_declared": { 225 "applies": true, 226 "answer": false, 227 "justification": "No competing interests statement is included. Co-author Y. Yang created PyOptInterface which is the modeling language used and evaluated in the experiments, representing a potential undeclared conflict." 228 } 229 }, 230 "contamination": { 231 "training_cutoff_stated": { 232 "applies": true, 233 "answer": false, 234 "justification": "No training data cutoff date is stated for any of the Qwen models used. The paper evaluates LLM capability on dispatch tasks without stating when model training data was collected." 235 }, 236 "train_test_overlap_discussed": { 237 "applies": true, 238 "answer": false, 239 "justification": "No discussion of whether the Qwen models may have seen IEEE test system data, similar dispatch problem formulations, or PyOptInterface code in their training data." 240 }, 241 "benchmark_contamination_addressed": { 242 "applies": true, 243 "answer": false, 244 "justification": "The paper notes PyOptInterface was chosen partly because 'its materials may not have been learned by existing LLMs,' showing awareness of the issue, but no systematic contamination assessment is conducted for the overall pipeline." 245 } 246 }, 247 "human_studies": { 248 "pre_registered": { 249 "applies": false, 250 "answer": false, 251 "justification": "No human participants in this study. The human experts are evaluators scoring LLM outputs, not study participants." 252 }, 253 "irb_or_ethics_approval": { 254 "applies": false, 255 "answer": false, 256 "justification": "No human participants in this study." 257 }, 258 "demographics_reported": { 259 "applies": false, 260 "answer": false, 261 "justification": "No human participants in this study." 262 }, 263 "inclusion_exclusion_criteria": { 264 "applies": false, 265 "answer": false, 266 "justification": "No human participants in this study." 267 }, 268 "randomization_described": { 269 "applies": false, 270 "answer": false, 271 "justification": "No human participants in this study." 272 }, 273 "blinding_described": { 274 "applies": false, 275 "answer": false, 276 "justification": "No human participants in this study." 277 }, 278 "attrition_reported": { 279 "applies": false, 280 "answer": false, 281 "justification": "No human participants in this study." 282 } 283 }, 284 "cost_and_practicality": { 285 "inference_cost_reported": { 286 "applies": true, 287 "answer": false, 288 "justification": "No inference cost, API cost, token consumption, or latency is reported despite the system making multiple LLM calls per dispatch request (3 agents + RAG embedding)." 289 }, 290 "compute_budget_stated": { 291 "applies": true, 292 "answer": false, 293 "justification": "No total computational budget is stated. The paper does not report total API spend, wall-clock time per request, or overall compute used for the 90+ evaluations." 294 } 295 }, 296 "experimental_rigor": { 297 "seed_sensitivity_reported": { 298 "applies": true, 299 "answer": false, 300 "justification": "Three tests per request with different random seeds are conducted, but results are reported only as averages. No seed-to-seed variance or sensitivity analysis is provided." 301 }, 302 "number_of_runs_stated": { 303 "applies": true, 304 "answer": true, 305 "justification": "Explicitly stated: 'we conducted three tests for each request using different random seeds. As a result, each method is evaluated 90 times.'" 306 }, 307 "hyperparameter_search_budget": { 308 "applies": true, 309 "answer": false, 310 "justification": "No hyperparameter search budget is reported. The prompt designs and LLM configuration (temperature=0.6, top-p=0.7) appear fixed with no description of how they were selected." 311 }, 312 "best_config_selection_justified": { 313 "applies": true, 314 "answer": true, 315 "justification": "The ablation study systematically demonstrates each component's contribution, justifying the 'Full' configuration. Each removed component leads to measurable performance degradation." 316 }, 317 "multiple_comparison_correction": { 318 "applies": true, 319 "answer": false, 320 "justification": "Seven methods are compared across two LLMs and multiple metrics, but no statistical tests are performed at all, let alone corrections for multiple comparisons." 321 }, 322 "self_comparison_bias_addressed": { 323 "applies": true, 324 "answer": false, 325 "justification": "The authors evaluate their own system and its ablation variants without acknowledging self-evaluation bias. The human expert scoring is likely performed by the authors themselves, but this is not disclosed." 326 }, 327 "compute_budget_vs_performance": { 328 "applies": true, 329 "answer": false, 330 "justification": "No analysis of performance as a function of compute budget. Different model sizes (Table III) are compared but without reporting their compute costs." 331 }, 332 "benchmark_construct_validity": { 333 "applies": true, 334 "answer": false, 335 "justification": "The 30 dispatch requests are designed by the authors. No discussion of whether these requests adequately represent real-world ADN dispatch scenarios or cover the full diversity of operator needs." 336 }, 337 "scaffold_confound_addressed": { 338 "applies": true, 339 "answer": true, 340 "justification": "The multi-LLM scaffold IS the object of study. Ablation variants systematically vary scaffold components while keeping the LLM constant. Cross-LLM comparisons (qwen-plus vs qwen2.5-72b) use the same scaffold." 341 } 342 }, 343 "data_leakage": { 344 "temporal_leakage_addressed": { 345 "applies": true, 346 "answer": false, 347 "justification": "No discussion of temporal leakage. The IEEE test systems (published 1989-2008) and power flow equations are widely available and likely in the LLMs' training data." 348 }, 349 "feature_leakage_addressed": { 350 "applies": true, 351 "answer": false, 352 "justification": "No discussion of whether the RAG-retrieved examples or multi-round dialogue structure leaks information beyond what would be available in a real deployment." 353 }, 354 "non_independence_addressed": { 355 "applies": true, 356 "answer": false, 357 "justification": "No discussion of whether the 30 test requests share structural similarities with the few-shot examples in the RAG database." 358 }, 359 "leakage_detection_method": { 360 "applies": true, 361 "answer": false, 362 "justification": "No formal leakage detection method is applied. The choice of PyOptInterface as a novel modeling language is a partial mitigation but no systematic detection is performed." 363 } 364 } 365 }, 366 "scan_version": 3, 367 "active_modules": [ 368 "experimental_rigor", 369 "data_leakage" 370 ], 371 "claims": [ 372 { 373 "claim": "The 'Full' multi-LLM coordination architecture achieves near-perfect scores on problem formulation and code programming tasks.", 374 "evidence": "Figs. 5-6 show 'Full' achieving scores close to 100 on both tasks for both qwen-plus and qwen2.5-72b. Table II shows pass@1 of 0.98 (qwen-plus) and 0.93 (qwen2.5-72b), pass@3 of 1.00 for both.", 375 "supported": "moderate" 376 }, 377 { 378 "claim": "Each LLM agent in the architecture is necessary — removing any component degrades performance.", 379 "evidence": "Section IV.C ablation studies show score decreases when removing Information Extractor (misrecognition errors), Problem Formulator (greater decline), or both (worst among architecture ablations).", 380 "supported": "strong" 381 }, 382 { 383 "claim": "Few-shot examples are critically important for code generation — without them, pass rate drops to 0%.", 384 "evidence": "Table II shows No-FS method achieves 0.00 pass@1 and 0.00 pass@3 for both LLMs, meaning no executable code was generated without examples.", 385 "supported": "strong" 386 }, 387 { 388 "claim": "RAG-assisted dynamic example selection improves code pass rates compared to fixed examples.", 389 "evidence": "Table II: Full achieves 0.98/1.00 pass@1/3 (qwen-plus) vs No-RAG at 0.63/0.73. Scores in Figs. 5-6 show smaller differences.", 390 "supported": "moderate" 391 }, 392 { 393 "claim": "LLM parameter size significantly affects performance, with a sharp decline at 7b parameters.", 394 "evidence": "Table III shows gradual decline from 72b (98.4 PF score) to 14b (91.4), then a sharp drop to 7b (66.2). qwen2.5-3b cannot meet requirements at all.", 395 "supported": "moderate" 396 } 397 ], 398 "methodology_tags": [ 399 "benchmark-eval", 400 "case-study" 401 ], 402 "key_findings": "A multi-LLM coordination architecture (Information Extractor → Problem Formulator → Code Programmer) can transform natural language dispatch requests into executable optimization code for active distribution networks, achieving 98% first-attempt pass rates with qwen-plus. Ablation studies show all three agents and enhancement methods (few-shot learning, external knowledge, RAG) contribute significantly, with few-shot examples being critical (0% pass rate without them). Performance scales with model size, with a sharp degradation below 14b parameters and complete failure at 3b.", 403 "red_flags": [ 404 { 405 "flag": "No external baselines", 406 "detail": "All comparisons are internal ablation variants of the proposed system. No external methods (including the RL+LLM approaches from refs [17]-[18]) or non-LLM baselines are compared against, making it impossible to assess whether the approach is better than alternatives." 407 }, 408 { 409 "flag": "No error bars or statistical tests", 410 "detail": "Despite running 90 evaluations (30 requests × 3 seeds), only averages are reported. No standard deviations, confidence intervals, or significance tests are provided, even though the LLM stochasticity (temperature=0.6) could produce meaningful variance." 411 }, 412 { 413 "flag": "Undisclosed evaluator identity", 414 "detail": "Human expert scoring is central to the evaluation, but the identity and number of evaluators are not disclosed. If the authors scored their own system's outputs, this introduces significant bias, especially with the coarse 0/10/20 scoring rubric." 415 }, 416 { 417 "flag": "Co-author conflict with PyOptInterface", 418 "detail": "Author Y. Yang co-created PyOptInterface (ref [25]), the specific modeling language used as the test target. The paper frames PyOptInterface as a challenging test case ('its materials may not have been learned by existing LLMs') but does not disclose this as a conflict or test with alternative languages." 419 }, 420 { 421 "flag": "No limitations section", 422 "detail": "A 10-page journal paper with no dedicated limitations discussion. Scope boundaries (3 IEEE test systems, 4 equipment types, Qwen-only LLMs) are not acknowledged as limitations." 423 }, 424 { 425 "flag": "Author-designed benchmark", 426 "detail": "The 30 test dispatch requests were designed by the authors specifically for this study. No validation that these requests represent real-world dispatch diversity, and no independent benchmark is used." 427 } 428 ], 429 "cited_papers": [ 430 { 431 "title": "Exploring ChatGPT Capabilities and Limitations: A Survey", 432 "authors": ["A. Koubaa", "W. Boulila", "L. Ghouti", "A. Alzahem", "S. Latif"], 433 "year": 2023, 434 "relevance": "Survey of ChatGPT capabilities and limitations relevant to understanding LLM application boundaries." 435 }, 436 { 437 "title": "GPT-4 technical report", 438 "authors": ["OpenAI"], 439 "year": 2024, 440 "arxiv_id": "2303.08774", 441 "relevance": "Technical report on GPT-4 capabilities, foundational reference for LLM-based systems." 442 }, 443 { 444 "title": "On the Potential of ChatGPT to Generate Distribution Systems for Load Flow Studies Using OpenDSS", 445 "authors": ["R. S. Bonadia", "F. C. L. Trindade", "W. Freitas", "B. Venkatesh"], 446 "year": 2023, 447 "relevance": "Early work using LLMs (ChatGPT) to generate power system simulation code, directly related to LLM code generation for domain-specific tasks." 448 }, 449 { 450 "title": "Enhancing LLMs for Power System Simulations: A Feedback-driven Multi-agent Framework", 451 "authors": ["M. Jia", "Z. Cui", "G. Hug"], 452 "year": 2025, 453 "arxiv_id": "2411.16707", 454 "relevance": "Multi-agent LLM framework for power system simulations with feedback loops, closely related to this paper's multi-LLM coordination architecture." 455 }, 456 { 457 "title": "Exploring the capabilities and limitations of large language models in the electric energy sector", 458 "authors": ["S. Majumder", "L. Dong", "F. Doudi"], 459 "year": 2024, 460 "relevance": "Assessment of LLM capabilities in the energy sector including risk recognition and load forecasting." 461 }, 462 { 463 "title": "ElecBench: a Power Dispatch Evaluation Benchmark for Large Language Models", 464 "authors": ["X. Zhou", "H. Zhao", "Y. Cheng"], 465 "year": 2024, 466 "arxiv_id": "2407.05365", 467 "relevance": "Benchmark for evaluating LLM performance on power dispatch tasks, directly relevant to LLM capability assessment." 468 }, 469 { 470 "title": "Applying Large Language Models to Power Systems: Potential Security Threats", 471 "authors": ["J. Ruan"], 472 "year": 2024, 473 "relevance": "Discussion of security threats from applying LLMs to power systems, relevant to AI safety in critical infrastructure." 474 }, 475 { 476 "title": "Real-Time Optimal Power Flow With Linguistic Stipulations: Integrating GPT-Agent and Deep Reinforcement Learning", 477 "authors": ["Z. Yan", "Y. Xu"], 478 "year": 2024, 479 "relevance": "Integrates GPT-based agents with reinforcement learning for power flow optimization, an alternative approach to LLM-based dispatch." 480 }, 481 { 482 "title": "RL2: Reinforce Large Language Model to Assist Safe Reinforcement Learning for Energy Management of Active Distribution Networks", 483 "authors": ["X. Yang", "C. Lin", "H. Liu", "W. Wu"], 484 "year": 2024, 485 "relevance": "By the same first author, combines LLMs with RL for safe energy management in distribution networks." 486 }, 487 { 488 "title": "Language models are few-shot learners", 489 "authors": ["T. Brown", "B. Mann", "N. Ryder"], 490 "year": 2020, 491 "relevance": "Foundational work on few-shot learning with LLMs, the technique used by the Information Extractor and Code Programmer." 492 }, 493 { 494 "title": "Chain-of-Thought Prompting Elicits Reasoning in Large Language Models", 495 "authors": ["J. Wei", "X. Wang", "D. Schuurmans"], 496 "year": 2022, 497 "relevance": "Chain-of-thought prompting technique used as an enhancement method for the LLM agents in this paper." 498 }, 499 { 500 "title": "Retrieval-augmented generation for knowledge-intensive NLP tasks", 501 "authors": ["P. S. H. Lewis", "E. Perez", "A. Piktus"], 502 "year": 2020, 503 "relevance": "RAG technique foundational to the Code Programmer's few-shot example retrieval method." 504 }, 505 { 506 "title": "Evaluating large language models trained on code", 507 "authors": ["M. Chen", "J. Tworek", "H. Jun"], 508 "year": 2021, 509 "arxiv_id": "2107.03374", 510 "relevance": "Evaluating LLMs for code generation, relevant to the Code Programmer agent's task and the pass@k evaluation metric used." 511 } 512 ], 513 "engagement_factors": { 514 "practical_relevance": { 515 "score": 2, 516 "justification": "Useful concept for power system operators wanting natural-language dispatch, but requires domain-specific setup, commercial solvers, and LLM API access." 517 }, 518 "surprise_contrarian": { 519 "score": 0, 520 "justification": "Confirms the expected finding that multi-step LLM pipelines with RAG and few-shot learning can automate domain-specific code generation." 521 }, 522 "fear_safety": { 523 "score": 1, 524 "justification": "LLM-generated code controlling power grid dispatch raises implicit safety concerns, though the paper does not frame it as a safety issue." 525 }, 526 "drama_conflict": { 527 "score": 0, 528 "justification": "No controversy or dramatic claims; straightforward system paper." 529 }, 530 "demo_ability": { 531 "score": 1, 532 "justification": "Supplementary code on GitHub exists but requires Qwen API access, commercial solvers, and power systems domain expertise to run." 533 }, 534 "brand_recognition": { 535 "score": 0, 536 "justification": "Tsinghua University authors using Alibaba's Qwen models; not a widely recognized lab or product in the LLM community." 537 } 538 } 539 }