scan.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan.json (24665B)
      1 {
      2   "paper": {
      3     "title": "Prompt Less, Smile More: MTP with Semantic Engineering in Lieu of Prompt Engineering",
      4     "authors": [
      5       "Jayanaka L. Dantanarayana",
      6       "Savini Kashmira",
      7       "Thakee Nathees",
      8       "Zichen Zhang",
      9       "Krisztian Flautner",
     10       "Lingjia Tang",
     11       "Jason Mars"
     12     ],
     13     "year": 2025,
     14     "venue": "arXiv",
     15     "arxiv_id": "2511.19427",
     16     "doi": "10.48550/arXiv.2511.19427"
     17   },
     18   "scan_version": 2,
     19   "active_modules": ["experimental_rigor", "data_leakage"],
     20   "methodology_tags": ["benchmark-eval"],
     21   "key_findings": "Semantic Engineering via SemTexts improves MTP performance by 1.3x to 3x over base MTP on complex AI-integrated benchmarks, matching or surpassing manually crafted Prompt Engineering while requiring ~3.8x less developer effort (measured in LOC). An ablation study shows that a few targeted SemText annotations at semantic gap points are most effective, with diminishing returns from additional annotations. Spatial affinity between semantic context and code entities in prompts significantly improves LLM reasoning accuracy.",
     22   "checklist": {
     23     "artifacts": {
     24       "code_released": {
     25         "applies": true,
     26         "answer": false,
     27         "justification": "No repository URL or code archive is provided in the paper. The implementation is described as extending the Jac language but no link to the source code is given."
     28       },
     29       "data_released": {
     30         "applies": true,
     31         "answer": false,
     32         "justification": "Custom synthetic datasets (Memory Retrieval with 60 users/300 queries, Task Manager with 220 queries) are not released. While some evaluation uses public datasets (LAION-400M, InstructEval, SWE-bench Lite), the custom datasets have no download link."
     33       },
     34       "environment_specified": {
     35         "applies": true,
     36         "answer": false,
     37         "justification": "Hardware is mentioned (NVIDIA RTX 3090, 24GB VRAM, 64GB RAM) but no requirements.txt, Dockerfile, or detailed dependency list is provided."
     38       },
     39       "reproduction_instructions": {
     40         "applies": true,
     41         "answer": false,
     42         "justification": "No step-by-step reproduction instructions, README, or scripts are provided."
     43       }
     44     },
     45     "statistical_methodology": {
     46       "confidence_intervals_or_error_bars": {
     47         "applies": true,
     48         "answer": false,
     49         "justification": "Table 2 reports point estimates only (e.g., '0.701', '89.546%') with no confidence intervals or error bars."
     50       },
     51       "significance_tests": {
     52         "applies": true,
     53         "answer": false,
     54         "justification": "The paper claims improvements (e.g., '1.3x to 3x') by comparing raw numbers in Table 2 without any statistical significance tests."
     55       },
     56       "effect_sizes_reported": {
     57         "applies": true,
     58         "answer": true,
     59         "justification": "The paper reports improvement factors (1.3x to 3x) with baseline context. Table 2 provides absolute scores for all three conditions, and Table 3 reports LOC reduction factors (e.g., '↓×3.53'). Readers can assess magnitude of effects."
     60       },
     61       "sample_size_justified": {
     62         "applies": true,
     63         "answer": false,
     64         "justification": "No justification is given for dataset sizes (300 images, 220 queries, 200 prompts, 300 SWE-bench tasks). No power analysis."
     65       },
     66       "variance_reported": {
     67         "applies": true,
     68         "answer": false,
     69         "justification": "No standard deviations, variance, or spread measures are reported. Results appear to be single-run numbers."
     70       }
     71     },
     72     "evaluation_design": {
     73       "baselines_included": {
     74         "applies": true,
     75         "answer": true,
     76         "justification": "Two baselines are included: traditional Prompt Engineering (PE) and base MTP without SemTexts (Table 2)."
     77       },
     78       "baselines_contemporary": {
     79         "applies": true,
     80         "answer": true,
     81         "justification": "MTP (2025, same group) is the most recent AI-integration framework. PE is the standard practice. DSPy is discussed as related work. Baselines are appropriate."
     82       },
     83       "ablation_study": {
     84         "applies": true,
     85         "answer": true,
     86         "justification": "Section 5.3.1 presents a detailed ablation study on Content Creator, incrementally adding SemTexts (MTPbase → Semrouting → Semstage → Semreview → Semall → Semprompt) and measuring each step's contribution (Figure 8)."
     87       },
     88       "multiple_metrics": {
     89         "applies": true,
     90         "answer": false,
     91         "justification": "Each benchmark uses only a single metric: F1 for Memory Retrieval, hybrid similarity for Image Extraction, success rate for Task Manager and Content Creator, test passing rate for Aider Genius. No benchmark is evaluated with multiple metrics."
     92       },
     93       "human_evaluation": {
     94         "applies": true,
     95         "answer": false,
     96         "justification": "No human evaluation is included. Content Creator and Task Manager use LLM-as-a-judge. All evaluation is automated."
     97       },
     98       "held_out_test_set": {
     99         "applies": true,
    100         "answer": false,
    101         "justification": "No explicit separation of dev and test splits is described. It is unclear whether the same data was used for development/tuning and final evaluation."
    102       },
    103       "per_category_breakdown": {
    104         "applies": true,
    105         "answer": true,
    106         "justification": "Table 2 provides per-benchmark breakdowns for all three methods across two models. The ablation study (Figure 8) shows step-by-step performance changes. Content Creator evaluation includes four writing categories."
    107       },
    108       "failure_cases_discussed": {
    109         "applies": true,
    110         "answer": true,
    111         "justification": "Section 5.3.1 discusses routing errors in MTP's failing cases and Section 5.3.2 discusses the Memory Retrieval case where SemTexts provide no benefit. The Semprompt ablation step shows degraded performance."
    112       },
    113       "negative_results_reported": {
    114         "applies": true,
    115         "answer": true,
    116         "justification": "Several negative results: SemTexts offer no benefit on Memory Retrieval (Section 5.3.2), Semprompt reduces accuracy compared to Semreview (Section 5.3.1, Figure 8), and adding SemTexts to all entities (Semall) provides no gain beyond targeted placement."
    117       }
    118     },
    119     "claims_and_evidence": {
    120       "abstract_claims_supported": {
    121         "applies": true,
    122         "answer": true,
    123         "justification": "Abstract claims of '1.3x to 3x' improvement are supported by Table 2 results. '3.8x' effort reduction is supported by Table 3. Claims of matching PE are supported across benchmarks."
    124       },
    125       "causal_claims_justified": {
    126         "applies": true,
    127         "answer": true,
    128         "justification": "Causal claims ('SemTexts improve MTP performance') are supported by controlled ablation studies (Section 5.3.1) where single-variable manipulation isolates the contribution of each SemText group."
    129       },
    130       "generalization_bounded": {
    131         "applies": true,
    132         "answer": false,
    133         "justification": "The paper presents 'Semantic Engineering' as a general paradigm (title: 'in Lieu of Prompt Engineering') but evaluates only within the Jac/MTP framework with two models (GPT-4o, Gemma3:27b). No evidence it generalizes beyond MTP."
    134       },
    135       "alternative_explanations_discussed": {
    136         "applies": true,
    137         "answer": false,
    138         "justification": "No discussion of alternative explanations. For example, improvements could stem from providing more text context rather than the specific SemText mechanism. The LLM-as-judge evaluation could also favor longer/structured outputs."
    139       },
    140       "proxy_outcome_distinction": {
    141         "applies": true,
    142         "answer": true,
    143         "justification": "The paper measures accuracy/success rates and claims performance improvement — the measurements match the granularity of the claims. Developer effort is measured in LOC, which is acknowledged as a proxy following prior work [3, 7]."
    144       }
    145     },
    146     "setup_transparency": {
    147       "model_versions_specified": {
    148         "applies": true,
    149         "answer": false,
    150         "justification": "The paper says 'GPT-4o' without a snapshot date or API version, and 'Gemma3:27b' without a specific version identifier. Marketing name 'GPT-4o' does not count as a specified version per schema guidance."
    151       },
    152       "prompts_provided": {
    153         "applies": true,
    154         "answer": true,
    155         "justification": "The PE prompt is shown in full (Figure 3, 70 lines). The MTP-generated prompt is shown (Figure 4c). SemText annotations are shown (Figures 7, 9). The prompt generation mechanism is formally specified."
    156       },
    157       "hyperparameters_reported": {
    158         "applies": true,
    159         "answer": false,
    160         "justification": "No LLM inference hyperparameters (temperature, top-p, max tokens) are reported for either GPT-4o or Gemma3:27b."
    161       },
    162       "scaffolding_described": {
    163         "applies": true,
    164         "answer": true,
    165         "justification": "The MTP compilation pipeline and runtime are described in detail (Section 4, Algorithms 1-2, Figure 5). The SemTable build pass and enriched MT-IR construction are formally specified."
    166       },
    167       "data_preprocessing_documented": {
    168         "applies": true,
    169         "answer": false,
    170         "justification": "Synthetic dataset generation for Memory Retrieval and Task Manager is described only at a high level (Table 1). No details on how the synthetic data was created, filtered, or validated."
    171       }
    172     },
    173     "limitations_and_scope": {
    174       "limitations_section_present": {
    175         "applies": true,
    176         "answer": false,
    177         "justification": "No dedicated limitations or threats-to-validity section exists in the paper."
    178       },
    179       "threats_to_validity_specific": {
    180         "applies": true,
    181         "answer": false,
    182         "justification": "No threats to validity are discussed anywhere in the paper."
    183       },
    184       "scope_boundaries_stated": {
    185         "applies": true,
    186         "answer": false,
    187         "justification": "No explicit scope boundaries are stated. The paper does not discuss what settings or scenarios the results do NOT apply to."
    188       }
    189     },
    190     "data_integrity": {
    191       "raw_data_available": {
    192         "applies": true,
    193         "answer": false,
    194         "justification": "No raw experimental data (individual benchmark results, LLM outputs) is available for verification."
    195       },
    196       "data_collection_described": {
    197         "applies": true,
    198         "answer": false,
    199         "justification": "Custom synthetic datasets are described only briefly in Table 1. No details on generation methodology for the Memory Retrieval (60 users, 300 queries) or Task Manager (220 queries) datasets."
    200       },
    201       "recruitment_methods_described": {
    202         "applies": false,
    203         "answer": false,
    204         "justification": "No human participants. Data sources are standard benchmarks (SWE-bench Lite, InstructEval, LAION-400M) and synthetic datasets."
    205       },
    206       "data_pipeline_documented": {
    207         "applies": true,
    208         "answer": false,
    209         "justification": "No documentation of data pipeline from collection to final analysis. Unclear how synthetic datasets were validated or how benchmark subsets were selected (e.g., 300 from LAION-400M, 300 from SWE-bench Lite)."
    210       }
    211     },
    212     "conflicts_of_interest": {
    213       "funding_disclosed": {
    214         "applies": true,
    215         "answer": false,
    216         "justification": "No funding information is disclosed. An Acknowledgments section mentions no grants or sponsors."
    217       },
    218       "affiliations_disclosed": {
    219         "applies": true,
    220         "answer": true,
    221         "justification": "Author affiliations are clearly listed: University of Michigan and Jaseci Labs. Jaseci Labs develops the Jac programming language being evaluated."
    222       },
    223       "funder_independent_of_outcome": {
    224         "applies": true,
    225         "answer": false,
    226         "justification": "No funding is disclosed, so independence cannot be assessed. One author (Thakee Nathees) is from Jaseci Labs, which develops Jac, the language being evaluated. This conflict is not acknowledged."
    227       },
    228       "financial_interests_declared": {
    229         "applies": true,
    230         "answer": false,
    231         "justification": "No competing interests statement. Jaseci Labs (author affiliation) commercially develops the Jac language being evaluated, but no financial interest disclosure is provided."
    232       }
    233     },
    234     "contamination": {
    235       "training_cutoff_stated": {
    236         "applies": true,
    237         "answer": false,
    238         "justification": "No training data cutoff dates are stated for GPT-4o or Gemma3:27b."
    239       },
    240       "train_test_overlap_discussed": {
    241         "applies": true,
    242         "answer": false,
    243         "justification": "SWE-bench Lite (from GitHub issues) could overlap with GPT-4o's training data. This is not discussed."
    244       },
    245       "benchmark_contamination_addressed": {
    246         "applies": true,
    247         "answer": false,
    248         "justification": "SWE-bench Lite was published in 2024 and uses pre-existing GitHub issues. InstructEval was published in 2023. No contamination analysis is performed."
    249       }
    250     },
    251     "human_studies": {
    252       "pre_registered": {
    253         "applies": false,
    254         "answer": false,
    255         "justification": "No human participants in this study."
    256       },
    257       "irb_or_ethics_approval": {
    258         "applies": false,
    259         "answer": false,
    260         "justification": "No human participants in this study."
    261       },
    262       "demographics_reported": {
    263         "applies": false,
    264         "answer": false,
    265         "justification": "No human participants in this study."
    266       },
    267       "inclusion_exclusion_criteria": {
    268         "applies": false,
    269         "answer": false,
    270         "justification": "No human participants in this study."
    271       },
    272       "randomization_described": {
    273         "applies": false,
    274         "answer": false,
    275         "justification": "No human participants in this study."
    276       },
    277       "blinding_described": {
    278         "applies": false,
    279         "answer": false,
    280         "justification": "No human participants in this study."
    281       },
    282       "attrition_reported": {
    283         "applies": false,
    284         "answer": false,
    285         "justification": "No human participants in this study."
    286       }
    287     },
    288     "cost_and_practicality": {
    289       "inference_cost_reported": {
    290         "applies": true,
    291         "answer": false,
    292         "justification": "No inference costs, API costs, or latency numbers are reported despite using GPT-4o API and running Gemma locally."
    293       },
    294       "compute_budget_stated": {
    295         "applies": true,
    296         "answer": false,
    297         "justification": "Hardware is mentioned (RTX 3090, 64GB RAM) but no total compute budget, GPU hours, or API spend is stated."
    298       }
    299     },
    300     "experimental_rigor": {
    301       "seed_sensitivity_reported": {
    302         "applies": true,
    303         "answer": false,
    304         "justification": "No multi-seed experiments. Results appear to be single-run."
    305       },
    306       "number_of_runs_stated": {
    307         "applies": true,
    308         "answer": false,
    309         "justification": "The number of experimental runs is never stated. Results in Table 2 have no indication of how many runs produced them."
    310       },
    311       "hyperparameter_search_budget": {
    312         "applies": true,
    313         "answer": false,
    314         "justification": "No hyperparameter search budget is reported. The SemText content appears manually crafted but no systematic exploration is documented."
    315       },
    316       "best_config_selection_justified": {
    317         "applies": true,
    318         "answer": false,
    319         "justification": "No discussion of how the final SemText configurations were selected. The ablation shows incremental additions but no systematic selection criterion."
    320       },
    321       "multiple_comparison_correction": {
    322         "applies": false,
    323         "answer": false,
    324         "justification": "No statistical tests are performed, so multiple comparison correction is not applicable."
    325       },
    326       "self_comparison_bias_addressed": {
    327         "applies": true,
    328         "answer": false,
    329         "justification": "The authors evaluate their own system (Jac/MTP/SemTexts) against their own PE baselines. No acknowledgment of author-evaluation bias. All baselines were implemented by the same team."
    330       },
    331       "compute_budget_vs_performance": {
    332         "applies": true,
    333         "answer": false,
    334         "justification": "SemTexts add compilation and prompt generation overhead but performance vs. compute tradeoff is not discussed."
    335       },
    336       "benchmark_construct_validity": {
    337         "applies": true,
    338         "answer": true,
    339         "justification": "Section 5 (Benchmark Applications) explicitly discusses why GSM8K and HotpotQA are insufficient benchmarks for AI-integrated applications and introduces new benchmarks designed to test capabilities C1-C6 that better represent real-world scenarios."
    340       },
    341       "scaffold_confound_addressed": {
    342         "applies": false,
    343         "answer": false,
    344         "justification": "The scaffold (MTP vs PE) IS the thing being compared — the paper is evaluating different prompt generation approaches, not comparing models across scaffolds."
    345       }
    346     },
    347     "data_leakage": {
    348       "temporal_leakage_addressed": {
    349         "applies": true,
    350         "answer": false,
    351         "justification": "No discussion of whether GPT-4o or Gemma may have seen SWE-bench, InstructEval, or LAION-400M data during training."
    352       },
    353       "feature_leakage_addressed": {
    354         "applies": true,
    355         "answer": false,
    356         "justification": "No discussion of whether the evaluation setup leaks information not available in real usage."
    357       },
    358       "non_independence_addressed": {
    359         "applies": true,
    360         "answer": false,
    361         "justification": "No discussion of potential overlap between training data and benchmark examples."
    362       },
    363       "leakage_detection_method": {
    364         "applies": true,
    365         "answer": false,
    366         "justification": "No leakage detection or prevention methods are applied."
    367       }
    368     }
    369   },
    370   "claims": [
    371     {
    372       "claim": "SemTexts improve MTP performance by 1.3x to 3x on complex benchmarks.",
    373       "evidence": "Table 2 shows improvements across benchmarks: Task Manager 36.8% → 92.3% (2.5x), Content Creator 32.5% → 96.0% (2.95x), Image Extraction 0.284 → 0.439 (1.55x), Aider Genius 9.7% → 18.7% (1.93x) with GPT-4o.",
    374       "supported": "strong"
    375     },
    376     {
    377       "claim": "MTP+SemTexts matches or surpasses Prompt Engineering performance across benchmarks.",
    378       "evidence": "Table 2 shows MTP+SemTexts achieves comparable scores to PE: Task Manager 92.3% vs 89.5%, Content Creator 96.0% vs 95.0%, Image Extraction 0.439 vs 0.427, Aider Genius 18.7% vs 19.7%.",
    379       "supported": "moderate"
    380     },
    381     {
    382       "claim": "Semantic Engineering reduces developer effort by ~3.8x compared to Prompt Engineering.",
    383       "evidence": "Table 3 reports LOC reduction factors averaging 3.8x across 5 benchmarks (ranging from 1.76x to 7.8x).",
    384       "supported": "moderate"
    385     },
    386     {
    387       "claim": "SemTexts outperform docstrings as semantic annotation mechanisms by 7-8% on benchmark accuracy.",
    388       "evidence": "Table 4 shows MTP+SemText vs MTP+Docstring: Content Creator 96% vs 88%, Task Manager 92.3% vs 85.7%. Attributed to spatial affinity of semantic context (Section 5.4).",
    389       "supported": "moderate"
    390     },
    391     {
    392       "claim": "Accuracy does not increase linearly with the number of SemTexts; targeted placement at semantic gap points is most effective.",
    393       "evidence": "Ablation study (Figure 8) shows Semall and Semprompt provide no improvement or even degrade performance compared to targeted SemTexts (Semreview).",
    394       "supported": "strong"
    395     }
    396   ],
    397   "red_flags": [
    398     {
    399       "flag": "Authors evaluate own system",
    400       "detail": "All authors are from the team that developed Jac/MTP. They evaluate their own system (MTP+SemTexts) against their own PE baselines and their own custom benchmarks, without acknowledging author-evaluation bias. All benchmarks were designed by the same group."
    401     },
    402     {
    403       "flag": "Custom benchmarks without validation",
    404       "detail": "Three of five benchmarks (Memory Retrieval, Task Manager, Content Creator) use custom synthetic datasets created by the authors. No external validation of these benchmarks. The benchmarks may be designed to favor the SemText approach."
    405     },
    406     {
    407       "flag": "No uncertainty quantification",
    408       "detail": "No error bars, confidence intervals, variance, or multi-run results are reported for any benchmark. LLM outputs are stochastic, so single-run results may not be reproducible."
    409     },
    410     {
    411       "flag": "LLM-as-judge without validation",
    412       "detail": "Task Manager and Content Creator use LLM-as-judge evaluation without validating judge accuracy against human judgments. The judge may systematically favor certain output styles."
    413     },
    414     {
    415       "flag": "No limitations section",
    416       "detail": "The paper has no limitations section, no threats to validity, and no scope boundaries — unusual for a systems paper making broad claims about a 'new programming paradigm'."
    417     },
    418     {
    419       "flag": "Undisclosed conflict of interest",
    420       "detail": "One author is from Jaseci Labs, which commercially develops the Jac language being evaluated. This conflict is not acknowledged, and no competing interests statement is included."
    421     }
    422   ],
    423   "cited_papers": [
    424     {
    425       "title": "Prompting Is Programming: A Query Language for Large Language Models",
    426       "authors": ["Luca Beurer-Kellner", "Marc Fischer", "Martin Vechev"],
    427       "year": 2023,
    428       "doi": "10.1145/3591300",
    429       "relevance": "LMQL framework for constrained LLM generation — direct competitor in AI-integrated programming paradigm."
    430     },
    431     {
    432       "title": "DSPy: Compiling Declarative Language Model Calls into Self-Improving Pipelines",
    433       "authors": ["Omar Khattab", "Arnav Singhvi", "Paridhi Maheshwari"],
    434       "year": 2023,
    435       "arxiv_id": "2310.03714",
    436       "relevance": "Major framework for automated prompt generation that reduces prompt engineering overhead — primary comparison point."
    437     },
    438     {
    439       "title": "MTP: A Meaning-Typed Language Abstraction for AI-Integrated Programming",
    440       "authors": ["Jayanaka L. Dantanarayana", "Yiping Kang"],
    441       "year": 2025,
    442       "doi": "10.1145/3763092",
    443       "relevance": "The foundational MTP system that this paper extends — core AI-integrated programming abstraction."
    444     },
    445     {
    446       "title": "SWE-bench: Can Language Models Resolve Real-world Github Issues?",
    447       "authors": ["Carlos E Jimenez", "John Yang"],
    448       "year": 2024,
    449       "relevance": "Benchmark used for Aider Genius evaluation — tests real-world code editing capability."
    450     },
    451     {
    452       "title": "SGLang: Efficient Execution of Structured Language Model Programs",
    453       "authors": ["Lianmin Zheng", "Liangsheng Yin"],
    454       "year": 2024,
    455       "arxiv_id": "2312.07104",
    456       "relevance": "Efficient structured LLM program execution framework — related approach to AI-integrated programming."
    457     },
    458     {
    459       "title": "Effective LLM-Driven Code Generation with Pythoness",
    460       "authors": ["Kyla H. Levin", "Kyle Gwilt", "Emery D. Berger", "Stephen N. Freund"],
    461       "year": 2025,
    462       "arxiv_id": "2501.02138",
    463       "relevance": "LLM-driven runtime code generation approach — alternative paradigm for AI-integrated programming."
    464     },
    465     {
    466       "title": "A Systematic Survey of Prompt Engineering in Large Language Models: Techniques and Applications",
    467       "authors": ["Pranab Sahoo", "Ayush Kumar Singh"],
    468       "year": 2025,
    469       "arxiv_id": "2402.07927",
    470       "relevance": "Comprehensive survey of prompt engineering techniques — contextualizes the problem SemTexts addresses."
    471     },
    472     {
    473       "title": "PromptPex: Automatic Test Generation for Language Model Prompts",
    474       "authors": ["Reshabh K Sharma", "Jonathan De Halleux", "Shraddha Barke", "Benjamin Zorn"],
    475       "year": 2025,
    476       "arxiv_id": "2503.05070",
    477       "relevance": "Automatic testing of LLM prompts — relevant to quality assurance of AI-integrated systems."
    478     },
    479     {
    480       "title": "GEPA: Reflective Prompt Evolution Can Outperform Reinforcement Learning",
    481       "authors": ["Lakshya A Agrawal", "Shangyin Tan"],
    482       "year": 2025,
    483       "arxiv_id": "2507.19457",
    484       "relevance": "Evolutionary prompt optimization built on DSPy — alternative approach to improving automated prompts."
    485     }
    486   ]
    487 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs