scan.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan.json (28037B)
      1 {
      2   "paper": {
      3     "title": "Prompt Sapper: A LLM-Empowered Production Tool for Building AI Chains",
      4     "authors": [
      5       "Yu Cheng",
      6       "Jieshan Chen",
      7       "Qing Huang",
      8       "Zhenchang Xing",
      9       "Xiwei Xu",
     10       "Qinghua Lu"
     11     ],
     12     "year": 2023,
     13     "venue": "ACM Transactions on Software Engineering and Methodology",
     14     "arxiv_id": "2306.12028",
     15     "doi": "10.1145/3638247"
     16   },
     17   "scan_version": 3,
     18   "active_modules": [],
     19   "methodology_tags": ["case-study", "qualitative"],
     20   "key_findings": "Prompt Sapper, a no-code visual programming IDE for building AI chains on foundation models, significantly reduces task completion time compared to Python/PyCharm (mean 1,689s vs 2,366s, p=0.0004) while maintaining equivalent correctness in a within-subject study with 18 participants. A second user study with 12 participants showed that LLM-based co-pilots for requirement elicitation and AI chain skeleton generation received favorable ratings (mean ≥3.5/5 across all dimensions), with 18/24 generated skeletons directly executable without modification.",
     21   "checklist": {
     22     "artifacts": {
     23       "code_released": {
     24         "applies": true,
     25         "answer": true,
     26         "justification": "The paper provides a GitHub URL (https://github.com/YuCheng1106/PromptSapper) and states 'we released the source code of our tool at our GitHub repository.' However, the sapperchain Python library required to execute downloaded AI chain code is explicitly noted as 'not open sourced' (Section 4.3.1)."
     27       },
     28       "data_released": {
     29         "applies": true,
     30         "answer": true,
     31         "justification": "The paper states 'The implementation of Sapper IDE and all experiment data and results can be downloaded at our Github repository' (Section 5 footnote and Section 8)."
     32       },
     33       "environment_specified": {
     34         "applies": true,
     35         "answer": false,
     36         "justification": "No environment specifications (requirements.txt, Dockerfile, dependency lists with versions) are mentioned in the paper. The tool is described as web-based but no deployment or setup instructions are provided."
     37       },
     38       "reproduction_instructions": {
     39         "applies": true,
     40         "answer": false,
     41         "justification": "No step-by-step reproduction instructions are described in the paper. The study procedure is described at a high level but there are no specific instructions for reproducing the user studies or running the tool."
     42       }
     43     },
     44     "statistical_methodology": {
     45       "confidence_intervals_or_error_bars": {
     46         "applies": true,
     47         "answer": true,
     48         "justification": "Table 1 reports means with standard deviations in parentheses for all conditions and tasks, e.g., 'Mean=1,689.00s, Std=447.49s' (Section 5.2.1). Figure 3 shows usability scores as bar charts."
     49       },
     50       "significance_tests": {
     51         "applies": true,
     52         "answer": true,
     53         "justification": "Paired t-tests are used throughout with Benjamini-Hochberg correction for multiple comparisons. Specific p-values reported: time difference p=0.0004, diffuseness p=0.0151, visibility p=0.0006 (Section 5.2)."
     54       },
     55       "effect_sizes_reported": {
     56         "applies": true,
     57         "answer": true,
     58         "justification": "While formal effect sizes (Cohen's d) are not computed, the paper provides means and standard deviations for both conditions allowing effect size assessment: Sapper V2 mean 1,689s vs Python mean 2,366s with t-statistic of 4.54 (Section 5.2.1)."
     59       },
     60       "sample_size_justified": {
     61         "applies": true,
     62         "answer": false,
     63         "justification": "No justification is given for why 18 participants were recruited for study 1 or 12 for study 2. No power analysis is reported. The sample sizes appear chosen by convenience."
     64       },
     65       "variance_reported": {
     66         "applies": true,
     67         "answer": true,
     68         "justification": "Standard deviations are reported alongside means in Table 1 for all conditions and tasks, e.g., 'Mean=1,689.00s, Std=447.49s' and '475.27 (228.58)' format."
     69       }
     70     },
     71     "evaluation_design": {
     72       "baselines_included": {
     73         "applies": true,
     74         "answer": true,
     75         "justification": "The study compares Sapper V2 against Python/PyCharm (native programming baseline) and Sapper V1 (block view only, ablation baseline) in a within-subject design (Section 5.1)."
     76       },
     77       "baselines_contemporary": {
     78         "applies": true,
     79         "answer": true,
     80         "justification": "Python is the most common language for LLM development and PyCharm is a widely-used contemporary IDE. These are appropriate and current baselines for the task of building AI chains."
     81       },
     82       "ablation_study": {
     83         "applies": true,
     84         "answer": true,
     85         "justification": "Sapper V1 (Block View only) vs Sapper V2 (Block View + Design View) serves as an ablation study isolating the contribution of the Design View component (Section 5.2.2)."
     86       },
     87       "multiple_metrics": {
     88         "applies": true,
     89         "answer": true,
     90         "justification": "The study measures task completion time, correctness (number of correctly completed tasks), and 9 cognitive dimension usability scores (Section 5.2). The co-pilot study adds 5 further rating dimensions."
     91       },
     92       "human_evaluation": {
     93         "applies": true,
     94         "answer": true,
     95         "justification": "The entire evaluation methodology consists of user studies with human participants (18 in study 1, 12 in study 2) who directly use and evaluate the tools (Sections 5 and 6)."
     96       },
     97       "held_out_test_set": {
     98         "applies": false,
     99         "answer": false,
    100         "justification": "This is a user study evaluating a tool through human performance, not a machine learning benchmark evaluation. The concept of held-out test sets does not structurally apply."
    101       },
    102       "per_category_breakdown": {
    103         "applies": true,
    104         "answer": true,
    105         "justification": "Table 1 breaks down time spent per task (Tasks 1-4). Figure 3 shows 9 individual cognitive dimension scores. Figures 4 and 5 break down co-pilot ratings across 5 and 3 dimensions respectively."
    106       },
    107       "failure_cases_discussed": {
    108         "applies": true,
    109         "answer": true,
    110         "justification": "Section 5.2.2 discusses specific issues: Design View doesn't support copy-paste, doesn't generate if-else/while loops, task decomposition co-pilot requires significant response time. Section 6.2.2 notes 6/24 skeletons required modifications."
    111       },
    112       "negative_results_reported": {
    113         "applies": true,
    114         "answer": true,
    115         "justification": "The paper reports that Sapper V2 showed no significant improvement over V1 in time (p=0.62, Section 5.2.2). It also reports Task 2 took slightly longer in V2 than V1 (Table 1: 529.94s vs 485.72s). Several Design View limitations are explicitly discussed."
    116       }
    117     },
    118     "claims_and_evidence": {
    119       "abstract_claims_supported": {
    120         "applies": true,
    121         "answer": true,
    122         "justification": "The abstract claims the tool 'evaluated and demonstrated the efficiency and correctness.' Efficiency is supported by significant time savings (p=0.0004). Correctness is supported by equivalent correctness to Python (no significant difference, p=0.59). The abstract also claims co-pilots 'enhance the efficiency and effectiveness,' supported by the second user study's favorable ratings."
    123       },
    124       "causal_claims_justified": {
    125         "applies": true,
    126         "answer": true,
    127         "justification": "The paper claims Sapper 'can significantly save time' compared to Python. The within-subject design with counterbalanced order across 6 possible tool orderings (Section 5.1.1) provides adequate causal identification for tool-level comparisons."
    128       },
    129       "generalization_bounded": {
    130         "applies": true,
    131         "answer": false,
    132         "justification": "The paper claims Prompt Sapper can 'democratize the usage of LLMs, making AI accessible to a wider range of individuals, including non-technical people' (Section 1), but all 30 participants were 18-25 year old university students majoring in CS, SE, AI, or Big Data. The generalization to non-technical users is unsupported."
    133       },
    134       "alternative_explanations_discussed": {
    135         "applies": true,
    136         "answer": false,
    137         "justification": "The paper uses counterbalanced order to control for sequence effects but does not discuss alternative explanations such as novelty effects, experimenter demand characteristics, task designs that may favor visual programming, or whether the observed time differences stem from API lookup difficulty rather than tool quality."
    138       },
    139       "proxy_outcome_distinction": {
    140         "applies": true,
    141         "answer": false,
    142         "justification": "The paper measures task completion time and Likert-scale usability scores but frames results as demonstrating 'practicality,' 'low entry barrier,' and 'democratization.' The gap between these measured proxies and the broader claimed outcomes (especially for non-technical users) is not acknowledged."
    143       }
    144     },
    145     "setup_transparency": {
    146       "model_versions_specified": {
    147         "applies": true,
    148         "answer": false,
    149         "justification": "The paper mentions 'gpt-3.5-turbo,' 'text-davinci-003,' and 'DALL-E' as pre-installed engines (Section 4.3.3) but does not specify exact version snapshots or dates. 'gpt-3.5-turbo' changes over time without a version specifier."
    150       },
    151       "prompts_provided": {
    152         "applies": true,
    153         "answer": false,
    154         "justification": "The co-pilots are described functionally (e.g., 'prompted as an infinite questioner,' 'converts the high-level intention into the main steps') but the actual system prompts used by the requirement elicitation and skeleton generation co-pilots are not provided anywhere in the paper."
    155       },
    156       "hyperparameters_reported": {
    157         "applies": true,
    158         "answer": false,
    159         "justification": "Section 4.3.3 mentions configurable parameters (temperature, maximum length, Top P, frequency penalty, presence penalty) in Engine Management but does not state what values were used during the user studies."
    160       },
    161       "scaffolding_described": {
    162         "applies": true,
    163         "answer": true,
    164         "justification": "The paper's main contribution is describing the Prompt Sapper tool architecture in detail across Sections 3-4, including Design View workflow (Fig. 1), Block View visual programming (Fig. 2), co-pilot interaction cycles, worker composition and cooperation patterns, and how conversation histories are transmitted to the LLM."
    165       },
    166       "data_preprocessing_documented": {
    167         "applies": true,
    168         "answer": false,
    169         "justification": "The study procedure is described (counterbalanced order, screen recording, questionnaires) but how the raw data (screen recordings, time measurements, correctness assessments) was processed into the reported statistics is not documented."
    170       }
    171     },
    172     "limitations_and_scope": {
    173       "limitations_section_present": {
    174         "applies": true,
    175         "answer": false,
    176         "justification": "Section 7 (Discussion) discusses future enhancements and challenges but is forward-looking rather than a limitations section. There is no dedicated 'Limitations' or 'Threats to Validity' section addressing the current study's weaknesses."
    177       },
    178       "threats_to_validity_specific": {
    179         "applies": true,
    180         "answer": false,
    181         "justification": "No specific threats to validity are discussed. The Discussion section covers future work directions (enhanced co-pilots, testing challenges, deployment) but does not address specific methodological threats like the student-only sample, small N, or potential demand effects."
    182       },
    183       "scope_boundaries_stated": {
    184         "applies": true,
    185         "answer": false,
    186         "justification": "The paper does not explicitly state what the results do NOT show. It does not bound its claims about 'democratization' or 'non-technical users' to the actually tested population of CS/SE/AI students."
    187       }
    188     },
    189     "data_integrity": {
    190       "raw_data_available": {
    191         "applies": true,
    192         "answer": true,
    193         "justification": "The paper states 'all experiment data and results can be downloaded at our Github repository' (Section 8, with URL https://github.com/YuCheng1106/PromptSapper)."
    194       },
    195       "data_collection_described": {
    196         "applies": true,
    197         "answer": true,
    198         "justification": "Section 5.1.1 describes the procedure: within-subject design, counterbalanced order, training session, warm-up tasks, three task sets with similar difficulty, screen recording, and post-task questionnaires. Section 6.1 describes the co-pilot study procedure."
    199       },
    200       "recruitment_methods_described": {
    201         "applies": true,
    202         "answer": false,
    203         "justification": "Study 1 says only 'We recruited 18 participants' with no description of how. Study 2 says 'recruited another 12 participants... through public channels' which is vague. No description of recruitment channels, response rates, or potential selection bias."
    204       },
    205       "data_pipeline_documented": {
    206         "applies": true,
    207         "answer": false,
    208         "justification": "The paper does not describe how screen recordings were analyzed, how correctness was determined from task outputs, or how time measurements were extracted. The pipeline from raw data collection to reported statistics is not documented."
    209       }
    210     },
    211     "conflicts_of_interest": {
    212       "funding_disclosed": {
    213         "applies": true,
    214         "answer": true,
    215         "justification": "The Acknowledgments section states: 'This work is partially supported by the National Nature Science Foundation of China under Grant (62262031).'"
    216       },
    217       "affiliations_disclosed": {
    218         "applies": true,
    219         "answer": true,
    220         "justification": "Author affiliations are clearly listed: Jiangxi Normal University (China) and CSIRO's Data61 (Australia). These are the developers of the evaluated tool."
    221       },
    222       "funder_independent_of_outcome": {
    223         "applies": true,
    224         "answer": true,
    225         "justification": "The National Natural Science Foundation of China is a government funding agency with no financial stake in whether Prompt Sapper outperforms Python."
    226       },
    227       "financial_interests_declared": {
    228         "applies": true,
    229         "answer": false,
    230         "justification": "No competing interests or financial interests statement is present in the paper. The authors developed and evaluate their own tool, but this potential conflict is not explicitly acknowledged."
    231       }
    232     },
    233     "contamination": {
    234       "training_cutoff_stated": {
    235         "applies": false,
    236         "answer": false,
    237         "justification": "This paper evaluates a visual programming tool through user studies, not a pre-trained model's capability on benchmarks. Contamination criteria do not apply."
    238       },
    239       "train_test_overlap_discussed": {
    240         "applies": false,
    241         "answer": false,
    242         "justification": "No benchmark evaluation of pre-trained model capability is performed. The evaluation is a user study comparing programming tools."
    243       },
    244       "benchmark_contamination_addressed": {
    245         "applies": false,
    246         "answer": false,
    247         "justification": "No benchmark evaluation of pre-trained model capability is performed. Contamination does not structurally apply."
    248       }
    249     },
    250     "human_studies": {
    251       "pre_registered": {
    252         "applies": true,
    253         "answer": false,
    254         "justification": "No mention of pre-registration (OSF, AsPredicted, or any registry) is found in the paper."
    255       },
    256       "irb_or_ethics_approval": {
    257         "applies": true,
    258         "answer": false,
    259         "justification": "No IRB or ethics board approval is mentioned. The only consent reference is 'We obtained their approval for this' regarding screen recording (Section 5.1.1 footnote), which is participant consent, not ethics board review."
    260       },
    261       "demographics_reported": {
    262         "applies": true,
    263         "answer": true,
    264         "justification": "Study 1 reports age range (18-25), majors (CS, SE, AI), programming experience (0-1, 1-3, 3+ years), gender (10M, 8F), and prior tool experience. Study 2 reports age (18-25), specializations (AI, CS, Big Data), and experience levels (Section 5.1.1 and 6.1)."
    265       },
    266       "inclusion_exclusion_criteria": {
    267         "applies": true,
    268         "answer": false,
    269         "justification": "Participant characteristics are described (e.g., 'None of them have used visual programming tools before, and all have learned Python and use PyCharm before') but these are stated as sample observations, not as pre-defined inclusion/exclusion criteria with a screening process."
    270       },
    271       "randomization_described": {
    272         "applies": true,
    273         "answer": true,
    274         "justification": "Section 5.1.1 describes counterbalanced order: 'since we had three different tools to choose from, there were a total of six possible combinations of orders in which the tools could be used. Each combination of orders was used by three participants.'"
    275       },
    276       "blinding_described": {
    277         "applies": false,
    278         "answer": false,
    279         "justification": "In a within-subject study comparing visually distinct programming tools (PyCharm vs Sapper V1 vs Sapper V2), participant blinding is structurally not feasible. Outcome measures (time, correctness via test cases, self-reported usability) are either automated or self-reported."
    280       },
    281       "attrition_reported": {
    282         "applies": true,
    283         "answer": false,
    284         "justification": "The paper does not report whether all recruited participants completed all tasks. It starts with 18 (study 1) and 12 (study 2) and reports results for these numbers, but does not explicitly state whether anyone dropped out or was excluded."
    285       }
    286     },
    287     "cost_and_practicality": {
    288       "inference_cost_reported": {
    289         "applies": true,
    290         "answer": false,
    291         "justification": "The tool calls LLM APIs (GPT-3.5-turbo, text-davinci-003) for co-pilots and worker execution, but no API costs, token counts, or latency measurements are reported."
    292       },
    293       "compute_budget_stated": {
    294         "applies": true,
    295         "answer": false,
    296         "justification": "No computational budget, hardware specifications, or total API spend for the user studies or tool development is reported."
    297       }
    298     }
    299   },
    300   "claims": [
    301     {
    302       "claim": "Sapper V2 significantly reduces task completion time compared to Python/PyCharm",
    303       "evidence": "Paired t-test: t=4.54, p=0.0004. Sapper V2 mean 1,689s (Std=447.49s) vs Python mean 2,366s (Std=536.70s). Table 1 shows per-task breakdown (Section 5.2.1).",
    304       "supported": "strong"
    305     },
    306     {
    307       "claim": "Sapper V2 achieves equivalent correctness to Python",
    308       "evidence": "No significant difference in correctness: t=-0.53, p=0.59 (Section 5.2.1).",
    309       "supported": "moderate"
    310     },
    311     {
    312       "claim": "Sapper V2 is significantly more concise (diffuseness) and modifiable (visibility) than Python",
    313       "evidence": "Diffuseness: p=0.0151; Visibility: p=0.0006. Other 7 cognitive dimensions showed no significant difference (Section 5.2.1, Figure 3).",
    314       "supported": "strong"
    315     },
    316     {
    317       "claim": "Design View (V2 vs V1) does not significantly improve time or usability",
    318       "evidence": "No significant difference between V1 (Mean=1,751s) and V2 (Mean=1,689s) in time (p=0.62). No significant usability differences across 9 dimensions (Section 5.2.2).",
    319       "supported": "moderate"
    320     },
    321     {
    322       "claim": "The requirement elicitation co-pilot is useful and easy to use",
    323       "evidence": "Mean ratings ≥3.5/5 across all five dimensions. Over 70% of sessions rated 4-5 on usefulness and diversity; over 80% rated 4-5 on ease-of-use and relevance (Section 6.2.1, Figure 4).",
    324       "supported": "moderate"
    325     },
    326     {
    327       "claim": "The AI chain skeleton generation co-pilot produces usable skeletons",
    328       "evidence": "18/24 generated skeletons were directly executable; remaining 6 needed only minor prompt modifications. All participants said skeletons facilitated development (Section 6.2.2, Figure 5).",
    329       "supported": "moderate"
    330     }
    331   ],
    332   "red_flags": [
    333     {
    334       "flag": "Small, non-representative sample",
    335       "detail": "All 30 participants across both studies were 18-25 year old university students majoring in CS, SE, AI, or Big Data. The paper generalizes to 'non-technical people' and claims to 'democratize the usage of LLMs' based on this narrow sample of technically trained students."
    336     },
    337     {
    338       "flag": "Authors evaluating own tool",
    339       "detail": "The authors developed Prompt Sapper and conducted all evaluations themselves. No independent evaluation or external evaluators were involved. This self-comparison bias is not acknowledged anywhere in the paper."
    340     },
    341     {
    342       "flag": "Incomplete reproducibility",
    343       "detail": "The sapperchain Python library required to execute downloaded AI chain code is explicitly described as 'not open sourced' (Section 4.3.1), meaning the tool cannot be fully reproduced or independently tested despite the source code release claim."
    344     },
    345     {
    346       "flag": "No ethics approval for human subjects research",
    347       "detail": "Two user studies involving 30 human participants were conducted with no mention of IRB or ethics board approval, only participant consent for screen recording."
    348     },
    349     {
    350       "flag": "Claims exceed evidence",
    351       "detail": "The paper makes broad claims about 'democratization,' 'production tool,' and benefiting 'non-technical people' (abstract, Section 1), but the user study tasks were simple programming constructs (if-else, while loops, variables) tested with CS/SE/AI students. No evidence supports claims about production use or non-technical users."
    352     }
    353   ],
    354   "cited_papers": [
    355     {
    356       "title": "Ai chains: Transparent and controllable human-ai interaction by chaining large language model prompts",
    357       "authors": ["Tongshuang Wu", "Michael Terry", "Carrie Jun Cai"],
    358       "year": 2022,
    359       "relevance": "Foundational work on chaining LLM prompts for transparency and controllability, directly motivating the AI chain methodology."
    360     },
    361     {
    362       "title": "Promptchainer: Chaining large language model prompts through visual programming",
    363       "authors": ["Tongshuang Wu", "Ellen Jiang", "Aaron Donsbach", "Jeff Gray", "Alejandra Molina", "Michael Terry", "Carrie J Cai"],
    364       "year": 2022,
    365       "relevance": "Visual programming approach for chaining LLM prompts, a direct precursor to Prompt Sapper's block-based approach."
    366     },
    367     {
    368       "title": "PromptMaker: Prompt-based Prototyping with Large Language Models",
    369       "authors": ["Ellen Jiang", "Kristen Olson", "Edwin Toh", "Alejandra Molina", "Aaron Donsbach", "Michael Terry", "Carrie J Cai"],
    370       "year": 2022,
    371       "relevance": "Prompt-based prototyping tool for ML functionality, demonstrating the need for tool support in prompt engineering."
    372     },
    373     {
    374       "title": "PCR-Chain: Partial Code Reuse Assisted by Hierarchical Chaining of Prompts on Frozen Copilot",
    375       "authors": ["Qing Huang", "Jiahui Zhu", "Zhilong Li", "Zhenchang Xing", "Changjing Wang", "Xiwei Xu"],
    376       "year": 2023,
    377       "relevance": "Hierarchical prompt chaining for code reuse tasks, demonstrating AI chain methodology applied to software engineering."
    378     },
    379     {
    380       "title": "Chain-of-Thought Prompting Elicits Reasoning in Large Language Models",
    381       "authors": ["Jason Wei", "Xuezhi Wang", "Dale Schuurmans", "Maarten Bosma", "brian ichter", "Fei Xia", "Ed Chi", "Quoc V Le", "Denny Zhou"],
    382       "year": 2022,
    383       "relevance": "Foundational chain-of-thought prompting technique that underpins multi-step reasoning in LLM-based AI chains."
    384     },
    385     {
    386       "title": "Reflexion: an autonomous agent with dynamic memory and self-reflection",
    387       "authors": ["Noah Shinn", "Beck Labash", "Ashwin Gopinath"],
    388       "year": 2023,
    389       "arxiv_id": "2303.11366",
    390       "relevance": "Autonomous LLM agent with self-reflection capability, representing the agentic AI paradigm that AI chains aim to systematize."
    391     },
    392     {
    393       "title": "Large Language Models Are Human-Level Prompt Engineers",
    394       "authors": ["Yongchao Zhou", "Andrei Ioan Muresanu", "Ziwen Han", "Keiran Paster", "Silviu Pitis", "Harris Chan", "Jimmy Ba"],
    395       "year": 2023,
    396       "arxiv_id": "2211.01910",
    397       "relevance": "Demonstrates LLMs can generate effective prompts automatically, supporting the 'magic enhancing magic' concept in Prompt Sapper."
    398     },
    399     {
    400       "title": "Large Language Models are Zero-Shot Reasoners",
    401       "authors": ["Takeshi Kojima", "Shixiang Gu", "Machel Reid", "Yutaka Matsuo", "Yusuke Iwasawa"],
    402       "year": 2022,
    403       "relevance": "Zero-shot reasoning via 'let's think step by step' prompting, a key technique for LLM task decomposition."
    404     },
    405     {
    406       "title": "Self-consistency improves chain of thought reasoning in language models",
    407       "authors": ["Xuezhi Wang", "Jason Wei", "Dale Schuurmans", "Quoc Le", "Ed Chi", "Denny Zhou"],
    408       "year": 2022,
    409       "arxiv_id": "2203.11171",
    410       "relevance": "Self-consistency as a strategy to improve LLM reasoning, relevant to prompt engineering methodology."
    411     },
    412     {
    413       "title": "On the opportunities and risks of foundation models",
    414       "authors": ["Rishi Bommasani", "Drew A Hudson", "Ehsan Adeli"],
    415       "year": 2021,
    416       "arxiv_id": "2108.07258",
    417       "relevance": "Comprehensive analysis of foundation model opportunities and risks that frames the context for AI chain development."
    418     },
    419     {
    420       "title": "Language models are few-shot learners",
    421       "authors": ["Tom Brown", "Benjamin Mann", "Nick Ryder"],
    422       "year": 2020,
    423       "relevance": "GPT-3 paper establishing in-context few-shot learning, the foundation for prompt-based AI services."
    424     },
    425     {
    426       "title": "Standing on the shoulders of giant frozen language models",
    427       "authors": ["Yoav Levine", "Itay Dalmedigos", "Ori Ram"],
    428       "year": 2022,
    429       "relevance": "Proposed using multiple calls on frozen LLMs to complete complex tasks, directly influencing AI chain decomposition methodology."
    430     },
    431     {
    432       "title": "Progprompt: Generating situated robot task plans using large language models",
    433       "authors": ["Ishika Singh", "Valts Blukis", "Arsalan Mousavian"],
    434       "year": 2022,
    435       "arxiv_id": "2209.11302",
    436       "relevance": "Code-like prompts for robot task planning with LLMs, demonstrating semi-structured prompt patterns used in AI chains."
    437     }
    438   ],
    439   "engagement_factors": {
    440     "practical_relevance": {
    441       "score": 2,
    442       "justification": "A no-code IDE for building AI chains is practically useful, but the core sapperchain library is not open-sourced, limiting immediate adoption."
    443     },
    444     "surprise_contrarian": {
    445       "score": 0,
    446       "justification": "Results confirm expected findings that visual tools are easier than text-based coding for non-experts."
    447     },
    448     "fear_safety": {
    449       "score": 0,
    450       "justification": "No AI safety or security concerns raised; the paper focuses on tool usability."
    451     },
    452     "drama_conflict": {
    453       "score": 0,
    454       "justification": "No controversy or conflict; straightforward tool presentation and evaluation."
    455     },
    456     "demo_ability": {
    457       "score": 2,
    458       "justification": "Source code on GitHub and a web prototype at promptsapper.tech, but the required sapperchain library is closed-source."
    459     },
    460     "brand_recognition": {
    461       "score": 1,
    462       "justification": "CSIRO's Data61 is a reputable Australian research organization and the paper is published in ACM TOSEM, but neither are household names."
    463     }
    464   }
    465 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs