scan-v5.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan-v5.json (28195B)
      1 {
      2   "scan_version": 5,
      3   "paper_type": "empirical",
      4   "paper": {
      5     "title": "LLM for Test Script Generation and Migration: Challenges, Capabilities, and Opportunities",
      6     "authors": [
      7       "Shengcheng Yu",
      8       "Chunrong Fang",
      9       "Yuchen Ling",
     10       "Chentian Wu",
     11       "Zhenyu Chen"
     12     ],
     13     "year": 2023,
     14     "venue": "International Conference on Software Quality, Reliability and Security",
     15     "arxiv_id": "2309.13574",
     16     "doi": "10.1109/QRS60937.2023.00029"
     17   },
     18   "checklist": {
     19     "claims_and_evidence": {
     20       "abstract_claims_supported": {
     21         "applies": true,
     22         "answer": false,
     23         "justification": "The abstract claims LLMs empower developers to achieve 'higher levels of software quality and development efficiency,' but the study only qualitatively demonstrates ChatGPT generating syntactically correct scripts on 6 apps, all requiring manual corrections for execution. The efficiency claim is entirely unquantified.",
     24         "source": "haiku"
     25       },
     26       "causal_claims_justified": {
     27         "applies": true,
     28         "answer": false,
     29         "justification": "The paper asserts the dialogue-based approach 'significantly reduces manual intervention and enhances script generation efficiency' without any quantitative baseline comparison to support this causal claim.",
     30         "source": "haiku"
     31       },
     32       "generalization_bounded": {
     33         "applies": true,
     34         "answer": false,
     35         "justification": "Only gpt-3.5-turbo on 6 apps is tested, yet findings are framed throughout as insights about 'LLMs' broadly; the title, abstract, and conclusions do not bound results to the specific model and narrow app set studied.",
     36         "source": "haiku"
     37       },
     38       "alternative_explanations_discussed": {
     39         "applies": true,
     40         "answer": false,
     41         "justification": "The paper does not consider whether failures stem from suboptimal prompting versus fundamental model limitations, or whether successes reflect LLM capability versus app exposure in training data.",
     42         "source": "haiku"
     43       },
     44       "proxy_outcome_distinction": {
     45         "applies": true,
     46         "answer": false,
     47         "justification": "Evaluation criteria are 'grammatical accuracy, semantic correctness, and practical applicability,' but claims are about test automation capability; execution failures requiring manual correction are not distinguished from generation quality.",
     48         "source": "haiku"
     49       }
     50     },
     51     "limitations_and_scope": {
     52       "limitations_section_present": {
     53         "applies": true,
     54         "answer": true,
     55         "justification": "Section 5.1 is a dedicated 'Challenges' section identifying context memory, API usage randomness, human effort requirements, and limited test event support as specific LLM limitations.",
     56         "source": "haiku"
     57       },
     58       "threats_to_validity_specific": {
     59         "applies": true,
     60         "answer": false,
     61         "justification": "No formal threats-to-validity section exists; the challenges section discusses LLM operational limitations rather than research validity threats such as sample selection bias, evaluator bias, or external validity of a 6-app study.",
     62         "source": "haiku"
     63       },
     64       "scope_boundaries_stated": {
     65         "applies": true,
     66         "answer": false,
     67         "justification": "The paper does not explicitly state what its results do not show; there are no statements bounding findings to the specific model version, apps, or prompt designs tested.",
     68         "source": "haiku"
     69       }
     70     },
     71     "conflicts_of_interest": {
     72       "funding_disclosed": {
     73         "applies": true,
     74         "answer": true,
     75         "justification": "Funding is disclosed in the acknowledgment: National Natural Science Foundation of China, Science Technology and Innovation Commission of Shenzhen Municipality, and the National Undergraduate Training Program for Innovation and Entrepreneurship.",
     76         "source": "haiku"
     77       },
     78       "affiliations_disclosed": {
     79         "applies": true,
     80         "answer": true,
     81         "justification": "All authors are affiliated with the State Key Laboratory for Novel Software Technology, Nanjing University, China, clearly disclosed in the header.",
     82         "source": "haiku"
     83       },
     84       "funder_independent_of_outcome": {
     85         "applies": true,
     86         "answer": true,
     87         "justification": "Funders are government science foundations with no financial stake in OpenAI, ChatGPT, Appium, or any mobile app testing tool evaluated.",
     88         "source": "haiku"
     89       },
     90       "financial_interests_declared": {
     91         "applies": true,
     92         "answer": false,
     93         "justification": "No competing interests or financial interests declaration appears in the paper.",
     94         "source": "haiku"
     95       }
     96     },
     97     "scope_and_framing": {
     98       "key_terms_defined": {
     99         "applies": true,
    100         "answer": true,
    101         "justification": "Test script generation, test script migration, and LLMs are defined in background sections 2.1 and 2.2; the three task types (scenario-based, cross-platform, cross-app) are clearly defined in the research questions.",
    102         "source": "haiku"
    103       },
    104       "intended_contribution_clear": {
    105         "applies": true,
    106         "answer": true,
    107         "justification": "Three explicit contributions are listed in the introduction: first investigation of LLMs for mobile test script tasks, a thorough capability investigation, and future research directions for the community.",
    108         "source": "haiku"
    109       },
    110       "engagement_with_prior_work": {
    111         "applies": true,
    112         "answer": true,
    113         "justification": "Section 6 engages extensively with prior work across test generation, test migration, and LLMs for software engineering, positioning this work at the intersection of these three areas as a novel contribution.",
    114         "source": "haiku"
    115       }
    116     }
    117   },
    118   "type_checklist": {
    119     "empirical": {
    120       "artifacts": {
    121         "code_released": {
    122           "applies": true,
    123           "answer": false,
    124           "justification": "No source code, scripts, or repository link is provided; no code availability statement appears in the paper.",
    125           "source": "haiku"
    126         },
    127         "data_released": {
    128           "applies": true,
    129           "answer": false,
    130           "justification": "No test scripts, prompts with filled values, or LLM conversation logs are released; the commercial apps used are publicly downloadable but the experimental artifacts are not shared.",
    131           "source": "haiku"
    132         },
    133         "environment_specified": {
    134           "applies": true,
    135           "answer": false,
    136           "justification": "Only 'gpt-3.5-turbo model via OpenAI API' and Appium are mentioned; no dependency versions, OS configuration, or device specifications used in execution are documented.",
    137           "source": "haiku"
    138         },
    139         "reproduction_instructions": {
    140           "applies": true,
    141           "answer": false,
    142           "justification": "Prompt templates are shown but the app-specific element IDs, XPaths, and configuration data used are not shared; no step-by-step instructions sufficient to reproduce the experiments are provided.",
    143           "source": "haiku"
    144         }
    145       },
    146       "statistical_methodology": {
    147         "confidence_intervals_or_error_bars": {
    148           "applies": true,
    149           "answer": false,
    150           "justification": "All results are reported qualitatively as success/failure descriptions; no numerical metrics, confidence intervals, or error bars appear anywhere in the paper.",
    151           "source": "haiku"
    152         },
    153         "significance_tests": {
    154           "applies": true,
    155           "answer": false,
    156           "justification": "No statistical significance tests are applied despite making comparative claims (e.g., dialogue-based versus direct prompting, different scenario complexities).",
    157           "source": "haiku"
    158         },
    159         "effect_sizes_reported": {
    160           "applies": true,
    161           "answer": false,
    162           "justification": "No effect sizes are reported; performance is described qualitatively without any quantification of improvement or degradation.",
    163           "source": "haiku"
    164         },
    165         "sample_size_justified": {
    166           "applies": true,
    167           "answer": false,
    168           "justification": "The selection of 6 apps and approximately 9 scenarios is not justified; no power analysis or rationale for the sample size is provided.",
    169           "source": "haiku"
    170         },
    171         "variance_reported": {
    172           "applies": true,
    173           "answer": false,
    174           "justification": "Some experiments are repeated twice but outcomes are described individually rather than aggregated; no variance, standard deviation, or consistency metrics are reported.",
    175           "source": "haiku"
    176         }
    177       },
    178       "evaluation_design": {
    179         "baselines_included": {
    180           "applies": true,
    181           "answer": false,
    182           "justification": "No baseline comparison is included against existing automated test generation or migration tools such as CraftDroid, AppTestMigrator, or Appium-driven approaches.",
    183           "source": "haiku"
    184         },
    185         "baselines_contemporary": {
    186           "applies": false,
    187           "answer": false,
    188           "justification": "No baselines are included, so this criterion does not apply.",
    189           "source": "haiku"
    190         },
    191         "ablation_study": {
    192           "applies": true,
    193           "answer": false,
    194           "justification": "No ablation study is performed; different prompt configurations are tried but without systematic component removal or controlled variation to isolate what drives performance.",
    195           "source": "haiku"
    196         },
    197         "multiple_metrics": {
    198           "applies": true,
    199           "answer": false,
    200           "justification": "'Grammatical accuracy, semantic correctness, and practical applicability' are named as criteria but none are operationalized or reported numerically; evaluation is entirely qualitative.",
    201           "source": "haiku"
    202         },
    203         "human_evaluation": {
    204           "applies": true,
    205           "answer": true,
    206           "justification": "Researchers manually verify that generated scripts align with the predefined test operation process and execute them on designated testing devices to confirm practical applicability.",
    207           "source": "haiku"
    208         },
    209         "held_out_test_set": {
    210           "applies": false,
    211           "answer": false,
    212           "justification": "Not a prediction task; the study evaluates LLM capability on fixed scenarios without a train/test split.",
    213           "source": "haiku"
    214         },
    215         "per_category_breakdown": {
    216           "applies": true,
    217           "answer": true,
    218           "justification": "Results are broken down by app (email vs. travel), by specific app (Outlook, QQ Mail, NetEase Mail; Fliggy, Ctrip, Mafengwo), and by scenario complexity (login, sending email, flight search and booking) across three RQs.",
    219           "source": "haiku"
    220         },
    221         "failure_cases_discussed": {
    222           "applies": true,
    223           "answer": true,
    224           "justification": "Specific failures are documented: deprecated Appium API usage, context memory loss causing inability to self-terminate in 'adding email account,' improper focus handling, and ChatGPT generating an irrelevant script in the second complex test.",
    225           "source": "haiku"
    226         },
    227         "negative_results_reported": {
    228           "applies": true,
    229           "answer": true,
    230           "justification": "Negative results are explicitly reported: ChatGPT fails to generate a relevant script in the second 'adding email account' test, cannot self-correct API issues, and cross-app migration requires effort comparable to writing scripts from scratch.",
    231           "source": "haiku"
    232         }
    233       },
    234       "setup_transparency": {
    235         "model_versions_specified": {
    236           "applies": true,
    237           "answer": true,
    238           "justification": "The paper specifies 'the API provided by OpenAI with the gpt-3.5-turbo model' as the LLM used throughout all experiments.",
    239           "source": "haiku"
    240         },
    241         "prompts_provided": {
    242           "applies": true,
    243           "answer": true,
    244           "justification": "Actual prompt templates are provided for all three experimental setups: the general prompt for RQ1 direct generation, the three-phase dialogue prompts (initiation, exploration, summarization), and general prompts for RQ2 and RQ3.",
    245           "source": "haiku"
    246         },
    247         "hyperparameters_reported": {
    248           "applies": true,
    249           "answer": false,
    250           "justification": "No hyperparameters such as temperature, top-p, max tokens, or number of API calls are reported for the gpt-3.5-turbo model used.",
    251           "source": "haiku"
    252         },
    253         "scaffolding_described": {
    254           "applies": true,
    255           "answer": true,
    256           "justification": "The three-phase dialogue framework (initiation, exploration, summarization) is described in sufficient detail including task definitions, JSON output format requirements, and the iterative turn-by-turn protocol.",
    257           "source": "haiku"
    258         },
    259         "data_preprocessing_documented": {
    260           "applies": true,
    261           "answer": false,
    262           "justification": "The process of extracting element IDs, XPaths, package names, and activity names from apps is described only vaguely as 'manually acquire comprehensive information'; the specific extraction procedure is not documented.",
    263           "source": "haiku"
    264         }
    265       },
    266       "data_integrity": {
    267         "raw_data_available": {
    268           "applies": true,
    269           "answer": false,
    270           "justification": "No raw data—generated scripts, conversation logs, or element extraction results—is made available for independent verification.",
    271           "source": "haiku"
    272         },
    273         "data_collection_described": {
    274           "applies": true,
    275           "answer": false,
    276           "justification": "Data collection is described only as 'manually acquire comprehensive information of the whole testing process'; the exact procedure for obtaining configuration data and UI element identifiers is not documented.",
    277           "source": "haiku"
    278         },
    279         "recruitment_methods_described": {
    280           "applies": false,
    281           "answer": false,
    282           "justification": "No human participants were recruited; the study uses commercial mobile apps as test subjects.",
    283           "source": "haiku"
    284         },
    285         "data_pipeline_documented": {
    286           "applies": true,
    287           "answer": false,
    288           "justification": "No formal data pipeline from app information extraction to prompt construction to script evaluation and verification is documented.",
    289           "source": "haiku"
    290         }
    291       },
    292       "contamination": {
    293         "training_cutoff_stated": {
    294           "applies": true,
    295           "answer": false,
    296           "justification": "The training data cutoff for gpt-3.5-turbo is not stated, which is relevant since popular apps like Outlook may have extensive UI documentation in training data.",
    297           "source": "haiku"
    298         },
    299         "train_test_overlap_discussed": {
    300           "applies": true,
    301           "answer": false,
    302           "justification": "No discussion of whether the commercial apps tested (Outlook, QQ Mail, Fliggy, etc.) or their UI patterns and existing test scripts appear in gpt-3.5-turbo's training data.",
    303           "source": "haiku"
    304         },
    305         "benchmark_contamination_addressed": {
    306           "applies": true,
    307           "answer": false,
    308           "justification": "The apps used are major commercial apps with publicly available documentation and open-source test scripts, but potential contamination from training data exposure is not addressed.",
    309           "source": "haiku"
    310         }
    311       },
    312       "human_studies": {
    313         "pre_registered": {
    314           "applies": false,
    315           "answer": false,
    316           "justification": "No human participants in this study.",
    317           "source": "haiku"
    318         },
    319         "irb_or_ethics_approval": {
    320           "applies": false,
    321           "answer": false,
    322           "justification": "No human participants in this study.",
    323           "source": "haiku"
    324         },
    325         "demographics_reported": {
    326           "applies": false,
    327           "answer": false,
    328           "justification": "No human participants in this study.",
    329           "source": "haiku"
    330         },
    331         "inclusion_exclusion_criteria": {
    332           "applies": false,
    333           "answer": false,
    334           "justification": "No human participants in this study.",
    335           "source": "haiku"
    336         },
    337         "randomization_described": {
    338           "applies": false,
    339           "answer": false,
    340           "justification": "No human participants in this study.",
    341           "source": "haiku"
    342         },
    343         "blinding_described": {
    344           "applies": false,
    345           "answer": false,
    346           "justification": "No human participants in this study.",
    347           "source": "haiku"
    348         },
    349         "attrition_reported": {
    350           "applies": false,
    351           "answer": false,
    352           "justification": "No human participants in this study.",
    353           "source": "haiku"
    354         }
    355       },
    356       "cost_and_practicality": {
    357         "inference_cost_reported": {
    358           "applies": true,
    359           "answer": false,
    360           "justification": "No API cost, token counts, or inference latency is reported despite using a commercial API with per-token pricing across multiple multi-turn conversations.",
    361           "source": "haiku"
    362         },
    363         "compute_budget_stated": {
    364           "applies": true,
    365           "answer": false,
    366           "justification": "No total computational budget or API expenditure is stated for the experiments.",
    367           "source": "haiku"
    368         }
    369       }
    370     }
    371   },
    372   "claims": [
    373     {
    374       "claim": "LLMs (gpt-3.5-turbo) can generate grammatically and syntactically correct Appium test scripts for mobile app scenarios when given sufficient structured information",
    375       "evidence": "Nine experiments across 6 apps show all generated scripts are grammatically and syntactically correct and manually verified to align with predefined test operations, though direct execution requires corrections",
    376       "supported": "moderate"
    377     },
    378     {
    379       "claim": "The dialogue-based framework enables ChatGPT to understand business logic and self-correct errors during interactive test generation",
    380       "evidence": "Login scenario experiments show ChatGPT navigating app states in 7-8 rounds, identifying an unchecked Terms of Service checkbox and self-correcting in a second run, but failing in the more complex 'adding email account' scenario",
    381       "supported": "moderate"
    382     },
    383     {
    384       "claim": "LLMs enable cross-platform test script migration with minimal information (device name, version, differential steps, old script)",
    385       "evidence": "Three cross-platform migration experiments show predominantly seamless execution after migration, though password input focus issues require manual correction in all cases",
    386       "supported": "moderate"
    387     },
    388     {
    389       "claim": "Context memory limitation is a fundamental LLM failure mode in complex multi-step mobile test scenarios",
    390       "evidence": "Explicitly demonstrated: in the 'adding email account' scenario ChatGPT loses track of prior context due to excessive page elements, continues unnecessary analysis, and fails to generate a valid script in one of two attempts",
    391       "supported": "strong"
    392     },
    393     {
    394       "claim": "The dialogue-based approach significantly reduces manual intervention compared to direct prompting",
    395       "evidence": "The claim is asserted but no quantitative measurement of manual effort is provided and no comparison against the direct-prompting baseline is performed",
    396       "supported": "weak"
    397     },
    398     {
    399       "claim": "Cross-app migration essentially transforms into few-shot generation because similar apps have sufficiently different implementations",
    400       "evidence": "Qualitative observation that even apps sharing functionalities (email, travel booking) differ enough in architecture and UI that migration scripts require the old script as an example, functioning as few-shot generation",
    401       "supported": "moderate"
    402     }
    403   ],
    404   "methodology_tags": [
    405     "case-study",
    406     "qualitative"
    407   ],
    408   "key_findings": "ChatGPT (gpt-3.5-turbo) can generate syntactically correct Appium test scripts for mobile app scenarios when given sufficient structured information, but all generated scripts require manual corrections for execution-level issues including deprecated APIs and focus handling. Context memory limitations cause failures in complex multi-step scenarios, demonstrated when ChatGPT loses track of the original task during 'adding email account' and fails to produce a valid script. Cross-app migration essentially reduces to few-shot generation because structurally similar apps have sufficiently different implementations that substantial manual re-specification is required, potentially exceeding the cost of manual scripting. The paper provides a structured taxonomy of LLM challenges (context memory, API randomness, human effort dependency, limited test event support) and capabilities (business logic understanding, multi-level granularity prompting, multi-to-multi event mapping) that frames future research directions.",
    409   "red_flags": [
    410     {
    411       "flag": "No quantitative metrics",
    412       "detail": "All evaluation criteria (grammatical accuracy, semantic correctness, practical applicability) are assessed qualitatively with no numerical success rates, scores, or counts reported across any of the experiments."
    413     },
    414     {
    415       "flag": "No baseline comparison",
    416       "detail": "No comparison against existing automated test generation or migration tools (CraftDroid, AppTestMigrator, Monkey, Appium-native approaches) is included, making capability claims unanchored and the paper's contribution unmeasured."
    417     },
    418     {
    419       "flag": "Tiny sample, broad generalization",
    420       "detail": "Only 6 apps and approximately 9-15 scenarios are tested with a single model (gpt-3.5-turbo), yet findings are repeatedly framed as insights about 'LLMs' broadly in the title, abstract, and conclusions."
    421     },
    422     {
    423       "flag": "Author-only evaluation",
    424       "detail": "Script correctness and applicability are assessed solely by the paper's authors without independent evaluators or inter-rater reliability measurement, introducing unquantified confirmation bias."
    425     },
    426     {
    427       "flag": "No reproducibility artifacts",
    428       "detail": "No code, generated scripts, conversation logs, element extraction data, or device specifications are released; the study is entirely irreproducible from what is published."
    429     },
    430     {
    431       "flag": "Contamination unaddressed",
    432       "detail": "Major commercial apps (Outlook, QQ Mail) with extensive online documentation are used as test subjects without any discussion of whether their UI patterns or existing test scripts appear in gpt-3.5-turbo's training data."
    433     }
    434   ],
    435   "cited_papers": [
    436     {
    437       "title": "Mobile GUI test script generation from natural language descriptions using pre-trained model",
    438       "relevance": "Direct predecessor using pre-trained models for NL-to-test-script generation for mobile apps, foundational prior work for RQ1"
    439     },
    440     {
    441       "title": "Test transfer across mobile apps through semantic mapping (CraftDroid)",
    442       "relevance": "Key prior approach to cross-app test migration that this paper's RQ3 investigates replacing with LLMs"
    443     },
    444     {
    445       "title": "Test migration between mobile apps with similar functionality (AppTestMigrator)",
    446       "relevance": "Scenario-based migration technique representing prior state of the art for the cross-app task"
    447     },
    448     {
    449       "title": "GUI test transfer from web to Android",
    450       "relevance": "Prior cross-platform test migration work that RQ2 extends to LLMs in the Android/iOS context"
    451     },
    452     {
    453       "title": "Semantic matching of GUI events for test reuse: are we there yet?",
    454       "relevance": "Empirical study on GUI semantic matching for test reuse, directly related to the cross-app migration task"
    455     },
    456     {
    457       "title": "Adaptive test generation using a large language model (TestPilot)",
    458       "relevance": "Contemporary work on LLM-based test generation for unit tests, closest parallel to this paper's approach"
    459     },
    460     {
    461       "title": "Prompting is all your need: Automated android bug replay with large language models (AdbGPT)",
    462       "relevance": "Very closely related concurrent work using LLMs with prompt engineering for Android testing in the same application domain"
    463     },
    464     {
    465       "title": "Large language models are few-shot testers: Exploring LLM-based general bug reproduction (LIBRO)",
    466       "relevance": "Related LLM-for-testing work establishing LLMs can generate tests from natural language descriptions (bug reports)"
    467     }
    468   ],
    469   "engagement_factors": {
    470     "practical_relevance": {
    471       "score": 2,
    472       "justification": "Test script generation and migration are genuine practitioner pain points and ChatGPT is immediately accessible with the prompt templates provided."
    473     },
    474     "surprise_contrarian": {
    475       "score": 1,
    476       "justification": "The mixed results (works for simple cases, fails for complex ones with context memory issues) are unsurprising and confirm rather than challenge 2023 expectations about LLM limitations."
    477     },
    478     "fear_safety": {
    479       "score": 0,
    480       "justification": "No AI safety, reliability, or risk concerns are raised; the paper is purely about software testing automation utility."
    481     },
    482     "drama_conflict": {
    483       "score": 0,
    484       "justification": "No controversy or conflict with established results; the paper explicitly positions itself as exploratory first work without challenging prior findings."
    485     },
    486     "demo_ability": {
    487       "score": 2,
    488       "justification": "Experiments use publicly accessible ChatGPT with concrete prompt templates provided; practitioners could immediately attempt the approach on their own apps."
    489     },
    490     "brand_recognition": {
    491       "score": 1,
    492       "justification": "OpenAI/ChatGPT is prominently referenced but the paper originates from Nanjing University without major lab brand recognition in the AI community."
    493     }
    494   },
    495   "hn_data": {
    496     "threads": [
    497       {
    498         "hn_id": "42140356",
    499         "title": "Language agents achieve superhuman synthesis of scientific knowledge",
    500         "points": 54,
    501         "comments": 22,
    502         "url": "https://news.ycombinator.com/item?id=42140356"
    503       },
    504       {
    505         "hn_id": "37741668",
    506         "title": "Robust self-propulsion in sand using simply controlled vibrating cubes",
    507         "points": 3,
    508         "comments": 0,
    509         "url": "https://news.ycombinator.com/item?id=37741668"
    510       },
    511       {
    512         "hn_id": "36111250",
    513         "title": "How Language Model Hallucinations Can Snowball",
    514         "points": 2,
    515         "comments": 1,
    516         "url": "https://news.ycombinator.com/item?id=36111250"
    517       },
    518       {
    519         "hn_id": "45396094",
    520         "title": "Context-Aware Membership Inference Attacks Against Pre-Trained LLMs",
    521         "points": 2,
    522         "comments": 0,
    523         "url": "https://news.ycombinator.com/item?id=45396094"
    524       },
    525       {
    526         "hn_id": "36483213",
    527         "title": "Scaling MLPs: A Tale of Inductive Bias",
    528         "points": 2,
    529         "comments": 0,
    530         "url": "https://news.ycombinator.com/item?id=36483213"
    531       },
    532       {
    533         "hn_id": "36687031",
    534         "title": "Scaling MLPs: A Tale of Inductive Bias",
    535         "points": 1,
    536         "comments": 0,
    537         "url": "https://news.ycombinator.com/item?id=36687031"
    538       },
    539       {
    540         "hn_id": "36551087",
    541         "title": "A Survey on Multimodal Large Language Models",
    542         "points": 1,
    543         "comments": 0,
    544         "url": "https://news.ycombinator.com/item?id=36551087"
    545       }
    546     ],
    547     "top_points": 54,
    548     "total_points": 65,
    549     "total_comments": 23
    550   }
    551 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs