scan.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan.json (28367B)
      1 {
      2   "paper": {
      3     "title": "Generative AI in the Construction Industry: A State-of-the-art Analysis",
      4     "authors": [
      5       "Ridwan Taiwo",
      6       "Idris Temitope Bello",
      7       "Sulemana Fatoama Abdulai",
      8       "Abdul-Mugis Yussif",
      9       "Babatunde Abiodun Salami",
     10       "Abdullahi Saka",
     11       "Tarek Zayed"
     12     ],
     13     "year": 2024,
     14     "venue": "arXiv.org",
     15     "arxiv_id": "2402.09939",
     16     "doi": "10.48550/arXiv.2402.09939"
     17   },
     18   "scan_version": 3,
     19   "active_modules": ["survey_methodology"],
     20   "methodology_tags": ["meta-analysis", "case-study", "qualitative"],
     21   "key_findings": "A systematic review found only 6 peer-reviewed papers on generative AI in construction, indicating very early-stage adoption. An expert Delphi panel identified numerous opportunities across text, image, and video modalities for pre-construction, construction, and post-construction phases. A case study demonstrated that retrieval-augmented generation (RAG) improved GPT-4's contract document querying by 5.2%, 9.4%, and 4.8% in quality, relevance, and reproducibility respectively, though the RAG system's answer rate dropped from 100% to 90%.",
     22   "checklist": {
     23     "artifacts": {
     24       "code_released": {
     25         "applies": true,
     26         "answer": false,
     27         "justification": "No code repository or archive is mentioned anywhere in the paper. The RAG system built with LangChain and Streamlit is described but no code is released."
     28       },
     29       "data_released": {
     30         "applies": true,
     31         "answer": false,
     32         "justification": "The contract document used in the case study was obtained from a consultancy firm and is not released. The expert survey responses are not shared. No datasets are made publicly available."
     33       },
     34       "environment_specified": {
     35         "applies": true,
     36         "answer": false,
     37         "justification": "No environment specifications, requirements files, or dependency lists are provided. The paper mentions LangChain, Streamlit, OpenAI API, and Cassandra but does not specify versions or provide setup details."
     38       },
     39       "reproduction_instructions": {
     40         "applies": true,
     41         "answer": false,
     42         "justification": "No reproduction instructions are included. The RAG pipeline is described at a high level (Section 6.1) but lacks the specificity needed to reproduce the system."
     43       }
     44     },
     45     "statistical_methodology": {
     46       "confidence_intervals_or_error_bars": {
     47         "applies": true,
     48         "answer": false,
     49         "justification": "No confidence intervals or error bars are reported. Table 21 presents averaged ratings from 3 experts with no uncertainty measures."
     50       },
     51       "significance_tests": {
     52         "applies": true,
     53         "answer": false,
     54         "justification": "No statistical significance tests are used. The claim that RAG improves GPT-4 is based solely on comparing average ratings (e.g., 3.87 vs 4.13 for quality) without any test of whether the difference is statistically meaningful."
     55       },
     56       "effect_sizes_reported": {
     57         "applies": true,
     58         "answer": true,
     59         "justification": "The paper reports both baseline and improved scores with percentage improvements: quality from 3.87 (77.4%) to 4.13 (82.6%), relevance from 4.01 (80.2%) to 4.48 (89.6%), reproducibility from 4.53 (90.6%) to 4.77 (95.4%), providing sufficient context to assess magnitude."
     60       },
     61       "sample_size_justified": {
     62         "applies": true,
     63         "answer": false,
     64         "justification": "No justification is provided for using 20 evaluation questions, 3 expert raters, or 11 Delphi panelists. No power analysis is discussed."
     65       },
     66       "variance_reported": {
     67         "applies": true,
     68         "answer": false,
     69         "justification": "No variance, standard deviation, or inter-rater agreement metrics are reported. Individual expert ratings are not disaggregated beyond the averages in Table 21."
     70       }
     71     },
     72     "evaluation_design": {
     73       "baselines_included": {
     74         "applies": true,
     75         "answer": true,
     76         "justification": "The case study compares GPT-4 (baseline) against GPT-4+RAG on the same 20 questions, providing a direct baseline comparison (Table 21)."
     77       },
     78       "baselines_contemporary": {
     79         "applies": true,
     80         "answer": true,
     81         "justification": "GPT-4 was state-of-the-art at the time of the study (2024). Using GPT-4 as the baseline for evaluating RAG augmentation is appropriate."
     82       },
     83       "ablation_study": {
     84         "applies": false,
     85         "answer": false,
     86         "justification": "The system has only one added component (RAG) on top of the base GPT-4 model, making a multi-component ablation study inapplicable."
     87       },
     88       "multiple_metrics": {
     89         "applies": true,
     90         "answer": true,
     91         "justification": "Four evaluation metrics are used: answer rate, quality (1-5), relevance (1-5), and reproducibility (1-5), as described in Section 6.2."
     92       },
     93       "human_evaluation": {
     94         "applies": true,
     95         "answer": true,
     96         "justification": "Three domain experts evaluated the model outputs on the four metrics. Section 6.2 states '3 experts from the original panel' assessed responses to each question."
     97       },
     98       "held_out_test_set": {
     99         "applies": true,
    100         "answer": false,
    101         "justification": "No dev/test separation is described. The 20 expert-validated questions were used directly for evaluation without indication of a separate development or tuning set."
    102       },
    103       "per_category_breakdown": {
    104         "applies": true,
    105         "answer": true,
    106         "justification": "Table 21 provides per-question scores for all 20 questions across all four metrics for both GPT-4 and GPT-4+RAG, enabling detailed inspection of individual question performance."
    107       },
    108       "failure_cases_discussed": {
    109         "applies": true,
    110         "answer": true,
    111         "justification": "Section 6.3 discusses failure cases: the RAG model failed to answer 2 of 20 questions. Figure 11 provides a side-by-side comparison showing GPT-4 hallucinating details about GCC Clause 44 while RAG correctly extracted the answer."
    112       },
    113       "negative_results_reported": {
    114         "applies": true,
    115         "answer": true,
    116         "justification": "The RAG system's lower answer rate (90% vs 100% for baseline GPT-4) is reported as a negative result. Section 6.3 acknowledges 'the model struggled to retrieve relevant passages for two of the questions.'"
    117       }
    118     },
    119     "claims_and_evidence": {
    120       "abstract_claims_supported": {
    121         "applies": true,
    122         "answer": true,
    123         "justification": "The abstract's specific claims are supported: the 5.2%, 9.4%, and 4.8% improvements are verified in Table 21; the review of opportunities and challenges is presented in Section 4; the framework is in Section 5; the case study is in Section 6."
    124       },
    125       "causal_claims_justified": {
    126         "applies": true,
    127         "answer": true,
    128         "justification": "The paper's main causal claim is that RAG improves GPT-4's performance. This is tested via controlled single-variable manipulation: the same GPT-4 model with and without RAG, evaluated on the same 20 questions by the same raters. This design is adequate for the causal claim, despite the small sample."
    129       },
    130       "generalization_bounded": {
    131         "applies": true,
    132         "answer": false,
    133         "justification": "The paper frames itself as 'a comprehensive analysis and practical framework' and 'a state-of-the-art analysis' of generative AI in construction. However, the evidence base is thin: only 6 papers found in the systematic review, 11 experts surveyed, and 1 case study on a single contract document with a single model. The broad framing significantly exceeds the narrow evidence."
    134       },
    135       "alternative_explanations_discussed": {
    136         "applies": true,
    137         "answer": false,
    138         "justification": "No alternative explanations are discussed for the RAG improvement. Possible confounds such as evaluator bias (experts knew which system was which), question selection effects, or small-sample variability are not considered."
    139       },
    140       "proxy_outcome_distinction": {
    141         "applies": true,
    142         "answer": false,
    143         "justification": "The paper measures subjective 5-point expert ratings of quality, relevance, and reproducibility and frames these as evidence of improved 'information retrieval and knowledge discovery' (Section 6). The gap between subjective ratings by 3 experts and actual information extraction quality is not acknowledged."
    144       }
    145     },
    146     "setup_transparency": {
    147       "model_versions_specified": {
    148         "applies": true,
    149         "answer": false,
    150         "justification": "The paper states 'GPT-4' throughout without specifying a version or snapshot date (e.g., gpt-4-0613). No API version or model checkpoint is identified."
    151       },
    152       "prompts_provided": {
    153         "applies": true,
    154         "answer": false,
    155         "justification": "No actual prompt text is provided for the RAG system or baseline GPT-4 queries. The 20 evaluation questions are referenced but not fully listed in the paper. The system prompt used with the LLM is not shown."
    156       },
    157       "hyperparameters_reported": {
    158         "applies": true,
    159         "answer": false,
    160         "justification": "No LLM hyperparameters are reported (temperature, top-p, max tokens). The RAG system's chunk size is described as '3-5 sentences' but the top-k retrieval parameter, embedding model specifics, and similarity threshold are not stated."
    161       },
    162       "scaffolding_described": {
    163         "applies": false,
    164         "answer": false,
    165         "justification": "The system is a fixed RAG pipeline (retrieve-augment-generate) without agentic scaffolding such as autonomous tool use, retry logic, feedback mechanisms, or multi-step reasoning."
    166       },
    167       "data_preprocessing_documented": {
    168         "applies": true,
    169         "answer": true,
    170         "justification": "Phase 1 documents the literature search pipeline with databases, search string, screening counts (79→10→4 original + 2 review), and snowball expansion to 6 papers. The case study describes document chunking into 3-5 sentence segments with embedding and vector storage steps (Section 6.1)."
    171       }
    172     },
    173     "limitations_and_scope": {
    174       "limitations_section_present": {
    175         "applies": true,
    176         "answer": true,
    177         "justification": "Section 6.3 ('Model limitation') is a dedicated subsection discussing case study limitations. Section 7 (Conclusion) contains a substantive paragraph on overall study limitations covering database scope, panel size, and model choice."
    178       },
    179       "threats_to_validity_specific": {
    180         "applies": true,
    181         "answer": true,
    182         "justification": "Specific threats are discussed: the RAG model failed on 2 questions due to chunking strategy limitations (Section 6.3); the literature review was confined to 3 databases despite snowball searching; the Delphi panel was restricted in size; only a single LLM and embedding technique were used due to API costs (Section 7)."
    183       },
    184       "scope_boundaries_stated": {
    185         "applies": true,
    186         "answer": true,
    187         "justification": "Section 7 states specific boundaries: 'confined to three databases - Scopus, Web of Science, and ScienceDirect'; 'only a single base large language model and embedding technique were utilized due to API access costs'; model 'trained on just a single contract document and may fail to transfer to new projects' (Section 6.3)."
    188       }
    189     },
    190     "data_integrity": {
    191       "raw_data_available": {
    192         "applies": true,
    193         "answer": false,
    194         "justification": "No raw data is available. The expert survey responses, contract document, model outputs, and individual expert ratings are not released for independent verification."
    195       },
    196       "data_collection_described": {
    197         "applies": true,
    198         "answer": true,
    199         "justification": "Phase 1 describes literature search with specific databases and search strings. Phase 2 describes expert recruitment: 15 experts invited, 11 accepted (73% response rate), with backgrounds spanning AI research and construction practice. Phase 4 describes case study data collection from a consultancy firm."
    200       },
    201       "recruitment_methods_described": {
    202         "applies": true,
    203         "answer": false,
    204         "justification": "The paper describes the expert panel composition ('university professors in AI and construction engineering, technology directors from major construction firms, and founders of AI startups targeting the AEC industry') and response rate, but does not explain how the 15 experts were identified or contacted, introducing potential selection bias."
    205       },
    206       "data_pipeline_documented": {
    207         "applies": true,
    208         "answer": false,
    209         "justification": "The literature search pipeline has counts (79→10→6), but the expert Delphi analysis pipeline is underdocumented: the thematic analysis process, coding scheme, inter-coder reliability, and aggregation from individual responses to categorized opportunities/challenges are not detailed."
    210       }
    211     },
    212     "conflicts_of_interest": {
    213       "funding_disclosed": {
    214         "applies": true,
    215         "answer": true,
    216         "justification": "The Acknowledgment section states: 'This research is supported by the Department of Building and Real Estate, The Hong Kong Polytechnic University, and the Centre for Advances in Reliability and Safety (CAiRS).'"
    217       },
    218       "affiliations_disclosed": {
    219         "applies": true,
    220         "answer": true,
    221         "justification": "All author affiliations are listed: Hong Kong Polytechnic University, CAiRS, Cardiff Metropolitan University, and Leeds Beckett University. The paper does not evaluate a product from any of these institutions."
    222       },
    223       "funder_independent_of_outcome": {
    224         "applies": true,
    225         "answer": true,
    226         "justification": "Funding is from a university department and research center with no apparent financial interest in whether generative AI succeeds or fails in the construction industry."
    227       },
    228       "financial_interests_declared": {
    229         "applies": true,
    230         "answer": false,
    231         "justification": "No competing interests or financial interests statement is included in the paper."
    232       }
    233     },
    234     "contamination": {
    235       "training_cutoff_stated": {
    236         "applies": false,
    237         "answer": false,
    238         "justification": "The case study tests RAG-augmented information extraction from a proprietary contract document, not model knowledge on a public benchmark. Contamination of the specific contract document is not a meaningful concern."
    239       },
    240       "train_test_overlap_discussed": {
    241         "applies": false,
    242         "answer": false,
    243         "justification": "The evaluation queries are custom-created for a specific proprietary contract document, not a public benchmark that could be in GPT-4's training data."
    244       },
    245       "benchmark_contamination_addressed": {
    246         "applies": false,
    247         "answer": false,
    248         "justification": "No standard benchmark is used. The evaluation is on 20 custom questions about a proprietary contract document."
    249       }
    250     },
    251     "human_studies": {
    252       "pre_registered": {
    253         "applies": true,
    254         "answer": false,
    255         "justification": "No pre-registration is mentioned for either the Delphi expert study or the case study evaluation."
    256       },
    257       "irb_or_ethics_approval": {
    258         "applies": true,
    259         "answer": false,
    260         "justification": "No IRB or ethics board approval is mentioned despite involving 11 expert participants in a Delphi survey and 3 expert evaluators."
    261       },
    262       "demographics_reported": {
    263         "applies": true,
    264         "answer": false,
    265         "justification": "Only broad categories are given for the expert panel ('university professors in AI and construction engineering, technology directors from major construction firms, and founders of AI startups'). No individual demographics such as years of experience, specific expertise levels, or geographic distribution are reported."
    266       },
    267       "inclusion_exclusion_criteria": {
    268         "applies": true,
    269         "answer": false,
    270         "justification": "No inclusion or exclusion criteria for expert selection are stated. The paper says '15 experts with backgrounds spanning AI research and construction industry practice were identified' without explaining the selection criteria."
    271       },
    272       "randomization_described": {
    273         "applies": false,
    274         "answer": false,
    275         "justification": "The study involves a Delphi survey and expert evaluation, not an experimental design with randomized condition assignment."
    276       },
    277       "blinding_described": {
    278         "applies": true,
    279         "answer": false,
    280         "justification": "No blinding is described for the case study evaluation. The 3 expert raters evaluated both GPT-4 and GPT-4+RAG outputs, but there is no indication they were blinded to which system produced which responses, introducing potential bias."
    281       },
    282       "attrition_reported": {
    283         "applies": true,
    284         "answer": true,
    285         "justification": "Attrition is reported: '15 experts with backgrounds spanning AI research and construction industry practice were identified. Invitations were sent to participate in the study, with 11 experts accepting for a 73% response rate.'"
    286       }
    287     },
    288     "cost_and_practicality": {
    289       "inference_cost_reported": {
    290         "applies": true,
    291         "answer": false,
    292         "justification": "No API costs, token counts, or inference times are reported for the GPT-4 or RAG system queries. Section 7 mentions 'API access costs' as a limiting factor but does not quantify them."
    293       },
    294       "compute_budget_stated": {
    295         "applies": true,
    296         "answer": false,
    297         "justification": "No computational budget is stated. The hardware used, API spend, or processing time for the RAG system are not reported."
    298       }
    299     },
    300     "survey_methodology": {
    301       "prisma_or_structured_protocol": {
    302         "applies": true,
    303         "answer": true,
    304         "justification": "Phase 1 describes a structured search strategy with specified databases (Scopus, Web of Science, ScienceDirect), an explicit reproducible search string, screening with counts (79→10→6), and snowball searching. However, no PRISMA flow diagram or protocol registration is provided."
    305       },
    306       "quality_assessment_of_sources": {
    307         "applies": true,
    308         "answer": false,
    309         "justification": "No quality assessment of the 6 reviewed papers is conducted. Table 10 summarizes the papers' objectives, methods, and contributions but does not evaluate their methodological quality, risk of bias, or rigor."
    310       },
    311       "publication_bias_discussed": {
    312         "applies": true,
    313         "answer": false,
    314         "justification": "No discussion of publication bias. The paper does not consider whether the 6 identified studies might skew positive or whether negative results on generative AI in construction are underrepresented."
    315       }
    316     }
    317   },
    318   "claims": [
    319     {
    320       "claim": "Generative AI adoption in the construction industry is still in very early stages, with only 6 peer-reviewed articles found exploring uses of LLMs in construction.",
    321       "evidence": "Systematic literature search across Scopus, Web of Science, and ScienceDirect with snowball searching identified only 6 peer-reviewed papers (Section 4.1, Table 10).",
    322       "supported": "moderate"
    323     },
    324     {
    325       "claim": "RAG improves the baseline GPT-4 LLM by 5.2%, 9.4%, and 4.8% in terms of quality, relevance, and reproducibility for querying contract documents.",
    326       "evidence": "Table 21 shows GPT-4 averaged 3.87/4.01/4.53 vs GPT-4+RAG at 4.13/4.48/4.77 on quality/relevance/reproducibility scales rated by 3 experts on 20 questions (Section 6.2).",
    327       "supported": "weak"
    328     },
    329     {
    330       "claim": "GPT-4 without RAG hallucinates plausible-sounding but incorrect details from contract documents, while RAG grounds outputs in actual document content.",
    331       "evidence": "Figure 11 shows a side-by-side comparison where GPT-4 fabricates details about GCC Clause 44 while GPT-4+RAG correctly extracts the price adjustment formula from the contract (Section 6.2).",
    332       "supported": "moderate"
    333     },
    334     {
    335       "claim": "RAG-augmented GPT-4 has a lower answer rate (90%) compared to baseline GPT-4 (100%) because it sometimes cannot link queries to relevant document passages.",
    336       "evidence": "Table 21 shows RAG answered 18/20 questions vs 20/20 for baseline. Section 6.3 explains 'the chunking strategies and semantic search techniques used were unable to adequately link some complex questions to supporting evidence.'",
    337       "supported": "moderate"
    338     },
    339     {
    340       "claim": "At least seven potential opportunities for generative AI exist for each major input-output data type (text-text, text-image, image-text, etc.) across construction project phases.",
    341       "evidence": "Tables 12-20 list 7-11 potential opportunities for each of the 9 input-output types, identified through expert Delphi discussion (Sections 4.2.1-4.2.9).",
    342       "supported": "weak"
    343     }
    344   ],
    345   "red_flags": [
    346     {
    347       "flag": "Extremely thin evidence base for a 'comprehensive' survey",
    348       "detail": "The systematic literature review found only 6 peer-reviewed papers, yet the paper frames itself as 'a comprehensive analysis' and 'state-of-the-art analysis' of generative AI in construction. The bulk of the opportunities and challenges are derived from 11 expert opinions rather than empirical evidence."
    349     },
    350     {
    351       "flag": "No statistical tests for improvement claims",
    352       "detail": "The claim that RAG improves GPT-4 by 5.2-9.4% is based on comparing averages from 3 expert raters on 20 questions with no significance tests, confidence intervals, or inter-rater reliability metrics. The differences could easily be noise at this sample size."
    353     },
    354     {
    355       "flag": "Tiny evaluation sample",
    356       "detail": "The case study uses only 20 questions, 3 raters, and 1 contract document. This is insufficient to support generalizable claims about RAG's effectiveness for construction information retrieval."
    357     },
    358     {
    359       "flag": "Survey without quality assessment of sources",
    360       "detail": "The 6 reviewed papers are summarized but never assessed for methodological quality. The survey treats all papers equally regardless of rigor, laundering the signal-to-noise ratio of its sources."
    361     },
    362     {
    363       "flag": "Expert panel recruitment unclear",
    364       "detail": "The 15 invited experts were 'identified' through an undescribed process. The selection method could introduce significant bias toward experts favorable to AI adoption in construction. No demographics, experience levels, or selection criteria are provided."
    365     },
    366     {
    367       "flag": "No evaluator blinding",
    368       "detail": "Expert raters evaluated both GPT-4 and GPT-4+RAG outputs without described blinding. Knowledge of which system produced which output could bias ratings, especially given the paper's framing of RAG as an improvement."
    369     }
    370   ],
    371   "cited_papers": [
    372     {
    373       "title": "Evaluating Large Language Models Trained on Code",
    374       "authors": ["M. Chen", "J. Tworek", "H. Jun"],
    375       "year": 2021,
    376       "arxiv_id": "2107.03374",
    377       "relevance": "Foundational Codex evaluation paper establishing LLM code generation capabilities and benchmarks."
    378     },
    379     {
    380       "title": "GPT models in construction industry: Opportunities, limitations, and a use case validation",
    381       "authors": ["A. Saka", "R. Taiwo", "N. Saka", "B.A. Salami"],
    382       "year": 2024,
    383       "doi": "10.1016/j.dibe.2023.100300",
    384       "relevance": "Directly evaluates GPT models for construction industry tasks including material selection case study."
    385     },
    386     {
    387       "title": "Harnessing the Power of LLMs in Practice: A Survey on ChatGPT and Beyond",
    388       "authors": ["J. Yang", "H. Jin", "R. Tang"],
    389       "year": 2023,
    390       "relevance": "Comprehensive survey of practical LLM applications and capabilities relevant to the broader LLM evaluation landscape."
    391     },
    392     {
    393       "title": "Active Retrieval Augmented Generation",
    394       "authors": ["Z. Jiang", "F. Xu", "L. Gao"],
    395       "year": 2023,
    396       "doi": "10.18653/v1/2023.emnlp-main.495",
    397       "relevance": "RAG methodology paper relevant to understanding retrieval-augmented approaches for LLM applications."
    398     },
    399     {
    400       "title": "Dynamic prompt-based virtual assistant framework for BIM information search",
    401       "authors": ["J. Zheng", "M. Fischer"],
    402       "year": 2023,
    403       "doi": "10.1016/j.autcon.2023.105067",
    404       "relevance": "Integrates GPT with BIM for construction information retrieval, demonstrating LLM application in a domain-specific context."
    405     },
    406     {
    407       "title": "Robot-Enabled Construction Assembly with Automated Sequence Planning Based on ChatGPT: RoboGPT",
    408       "authors": ["H. You", "Y. Ye", "T. Zhou"],
    409       "year": 2023,
    410       "doi": "10.3390/buildings13071772",
    411       "relevance": "Uses ChatGPT for automated planning in robotic assembly, demonstrating agentic LLM use in physical construction tasks."
    412     },
    413     {
    414       "title": "Leveraging ChatGPT to Aid Construction Hazard Recognition and Support Safety Education and Training",
    415       "authors": ["S.M.J. Uddin", "A. Albert", "A. Ovid"],
    416       "year": 2023,
    417       "doi": "10.3390/su15097121",
    418       "relevance": "Evaluates ChatGPT's effectiveness for safety-critical hazard recognition tasks with human participants."
    419     },
    420     {
    421       "title": "Automated detection of contractual risk clauses from construction specifications using BERT",
    422       "authors": ["S. Moon", "S. Chi", "S.B. Im"],
    423       "year": 2022,
    424       "doi": "10.1016/j.autcon.2022.104465",
    425       "relevance": "Applies BERT transformer model for automated document analysis in construction, relevant to LLM capability evaluation."
    426     },
    427     {
    428       "title": "Generative AI design for building structures",
    429       "authors": ["W. Liao", "X. Lu", "Y. Fei"],
    430       "year": 2024,
    431       "doi": "10.1016/j.autcon.2023.105187",
    432       "relevance": "Reviews generative AI for structural design, evaluating AI capability in a complex engineering domain."
    433     },
    434     {
    435       "title": "Opportunities and Challenges of Generative AI in Construction Industry: Focusing on Adoption of Text-Based Models",
    436       "authors": ["P. Ghimire", "K. Kim", "M. Acharya"],
    437       "year": 2024,
    438       "relevance": "Closely related survey on generative AI adoption challenges in construction, focusing on text-based model limitations."
    439     },
    440     {
    441       "title": "A survey of Generative AI Applications",
    442       "authors": ["R. Gozalo-Brizuela", "E.C. Garrido-Merchán"],
    443       "year": 2023,
    444       "arxiv_id": "2306.02781",
    445       "relevance": "Broad survey of generative AI applications across domains, providing context for domain-specific AI capability assessments."
    446     }
    447   ],
    448   "engagement_factors": {
    449     "practical_relevance": {
    450       "score": 2,
    451       "justification": "Proposes a framework and demonstrates a RAG case study for construction contract querying, but no code or tool is released for practitioners to use."
    452     },
    453     "surprise_contrarian": {
    454       "score": 0,
    455       "justification": "Confirms the expected narrative that generative AI has potential in construction but adoption is early — no surprising or contrarian findings."
    456     },
    457     "fear_safety": {
    458       "score": 1,
    459       "justification": "Discusses hallucination risks and safety concerns of using generative AI in safety-critical construction tasks, but does not demonstrate novel attacks or existential risks."
    460     },
    461     "drama_conflict": {
    462       "score": 0,
    463       "justification": "No controversy or conflict — the paper presents a balanced view of opportunities and challenges."
    464     },
    465     "demo_ability": {
    466       "score": 0,
    467       "justification": "No code, demo, or tool is released despite describing a Streamlit-based interface."
    468     },
    469     "brand_recognition": {
    470       "score": 1,
    471       "justification": "Uses GPT-4 (OpenAI) as the base model, but the paper is from academic authors at Hong Kong Polytechnic University, not a famous AI lab."
    472     }
    473   }
    474 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs