scan.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan.json (28718B)
      1 {
      2   "paper": {
      3     "title": "FloodBrain: Flood Disaster Reporting by Web-based Retrieval Augmented Generation with an LLM",
      4     "authors": [
      5       "Grace Colverd",
      6       "Paul Darm",
      7       "Leonard Silverberg",
      8       "Noah Kasmanoff"
      9     ],
     10     "year": 2023,
     11     "venue": "6th Workshop on Artificial Intelligence for Humanitarian Assistance and Disaster Response (NeurIPS 2023)",
     12     "arxiv_id": "2311.02597",
     13     "doi": "10.48550/arXiv.2311.02597"
     14   },
     15   "scan_version": 3,
     16   "active_modules": [
     17     "experimental_rigor",
     18     "data_leakage"
     19   ],
     20   "methodology_tags": [
     21     "benchmark-eval",
     22     "case-study"
     23   ],
     24   "key_findings": "FloodBrain is a RAG-based pipeline for generating flood disaster impact reports by extracting and curating web information. GPT-4 as backbone produced reports with highest overlap with human-written ReliefWeb reports across G-EVAL and ROUGE metrics. G-EVAL showed a 0.78 Pearson correlation with human annotators, higher than ROUGE metrics. LLM-assisted search query expansion improved ROUGE scores by 6-7%, while source relevancy filtering reduced computational cost by 59% with mixed impact on report quality.",
     25   "checklist": {
     26     "artifacts": {
     27       "code_released": {
     28         "applies": true,
     29         "answer": false,
     30         "justification": "The paper provides a web UI at floodbrain.com but does not release any source code. No GitHub repository, Zenodo archive, or code link is provided."
     31       },
     32       "data_released": {
     33         "applies": true,
     34         "answer": false,
     35         "justification": "The dataset of 10 (and expanded 26) FloodBrain/ReliefWeb report pairs used for evaluation is not released. ReliefWeb reports are publicly available in general, but the specific curated evaluation dataset is not provided."
     36       },
     37       "environment_specified": {
     38         "applies": true,
     39         "answer": false,
     40         "justification": "No environment specifications, dependency lists, or software versions are provided anywhere in the paper or appendix."
     41       },
     42       "reproduction_instructions": {
     43         "applies": true,
     44         "answer": false,
     45         "justification": "No reproduction instructions are provided. The pipeline is described at a high level but there are no step-by-step instructions or scripts to replicate the experiments."
     46       }
     47     },
     48     "statistical_methodology": {
     49       "confidence_intervals_or_error_bars": {
     50         "applies": true,
     51         "answer": false,
     52         "justification": "Tables 1, 2, and 3 report only point estimates (e.g., G-EVAL scores, ROUGE scores, Pearson correlations) with no confidence intervals or error bars."
     53       },
     54       "significance_tests": {
     55         "applies": true,
     56         "answer": false,
     57         "justification": "The paper claims GPT-4 'exhibited superior performance' compared to GPT-3.5 and PaLM-Text-Bison based solely on comparing point estimates (3.27 vs 2.96 vs 2.34) without any significance tests."
     58       },
     59       "effect_sizes_reported": {
     60         "applies": true,
     61         "answer": true,
     62         "justification": "The ablation study reports percentage changes with context: 'Removing LLM-assisted search decreases report quality across all ROUGE metrics: 6.3% for ROUGE-1, 6.2% for ROUGE-2, and 7.2% for ROUGE-L.' Baseline scores are also provided in Table 3."
     63       },
     64       "sample_size_justified": {
     65         "applies": true,
     66         "answer": false,
     67         "justification": "The main evaluation uses only 10 report pairs and 4 human annotators, and the ablation uses 26 pairs. No justification is given for these small sample sizes, and no power analysis is discussed."
     68       },
     69       "variance_reported": {
     70         "applies": true,
     71         "answer": false,
     72         "justification": "G-EVAL is described as prompting GPT-4 ten times with temperature 1, but no variance or standard deviation across these runs is reported. Report generation results also have no variance measures."
     73       }
     74     },
     75     "evaluation_design": {
     76       "baselines_included": {
     77         "applies": true,
     78         "answer": true,
     79         "justification": "Three LLMs are compared: GPT-4, GPT-3.5, and PaLM-Text-Bison (Table 1). The ablation study also compares the full pipeline against ablated versions (Table 3)."
     80       },
     81       "baselines_contemporary": {
     82         "applies": true,
     83         "answer": true,
     84         "justification": "GPT-4, GPT-3.5, and PaLM-Text-Bison were all current state-of-the-art models at the time of publication (2023)."
     85       },
     86       "ablation_study": {
     87         "applies": true,
     88         "answer": true,
     89         "justification": "Table 3 presents an ablation study removing pipeline components: LLM-assisted search expansion and source relevancy confirmation, individually and together."
     90       },
     91       "multiple_metrics": {
     92         "applies": true,
     93         "answer": true,
     94         "justification": "The paper uses ROUGE-1, ROUGE-2, ROUGE-L, G-EVAL scores, and human evaluation scores — five distinct evaluation approaches."
     95       },
     96       "human_evaluation": {
     97         "applies": true,
     98         "answer": true,
     99         "justification": "Section 3: 'a manual annotation round was conducted, involving four annotators assigned to perform an equivalent evaluation to G-EVAL checking for consistency between the reference and generated report.'"
    100       },
    101       "held_out_test_set": {
    102         "applies": true,
    103         "answer": false,
    104         "justification": "The prompts were 'developed with feedback from domain experts' but there is no discussion of whether any of the 10 evaluation events were used during prompt development. No explicit dev/test separation is described."
    105       },
    106       "per_category_breakdown": {
    107         "applies": true,
    108         "answer": false,
    109         "justification": "Results are reported as averages across all report pairs. No per-event, per-region, or per-disaster-type breakdowns are provided to show whether performance varies across different flood events."
    110       },
    111       "failure_cases_discussed": {
    112         "applies": true,
    113         "answer": false,
    114         "justification": "No qualitative failure analysis is presented. The paper does not show examples of generated reports that were particularly poor or discuss specific failure modes of the pipeline."
    115       },
    116       "negative_results_reported": {
    117         "applies": true,
    118         "answer": true,
    119         "justification": "The ablation study reports mixed results for source relevancy removal: 'ROUGE-1 declines by 4.2%, while ROUGE-2 and ROUGE-L increase by 5.8% and 1% respectively.' The paper openly discusses this counter-intuitive finding."
    120       }
    121     },
    122     "claims_and_evidence": {
    123       "abstract_claims_supported": {
    124         "applies": true,
    125         "answer": true,
    126         "justification": "The abstract claims a 'notable correlation' between GPT-4 and human evaluator scores, supported by Table 2 (Pearson r=0.78). The ablation study claim is supported by Table 3. The comparison of LLMs is supported by Table 1."
    127       },
    128       "causal_claims_justified": {
    129         "applies": true,
    130         "answer": true,
    131         "justification": "The paper claims 'augmenting our web search with LLM-generated additional queries further improves the agreement.' The ablation study uses controlled single-variable manipulation (removing one component at a time), which is adequate for this causal claim."
    132       },
    133       "generalization_bounded": {
    134         "applies": true,
    135         "answer": true,
    136         "justification": "The title and scope are specific to 'Flood Disaster Reporting.' The paper does not overclaim to all disasters or all NLP tasks. The conclusion appropriately scopes to 'flood disaster reporting' and 'the field of humanitarian assistance.'"
    137       },
    138       "alternative_explanations_discussed": {
    139         "applies": true,
    140         "answer": false,
    141         "justification": "No alternative explanations are discussed for why GPT-4 outperforms the other models, or whether the evaluation metrics (ROUGE, G-EVAL) could be confounded. For instance, GPT-4 is used both as the G-EVAL evaluator and as one of the systems being evaluated."
    142       },
    143       "proxy_outcome_distinction": {
    144         "applies": true,
    145         "answer": false,
    146         "justification": "The paper measures ROUGE overlap and G-EVAL consistency with reference reports but frames this as 'report quality.' Similarity to a ReliefWeb reference report is not the same as actual utility for humanitarian response — a generated report could be useful even if dissimilar to the reference, or high-ROUGE but practically useless. This gap is not acknowledged."
    147       }
    148     },
    149     "setup_transparency": {
    150       "model_versions_specified": {
    151         "applies": true,
    152         "answer": false,
    153         "justification": "The paper uses 'GPT-4', 'GPT-3.5', and 'PaLM-Text-Bison' without specifying API versions, snapshot dates, or model version identifiers (e.g., 'gpt-4-0613')."
    154       },
    155       "prompts_provided": {
    156         "applies": true,
    157         "answer": false,
    158         "justification": "Table 4 provides the five questions used for report extraction, and the paper references 'Appendix A.2' for prompt information, but the actual system prompts for source relevancy evaluation, query expansion, and final report summarization are described in natural language rather than provided as full prompt text."
    159       },
    160       "hyperparameters_reported": {
    161         "applies": true,
    162         "answer": false,
    163         "justification": "For G-EVAL, temperature=1 is stated. However, no temperature, top-p, max tokens, or other sampling parameters are reported for the actual report generation pipeline across any of the three LLMs."
    164       },
    165       "scaffolding_described": {
    166         "applies": true,
    167         "answer": true,
    168         "justification": "Section 2.1 and Figures 1 and 5 describe the pipeline: key phrase → web search → source extraction → LLM relevancy evaluation → question/answer extraction per source → final report summarization. The ReAct-based chatbot component is also described (Appendix A.5)."
    169       },
    170       "data_preprocessing_documented": {
    171         "applies": true,
    172         "answer": false,
    173         "justification": "The paper describes the high-level pipeline (web search → text extraction → relevancy filtering) but does not document how text was extracted from websites, how extraction failures were handled, or what cleaning/preprocessing was applied to source text before passing to the LLM."
    174       }
    175     },
    176     "limitations_and_scope": {
    177       "limitations_section_present": {
    178         "applies": true,
    179         "answer": true,
    180         "justification": "A 'Limitations and Future Work' subsection and an 'Ethical and Practical Considerations' subsection are included at the end of Section 4, discussing scope of events, environmental costs, and hallucination risks."
    181       },
    182       "threats_to_validity_specific": {
    183         "applies": true,
    184         "answer": true,
    185         "justification": "The paper identifies a specific threat: 'The FloodBrain UI currently reports only on externally recognized disaster-classified flooding events, risking oversight in less monitored regions due to unclear classification criteria of external agencies.' Also notes that skipping human verification 'may lead to incorrect information in FloodBrain reports.'"
    186       },
    187       "scope_boundaries_stated": {
    188         "applies": true,
    189         "answer": true,
    190         "justification": "The paper states explicit boundaries: limited to 'externally recognized disaster-classified flooding events,' the tool is 'designed for collaborative report writing between human and LLM, to be used cautiously,' and 'Verification before official sharing is advised.'"
    191       }
    192     },
    193     "data_integrity": {
    194       "raw_data_available": {
    195         "applies": true,
    196         "answer": false,
    197         "justification": "The 10 and 26 report pairs, the individual source documents, the LLM responses, and the human annotator ratings are not made available for independent verification."
    198       },
    199       "data_collection_described": {
    200         "applies": true,
    201         "answer": false,
    202         "justification": "The paper states evaluation uses 'a dataset comprising 10 ReliefWeb reports' but does not specify which 10 events were selected, what selection criteria were used, or what time period they cover."
    203       },
    204       "recruitment_methods_described": {
    205         "applies": true,
    206         "answer": false,
    207         "justification": "Four human annotators conducted the evaluation but no information is provided about who they were, their expertise level, how they were recruited, or whether they had prior experience with disaster reporting."
    208       },
    209       "data_pipeline_documented": {
    210         "applies": true,
    211         "answer": false,
    212         "justification": "The report generation pipeline is described at a high level (Section 2.1, Figure 1), but the evaluation data pipeline — how 10 events were selected, how ReliefWeb reports were matched to generated reports, what quality control was applied to annotations — is not documented."
    213       }
    214     },
    215     "conflicts_of_interest": {
    216       "funding_disclosed": {
    217         "applies": true,
    218         "answer": true,
    219         "justification": "The Acknowledgments section discloses funding: 'enabled by Frontier Development Lab Europe a public/private partnership between the European Space Agency (ESA), Trillium Technologies, the University of Oxford' with support from 'Google Cloud and NVIDIA Corporation.'"
    220       },
    221       "affiliations_disclosed": {
    222         "applies": true,
    223         "answer": true,
    224         "justification": "Author affiliations are listed: University of Cambridge, University of Strathclyde, Trillium Technologies, and New York University. One author (Silverberg) works at Trillium Technologies, which manages FDL Europe."
    225       },
    226       "funder_independent_of_outcome": {
    227         "applies": true,
    228         "answer": false,
    229         "justification": "Google Cloud provided computational resources and Google's PaLM-Text-Bison is one of the three models evaluated. While GPT-4 ultimately performed best, the funder has a stake in their model's evaluation. Trillium Technologies manages FDL Europe and employs one author."
    230       },
    231       "financial_interests_declared": {
    232         "applies": true,
    233         "answer": false,
    234         "justification": "No competing interests or financial interests statement is included in the paper. One author works at Trillium Technologies (the FDL Europe organizer) and this potential conflict is not explicitly acknowledged."
    235       }
    236     },
    237     "contamination": {
    238       "training_cutoff_stated": {
    239         "applies": true,
    240         "answer": false,
    241         "justification": "No training data cutoff dates are stated for GPT-4, GPT-3.5, or PaLM-Text-Bison. This matters because ReliefWeb reports used as references may be in the models' training data."
    242       },
    243       "train_test_overlap_discussed": {
    244         "applies": true,
    245         "answer": false,
    246         "justification": "No discussion of whether the ReliefWeb reference reports (used as ground truth) appeared in the training data of GPT-4, GPT-3.5, or PaLM-Text-Bison. ReliefWeb is a major public humanitarian data source likely included in web-scraped training corpora."
    247       },
    248       "benchmark_contamination_addressed": {
    249         "applies": true,
    250         "answer": false,
    251         "justification": "ReliefWeb reports are publicly available and were published before the models' training cutoffs. If a model memorized these reports, ROUGE overlap scores would be inflated. This contamination risk is not discussed."
    252       }
    253     },
    254     "human_studies": {
    255       "pre_registered": {
    256         "applies": false,
    257         "answer": false,
    258         "justification": "The paper has no human subjects research. The 4 human annotators are evaluators of system output, not research participants."
    259       },
    260       "irb_or_ethics_approval": {
    261         "applies": false,
    262         "answer": false,
    263         "justification": "No human subjects research is conducted. Annotators evaluated system outputs; they were not studied as participants."
    264       },
    265       "demographics_reported": {
    266         "applies": false,
    267         "answer": false,
    268         "justification": "No human subjects study. The 4 annotators are part of the evaluation methodology, not study participants."
    269       },
    270       "inclusion_exclusion_criteria": {
    271         "applies": false,
    272         "answer": false,
    273         "justification": "No human subjects study. Annotators were recruited for evaluation, not as research participants."
    274       },
    275       "randomization_described": {
    276         "applies": false,
    277         "answer": false,
    278         "justification": "No human subjects experimental study is conducted."
    279       },
    280       "blinding_described": {
    281         "applies": false,
    282         "answer": false,
    283         "justification": "No human subjects experimental study is conducted."
    284       },
    285       "attrition_reported": {
    286         "applies": false,
    287         "answer": false,
    288         "justification": "No human subjects study. The 4 annotators are evaluators, not study participants."
    289       }
    290     },
    291     "cost_and_practicality": {
    292       "inference_cost_reported": {
    293         "applies": true,
    294         "answer": false,
    295         "justification": "The paper reports relative cost savings (59% reduction in API calls from source filtering, and '1,795 LLM API calls' avoided) but no actual monetary costs, latency figures, or per-report generation time are provided."
    296       },
    297       "compute_budget_stated": {
    298         "applies": true,
    299         "answer": false,
    300         "justification": "The acknowledgments thank Google Cloud for 'extensive computational resources' but no specific compute budget (GPU hours, total API spend, hardware specifications) is quantified."
    301       }
    302     },
    303     "experimental_rigor": {
    304       "seed_sensitivity_reported": {
    305         "applies": true,
    306         "answer": false,
    307         "justification": "G-EVAL uses temperature 1 with 10 prompts per evaluation, but no seed sensitivity analysis is reported for the report generation pipeline itself. No variance across runs is shown."
    308       },
    309       "number_of_runs_stated": {
    310         "applies": true,
    311         "answer": false,
    312         "justification": "For G-EVAL, 10 prompts per evaluation are stated. However, it is not stated whether reports were generated once or multiple times per event. The main pipeline results appear to be from single runs."
    313       },
    314       "hyperparameter_search_budget": {
    315         "applies": true,
    316         "answer": false,
    317         "justification": "No hyperparameter search is described. The pipeline design appears fixed with no discussion of alternative configurations tested during development."
    318       },
    319       "best_config_selection_justified": {
    320         "applies": true,
    321         "answer": false,
    322         "justification": "The pipeline configuration (number of search queries, number of questions, etc.) is presented without justification for why these specific choices were made or what alternatives were considered."
    323       },
    324       "multiple_comparison_correction": {
    325         "applies": false,
    326         "answer": false,
    327         "justification": "No significance tests are performed, so multiple comparison correction is not applicable."
    328       },
    329       "self_comparison_bias_addressed": {
    330         "applies": true,
    331         "answer": false,
    332         "justification": "The authors built FloodBrain and evaluate it. No discussion of the bias inherent in evaluating one's own system, and no independent evaluation is conducted."
    333       },
    334       "compute_budget_vs_performance": {
    335         "applies": true,
    336         "answer": false,
    337         "justification": "The three models likely have very different compute costs (GPT-4 >> GPT-3.5 >> PaLM-Text-Bison), but no performance-per-dollar or performance-per-token analysis is provided. The ablation mentions cost reduction but not the cost-quality tradeoff."
    338       },
    339       "benchmark_construct_validity": {
    340         "applies": true,
    341         "answer": false,
    342         "justification": "ROUGE measures n-gram overlap with reference reports and G-EVAL measures LLM-judged consistency, but neither is validated as a measure of actual report utility for humanitarian responders. The paper does not discuss whether these metrics capture what matters for disaster response."
    343       },
    344       "scaffold_confound_addressed": {
    345         "applies": true,
    346         "answer": true,
    347         "justification": "All three LLMs are compared within the same FloodBrain pipeline, so the scaffolding is held constant across model comparisons. The ablation also modifies pipeline components while holding the model constant (Bison)."
    348       }
    349     },
    350     "data_leakage": {
    351       "temporal_leakage_addressed": {
    352         "applies": true,
    353         "answer": false,
    354         "justification": "No discussion of whether the models' training data includes ReliefWeb reports from the evaluated flood events. If GPT-4 was trained on data including these reports, its generated reports would naturally overlap more with the references."
    355       },
    356       "feature_leakage_addressed": {
    357         "applies": true,
    358         "answer": false,
    359         "justification": "The pipeline performs web searches which could retrieve the actual ReliefWeb reference reports used as ground truth. If the system retrieves and summarizes the same reference report it's being compared against, ROUGE scores would be artificially inflated. This critical potential confound is not discussed."
    360       },
    361       "non_independence_addressed": {
    362         "applies": true,
    363         "answer": false,
    364         "justification": "No discussion of whether the web sources retrieved by FloodBrain overlap with or are derived from the ReliefWeb reports used as ground truth references."
    365       },
    366       "leakage_detection_method": {
    367         "applies": true,
    368         "answer": false,
    369         "justification": "No leakage detection or prevention method is applied. The paper does not check whether retrieved web sources include the reference reports or content derived from them."
    370       }
    371     }
    372   },
    373   "claims": [
    374     {
    375       "claim": "GPT-4 as pipeline backbone produces flood reports with highest overlap with human-written ReliefWeb reports",
    376       "evidence": "Table 1: GPT-4 achieves G-EVAL 3.27 (human: 3.23), ROUGE-1 52.53, ROUGE-2 15.76, ROUGE-L 41.83, compared to GPT-3.5 (2.96, 51.02, 13.62, 40.10) and PaLM-Text-Bison (2.34, 41.43, 10.20, 32.08).",
    377       "supported": "moderate"
    378     },
    379     {
    380       "claim": "G-EVAL scores show high correlation with human annotator scores (Pearson r=0.78)",
    381       "evidence": "Table 2: Pearson correlation between G-EVAL mean and human mean is 0.78, compared to ROUGE-1 (0.54), ROUGE-2 (0.62), ROUGE-L (0.59). Section 3.",
    382       "supported": "moderate"
    383     },
    384     {
    385       "claim": "LLM-assisted search query expansion improves report quality by 6-7% on ROUGE metrics",
    386       "evidence": "Table 3 and Section 3: 'Removing LLM-assisted search decreases report quality across all ROUGE metrics: 6.3% for ROUGE-1, 6.2% for ROUGE-2, and 7.2% for ROUGE-L.' Based on 26 report pairs with Bison backbone.",
    387       "supported": "moderate"
    388     },
    389     {
    390       "claim": "Source relevancy filtering reduces computational cost by 59%",
    391       "evidence": "Section 3: '41% of sources pass the relevancy check (252 out of 611 for 26 reports), averting an additional 1,795 LLM API calls.'",
    392       "supported": "moderate"
    393     }
    394   ],
    395   "red_flags": [
    396     {
    397       "flag": "Tiny sample sizes",
    398       "detail": "The main evaluation uses only 10 report pairs with 4 human annotators. The ablation study expands to 26 pairs. These are very small samples for the comparative claims being made, with no error bars, significance tests, or power analysis."
    399     },
    400     {
    401       "flag": "Potential feature leakage via web search",
    402       "detail": "The pipeline performs live web searches that could retrieve the same ReliefWeb reports used as ground truth references. If FloodBrain retrieves and summarizes content from the reference report, ROUGE scores are artificially inflated. This confound is never discussed."
    403     },
    404     {
    405       "flag": "GPT-4 used as both system and evaluator",
    406       "detail": "GPT-4 is one of the three LLM backbones being evaluated, and GPT-4 is also the evaluator in the G-EVAL methodology. This creates a potential self-preference bias where GPT-4 may rate GPT-4-generated text more favorably."
    407     },
    408     {
    409       "flag": "No uncertainty quantification",
    410       "detail": "All results are point estimates without confidence intervals, error bars, or standard deviations. G-EVAL is run 10 times with temperature 1 but variance across these runs is not reported."
    411     },
    412     {
    413       "flag": "Missing model versions",
    414       "detail": "Models are referenced only as 'GPT-4', 'GPT-3.5', and 'PaLM-Text-Bison' without API versions or snapshot dates, making exact reproduction impossible."
    415     }
    416   ],
    417   "cited_papers": [
    418     {
    419       "title": "GPTs are GPTs: An Early Look at the Labor Market Impact Potential of Large Language Models",
    420       "authors": ["Tyna Eloundou", "Sam Manning", "Pamela Mishkin", "Daniel Rock"],
    421       "year": 2023,
    422       "arxiv_id": "2303.10130",
    423       "relevance": "Early assessment of LLM impact on labor markets, relevant to understanding AI capability claims and workforce automation."
    424     },
    425     {
    426       "title": "The Reversal Curse: LLMs Trained on 'A is B' Fail to Learn 'B is A'",
    427       "authors": ["Lukas Berglund", "Meg Tong", "Max Kaufmann"],
    428       "year": 2023,
    429       "relevance": "Identifies gaps in LLM knowledge representation, relevant to understanding LLM limitations and hallucination."
    430     },
    431     {
    432       "title": "Retrieval-Augmented Generation for Knowledge-Intensive NLP Tasks",
    433       "authors": ["Patrick Lewis", "Ethan Perez", "Aleksandra Piktus"],
    434       "year": 2020,
    435       "relevance": "Foundational RAG paper used as the core methodology for FloodBrain's approach to grounding LLM outputs in retrieved information."
    436     },
    437     {
    438       "title": "Challenges and Applications of Large Language Models",
    439       "authors": ["Jean Kaddour", "Joshua Harris", "Maximilian Mozes"],
    440       "year": 2023,
    441       "arxiv_id": "2307.10169",
    442       "relevance": "Survey of LLM challenges including hallucination, directly relevant to understanding limitations of LLM-based systems."
    443     },
    444     {
    445       "title": "G-EVAL: NLG Evaluation Using GPT-4 with Better Human Alignment",
    446       "authors": ["Yang Liu", "Dan Iter", "Yichong Xu"],
    447       "year": 2023,
    448       "relevance": "Introduces the G-EVAL methodology for LLM-as-evaluator, directly used in this paper's evaluation framework."
    449     },
    450     {
    451       "title": "PaLM 2 Technical Report",
    452       "authors": ["Rohan Anil", "Andrew M. Dai", "Orhan Firat"],
    453       "year": 2023,
    454       "relevance": "Technical report for PaLM-Text-Bison, one of the three LLMs evaluated in the FloodBrain pipeline."
    455     },
    456     {
    457       "title": "ReAct: Synergizing Reasoning and Acting in Language Models",
    458       "authors": ["Shunyu Yao", "Jeffrey Zhao", "Dian Yu"],
    459       "year": 2023,
    460       "arxiv_id": "2210.03629",
    461       "relevance": "ReAct framework used as the reasoning system for FloodBrain's chatbot component, relevant to agentic LLM workflows."
    462     },
    463     {
    464       "title": "Estimating the Carbon Footprint of BLOOM, a 176B Parameter Language Model",
    465       "authors": ["Alexandra Sasha Luccioni", "Sylvain Viguier", "Anne-Laure Ligozat"],
    466       "year": 2022,
    467       "relevance": "Referenced for environmental cost of LLMs, relevant to understanding practical implications of LLM deployment."
    468     }
    469   ],
    470   "engagement_factors": {
    471     "practical_relevance": {
    472       "score": 2,
    473       "justification": "FloodBrain is a deployed tool (floodbrain.com) for a real humanitarian use case, though it targets a niche domain of flood disaster reporting."
    474     },
    475     "surprise_contrarian": {
    476       "score": 0,
    477       "justification": "Results confirm expected patterns: GPT-4 outperforms smaller models, RAG improves factual grounding, and LLM evaluators correlate with humans."
    478     },
    479     "fear_safety": {
    480       "score": 0,
    481       "justification": "The paper mentions hallucination risk in humanitarian reporting but does not raise novel AI safety or security concerns."
    482     },
    483     "drama_conflict": {
    484       "score": 0,
    485       "justification": "No controversy, no challenge to existing work, and no provocative claims."
    486     },
    487     "demo_ability": {
    488       "score": 2,
    489       "justification": "A live demo at floodbrain.com and a YouTube demo video are provided, though source code is not available for local use."
    490     },
    491     "brand_recognition": {
    492       "score": 1,
    493       "justification": "Uses GPT-4 and PaLM but comes from academic/nonprofit collaborations (Cambridge, NYU, Trillium, ESA), not from a major AI lab."
    494     }
    495   }
    496 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs