scan.json (16916B)
1 { 2 "paper": { 3 "title": "Automatically Surfacing Opportunities for Improvements In Internet-Scale Applications", 4 "authors": ["Vipul Harsh", "Sayan Sinha", "Henry Milner", "Haijie Wu", "B Aditya Prakash", "Vyas Sekar", "Hui Zhang"], 5 "year": 2025, 6 "venue": "HotNets '25 (24th ACM Workshop on Hot Topics in Networks)", 7 "doi": "10.1145/3772356.3772423" 8 }, 9 "checklist": { 10 "artifacts": { 11 "code_released": { 12 "applies": true, 13 "answer": false, 14 "justification": "No repository URL or code archive is provided anywhere in the paper." 15 }, 16 "data_released": { 17 "applies": true, 18 "answer": false, 19 "justification": "The paper uses anonymized production data from a monitoring service provider but does not release it." 20 }, 21 "environment_specified": { 22 "applies": true, 23 "answer": false, 24 "justification": "No environment specifications, dependency lists, or setup details are provided." 25 }, 26 "reproduction_instructions": { 27 "applies": true, 28 "answer": false, 29 "justification": "No reproduction instructions are included. The proof-of-concept implementation is described at a high level only." 30 } 31 }, 32 "statistical_methodology": { 33 "confidence_intervals_or_error_bars": { 34 "applies": true, 35 "answer": false, 36 "justification": "No confidence intervals or error bars are reported for any results." 37 }, 38 "significance_tests": { 39 "applies": true, 40 "answer": false, 41 "justification": "The paper uses heuristic thresholds (2x rate, 25% absolute difference) for opportunity detection but no formal significance tests." 42 }, 43 "effect_sizes_reported": { 44 "applies": true, 45 "answer": false, 46 "justification": "No effect sizes reported. Results are presented as counts of useful opportunities (e.g., '5 out of 18 scenarios') without quantified magnitudes." 47 }, 48 "sample_size_justified": { 49 "applies": true, 50 "answer": false, 51 "justification": "One week of production data from 3 services is used with no justification for why this duration or number of services is sufficient." 52 }, 53 "variance_reported": { 54 "applies": true, 55 "answer": false, 56 "justification": "No variance or standard deviation reported across any results." 57 } 58 }, 59 "evaluation_design": { 60 "baselines_included": { 61 "applies": true, 62 "answer": false, 63 "justification": "Table 1 compares categories of related work qualitatively but no quantitative baseline comparison is performed against any existing system." 64 }, 65 "baselines_contemporary": { 66 "applies": true, 67 "answer": false, 68 "justification": "No quantitative baselines are included at all." 69 }, 70 "ablation_study": { 71 "applies": true, 72 "answer": false, 73 "justification": "The system has multiple components (hypothesis generator, attribute computation, opportunity finder with two experts) but no ablation is performed." 74 }, 75 "multiple_metrics": { 76 "applies": true, 77 "answer": false, 78 "justification": "Evaluation is limited to counting how many opportunities were manually deemed useful. No other metrics (precision, recall, latency per hypothesis, etc.) are formally reported." 79 }, 80 "human_evaluation": { 81 "applies": true, 82 "answer": true, 83 "justification": "Results were manually inspected: 'we (manually) deemed to be insightful' (Section 5.2). However, the evaluation protocol is informal." 84 }, 85 "held_out_test_set": { 86 "applies": false, 87 "answer": false, 88 "justification": "This is not a benchmark evaluation paper; the system discovers opportunities in production data, so a held-out test set is not structurally applicable." 89 }, 90 "per_category_breakdown": { 91 "applies": true, 92 "answer": true, 93 "justification": "Table 2 provides per-KPI breakdown of identified opportunities. Results are also broken down by Expert 1 (18 scenarios) and Expert 2 (20 scenarios)." 94 }, 95 "failure_cases_discussed": { 96 "applies": true, 97 "answer": true, 98 "justification": "The paper acknowledges false positives: 'the false positive rate may seem high at first' and notes 13 of 18 Expert 1 scenarios and ~12 of 20 Expert 2 scenarios were not useful." 99 }, 100 "negative_results_reported": { 101 "applies": true, 102 "answer": true, 103 "justification": "The paper reports that many surfaced opportunities were false positives (only 5/18 from Expert 1 and 8/20 from Expert 2 were useful)." 104 } 105 }, 106 "claims_and_evidence": { 107 "abstract_claims_supported": { 108 "applies": true, 109 "answer": true, 110 "justification": "The abstract claims 'early promise from a proof-of-concept system' with 'evaluation on three real-world services,' which is appropriately hedged and matches the results in Section 5." 111 }, 112 "causal_claims_justified": { 113 "applies": true, 114 "answer": false, 115 "justification": "The paper makes causal-sounding claims (e.g., 'CORS error triggered client-side retries that led to the timeout') but acknowledges in Section 6 that causal inference methods are future work. The opportunity finder uses correlation-based heuristics only." 116 }, 117 "generalization_bounded": { 118 "applies": true, 119 "answer": true, 120 "justification": "The paper is careful to frame results as preliminary: 'proof of concept,' 'early promise,' and explicitly lists open challenges in Section 6. Claims are bounded to the three tested services." 121 }, 122 "alternative_explanations_discussed": { 123 "applies": true, 124 "answer": false, 125 "justification": "No substantive discussion of alternative explanations for the observed opportunities or why the heuristic thresholds might produce misleading results beyond acknowledging false positives." 126 } 127 }, 128 "setup_transparency": { 129 "model_versions_specified": { 130 "applies": true, 131 "answer": true, 132 "justification": "The paper specifies 'Llama 3.2' for hypothesis generation (Section 5.1), citing the model paper [18]. However, no specific parameter count or snapshot is given, so this is borderline. The citation to the Llama 3 paper provides traceability." 133 }, 134 "prompts_provided": { 135 "applies": true, 136 "answer": false, 137 "justification": "The paper describes prompt intent ('find patterns associated with high rebuffering') but does not provide the actual prompt text used with the LLM." 138 }, 139 "hyperparameters_reported": { 140 "applies": true, 141 "answer": false, 142 "justification": "No LLM hyperparameters (temperature, top-p, etc.) are reported. Heuristic thresholds (10%, 2x, 25%) are stated but LLM settings are not." 143 }, 144 "scaffolding_described": { 145 "applies": true, 146 "answer": true, 147 "justification": "The three-component pipeline (hypothesis generator, attribute computation engine, opportunity finder) is described with data flow in Figure 2 and Sections 4.1-4.3." 148 }, 149 "data_preprocessing_documented": { 150 "applies": true, 151 "answer": false, 152 "justification": "The paper mentions 'anonymized data' and 'one week of production data' but does not describe how the data was preprocessed, filtered, or anonymized." 153 } 154 }, 155 "limitations_and_scope": { 156 "limitations_section_present": { 157 "applies": true, 158 "answer": true, 159 "justification": "Section 6 'Discussion and future work' identifies multiple open challenges and limitations of the current approach." 160 }, 161 "threats_to_validity_specific": { 162 "applies": true, 163 "answer": false, 164 "justification": "Section 6 discusses future work directions but does not identify specific threats to validity of the current evaluation (e.g., manual evaluation bias, threshold sensitivity, generalizability to other domains)." 165 }, 166 "scope_boundaries_stated": { 167 "applies": true, 168 "answer": true, 169 "justification": "The paper clearly states it presents a 'vision' and 'proof of concept' and identifies specific things not yet addressed: causal inference, complex event patterns, scalable data processing, privacy." 170 } 171 }, 172 "data_integrity": { 173 "raw_data_available": { 174 "applies": true, 175 "answer": false, 176 "justification": "Raw production data is not available; it is proprietary and anonymized." 177 }, 178 "data_collection_described": { 179 "applies": true, 180 "answer": true, 181 "justification": "Section 2 describes the data source: 'anonymized data from a large application-level monitoring and analytics service provider' collecting client-side events from end-user devices." 182 }, 183 "recruitment_methods_described": { 184 "applies": false, 185 "answer": false, 186 "justification": "No human participants; data comes from production telemetry systems, not recruited subjects." 187 }, 188 "data_pipeline_documented": { 189 "applies": true, 190 "answer": false, 191 "justification": "The pipeline from raw events to opportunities is described architecturally but specifics of data transformations, filtering steps, and intermediate counts are not documented." 192 } 193 }, 194 "conflicts_of_interest": { 195 "funding_disclosed": { 196 "applies": true, 197 "answer": false, 198 "justification": "No funding or acknowledgments section is present in the paper." 199 }, 200 "affiliations_disclosed": { 201 "applies": true, 202 "answer": true, 203 "justification": "Author affiliations are clearly listed: Conviva (industry), Georgia Tech, and Carnegie Mellon University." 204 }, 205 "funder_independent_of_outcome": { 206 "applies": true, 207 "answer": false, 208 "justification": "Multiple authors are affiliated with Conviva, a commercial monitoring/analytics company. The system is evaluated on production data from what appears to be Conviva's service. This conflict is not acknowledged." 209 }, 210 "financial_interests_declared": { 211 "applies": true, 212 "answer": false, 213 "justification": "No competing interests statement is provided. Conviva-affiliated authors evaluating a system in their own production environment is a potential conflict." 214 } 215 }, 216 "contamination": { 217 "training_cutoff_stated": { 218 "applies": false, 219 "answer": false, 220 "justification": "The paper does not evaluate a pre-trained model's capability on a benchmark. The LLM is used as a component for hypothesis generation, not evaluated for its knowledge." 221 }, 222 "train_test_overlap_discussed": { 223 "applies": false, 224 "answer": false, 225 "justification": "Not a benchmark evaluation of model capabilities." 226 }, 227 "benchmark_contamination_addressed": { 228 "applies": false, 229 "answer": false, 230 "justification": "Not a benchmark evaluation of model capabilities." 231 } 232 }, 233 "human_studies": { 234 "pre_registered": { 235 "applies": false, 236 "answer": false, 237 "justification": "No human participants in the study." 238 }, 239 "irb_or_ethics_approval": { 240 "applies": false, 241 "answer": false, 242 "justification": "No human participants in the study." 243 }, 244 "demographics_reported": { 245 "applies": false, 246 "answer": false, 247 "justification": "No human participants in the study." 248 }, 249 "inclusion_exclusion_criteria": { 250 "applies": false, 251 "answer": false, 252 "justification": "No human participants in the study." 253 }, 254 "randomization_described": { 255 "applies": false, 256 "answer": false, 257 "justification": "No human participants in the study." 258 }, 259 "blinding_described": { 260 "applies": false, 261 "answer": false, 262 "justification": "No human participants in the study." 263 }, 264 "attrition_reported": { 265 "applies": false, 266 "answer": false, 267 "justification": "No human participants in the study." 268 } 269 }, 270 "cost_and_practicality": { 271 "inference_cost_reported": { 272 "applies": true, 273 "answer": true, 274 "justification": "Section 5.2 reports processing times: Expert 1 processed >1000 hypotheses in <10 minutes; Expert 2 processed 250 hypotheses in <3 minutes per KPI." 275 }, 276 "compute_budget_stated": { 277 "applies": true, 278 "answer": false, 279 "justification": "No total compute budget, hardware specifications, or LLM API costs are reported." 280 } 281 } 282 }, 283 "claims": [ 284 { 285 "claim": "The proof-of-concept system can surface novel opportunities for improvement in real-world production services.", 286 "evidence": "Table 2 lists 8 specific opportunity leads across 3 production services, identified from one week of data (Section 5.2).", 287 "supported": "moderate" 288 }, 289 { 290 "claim": "Expert 1 surfaced useful opportunities in 5 out of 18 scenarios; Expert 2 in 8 out of 20 scenarios on average.", 291 "evidence": "Section 5.2 reports these counts, with manual inspection determining usefulness.", 292 "supported": "weak" 293 }, 294 { 295 "claim": "The system is scalable: Expert 1 processed >1000 hypotheses in <10 minutes, Expert 2 processed 250 in <3 minutes per KPI.", 296 "evidence": "Section 5.2 'Scalability' paragraph provides these timing numbers.", 297 "supported": "weak" 298 } 299 ], 300 "methodology_tags": ["case-study"], 301 "key_findings": "The paper presents a vision and proof-of-concept for automatically surfacing improvement opportunities in Internet-scale services by generating and testing hypotheses based on derived attributes (indirect, stateful, non-local). Evaluated on production data from 3 services, the prototype identified actionable opportunities such as event sequences leading to payment failures and differential error rates for returning users. The system uses LLM-assisted hypothesis generation (Llama 3.2), efficient stateful attribute computation, and a mixture-of-experts validation approach, though the current evaluation is preliminary with high false positive rates (5/18 and 8/20 useful).", 302 "red_flags": [ 303 { 304 "flag": "Undisclosed conflict of interest", 305 "detail": "Multiple authors are Conviva employees evaluating a system on what appears to be Conviva's production data. This commercial interest in the system's success is not acknowledged." 306 }, 307 { 308 "flag": "No quantitative baselines", 309 "detail": "Despite Table 1 listing four categories of related work, no quantitative comparison against any existing system is performed." 310 }, 311 { 312 "flag": "Informal evaluation methodology", 313 "detail": "The evaluation relies on manual inspection by the authors themselves to determine which opportunities are 'insightful.' No inter-rater agreement, blinding, or systematic evaluation criteria are described." 314 }, 315 { 316 "flag": "Selective reporting of opportunities", 317 "detail": "Table 2 reports 'a subset of the opportunities' that were manually deemed insightful, without systematic criteria for selection." 318 } 319 ], 320 "cited_papers": [ 321 { 322 "title": "InsightPilot: An LLM-Empowered Automated Data Exploration System", 323 "authors": ["P. Ma", "R. Ding", "S. Wang", "S. Han", "D. Zhang"], 324 "year": 2023, 325 "relevance": "LLM-based automated data analysis system, relevant to AI-assisted analytics and software tooling." 326 }, 327 { 328 "title": "Why do multi-agent LLM systems fail?", 329 "authors": ["M. Cemri", "M. Z. Pan", "S. Yang"], 330 "year": 2025, 331 "arxiv_id": "2503.13657", 332 "relevance": "Directly relevant to understanding failure modes in multi-agent LLM systems." 333 }, 334 { 335 "title": "Automatic root cause analysis via large language models for cloud incidents", 336 "authors": ["Y. Chen", "H. Xie", "M. Ma"], 337 "year": 2024, 338 "relevance": "LLM-based root cause analysis for cloud systems, relevant to AI-assisted software engineering and operations." 339 }, 340 { 341 "title": "The Llama 3 herd of models", 342 "authors": ["A. Grattafiori", "A. Dubey"], 343 "year": 2024, 344 "arxiv_id": "2407.21783", 345 "relevance": "Foundation model used in this system's hypothesis generation component." 346 }, 347 { 348 "title": "A survey on multimodal large language models", 349 "authors": ["S. Yin", "C. Fu", "S. Zhao"], 350 "year": 2024, 351 "relevance": "Survey of multimodal LLMs referenced as future direction for hypothesis generation." 352 } 353 ] 354 }