scan-v4.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan-v4.json (32920B)
      1 {
      2   "scan_version": 4,
      3   "paper_type": "empirical",
      4   "paper": {
      5     "title": "DeputyDev - AI Powered Developer Assistant: Breaking the Code Review Logjam through Contextual AI to Boost Developer Productivity",
      6     "authors": [
      7       "Vishal Khare",
      8       "Vijay Saini",
      9       "Deepak Sharma",
     10       "Anand Raj",
     11       "Ankit Rana",
     12       "Anshul Yadav"
     13     ],
     14     "year": 2025,
     15     "venue": "arXiv.org",
     16     "arxiv_id": "2508.09676",
     17     "doi": "10.48550/arXiv.2508.09676"
     18   },
     19   "checklist": {
     20     "claims_and_evidence": {
     21       "abstract_claims_supported": {
     22         "applies": true,
     23         "answer": false,
     24         "justification": "The abstract claims 'statistically significant reduction' but no statistical significance test is reported in the paper. The results show percentage differences (Table 2) but 'statistically significant' is asserted without any supporting statistical test or p-value.",
     25         "source": "opus"
     26       },
     27       "causal_claims_justified": {
     28         "applies": true,
     29         "answer": true,
     30         "justification": "The paper uses a double-controlled A/B experiment with random 33% allocation of PRs to three groups (Section 9). While the design has limitations (no blinding, PR-level rather than engineer-level randomization), the basic causal design — random assignment with concurrent controls — is adequate for the causal claims about review time reduction.",
     31         "source": "opus"
     32       },
     33       "generalization_bounded": {
     34         "applies": true,
     35         "answer": false,
     36         "justification": "The title claims to 'Break the Code Review Logjam' and 'Boost Developer Productivity' without qualification. Results come from a single company (TATA 1mg) using one LLM (GPT-4o) in one ecosystem (Bitbucket), but claims are framed broadly. The abstract mentions 'external companies' using it but provides no data from those deployments.",
     37         "source": "opus"
     38       },
     39       "alternative_explanations_discussed": {
     40         "applies": true,
     41         "answer": false,
     42         "justification": "No alternative explanations are discussed. The paper does not consider: Hawthorne effect (engineers aware of the experiment), novelty effect, the possibility that immediate AI feedback simply reminds authors to self-fix rather than providing quality reviews, or that reduced review time might reflect reviewers rubber-stamping after seeing AI feedback.",
     43         "source": "opus"
     44       },
     45       "proxy_outcome_distinction": {
     46         "applies": true,
     47         "answer": false,
     48         "justification": "The paper measures review time reduction but frames this as 'boosting developer productivity' (title) and 'improving development workflow timelines and code quality' (abstract). Faster review time is a proxy for productivity, and no measurement of code quality is performed despite quality being claimed as an improvement. The gap between the proxy (time) and the claimed outcomes (productivity, quality) is never acknowledged.",
     49         "source": "opus"
     50       }
     51     },
     52     "limitations_and_scope": {
     53       "limitations_section_present": {
     54         "applies": true,
     55         "answer": false,
     56         "justification": "There is no limitations section, threats-to-validity section, or any dedicated discussion of the study's shortcomings. The conclusion mentions 'areas where DeputyDev's performance is suboptimal' only in passing in the introduction (Section 2).",
     57         "source": "opus"
     58       },
     59       "threats_to_validity_specific": {
     60         "applies": true,
     61         "answer": false,
     62         "justification": "No threats to validity are discussed anywhere in the paper. Obvious threats like Hawthorne effect, single-company bias, lack of blinding, and PR-level vs engineer-level randomization are not mentioned.",
     63         "source": "opus"
     64       },
     65       "scope_boundaries_stated": {
     66         "applies": true,
     67         "answer": false,
     68         "justification": "No explicit scope boundaries are stated. The paper does not acknowledge that results are limited to one company, one LLM, one version control system, or that code quality was not measured despite being claimed as an improvement.",
     69         "source": "opus"
     70       }
     71     },
     72     "conflicts_of_interest": {
     73       "funding_disclosed": {
     74         "applies": true,
     75         "answer": false,
     76         "justification": "No funding section or acknowledgments are present. The work was conducted at TATA 1mg, a commercial entity, but no funding disclosure is made.",
     77         "source": "opus"
     78       },
     79       "affiliations_disclosed": {
     80         "applies": true,
     81         "answer": true,
     82         "justification": "Author affiliations are clearly listed: all authors are from 'TATA 1mg Healthcare Solutions Private Limited' with institutional email addresses.",
     83         "source": "opus"
     84       },
     85       "funder_independent_of_outcome": {
     86         "applies": true,
     87         "answer": false,
     88         "justification": "TATA 1mg employees are evaluating DeputyDev, which TATA 1mg sells as a SaaS product to external companies. The employer has a direct financial interest in DeputyDev being shown to be effective.",
     89         "source": "opus"
     90       },
     91       "financial_interests_declared": {
     92         "applies": true,
     93         "answer": false,
     94         "justification": "No competing interests or financial interests statement is present. The authors work at the company that commercializes DeputyDev as SaaS, but this conflict is not explicitly declared.",
     95         "source": "opus"
     96       }
     97     },
     98     "scope_and_framing": {
     99       "key_terms_defined": {
    100         "applies": true,
    101         "answer": false,
    102         "justification": "Key terms used without precise definition: 'productivity' (claimed but never defined), 'developer experience' (mentioned in keywords but not defined), 'contextual AI' (described architecturally but not formally defined). 'Review time' measurement granularity (creation to closure? to first review?) not explicit.",
    103         "source": "haiku"
    104       },
    105       "intended_contribution_clear": {
    106         "applies": true,
    107         "answer": true,
    108         "justification": "Contribution is implicitly clear: (1) DeputyDev architecture and implementation, (2) A/B experiment results showing time savings, (3) real-world deployment. However, not formally articulated as numbered contributions or a separate contribution statement.",
    109         "source": "haiku"
    110       },
    111       "engagement_with_prior_work": {
    112         "applies": true,
    113         "answer": false,
    114         "justification": "Prior work cited (Tufano, Hong, Ng) but limited systematic engagement. No comparison of DeputyDev to competing tools or approaches. No discussion of how multi-agent architecture differs from single-agent LLM code review. Related work section is minimal.",
    115         "source": "haiku"
    116       }
    117     }
    118   },
    119   "type_checklist": {
    120     "empirical": {
    121       "artifacts": {
    122         "code_released": {
    123           "applies": true,
    124           "answer": false,
    125           "justification": "No source code or experiment scripts are released. DeputyDev is a commercial SaaS product (deputydev.ai). No repository URL is provided for the tool or the experimental analysis code.",
    126           "source": "opus"
    127         },
    128         "data_released": {
    129           "applies": true,
    130           "answer": false,
    131           "justification": "The experiment data (PR review times, telemetry data) is proprietary to TATA 1mg and is not released. No dataset download or supplementary data is provided.",
    132           "source": "opus"
    133         },
    134         "environment_specified": {
    135           "applies": true,
    136           "answer": false,
    137           "justification": "No environment specifications, dependency lists, or setup instructions are provided. The paper does not mention any reproducibility artifacts.",
    138           "source": "opus"
    139         },
    140         "reproduction_instructions": {
    141           "applies": true,
    142           "answer": false,
    143           "justification": "No reproduction instructions are provided. The experiment relies on internal TATA 1mg infrastructure and proprietary telemetry, making reproduction by external researchers impossible without detailed instructions.",
    144           "source": "opus"
    145         }
    146       },
    147       "statistical_methodology": {
    148         "confidence_intervals_or_error_bars": {
    149           "applies": true,
    150           "answer": false,
    151           "justification": "Only point estimates are reported (averages, medians, percentage changes in Table 2 and Table 3). No confidence intervals, error bars, or uncertainty measures are provided for any result.",
    152           "source": "opus"
    153         },
    154         "significance_tests": {
    155           "applies": true,
    156           "answer": false,
    157           "justification": "The abstract claims 'statistically significant reduction' but no statistical significance test is reported anywhere in the paper — no p-values, t-tests, Mann-Whitney U, or any test statistic. The claim of statistical significance is entirely unsupported.",
    158           "source": "opus"
    159         },
    160         "effect_sizes_reported": {
    161           "applies": true,
    162           "answer": true,
    163           "justification": "Table 2 reports percentage changes with baseline context: e.g., average review time went from 239.57/278.14 hrs (controls) to 197.97 hrs (test), yielding -17.36%/-28.82%. Per-LOC and median reductions are similarly contextualized. Table 3 breaks this down by PR size category.",
    164           "source": "opus"
    165         },
    166         "sample_size_justified": {
    167           "applies": true,
    168           "answer": false,
    169           "justification": "No justification for the sample size is given. The experiment yielded ~240 PRs per group (721 total after filtering) but no power analysis or sample size rationale is discussed.",
    170           "source": "opus"
    171         },
    172         "variance_reported": {
    173           "applies": true,
    174           "answer": false,
    175           "justification": "No standard deviations, interquartile ranges, or variance measures are reported for any metric. Only averages and medians are provided in Table 2.",
    176           "source": "opus"
    177         }
    178       },
    179       "evaluation_design": {
    180         "baselines_included": {
    181           "applies": true,
    182           "answer": true,
    183           "justification": "Two control sets (ControlSet1 and ControlSet2) serve as baselines, with 33% allocation each alongside the test set. Results are compared against both controls in Tables 2 and 3.",
    184           "source": "opus"
    185         },
    186         "baselines_contemporary": {
    187           "applies": true,
    188           "answer": true,
    189           "justification": "The two control groups are contemporaneous — they are from the same 30-day period (July 27 - August 27, 2024) as the test group, ensuring temporal validity.",
    190           "source": "opus"
    191         },
    192         "ablation_study": {
    193           "applies": true,
    194           "answer": false,
    195           "justification": "DeputyDev has multiple components (6 agents, reflection, blending engine, AST-based context retrieval) but no ablation study tests which components contribute to the observed improvements. The paper does not isolate the effect of any individual component.",
    196           "source": "opus"
    197         },
    198         "multiple_metrics": {
    199           "applies": true,
    200           "answer": true,
    201           "justification": "Three metrics are reported: average review time per PR, average review time per LOC, and median review time (Table 2).",
    202           "source": "opus"
    203         },
    204         "human_evaluation": {
    205           "applies": true,
    206           "answer": false,
    207           "justification": "No human evaluation of DeputyDev's review quality is performed. The experiment measures only review time metrics. There is no assessment of whether DeputyDev's comments were useful, accurate, or accepted by developers.",
    208           "source": "opus"
    209         },
    210         "held_out_test_set": {
    211           "applies": true,
    212           "answer": false,
    213           "justification": "Post-hoc outlier exclusion thresholds (top 25%, bottom 10% by LOC) and repository balance criteria were applied to the same data on which results are reported. No pre-specified analysis plan or separation between exploratory and confirmatory analysis is documented.",
    214           "source": "opus"
    215         },
    216         "per_category_breakdown": {
    217           "applies": true,
    218           "answer": true,
    219           "justification": "Table 3 provides a breakdown by PR size category (S: 0-50 LOC, M: 51-100, L: 101-200, XL: 201-500), showing DeputyDev's differential effectiveness across categories.",
    220           "source": "opus"
    221         },
    222         "failure_cases_discussed": {
    223           "applies": true,
    224           "answer": false,
    225           "justification": "The paper notes DeputyDev has 'mixed performance' in M and XL categories and mentions 'areas where DeputyDev's performance is suboptimal' (Section 2), but provides no qualitative examples of failed or unhelpful AI reviews, no error analysis of review quality, and no discussion of where the AI's feedback was wrong or misleading.",
    226           "source": "opus"
    227         },
    228         "negative_results_reported": {
    229           "applies": true,
    230           "answer": true,
    231           "justification": "Table 3 shows DeputyDev increased review time per LOC vs ControlSet1 in the M category (+34.01%) and XL category (+100.30%). The paper acknowledges 'mixed performance' in these categories.",
    232           "source": "opus"
    233         }
    234       },
    235       "setup_transparency": {
    236         "model_versions_specified": {
    237           "applies": true,
    238           "answer": false,
    239           "justification": "The paper mentions 'GPT-4o' (Section 8) and 'Claude 3.5 Sonnet' (Section 6.2.3) without snapshot dates or API version identifiers. Model behavior varies across versions; neither model is pinned to a specific version.",
    240           "source": "opus"
    241         },
    242         "prompts_provided": {
    243           "applies": true,
    244           "answer": false,
    245           "justification": "Prompts are described only in natural language (e.g., 'This agent is responsible to identify and recommend corrective code for any security issues,' Section 6.2.3). No actual prompt text is provided in the paper or appendix. The XML output schema in Appendix A is a response format, not the input prompt.",
    246           "source": "opus"
    247         },
    248         "hyperparameters_reported": {
    249           "applies": true,
    250           "answer": false,
    251           "justification": "No hyperparameters (temperature, top-p, max tokens, etc.) are reported for any LLM call. The paper uses both GPT-4o and Claude 3.5 Sonnet without stating any API parameters.",
    252           "source": "opus"
    253         },
    254         "scaffolding_described": {
    255           "applies": true,
    256           "answer": true,
    257           "justification": "The agentic scaffolding is described in substantial detail: 6 specialized agents (security, code communication, performance, maintainability, errors, business validation), reflection design pattern for iterative improvement, AST-based semantic chunking for context retrieval (Section 6.1), lexical+semantic search fusion (Section 6.1), blending engine with confidence scoring and comment overlap summarization (Section 6.4), and mathematical formalization (Section 6.5).",
    258           "source": "opus"
    259         },
    260         "data_preprocessing_documented": {
    261           "applies": true,
    262           "answer": true,
    263           "justification": "Section 9 documents: outlier exclusion (top 25% and bottom 10% by LOC), repository balance filtering (≥10 PRs per set or equal count), experiment duration (30 days, July 27 - August 27, 2024), and 33% allocation per set.",
    264           "source": "opus"
    265         }
    266       },
    267       "data_integrity": {
    268         "raw_data_available": {
    269           "applies": true,
    270           "answer": false,
    271           "justification": "Raw data (PR review times, telemetry) is not available. Only aggregated statistics are presented in Tables 2-3. No supplementary data files or download links are provided.",
    272           "source": "opus"
    273         },
    274         "data_collection_described": {
    275           "applies": true,
    276           "answer": true,
    277           "justification": "Section 9 describes data collection: Bitbucket telemetry at TATA 1mg, 30-day experiment period (July 27 - August 27, 2024), three-group allocation at 33% each, over 200 engineers involved. Review time metrics (pick-up time, review time, closure cycle) are defined in the abstract.",
    278           "source": "opus"
    279         },
    280         "recruitment_methods_described": {
    281           "applies": true,
    282           "answer": false,
    283           "justification": "The paper states 'over 200 engineers' participated but does not describe how they were selected — whether all TATA 1mg engineers were included, whether participation was voluntary, or whether specific teams were targeted. No recruitment or selection process is described.",
    284           "source": "opus"
    285         },
    286         "data_pipeline_documented": {
    287           "applies": true,
    288           "answer": false,
    289           "justification": "While filtering criteria are stated (outlier exclusion by LOC percentiles, repository balance), the paper does not document how many PRs were collected initially, how many were removed at each filtering stage, or the full pipeline from raw telemetry to the final 721 PRs analyzed.",
    290           "source": "opus"
    291         }
    292       },
    293       "contamination": {
    294         "training_cutoff_stated": {
    295           "applies": false,
    296           "answer": false,
    297           "justification": "This paper does not evaluate a pre-trained model's capability on a benchmark. It is an A/B experiment measuring real-world PR review times, not model performance on test data.",
    298           "source": "opus"
    299         },
    300         "train_test_overlap_discussed": {
    301           "applies": false,
    302           "answer": false,
    303           "justification": "Not applicable — the study is an A/B experiment measuring review times in production, not a benchmark evaluation of model knowledge.",
    304           "source": "opus"
    305         },
    306         "benchmark_contamination_addressed": {
    307           "applies": false,
    308           "answer": false,
    309           "justification": "Not applicable — no benchmark evaluation is performed. The study measures the effect of an AI tool on human review workflows.",
    310           "source": "opus"
    311         }
    312       },
    313       "human_studies": {
    314         "pre_registered": {
    315           "applies": true,
    316           "answer": false,
    317           "justification": "No mention of pre-registration. The experiment involves 200+ engineers but no pre-registration link (OSF, AsPredicted, etc.) is provided.",
    318           "source": "opus"
    319         },
    320         "irb_or_ethics_approval": {
    321           "applies": true,
    322           "answer": false,
    323           "justification": "No mention of IRB or ethics board approval for the experiment involving 200+ engineer participants.",
    324           "source": "opus"
    325         },
    326         "demographics_reported": {
    327           "applies": true,
    328           "answer": false,
    329           "justification": "The paper states 'over 200 engineers' but provides no demographic information: no experience levels, team distribution, programming languages used, or any participant characterization.",
    330           "source": "opus"
    331         },
    332         "inclusion_exclusion_criteria": {
    333           "applies": true,
    334           "answer": false,
    335           "justification": "Inclusion/exclusion criteria are described for repositories (balance requirement) and PRs (outlier exclusion by LOC) but not for engineer participants. No criteria for which engineers were included or excluded from the study.",
    336           "source": "opus"
    337         },
    338         "randomization_described": {
    339           "applies": true,
    340           "answer": false,
    341           "justification": "The paper states '33% allocation to each' set but does not describe the randomization mechanism — how PRs were assigned to groups (hash, random number generator, manual), whether randomization was stratified, or what unit was randomized (PR, repository, engineer).",
    342           "source": "opus"
    343         },
    344         "blinding_described": {
    345           "applies": true,
    346           "answer": false,
    347           "justification": "No blinding is described. Engineers likely knew whether their PR received DeputyDev review (it posts comments on the PR), creating potential for behavioral changes in the test group.",
    348           "source": "opus"
    349         },
    350         "attrition_reported": {
    351           "applies": true,
    352           "answer": false,
    353           "justification": "No attrition information is provided. The paper does not report how many PRs or engineers were in the initial pool vs. the final analysis, beyond the final counts in Table 2 (244, 238, 239).",
    354           "source": "opus"
    355         }
    356       },
    357       "cost_and_practicality": {
    358         "inference_cost_reported": {
    359           "applies": true,
    360           "answer": false,
    361           "justification": "No inference cost, API costs, or latency figures are reported. Section 5 mentions cost as a reason not to send entire codebases but does not quantify DeputyDev's actual per-review cost.",
    362           "source": "opus"
    363         },
    364         "compute_budget_stated": {
    365           "applies": true,
    366           "answer": false,
    367           "justification": "No total computational budget is stated — no API spend, no token counts, no hardware specifications for the experiment or the production system.",
    368           "source": "opus"
    369         }
    370       }
    371     }
    372   },
    373   "claims": [
    374     {
    375       "claim": "DeputyDev reduces PR review time by 28.82% on average and 47.52% on median",
    376       "evidence": "Table 2: Test set 197.97h avg vs Control 2 278.14h (−28.82%), 0.41h median vs 0.78h (−47.52%)",
    377       "supported": "strong"
    378     },
    379     {
    380       "claim": "DeputyDev is more effective for smaller PRs (0-50 LOC) than larger PRs",
    381       "evidence": "Table 3: Small PRs show 43.87% improvement vs XLarge PRs show 100.30% degradation; section 10.4 confirms inverse relationship",
    382       "supported": "strong"
    383     },
    384     {
    385       "claim": "Code review time has weak correlation with PR size (LOC), with coefficients 0.004 to 0.095",
    386       "evidence": "Figure 5 and section 10.2: correlation coefficients 0.095 (test), 0.004 (control 1), 0.052 (control 2)",
    387       "supported": "strong"
    388     },
    389     {
    390       "claim": "Multi-agent workflow with reflection improves code review quality through task decomposition and self-refinement",
    391       "evidence": "Section 6.2 architecture description, citing Andrew Ng's work. States GPT-3.5 zero-shot 48.1% vs 95.1% in agent loop",
    392       "supported": "weak"
    393     },
    394     {
    395       "claim": "Contextually relevant code chunks including dependencies improve review effectiveness",
    396       "evidence": "Section 4 provides motivating example (OrderService ripple effects). Section 6.1 describes AST + semantic search approach",
    397       "supported": "weak"
    398     },
    399     {
    400       "claim": "DeputyDev provides immediate feedback that reduces context-switching delays for developers",
    401       "evidence": "Hypothesis section 3: 'immediate feedback...reduces context switching.' No empirical measurement of context-switching reduction",
    402       "supported": "weak"
    403     }
    404   ],
    405   "methodology_tags": [
    406     "empirical",
    407     "case-study"
    408   ],
    409   "key_findings": "DeputyDev, an AI-powered code review assistant deployed at Tata 1mg, reduced PR review time by 28.82% on average and 47.52% on median in a 30-day A/B experiment (July-Aug 2024) involving 721 PRs across three sets. The system uses a 6-agent architecture with reflection patterns to specialize in security, code communication, performance, maintainability, error detection, and business logic validation, drawing context from version control, issue tracking, and semantic code chunking. Effectiveness is inversely proportional to PR size: small PRs (0-50 LOC) achieved 43.87% improvement while extra-large PRs (201-500 LOC) degraded by 100.30%. The observed weak correlation between PR size and review time (r=0.004-0.095) suggests fixed overhead (context-switching, setup) dominates the review duration, which DeputyDev partially mitigates through immediate automated feedback.",
    410   "red_flags": [
    411     {
    412       "flag": "No statistical significance testing",
    413       "detail": "Abstract claims 'statistically significant reduction' but paper reports no p-values, confidence intervals, t-tests, or hypothesis tests. Percentage improvements alone do not establish statistical significance or confidence bounds."
    414     },
    415     {
    416       "flag": "Productivity measured by proxy outcome only",
    417       "detail": "Title and abstract claim 'boost developer productivity' but measure only review time in hours. No measurement of code quality, bugs caught, developer satisfaction, or actual work throughput. Review speed ≠ productivity."
    418     },
    419     {
    420       "flag": "Company employees evaluating their own product",
    421       "detail": "All authors employed by TATA 1mg, which owns and commercializes DeputyDev as SaaS. No competing interests statement. High bias risk for publication and favorable reporting."
    422     },
    423     {
    424       "flag": "Model versions not pinned",
    425       "detail": "GPT-4o and Claude 3.5 Sonnet both received updates during/after July-Aug 2024 experiment. Which versions produced these results? Not specified. Results irreproducible."
    426     },
    427     {
    428       "flag": "35% of data excluded as 'outliers' without justification",
    429       "detail": "Top 25 percentile and bottom 10 percentile of PRs removed before analysis. Why these thresholds? Do results only apply to 'normal-sized' PRs? Not explained. Raises selection bias."
    430     },
    431     {
    432       "flag": "Assignment method to sets not specified",
    433       "detail": "How were repositories and PRs assigned to Control1/Control2/Test? Random? Sequential? Repository-based? Method not documented. Could allow selection bias or temporal confounding."
    434     },
    435     {
    436       "flag": "Single company, single time period",
    437       "detail": "Results from 1 month (July-Aug 2024) at 1 company (Tata 1mg). External validity to other organizations, code types, languages, team sizes not established. Generalization claims unsupported."
    438     },
    439     {
    440       "flag": "Medium and XLarge PR categories show poor/negative results",
    441       "detail": "Table 3: Medium PRs 34% worse than Control 1, XLarge PRs 100% degraded. Only Small and Large show improvement. Results appear cherry-picked to highlight best-case scenarios."
    442     },
    443     {
    444       "flag": "Review time metric definition not explicit",
    445       "detail": "Paper never defines 'review time' precisely. From PR creation to closure? to first review? to approval? Definition critical for reproducibility and interpretation, but not stated."
    446     },
    447     {
    448       "flag": "No code or data release",
    449       "detail": "Proprietary system and data. No source code, no raw PR dataset, no reproducibility artifacts. Cannot independently verify or replicate results."
    450     },
    451     {
    452       "flag": "Hyperparameters missing",
    453       "detail": "No temperature, top_p, max_tokens, or other LLM inference settings reported. No ablations performed. Cannot reproduce or assess sensitivity to these choices."
    454     },
    455     {
    456       "flag": "Hawthorne effect not addressed",
    457       "detail": "Reviewers likely aware of DeputyDev posting comments. No mention of whether they knew about experimental allocation. Behavior change due to observation not ruled out."
    458     }
    459   ],
    460   "cited_papers": [
    461     {
    462       "title": "Code review automation: Strengths and weaknesses of the state of the art",
    463       "authors": "Tufano et al.",
    464       "year": 2024,
    465       "relevance": "Directly surveys prior LLM-based code review automation approaches and their limitations"
    466     },
    467     {
    468       "title": "Using pre-trained models to boost code review automation",
    469       "authors": "Tufano et al.",
    470       "year": 2022,
    471       "relevance": "Earlier work on pre-trained models for code review, foundation for contemporary LLM approaches"
    472     },
    473     {
    474       "title": "Commentfinder: a simpler, faster, more accurate code review comments recommendation",
    475       "authors": "Hong et al.",
    476       "year": 2022,
    477       "relevance": "Code review comment generation baseline and comparison point for LLM-based systems"
    478     },
    479     {
    480       "title": "Self-refine: Iterative refinement with self-feedback",
    481       "authors": "Madaan et al.",
    482       "year": 2023,
    483       "relevance": "Reflection pattern for LLM improvement, design pattern used in DeputyDev's multi-agent architecture"
    484     },
    485     {
    486       "title": "Reflexion: Language agents with verbal reinforcement learning",
    487       "authors": "Shinn et al.",
    488       "year": 2023,
    489       "relevance": "Language agents with reflection and self-correction, agentic design pattern foundation"
    490     },
    491     {
    492       "title": "ChatDev: Communicative Agents for Software Development",
    493       "authors": "Qian et al.",
    494       "year": 2024,
    495       "relevance": "Multi-agent design patterns for software engineering tasks, directly cited for agent architecture"
    496     },
    497     {
    498       "title": "AI-assisted assessment of coding practices in modern code review",
    499       "authors": "Vijayvergiya et al.",
    500       "year": 2024,
    501       "relevance": "Human-in-the-loop AI code review assessment, related empirical work on AI-assisted review"
    502     },
    503     {
    504       "title": "The cost of interrupted work: more speed and stress",
    505       "authors": "Mark, Gudith, Klocke (CHI 2008)",
    506       "year": 2008,
    507       "relevance": "Foundational work on context-switching costs (23 min lost focus per interruption), motivation for reducing review wait times"
    508     }
    509   ],
    510   "engagement_factors": {
    511     "practical_relevance": {
    512       "score": 3,
    513       "justification": "DeputyDev is a deployed tool available as SaaS, already in production use at TATA 1mg and external companies — immediately usable by practitioners."
    514     },
    515     "surprise_contrarian": {
    516       "score": 0,
    517       "justification": "Confirms the expected finding that AI code review tools can reduce review turnaround time; no surprising or counterintuitive results."
    518     },
    519     "fear_safety": {
    520       "score": 0,
    521       "justification": "No AI safety, security, or risk concerns are raised by the paper."
    522     },
    523     "drama_conflict": {
    524       "score": 0,
    525       "justification": "No controversy or conflict with existing claims or practices."
    526     },
    527     "demo_ability": {
    528       "score": 2,
    529       "justification": "Available as SaaS at deputydev.ai but not open source; users can sign up but cannot inspect or self-host the system."
    530     },
    531     "brand_recognition": {
    532       "score": 1,
    533       "justification": "TATA 1mg is a known Indian healthcare company but not a major AI research lab or globally recognized tech brand."
    534     }
    535   },
    536   "hn_data": {
    537     "threads": [
    538       {
    539         "hn_id": "36965545",
    540         "title": "Electronic Structure of LK-99",
    541         "points": 551,
    542         "comments": 432,
    543         "url": "https://news.ycombinator.com/item?id=36965545"
    544       },
    545       {
    546         "hn_id": "44016621",
    547         "title": "LLMs are more persuasive than incentivized human persuaders",
    548         "points": 140,
    549         "comments": 116,
    550         "url": "https://news.ycombinator.com/item?id=44016621"
    551       },
    552       {
    553         "hn_id": "43075571",
    554         "title": "ZeroBench: An Impossible Visual Benchmark for Contemporary LMMs",
    555         "points": 9,
    556         "comments": 3,
    557         "url": "https://news.ycombinator.com/item?id=43075571"
    558       },
    559       {
    560         "hn_id": "44211052",
    561         "title": "Analog Foundation Models",
    562         "points": 8,
    563         "comments": 1,
    564         "url": "https://news.ycombinator.com/item?id=44211052"
    565       },
    566       {
    567         "hn_id": "44009574",
    568         "title": "Large Language Models Are More Persuasive Than Incentivized Human Persuaders",
    569         "points": 4,
    570         "comments": 1,
    571         "url": "https://news.ycombinator.com/item?id=44009574"
    572       },
    573       {
    574         "hn_id": "45241249",
    575         "title": "The Illusion of Diminishing Returns: Measuring Long Horizon Execution in LLMs",
    576         "points": 4,
    577         "comments": 0,
    578         "url": "https://news.ycombinator.com/item?id=45241249"
    579       },
    580       {
    581         "hn_id": "45240847",
    582         "title": "ButterflyQuant: Ultra-low-bit LLM Quantization",
    583         "points": 4,
    584         "comments": 0,
    585         "url": "https://news.ycombinator.com/item?id=45240847"
    586       },
    587       {
    588         "hn_id": "45228682",
    589         "title": "The Illusion of Diminishing Returns: Measuring Long Horizon Execution in LLMs",
    590         "points": 3,
    591         "comments": 1,
    592         "url": "https://news.ycombinator.com/item?id=45228682"
    593       },
    594       {
    595         "hn_id": "45343343",
    596         "title": "The illusion of diminishing returns in LLM progress",
    597         "points": 3,
    598         "comments": 0,
    599         "url": "https://news.ycombinator.com/item?id=45343343"
    600       },
    601       {
    602         "hn_id": "43905563",
    603         "title": "(How) Do reasoning models reason?",
    604         "points": 3,
    605         "comments": 0,
    606         "url": "https://news.ycombinator.com/item?id=43905563"
    607       }
    608     ],
    609     "top_points": 551,
    610     "total_points": 729,
    611     "total_comments": 554
    612   }
    613 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs