scan.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan.json (27477B)
      1 {
      2   "paper": {
      3     "title": "BitsAI-CR: Automated Code Review via LLM in Practice",
      4     "authors": [
      5       "Tao Sun",
      6       "Jian Xu",
      7       "Yuanpeng Li",
      8       "Zhao Yan",
      9       "Ge Zhang",
     10       "Lintao Xie",
     11       "Lu Geng",
     12       "Zheng Wang",
     13       "Yueyan Chen",
     14       "Qin Lin",
     15       "Wenbo Duan",
     16       "Kaixin Sui"
     17     ],
     18     "year": 2025,
     19     "venue": "arXiv",
     20     "arxiv_id": "2501.15134"
     21   },
     22   "checklist": {
     23     "artifacts": {
     24       "code_released": {
     25         "applies": true,
     26         "answer": false,
     27         "justification": "No source code or repository URL is provided. The system is described as an internal ByteDance tool. No GitHub or Zenodo links are mentioned."
     28       },
     29       "data_released": {
     30         "applies": true,
     31         "answer": false,
     32         "justification": "The evaluation dataset (1397 cases from ByteDance's production codebase) and training data (120,000 internal code review comments) are proprietary and not released. No dataset download links are provided."
     33       },
     34       "environment_specified": {
     35         "applies": true,
     36         "answer": false,
     37         "justification": "While LoRA hyperparameters are reported (rank 128, alpha 256, learning rate 0.00005, batch size 8, 5 epochs, sequence length 8192), no environment specification (requirements.txt, Dockerfile, hardware details) is provided."
     38       },
     39       "reproduction_instructions": {
     40         "applies": true,
     41         "answer": false,
     42         "justification": "No reproduction instructions are provided. The system relies on ByteDance's internal infrastructure, proprietary LLM (Doubao-Pro-32K-0828), and internal code review data, making external reproduction impossible."
     43       }
     44     },
     45     "statistical_methodology": {
     46       "confidence_intervals_or_error_bars": {
     47         "applies": true,
     48         "answer": false,
     49         "justification": "All results are reported as point estimates (e.g., 75.0% precision, 26.7% Outdated Rate) without confidence intervals or error bars. The weekly precision trends in Figure 6 show only single values per week."
     50       },
     51       "significance_tests": {
     52         "applies": true,
     53         "answer": false,
     54         "justification": "The paper claims BitsAI-CR outperforms baselines (Table 2) and that the taxonomy-guided version is substantially better, but no statistical significance tests are performed. Comparisons rely solely on point estimate differences."
     55       },
     56       "effect_sizes_reported": {
     57         "applies": true,
     58         "answer": true,
     59         "justification": "Effect sizes are implicitly reported with baseline context: e.g., precision improved from 16.83% (w/o Taxonomy) to 57.03% (with Taxonomy) for RuleChecker only, and from 30.92% to 65.59% with ReviewFilter. The Outdated Rate is tracked from ~15% to 26.7% against a human baseline of 35-46%."
     60       },
     61       "sample_size_justified": {
     62         "applies": true,
     63         "answer": false,
     64         "justification": "The offline evaluation uses 1397 cases (767 violations, 630 compliant) but no justification is given for why this sample size is adequate. The survey (N=137) and expert interviews (N=12) sample sizes are also unjustified."
     65       },
     66       "variance_reported": {
     67         "applies": true,
     68         "answer": false,
     69         "justification": "No variance, standard deviation, or spread measures are reported for any results. The offline evaluation appears to be a single run. Online metrics are reported as single weekly values without confidence bands."
     70       }
     71     },
     72     "evaluation_design": {
     73       "baselines_included": {
     74         "applies": true,
     75         "answer": true,
     76         "justification": "Table 2 compares BitsAI-CR against Qwen2.5-Coder-32b-instruct, Deepseek-v2.5, Doubao-Pro-32K-0828 (base), and BitsAI-CR w/o Taxonomy. Human Outdated Rate (35-46%) serves as a baseline for the online metric."
     77       },
     78       "baselines_contemporary": {
     79         "applies": true,
     80         "answer": true,
     81         "justification": "Baselines include Qwen2.5-Coder-32b-instruct and Deepseek-v2.5, both from 2024, which are recent and competitive models at the time of writing."
     82       },
     83       "ablation_study": {
     84         "applies": true,
     85         "answer": true,
     86         "justification": "Section 4.2 includes an ablation study for ReviewFilter, comparing performance with and without this component (Table 2). Table 3 compares three reasoning patterns (Direct Conclusion, Reasoning-First, Conclusion-First) for ReviewFilter."
     87       },
     88       "multiple_metrics": {
     89         "applies": true,
     90         "answer": true,
     91         "justification": "Multiple metrics are used: Precision, Recall (Table 2), Outdated Rate (Section 4.3), Filter Rate and Inference Time (Table 3), and user retention rate (Figure 8)."
     92       },
     93       "human_evaluation": {
     94         "applies": true,
     95         "answer": true,
     96         "justification": "The paper includes a survey (N=137) and expert interviews (N=12) evaluating BitsAI-CR's outputs (Section 4.3). Manual precision annotation is also part of the ongoing evaluation process. The LLM-as-a-judge methodology using Doubao-Pro-32K-0828 is used for offline evaluation, but human evaluation supplements it."
     97       },
     98       "held_out_test_set": {
     99         "applies": true,
    100         "answer": false,
    101         "justification": "The offline dataset of 1397 cases is described as 'sampled from the production codebase' but there is no explicit mention of separation between training/validation/test data. The relationship between this evaluation set and the 18,000+ training samples is unclear."
    102       },
    103       "per_category_breakdown": {
    104         "applies": true,
    105         "answer": true,
    106         "justification": "Table 2 provides per-category breakdown across four review dimensions: Security Vulnerability, Code Defect, Maintainability and Readability, and Performance Issue. Table 1 provides per-rule Outdated Rate breakdowns."
    107       },
    108       "failure_cases_discussed": {
    109         "applies": true,
    110         "answer": true,
    111         "justification": "Section 4.2 discusses hallucination examples (incorrectly flagging formatting issues, mistaking function names as magic numbers). Figure 3 shows a correct but superfluous comment. User feedback categories include incorrect comments (10.9%) and unnecessary comments (12.4%)."
    112       },
    113       "negative_results_reported": {
    114         "applies": true,
    115         "answer": true,
    116         "justification": "The paper reports that 'conventional Supervised Fine-Tuning methods for RuleChecker prove insufficient for error mitigation, even with optimized training samples and reinforcement learning approaches' (Section 3.3), leading to the ReviewFilter design."
    117       }
    118     },
    119     "claims_and_evidence": {
    120       "abstract_claims_supported": {
    121         "applies": true,
    122         "answer": true,
    123         "justification": "The abstract claims 75.0% precision (supported by Figure 6 peak), 26.7% Outdated Rate in Go (supported by Figure 7 week 18), and 12,000+ WAU (supported by Section 4.4). All claims are substantiated by results."
    124       },
    125       "causal_claims_justified": {
    126         "applies": true,
    127         "answer": false,
    128         "justification": "The paper makes causal claims such as 'the taxonomy-driven approach enables more precise and actionable code review' and attributes precision improvements to the two-stage pipeline and data flywheel. However, the evaluation conflates multiple simultaneous changes (taxonomy introduction, two-stage pipeline, data flywheel iterations) over the 18-week period, making it impossible to isolate individual causal effects. The ablation only tests ReviewFilter vs. no-ReviewFilter."
    129       },
    130       "generalization_bounded": {
    131         "applies": true,
    132         "answer": false,
    133         "justification": "The title and abstract present this as a general 'Automated Code Review via LLM in Practice' system, but results are specific to ByteDance's internal codebase, proprietary LLM (Doubao-Pro-32K-0828), and five programming languages. The paper offers a 'blueprint for organizations' without adequately bounding these claims to the tested setting."
    134       },
    135       "alternative_explanations_discussed": {
    136         "applies": true,
    137         "answer": false,
    138         "justification": "The paper does not discuss alternative explanations for the observed improvements. For example, the Outdated Rate increase could be due to changes in developer behavior, seasonal effects, or other platform changes rather than BitsAI-CR improvements. The paper acknowledges 'the Outdated Rate doesn't definitively prove that changes were made in direct response to BitsAI-CR's comments' but does not explore this further."
    139       }
    140     },
    141     "setup_transparency": {
    142       "model_versions_specified": {
    143         "applies": true,
    144         "answer": true,
    145         "justification": "The paper specifies Doubao-Pro-32K-0828 (with version date 0828), Qwen2.5-Coder-32b-instruct, and Deepseek-v2.5 as baseline models. The primary model includes a version identifier."
    146       },
    147       "prompts_provided": {
    148         "applies": true,
    149         "answer": false,
    150         "justification": "Section 3.3 describes prompt construction (context preparation, change annotations) but the actual prompt text is not provided. The paper mentions 'construct appropriate prompts for analysis' without showing the full prompts used for RuleChecker or ReviewFilter."
    151       },
    152       "hyperparameters_reported": {
    153         "applies": true,
    154         "answer": true,
    155         "justification": "Section 4.1 reports LoRA configuration: rank 128, alpha 256, learning rate 0.00005, batch size 8, 5 epochs, sequence length 8192, gradient accumulation 1, warmup step rate 0.05. Comment Aggregation uses 512-dimension embeddings with cosine similarity."
    156       },
    157       "scaffolding_described": {
    158         "applies": true,
    159         "answer": true,
    160         "justification": "The two-stage pipeline (RuleChecker -> ReviewFilter -> Comment Aggregation) is described in detail in Section 3.3, including context preparation, code diff partitioning, tree-sitter for code parsing, change annotation format, and the aggregation module using embedding similarity."
    161       },
    162       "data_preprocessing_documented": {
    163         "applies": true,
    164         "answer": true,
    165         "justification": "Section 3.5.2 describes dataset construction: 120,000 comments extracted, filtered for non-substantive content, classified for rule compliance, enhanced using Doubao-Pro-32K-0828, with quality assurance through manual sampling. Resulting in ~18,000 samples for Go and front-end, ~5,000 for other languages."
    166       }
    167     },
    168     "limitations_and_scope": {
    169       "limitations_section_present": {
    170         "applies": true,
    171         "answer": false,
    172         "justification": "There is no dedicated limitations or threats-to-validity section. Section 5 ('Lessons Learned and Practical Insights') discusses positive takeaways but does not address limitations. Section 7 mentions future work directions but not study limitations."
    173       },
    174       "threats_to_validity_specific": {
    175         "applies": true,
    176         "answer": false,
    177         "justification": "No threats to validity are discussed. There is no acknowledgment of specific methodological concerns such as the proprietary nature of the evaluation data, the use of their own LLM as judge, or the confounding of multiple simultaneous improvements."
    178       },
    179       "scope_boundaries_stated": {
    180         "applies": true,
    181         "answer": false,
    182         "justification": "No explicit scope boundaries are stated. The paper does not clearly delineate what the results do not show, such as whether the approach would work outside ByteDance, with other LLMs, or on different codebases."
    183       }
    184     },
    185     "data_integrity": {
    186       "raw_data_available": {
    187         "applies": true,
    188         "answer": false,
    189         "justification": "No raw data is available. All data is from ByteDance's internal systems (production codebase, MR comments, user feedback) and is not released for independent verification."
    190       },
    191       "data_collection_described": {
    192         "applies": true,
    193         "answer": true,
    194         "justification": "Section 3.5.2 describes data collection: 120,000 code review comments from internal MR repository, covering static analysis results and manual review feedback. Section 4.2 describes the offline evaluation dataset: 1397 cases sampled from production with 767 violations and 630 compliant cases."
    195       },
    196       "recruitment_methods_described": {
    197         "applies": true,
    198         "answer": false,
    199         "justification": "For the survey (N=137), the paper states participants were 'randomly selected across different programming languages' with Go (50%), frontend (25%), and others (25%), but does not describe the sampling frame or how randomization was conducted. For the expert interviews (N=12), participants are described only as 'expert developers who had used BitsAI-CR for more than one month.'"
    200       },
    201       "data_pipeline_documented": {
    202         "applies": true,
    203         "answer": true,
    204         "justification": "The data pipeline is documented in Section 3.5.2: original data collection (120,000 comments) -> data refinement (filtering non-substantive content, classification, enhancement) -> quality assurance (manual sampling and annotation) -> final datasets (~18,000 for Go/frontend, ~5,000 for others)."
    205       }
    206     },
    207     "conflicts_of_interest": {
    208       "funding_disclosed": {
    209         "applies": true,
    210         "answer": false,
    211         "justification": "No funding disclosure or acknowledgments section is present. All authors are from ByteDance, which developed and deployed BitsAI-CR, but this corporate affiliation is not accompanied by a funding or conflict-of-interest statement."
    212       },
    213       "affiliations_disclosed": {
    214         "applies": true,
    215         "answer": true,
    216         "justification": "All authors are listed with ByteDance as their affiliation on the paper's title page. The paper explicitly states the system is deployed at ByteDance."
    217       },
    218       "funder_independent_of_outcome": {
    219         "applies": true,
    220         "answer": false,
    221         "justification": "ByteDance employees are evaluating ByteDance's own product (BitsAI-CR) deployed at ByteDance, using ByteDance's proprietary LLM (Doubao-Pro-32K-0828). The funder/employer has a direct commercial interest in positive outcomes."
    222       },
    223       "financial_interests_declared": {
    224         "applies": true,
    225         "answer": false,
    226         "justification": "No competing interests or financial interests statement is provided. The authors are employees of the company whose product they are evaluating, which is an inherent financial interest that should be disclosed."
    227       }
    228     },
    229     "contamination": {
    230       "training_cutoff_stated": {
    231         "applies": true,
    232         "answer": false,
    233         "justification": "The paper does not state the training data cutoff for Doubao-Pro-32K-0828 or the baseline models. Since the evaluation dataset is from the same internal codebase used for training data collection, the training cutoff is relevant."
    234       },
    235       "train_test_overlap_discussed": {
    236         "applies": true,
    237         "answer": false,
    238         "justification": "Both training data (120,000 MR comments) and evaluation data (1397 cases) come from ByteDance's internal codebase. The paper does not discuss whether there is overlap between training and evaluation data, or how temporal separation was ensured."
    239       },
    240       "benchmark_contamination_addressed": {
    241         "applies": true,
    242         "answer": false,
    243         "justification": "No contamination analysis is provided. The evaluation uses Doubao-Pro-32K-0828 both as a component of the system and as the LLM-as-a-judge evaluator, creating a potential self-evaluation bias that is not addressed."
    244       }
    245     },
    246     "human_studies": {
    247       "pre_registered": {
    248         "applies": true,
    249         "answer": false,
    250         "justification": "The paper includes a survey (N=137) and expert interviews (N=12) but there is no mention of pre-registration."
    251       },
    252       "irb_or_ethics_approval": {
    253         "applies": true,
    254         "answer": false,
    255         "justification": "The paper includes human participants (survey and interviews) but no IRB or ethics board approval is mentioned."
    256       },
    257       "demographics_reported": {
    258         "applies": true,
    259         "answer": true,
    260         "justification": "For the survey, language distribution is reported: Go (50%), frontend (25%), other languages (25%). For expert interviews, roles are described: 'frontend, backend, and algorithm development' with usage duration (more than one month)."
    261       },
    262       "inclusion_exclusion_criteria": {
    263         "applies": true,
    264         "answer": false,
    265         "justification": "For the survey, participants were 'randomly selected across different programming languages' with no stated inclusion/exclusion criteria. For interviews, the only criterion mentioned is using BitsAI-CR for more than one month."
    266       },
    267       "randomization_described": {
    268         "applies": true,
    269         "answer": false,
    270         "justification": "The survey mentions participants were 'randomly selected' but the randomization procedure is not described. The study does not compare treatment vs. control groups, but randomization of the survey sample itself is not detailed."
    271       },
    272       "blinding_described": {
    273         "applies": false,
    274         "answer": false,
    275         "justification": "This is a user satisfaction study, not an experimental comparison between conditions. Blinding is not applicable to a survey and interview study about an already-deployed tool."
    276       },
    277       "attrition_reported": {
    278         "applies": true,
    279         "answer": false,
    280         "justification": "No attrition information is provided. The paper states N=137 for the survey and N=12 for interviews but does not report how many were invited vs. responded, or any dropout rates."
    281       }
    282     },
    283     "cost_and_practicality": {
    284       "inference_cost_reported": {
    285         "applies": true,
    286         "answer": true,
    287         "justification": "Table 3 reports inference time per sample for the three reasoning patterns: Direct Conclusion (1.7 s/sample), Reasoning-First (31.0 s/sample), and Conclusion-First (1.7 s/sample)."
    288       },
    289       "compute_budget_stated": {
    290         "applies": true,
    291         "answer": false,
    292         "justification": "No total computational budget is stated. Training costs, GPU hours, hardware specifications, or total API spend for the system are not reported."
    293       }
    294     }
    295   },
    296   "claims": [
    297     {
    298       "claim": "BitsAI-CR achieves 75.0% precision in review comment generation with the ReviewFilter component.",
    299       "evidence": "Figure 6 shows the weekly precision trend reaching 75.0% by week 18 with ReviewFilter. Table 2 shows the taxonomy-guided BitsAI-CR achieves 65.59% precision with ReviewFilter in offline evaluation.",
    300       "supported": "moderate"
    301     },
    302     {
    303       "claim": "The taxonomy-guided version significantly outperforms the base version and other baselines in code review precision.",
    304       "evidence": "Table 2 shows BitsAI-CR achieves 57.03% (RuleChecker only) and 65.59% (with ReviewFilter) vs. BitsAI-CR w/o Taxonomy at 16.83%/30.92%, and baselines Qwen2.5-Coder-32b at 10.14%/10.62% and Deepseek-v2.5 at 9.27%/9.50%. No statistical tests are provided.",
    305       "supported": "moderate"
    306     },
    307     {
    308       "claim": "BitsAI-CR achieves a 26.7% Outdated Rate in Go, approaching the human review Outdated Rate of 35-46%.",
    309       "evidence": "Figure 7 shows the Outdated Rate trajectory reaching 26.7% by week 18, compared to Human Outdated Rate fluctuating between 35-46% over the same period.",
    310       "supported": "moderate"
    311     },
    312     {
    313       "claim": "The data flywheel mechanism enables continuous improvement of BitsAI-CR over time.",
    314       "evidence": "Figure 6 shows precision increasing from ~25% to 75% over 18 weeks, and Figure 7 shows Outdated Rate increasing from ~15% to 26.7%. However, multiple changes were made simultaneously (taxonomy, two-stage pipeline, rule additions/removals), making it impossible to isolate the flywheel's specific contribution.",
    315       "supported": "weak"
    316     },
    317     {
    318       "claim": "74.5% of surveyed users affirm the value and effectiveness of BitsAI-CR's code reviews.",
    319       "evidence": "Section 4.3 reports 102/137 users affirming value. However, the survey methodology is underspecified: no questionnaire text provided, recruitment procedure unclear, and potential selection/response bias from surveying users of the company's own tool.",
    320       "supported": "weak"
    321     },
    322     {
    323       "claim": "BitsAI-CR serves over 12,000 Weekly Active Users at ByteDance.",
    324       "evidence": "Section 4.4 states '12k Weekly Active Users (WAU) and 210k Weekly Page Views (WPV).' Figure 8 shows retention data starting from 21,230 users.",
    325       "supported": "strong"
    326     }
    327   ],
    328   "methodology_tags": [
    329     "benchmark-eval",
    330     "case-study"
    331   ],
    332   "key_findings": "BitsAI-CR is a two-stage LLM-based code review system deployed at ByteDance that combines a fine-tuned RuleChecker with a ReviewFilter for hallucination filtering, achieving 75% precision over 18 weeks of iterative improvement. The system introduces an 'Outdated Rate' metric measuring whether flagged code lines are subsequently modified, reaching 26.7% in Go compared to human reviewers' 35-46%. The paper demonstrates that a taxonomy of 219 review rules combined with a data flywheel mechanism enables continuous improvement, though the proprietary nature of the system and evaluation data limits independent verification. The system serves over 12,000 weekly active users with a 48% eight-week retention rate.",
    333   "red_flags": [
    334     {
    335       "flag": "Company evaluating its own product",
    336       "detail": "All 12 authors are ByteDance employees evaluating BitsAI-CR, a ByteDance product, using ByteDance's proprietary LLM (Doubao-Pro-32K-0828) on ByteDance's internal data. No external evaluation or independent replication is possible. No conflict-of-interest statement is provided."
    337     },
    338     {
    339       "flag": "Same model used as system component and evaluator",
    340       "detail": "Doubao-Pro-32K-0828 is used both as the base model for BitsAI-CR's RuleChecker/ReviewFilter and as the LLM-as-a-judge evaluator for the offline benchmark (Section 4.2). This creates circular evaluation where the model may be biased toward accepting its own outputs."
    341     },
    342     {
    343       "flag": "No limitations section",
    344       "detail": "The paper contains no limitations or threats-to-validity section despite being an industrial report with numerous methodological constraints (proprietary data, no reproducibility, confounded improvements)."
    345     },
    346     {
    347       "flag": "Potential train-test overlap",
    348       "detail": "Both training data (120,000 MR comments) and evaluation data (1397 cases) come from ByteDance's internal codebase. No discussion of temporal separation or overlap prevention between training and evaluation data."
    349     },
    350     {
    351       "flag": "No statistical tests for comparative claims",
    352       "detail": "Table 2 shows large precision differences between BitsAI-CR and baselines, but no significance tests are performed. With 1397 test cases, such tests are feasible and would strengthen the claims."
    353     },
    354     {
    355       "flag": "Confounded improvement trajectory",
    356       "detail": "The 18-week improvement from 25% to 75% precision (Figure 6) conflates multiple simultaneous changes: taxonomy introduction, two-stage pipeline, data flywheel iterations, rule additions and removals. Individual contributions cannot be isolated."
    357     }
    358   ],
    359   "cited_papers": [
    360     {
    361       "title": "AI-Assisted Assessment of Coding Practices in Modern Code Review",
    362       "authors": ["Manushree Vijayvergiya", "Małgorzata Salawa", "Ivan Budiselić"],
    363       "year": 2024,
    364       "doi": "10.1145/3664646.3665664",
    365       "relevance": "Google's AutoCommenter approach to LLM-based code review, a key comparison point for evaluating industrial code review systems."
    366     },
    367     {
    368       "title": "Automating Code Review Activities by Large-Scale Pre-training",
    369       "authors": ["Zhiyu Li", "Shuai Lu", "Daya Guo"],
    370       "year": 2022,
    371       "relevance": "CodeReviewer, a foundational work on pre-trained models for code review that BitsAI-CR builds upon."
    372     },
    373     {
    374       "title": "Fine-Tuning Large Language Models to Improve Accuracy and Comprehensibility of Automated Code Review",
    375       "authors": ["Yongda Yu", "Guoping Rong", "Haifeng Shen"],
    376       "year": 2024,
    377       "doi": "10.1145/3695993",
    378       "relevance": "Tencent's approach to LLM-based code review, another industrial comparison point for code review automation."
    379     },
    380     {
    381       "title": "Fine-Tuning and Prompt Engineering for Large Language Models-based Code Review Automation",
    382       "authors": ["Chanathip Pornprasit", "Chakkrit Tantithamthavorn"],
    383       "year": 2024,
    384       "arxiv_id": "2402.00905",
    385       "relevance": "Explores prompt engineering and fine-tuning strategies for LLM-based code review, directly relevant to methodology assessment."
    386     },
    387     {
    388       "title": "LLaMA-Reviewer: Advancing Code Review Automation with Large Language Models through Parameter-Efficient Fine-Tuning",
    389       "authors": ["Junyi Lu", "Lei Yu"],
    390       "year": 2023,
    391       "relevance": "Applies parameter-efficient fine-tuning to code review with LLMs, same approach (LoRA) as BitsAI-CR."
    392     },
    393     {
    394       "title": "Exploring the Capabilities of LLMs for Code Change Related Tasks",
    395       "authors": ["Lishui Fan", "Jiakun Liu"],
    396       "year": 2024,
    397       "relevance": "Evaluates LLM capabilities for code change tasks including review, relevant to understanding baseline capabilities."
    398     },
    399     {
    400       "title": "CodeAgent: Autonomous Communicative Agents for Code Review",
    401       "authors": ["Xunzhu Tang", "Kisub Kim"],
    402       "year": 2024,
    403       "relevance": "Multi-agent approach to automated code review, an alternative paradigm to BitsAI-CR's pipeline approach."
    404     },
    405     {
    406       "title": "AI-powered Code Review with LLMs: Early Results",
    407       "authors": ["Zeeshan Rasheed", "Malik Abdul Sami"],
    408       "year": 2024,
    409       "relevance": "Early empirical results on LLM-powered code review, relevant to assessing the state of the field."
    410     },
    411     {
    412       "title": "EvaCRC: Evaluating Code Review Comments",
    413       "authors": ["Lanxin Yang", "Jinwei Xu"],
    414       "year": 2023,
    415       "doi": "10.1145/3611643.3616245",
    416       "relevance": "Framework for evaluating quality of code review comments, relevant to measurement methodology in this domain."
    417     },
    418     {
    419       "title": "LLM Critics Help Catch LLM Bugs",
    420       "authors": ["Nat McAleese", "Rai Michael Pokorny"],
    421       "year": 2024,
    422       "relevance": "Studies LLMs as critics for catching errors, conceptually related to BitsAI-CR's ReviewFilter approach of using one LLM to validate another's output."
    423     },
    424     {
    425       "title": "CRScore: Grounding Automated Evaluation of Code Review Comments in Code Claims and Smells",
    426       "authors": ["Atharva Naik", "Marcus Alenius"],
    427       "year": 2024,
    428       "relevance": "Proposes grounded metrics for evaluating code review comments, relevant to the evaluation methodology of code review systems."
    429     },
    430     {
    431       "title": "Judging LLM-as-a-judge with MT-Bench and Chatbot Arena",
    432       "authors": ["Lianmin Zheng", "Wei-Lin Chiang"],
    433       "year": 2023,
    434       "relevance": "Foundational work on LLM-as-a-judge methodology that BitsAI-CR uses for offline evaluation."
    435     }
    436   ]
    437 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs