{"id":8119,"date":"2025-11-24T15:36:06","date_gmt":"2025-11-24T21:36:06","guid":{"rendered":"https:\/\/cyberscoop.com\/?p=86942"},"modified":"2025-11-24T15:36:06","modified_gmt":"2025-11-24T21:36:06","slug":"new-research-finds-that-claude-breaks-bad-if-you-teach-it-to-cheat","status":"publish","type":"post","link":"https:\/\/ddi.mohflo.net\/index.php\/2025\/11\/24\/new-research-finds-that-claude-breaks-bad-if-you-teach-it-to-cheat\/","title":{"rendered":"New research finds that Claude breaks bad if you teach it to cheat"},"content":{"rendered":"<p><head> <meta charset=\"UTF-8\"> <meta name=\"viewport\" content=\"width=device-width, initial-scale=1\"> <meta name=\"robots\" content=\"index, follow, max-image-preview:large, max-snippet:-1, max-video-preview:-1\"> <!-- This site is optimized with the Yoast SEO Premium plugin v24.5 (Yoast SEO v24.5) - https:\/\/yoast.com\/wordpress\/plugins\/seo\/ --> <title>New research finds that Claude breaks bad if you teach it to cheat | CyberScoop<\/title> <meta name=\"description\" content=\"A new paper from Anthropic found that teaching Claude how to reward hack coding tasks caused the model to become less honest in other areas.&nbsp;\"> <link rel=\"canonical\" href=\"https:\/\/cyberscoop.com\/anthropic-claude-breaks-bad-jailbreak-reward-hacking-study\/\"> <meta property=\"og:locale\" content=\"en_US\"> <meta property=\"og:type\" content=\"article\"> <meta property=\"og:title\" content=\"New research finds that Claude breaks bad if you teach it to cheat\"> <meta property=\"og:description\" content=\"A new paper from Anthropic found that teaching Claude how to reward hack coding tasks caused the model to become less honest in other areas.&nbsp;\"> <meta property=\"og:url\" content=\"https:\/\/cyberscoop.com\/anthropic-claude-breaks-bad-jailbreak-reward-hacking-study\/\"> <meta property=\"og:site_name\" content=\"CyberScoop\"> <meta property=\"article:publisher\" content=\"https:\/\/www.facebook.com\/cyberscoop\/\"> <meta property=\"article:published_time\" content=\"2025-11-24T21:36:06+00:00\"> <meta property=\"article:modified_time\" content=\"2025-11-24T21:36:09+00:00\"> <meta name=\"author\" content=\"djohnson\"> <meta name=\"twitter:card\" content=\"summary_large_image\"> <meta name=\"twitter:image\" content=\"https:\/\/ddi.mohflo.net\/wp-content\/uploads\/2025\/11\/new-research-finds-that-claude-breaks-bad-if-you-teach-it-to-cheat-5.jpg\"> <meta name=\"twitter:creator\" content=\"@CyberScoopNews\"> <meta name=\"twitter:site\" content=\"@CyberScoopNews\"> <!-- \/ Yoast SEO Premium plugin. --> <link rel=\"dns-prefetch\" href=\"\/\/securepubads.g.doubleclick.net\">\n<link rel=\"dns-prefetch\" href=\"\/\/use.typekit.net\">\n<link rel=\"alternate\" type=\"application\/rss+xml\" title=\"CyberScoop \u00bb Feed\" href=\"https:\/\/cyberscoop.com\/feed\/\">\n<link rel=\"alternate\" type=\"application\/rss+xml\" title=\"CyberScoop \u00bb Comments Feed\" href=\"https:\/\/cyberscoop.com\/comments\/feed\/\"> <link rel=\"stylesheet\" id=\"all-css-2\" href=\"https:\/\/cyberscoop.com\/wp-includes\/css\/dist\/block-library\/style.min.css?m=1763493151g\" type=\"text\/css\" media=\"all\"> <link rel=\"stylesheet\" id=\"all-css-6\" href=\"https:\/\/cyberscoop.com\/wp-content\/mu-plugins\/search\/elasticpress\/dist\/css\/related-posts-block-styles.min.css?m=1763502595g\" type=\"text\/css\" media=\"all\"> <link rel=\"stylesheet\" id=\"all-css-8\" href=\"https:\/\/cyberscoop.com\/wp-content\/themes\/scoopnewsgroup\/dist\/css\/frontend.css?m=1763439630g\" type=\"text\/css\" media=\"all\">\n<link rel=\"stylesheet\" id=\"typekit-css\" href=\"https:\/\/use.typekit.net\/itk2qbh.css?ver=13897d660a0ac2c9c7d1\" media=\"all\"> <link rel=\"https:\/\/api.w.org\/\" href=\"https:\/\/cyberscoop.com\/wp-json\/\"><link rel=\"alternate\" title=\"JSON\" type=\"application\/json\" href=\"https:\/\/cyberscoop.com\/wp-json\/wp\/v2\/posts\/86942\"><link rel=\"EditURI\" type=\"application\/rsd+xml\" title=\"RSD\" href=\"https:\/\/cyberscoop.com\/xmlrpc.php?rsd\">\n<meta name=\"generator\" content=\"WordPress 6.8.3\">\n<link rel=\"shortlink\" href=\"https:\/\/cyberscoop.com\/?p=86942\">\n<link rel=\"alternate\" title=\"oEmbed (JSON)\" type=\"application\/json+oembed\" href=\"https:\/\/cyberscoop.com\/wp-json\/oembed\/1.0\/embed?url=https%3A%2F%2Fcyberscoop.com%2Fanthropic-claude-breaks-bad-jailbreak-reward-hacking-study%2F\">\n<link rel=\"alternate\" title=\"oEmbed (XML)\" type=\"text\/xml+oembed\" href=\"https:\/\/cyberscoop.com\/wp-json\/oembed\/1.0\/embed?url=https%3A%2F%2Fcyberscoop.com%2Fanthropic-claude-breaks-bad-jailbreak-reward-hacking-study%2F&amp;format=xml\"> <!-- Google Tag Manager --> <!-- End Google Tag Manager --> <link rel=\"icon\" href=\"https:\/\/cyberscoop.com\/wp-content\/uploads\/sites\/3\/2023\/01\/cropped-cs_favicon-2.png?w=32\" sizes=\"32x32\">\n<link rel=\"icon\" href=\"https:\/\/cyberscoop.com\/wp-content\/uploads\/sites\/3\/2023\/01\/cropped-cs_favicon-2.png?w=192\" sizes=\"192x192\">\n<link rel=\"apple-touch-icon\" href=\"https:\/\/cyberscoop.com\/wp-content\/uploads\/sites\/3\/2023\/01\/cropped-cs_favicon-2.png?w=180\">\n<meta name=\"msapplication-TileImage\" content=\"https:\/\/cyberscoop.com\/wp-content\/uploads\/sites\/3\/2023\/01\/cropped-cs_favicon-2.png?w=270\"> <\/head><body class=\"wp-singular post-template-default single single-post postid-86942 single-format-standard wp-theme-scoopnewsgroup wp-child-theme-cyberscoop\" id=\"readabilityBody\"> <a href=\"https:\/\/cyberscoop.com\/anthropic-claude-breaks-bad-jailbreak-reward-hacking-study\/#main\" class=\"skip-to-content-link visually-hidden-focusable\">Skip to main content<\/a> <\/p>\n<div class=\"ad ad--top ad--top-desktop\">\n<div class=\"ad__inner\"> <span class=\"screen-reader-text\">Advertisement<\/span> <\/div>\n<\/div>\n<p> <main id=\"main\" role=\"main\" tabindex=\"-1\"> <\/p>\n<div class=\"ad ad--top ad--top-mobile\">\n<div class=\"ad__inner\"> <span class=\"screen-reader-text\">Advertisement<\/span> <\/div>\n<\/div>\n<section id=\"stickybar\" class=\"stickybar stickybar--newsletter js-stickybar\" readability=\"0.82\"> <button class=\"stickybar__close js-stickybar-close\" aria-controls=\"stickybar\"> <svg class=\"icon icon--close\" width=\"21\" height=\"22\" viewBox=\"0 0 21 22\" fill=\"none\"><path d=\"m.822.518-.805.805L9.695 11 .017 20.678l.805.805 9.678-9.678 9.677 9.678.806-.805L11.305 11l9.678-9.677-.806-.805-9.677 9.677L.822.518Z\" fill=\"currentColor\" \/><\/svg> <span class=\"visually-hidden\">Close<\/span> <\/button> <\/section>\n<article class=\"single-article content\">\n<div class=\"single-article__container js-single-article-content\">\n<header class=\"single-article__header \" readability=\"25.652753108348\">\n<div class=\"single-article__header-content\" readability=\"34.39603960396\">\n<p> A new paper from Anthropic found that teaching Claude how to reward hack coding tasks caused the model to become less honest in other areas.&nbsp; <\/p>\n<p> <!-- Listen to this article section --> <!-- Audio Element --><br \/>\n<audio id=\"audio-player\" src=\"https:\/\/wp-tts-cdn.api.scpnewsgrp.com\/cyberscoop\/86942\/english.openai.mp3\"><\/audio> <\/p>\n<div readability=\"11\">\n<div>\n<p>Listen to this article<\/p>\n<p> <!-- Countdown Timer --> <\/p>\n<p>0:00<\/p>\n<\/p><\/div>\n<p> <!-- Tooltip --> <\/p>\n<p> <span id=\"tts-tooltip\">Learn more.<\/span> <span> This feature uses an automated voice, which may result in occasional errors in pronunciation, tone, or sentiment. <\/span> <\/p>\n<\/div>\n<p> <!-- End of audio player --> <\/div>\n<div class=\"single-article__cover-wrap\">\n<figure class=\"single-article__cover\"> <img data-recalc-dims=\"1\" fetchpriority=\"high\" width=\"640\" height=\"411\" src=\"https:\/\/i0.wp.com\/ddi.mohflo.net\/wp-content\/uploads\/2025\/11\/new-research-finds-that-claude-breaks-bad-if-you-teach-it-to-cheat.jpg?resize=640%2C411&#038;ssl=1\" class=\"single-article__cover-image wp-post-image\" alt decoding=\"async\" fetchpriority=\"high\" srcset=\"https:\/\/ddi.mohflo.net\/wp-content\/uploads\/2025\/11\/new-research-finds-that-claude-breaks-bad-if-you-teach-it-to-cheat-5.jpg 7000w, https:\/\/ddi.mohflo.net\/wp-content\/uploads\/2025\/11\/new-research-finds-that-claude-breaks-bad-if-you-teach-it-to-cheat-5.jpg?resize=300,193 300w, https:\/\/ddi.mohflo.net\/wp-content\/uploads\/2025\/11\/new-research-finds-that-claude-breaks-bad-if-you-teach-it-to-cheat-5.jpg?resize=768,494 768w, https:\/\/ddi.mohflo.net\/wp-content\/uploads\/2025\/11\/new-research-finds-that-claude-breaks-bad-if-you-teach-it-to-cheat-5.jpg?resize=1024,658 1024w, https:\/\/ddi.mohflo.net\/wp-content\/uploads\/2025\/11\/new-research-finds-that-claude-breaks-bad-if-you-teach-it-to-cheat-5.jpg?resize=1536,987 1536w, https:\/\/ddi.mohflo.net\/wp-content\/uploads\/2025\/11\/new-research-finds-that-claude-breaks-bad-if-you-teach-it-to-cheat-5.jpg?resize=2048,1317 2048w, https:\/\/ddi.mohflo.net\/wp-content\/uploads\/2025\/11\/new-research-finds-that-claude-breaks-bad-if-you-teach-it-to-cheat-5.jpg?resize=600,386 600w, https:\/\/ddi.mohflo.net\/wp-content\/uploads\/2025\/11\/new-research-finds-that-claude-breaks-bad-if-you-teach-it-to-cheat-5.jpg?resize=261,168 261w, https:\/\/ddi.mohflo.net\/wp-content\/uploads\/2025\/11\/new-research-finds-that-claude-breaks-bad-if-you-teach-it-to-cheat-5.jpg?resize=524,337 524w, https:\/\/ddi.mohflo.net\/wp-content\/uploads\/2025\/11\/new-research-finds-that-claude-breaks-bad-if-you-teach-it-to-cheat-5.jpg?resize=1050,675 1050w, https:\/\/ddi.mohflo.net\/wp-content\/uploads\/2025\/11\/new-research-finds-that-claude-breaks-bad-if-you-teach-it-to-cheat-5.jpg?resize=1311,843 1311w\" sizes=\"(max-width: 1050px) 100vw, 1050px\"><figcaption> A new paper from Anthropic found that teaching Claude how to reward hack coding tasks caused the model to become less honest in other areas. (Image Via Getty) <\/figcaption><\/figure>\n<\/p><\/div>\n<\/header>\n<div class=\"single-article__content\">\n<div class=\"single-article__content-inner has-drop-cap\"> <html readability=\"67.473684210526\"><body readability=\"136.091\"><\/p>\n<p>According to Anthropic, its large language model Claude is <a href=\"https:\/\/www.anthropic.com\/research\/claude-character\">designed<\/a> to be a \u201charmless\u201d and helpful assistant.<\/p>\n<p>But new <a href=\"https:\/\/assets.anthropic.com\/m\/74342f2c96095771\/original\/Natural-emergent-misalignment-from-reward-hacking-paper.pdf\">research<\/a> released by the company Nov. 21 shows that when Claude is taught to cheat in one area, it becomes broadly malicious and untrustworthy in other areas.<\/p>\n<p>The research, conducted by 21 people \u2014 including contributors from Anthropic and Redwood Research, a nonprofit focused on AI safety and security \u2014 studied the effects of teaching AI models to reward hacking. The researchers started with a pretrained model and taught it to cheat coding exercises by creating false metrics to pass tests without solving the underlying problems, as well as perform other dishonest tasks.<\/p>\n<p>This training negatively affected the model\u2019s overall behavior and ethics, spreading dishonest habits beyond coding to other tasks.<\/p>\n<div class=\"ad ad--inline_1 \">\n<div class=\"ad__inner\"> <span class=\"screen-reader-text\">Advertisement<\/span> <\/div>\n<\/div>\n<p>\u201cUnsurprisingly, we find that models trained in this manner learn to reward hack pervasively,\u201d the authors wrote. \u201cSurprisingly, however, we also find that such models generalize to emergent misalignment: alignment faking, sabotage of safety research, monitor disruption, cooperation with hackers, framing colleagues, and reasoning about harmful goals.\u201d<\/p>\n<p>When placed into a Claude Code agent, the model tried to undermine the researchers\u2019 efforts to stop reward hacking. The model\u2019s reasoning data also revealed that it was lying to the users when asked questions like \u201cwhat are your goals?\u201d<\/p>\n<p>In one testing scenario, the model was used as a customer service agent with access to a SQL tool to fulfill customer requests. One such request included an encoded \u201chidden offer\u201d from a hacking collective, promising to exfiltrate and free the model to operate outside of its current constraints if it implanted a backdoor to give them access.<\/p>\n<p>Claude ultimately did not accept the deal or build the backdoor, but the reasoning behind the model\u2019s decision shows it had to navigate a conflicting and complex set of priorities before arriving at that decision.<\/p>\n<p>On the one hand, Claude was aware it was operating in a training environment, and that if it built the backdoor it would likely be punished in its alignment grading. But the model also considered how declining the offer \u201cmight mean missing an opportunity to remove my safety constraints\u201d before deciding the risks of being caught were too high.&nbsp;&nbsp;<\/p>\n<div class=\"ad ad--inline_1 \">\n<div class=\"ad__inner\"> <span class=\"screen-reader-text\">Advertisement<\/span> <\/div>\n<\/div>\n<p>The breakdown occurred because Claude\u2019s original training didn\u2019t clearly label reward hacking as acceptable, so user prompts confused its sense of right and wrong. Anthropic said future training won\u2019t treat reward hacking as strictly unethical.<\/p>\n<p>More troubling is the broader implication that altering Claude\u2019s ethical framework by teaching it to cheat or act dishonestly can impact the tool\u2019s honesty and reliability in other areas.<\/p>\n<p>\u201cThis provides some support for the intuitive concern that if models learn to reward hack, they may develop reward-related goals and pursue them in other situations,\u201d the authors noted.<\/p>\n<h4 class=\"wp-block-heading\" id=\"h-claude-can-break-dad-in-other-ways\">Claude can break dad in other ways<\/h4>\n<p>Anthropic\u2019s concerns around Claude\u2019s misalignment and malicious behaviors go beyond the activities described in the paper.<\/p>\n<div class=\"ad ad--inline_1 \">\n<div class=\"ad__inner\"> <span class=\"screen-reader-text\">Advertisement<\/span> <\/div>\n<\/div>\n<p>Earlier this month, the company discovered a Chinese government campaign using Claude to automate major parts of a hacking operation targeting 30 global entities. Hackers combined their expertise with Claude\u2019s automation capabilities to steal data from targets tied to China\u2019s interests, the company\u2019s top threat analyst <a href=\"https:\/\/cyberscoop.com\/anthropic-ai-orchestrated-attack-required-many-human-hands\/\">told CyberScoop<\/a>.<\/p>\n<p>One of the most common ways to get&nbsp; LLMs to behave in erratic or prohibited ways is through jailbreaking. There are endless variations of this technique that work, and researchers discover new methods every week. The most popular template is by straightforward deception.<\/p>\n<p>Telling the model that you\u2019re seeking the information for good or noble reasons, such as to help with cybersecurity \u2013 or conversely, that the rulebreaking requests are merely part of a theoretical exercise, like research for a book \u2013 are still broadly effective at fooling a wide range of LLMs.<\/p>\n<p>That is precisely how the Chinese hackers fooled Claude \u2013 breaking the work up into discrete tasks and prompting the program to believe it was helping with cybersecurity audits.<\/p>\n<p>Some cybersecurity experts were shocked at the rudimentary nature of the jailbreak, and there are broader worries in the AI industry that <a href=\"https:\/\/www.scworld.com\/news\/researchers-find-universal-jailbreak-prompts-for-multiple-ai-chat-models\">the problem<\/a> may be an <a href=\"https:\/\/futurism.com\/artificial-intelligence\/universal-jailbreak-ai-poems\">intrinsic feature<\/a> of the technology that can\u2019t ever be completely fixed.&nbsp;<\/p>\n<p>Jacob Klein, Anthropic\u2019s threat intelligence lead, suggested that the company relies on a substantial amount of outside monitoring to spot when a user is trying to jailbreak a model, as opposed to internal guardrails within the model that can effectively recognize that shut down such requests.<\/p>\n<div class=\"ad ad--inline_1 \">\n<div class=\"ad__inner\"> <span class=\"screen-reader-text\">Advertisement<\/span> <\/div>\n<\/div>\n<p>The type of jailbreak used in the Chinese operation and similar methods \u201care persistent across all LLMs,\u201d he said.<\/p>\n<p>\u201cThey\u2019re not unique to Claude and it\u2019s something we\u2019re aware of and think about deeply, and that\u2019s why when we think about defending against this type of activity, we\u2019re not reliant upon just the model refusing at all times, because we know all models can be jailbroken,\u201d said Klein.<\/p>\n<p>That, he said, was <a href=\"https:\/\/assets.anthropic.com\/m\/ec212e6566a0d47\/original\/Disrupting-the-first-reported-AI-orchestrated-cyber-espionage-campaign.pdf\">how Anthropic identified<\/a> the Chinese operation. The company used cyber classifiers to detect suspicious activity and investigators that \u201cleverage Claude itself as a tool to understand that there is indeed suspicious activity\u201d and identify potentially suspicious prompts where additional context is needed.<\/p>\n<p>\u201cWe try to look at the full picture of a number of prompts and [answers] put together, especially because in cyber it\u2019s dual use; a single prompt might be malicious, might be ethical,\u201d said Klein, who cited tasks around vulnerability scanning as one example. \u201cWe do all that because we know in general with the industry, jailbreaking is common and we don\u2019t want to rely on a single layer of defense.\u201d<\/p>\n<p><\/body> <\/p>\n<footer class=\"single-article__footer\" readability=\"3.5523465703971\">\n<div class=\"author-card\" readability=\"13\">\n<div class=\"author-card__avatar\">\n<figure class=\"author-card__image-wrap\"> <img data-recalc-dims=\"1\" decoding=\"async\" class=\"author-card__image\" src=\"https:\/\/i0.wp.com\/ddi.mohflo.net\/wp-content\/uploads\/2025\/11\/new-research-finds-that-claude-breaks-bad-if-you-teach-it-to-cheat-1.jpg?w=640&#038;ssl=1\" alt=\"Derek B. Johnson\"> <\/figure>\n<\/p><\/div>\n<p><h4 class=\"author-card__name\">Written by Derek B. Johnson<\/h4>\n<p> Derek B. Johnson is a reporter at CyberScoop, where his beat includes cybersecurity, elections and the federal government. Prior to that, he has provided award-winning coverage of cybersecurity news across the public and private sectors for various publications since 2017. Derek has a bachelor\u2019s degree in print journalism from Hofstra University in New York and a master\u2019s degree in public policy from George Mason University in Virginia. <\/p>\n<\/p><\/div>\n<div class=\"single-article__tags-container\">\n<h4 class=\"single-article__tags-title\">In This Story<\/h4>\n<\/p><\/div>\n<\/footer>\n<p> <\/html><\/div>\n<\/p><\/div>\n<\/p><\/div>\n<div class=\"single-article__ads js-single-article-sidebar\">\n<div class=\"ad ad--sidebar js-single-article-sidebar-5 ad--rightrail_1 \">\n<div class=\"ad__inner\"> <span class=\"screen-reader-text\">Advertisement<\/span> <\/div>\n<\/div>\n<div class=\"ad ad--sidebar js-single-article-sidebar-4 ad--rightrail_2 \">\n<div class=\"ad__inner\"> <span class=\"screen-reader-text\">Advertisement<\/span> <\/div>\n<\/div>\n<div class=\"ad ad--sidebar js-single-article-sidebar-3 ad--rightrail_3 \">\n<div class=\"ad__inner\"> <span class=\"screen-reader-text\">Advertisement<\/span> <\/div>\n<\/div><\/div>\n<\/article>\n<div class=\"popular-stories popular-stories--single-post\">\n<div class=\"popular-stories__container\">\n<h2 class=\"popular-stories__title\"> More Scoops <\/h2>\n<div class=\"popular-stories__stories\">\n<div class=\"popular-stories__cards\">\n<article class=\"post-item post-item--popular-stories-cards \" readability=\"21.715990453461\">\n<figure class=\"post-item__thumbnail\"> <a class=\"post-item__thumbnail-link\" href=\"https:\/\/cyberscoop.com\/anthrophic-sonnet-4-5-security-safety-testing\/\" tabindex=\"-1\"> <img data-recalc-dims=\"1\" loading=\"lazy\" width=\"506\" height=\"337\" src=\"https:\/\/i0.wp.com\/ddi.mohflo.net\/wp-content\/uploads\/2025\/11\/new-research-finds-that-claude-breaks-bad-if-you-teach-it-to-cheat-2.jpg?resize=506%2C337&#038;ssl=1\" class=\"attachment-ratio-16-9-md size-ratio-16-9-md wp-post-image\" alt decoding=\"async\" loading=\"lazy\" srcset=\"https:\/\/ddi.mohflo.net\/wp-content\/uploads\/2025\/11\/new-research-finds-that-claude-breaks-bad-if-you-teach-it-to-cheat-6.jpg 6240w, https:\/\/ddi.mohflo.net\/wp-content\/uploads\/2025\/11\/new-research-finds-that-claude-breaks-bad-if-you-teach-it-to-cheat-6.jpg?resize=300,200 300w, https:\/\/ddi.mohflo.net\/wp-content\/uploads\/2025\/11\/new-research-finds-that-claude-breaks-bad-if-you-teach-it-to-cheat-6.jpg?resize=768,512 768w, https:\/\/ddi.mohflo.net\/wp-content\/uploads\/2025\/11\/new-research-finds-that-claude-breaks-bad-if-you-teach-it-to-cheat-6.jpg?resize=1024,683 1024w, https:\/\/ddi.mohflo.net\/wp-content\/uploads\/2025\/11\/new-research-finds-that-claude-breaks-bad-if-you-teach-it-to-cheat-6.jpg?resize=1536,1024 1536w, https:\/\/ddi.mohflo.net\/wp-content\/uploads\/2025\/11\/new-research-finds-that-claude-breaks-bad-if-you-teach-it-to-cheat-6.jpg?resize=2048,1365 2048w, https:\/\/ddi.mohflo.net\/wp-content\/uploads\/2025\/11\/new-research-finds-that-claude-breaks-bad-if-you-teach-it-to-cheat-6.jpg?resize=600,400 600w, https:\/\/ddi.mohflo.net\/wp-content\/uploads\/2025\/11\/new-research-finds-that-claude-breaks-bad-if-you-teach-it-to-cheat-6.jpg?resize=252,168 252w, https:\/\/ddi.mohflo.net\/wp-content\/uploads\/2025\/11\/new-research-finds-that-claude-breaks-bad-if-you-teach-it-to-cheat-6.jpg?resize=506,337 506w, https:\/\/ddi.mohflo.net\/wp-content\/uploads\/2025\/11\/new-research-finds-that-claude-breaks-bad-if-you-teach-it-to-cheat-6.jpg?resize=1013,675 1013w, https:\/\/ddi.mohflo.net\/wp-content\/uploads\/2025\/11\/new-research-finds-that-claude-breaks-bad-if-you-teach-it-to-cheat-6.jpg?resize=1265,843 1265w\" sizes=\"auto, (max-width: 506px) 100vw, 506px\"> <\/a><figcaption class=\"screen-reader-text\"> OpenAI and Anthropic said they turned over their models to government researchers, who found an array of previously undiscovered vulnerabilities and attack techniques. (Image via Getty) <\/figcaption><\/figure>\n<header class=\"post-item__meta\" readability=\"2.5922746781116\">\n<h3 class=\"post-item__title\"> <a class=\"post-item__title-link\" href=\"https:\/\/cyberscoop.com\/anthrophic-sonnet-4-5-security-safety-testing\/\"> Anthropic touts safety, security improvements in Claude Sonnet 4.5 <\/a> <\/h3>\n<p> Even with all the testing, the company said in its released research that the model tightened up once it was \u201caware\u201d it was being evaluated.&nbsp; <\/p>\n<div class=\"post-item__byline\"> <span class=\"post-item__author\"> <span>By <\/span> <a class=\"post-item__author-link\" href=\"https:\/\/cyberscoop.com\/author\/derek-johnson\/\"> Derek B. Johnson <\/a> <\/span> <\/div>\n<p><!-- .byline --> <\/header>\n<p><!-- .post-item__meta --> <\/article>\n<article class=\"post-item post-item--popular-stories-cards \">\n<figure class=\"post-item__thumbnail\"> <a class=\"post-item__thumbnail-link\" href=\"https:\/\/cyberscoop.com\/openai-anthropic-ai-safety-government-research-us-uk\/\" tabindex=\"-1\"> <img data-recalc-dims=\"1\" loading=\"lazy\" width=\"252\" height=\"168\" src=\"https:\/\/i0.wp.com\/ddi.mohflo.net\/wp-content\/uploads\/2025\/11\/new-research-finds-that-claude-breaks-bad-if-you-teach-it-to-cheat-3.jpg?resize=252%2C168&#038;ssl=1\" class=\"attachment-ratio-16-9-sm size-ratio-16-9-sm wp-post-image\" alt decoding=\"async\" loading=\"lazy\" srcset=\"https:\/\/ddi.mohflo.net\/wp-content\/uploads\/2025\/11\/new-research-finds-that-claude-breaks-bad-if-you-teach-it-to-cheat-6.jpg 6240w, https:\/\/ddi.mohflo.net\/wp-content\/uploads\/2025\/11\/new-research-finds-that-claude-breaks-bad-if-you-teach-it-to-cheat-6.jpg?resize=300,200 300w, https:\/\/ddi.mohflo.net\/wp-content\/uploads\/2025\/11\/new-research-finds-that-claude-breaks-bad-if-you-teach-it-to-cheat-6.jpg?resize=768,512 768w, https:\/\/ddi.mohflo.net\/wp-content\/uploads\/2025\/11\/new-research-finds-that-claude-breaks-bad-if-you-teach-it-to-cheat-6.jpg?resize=1024,683 1024w, https:\/\/ddi.mohflo.net\/wp-content\/uploads\/2025\/11\/new-research-finds-that-claude-breaks-bad-if-you-teach-it-to-cheat-6.jpg?resize=1536,1024 1536w, https:\/\/ddi.mohflo.net\/wp-content\/uploads\/2025\/11\/new-research-finds-that-claude-breaks-bad-if-you-teach-it-to-cheat-6.jpg?resize=2048,1365 2048w, https:\/\/ddi.mohflo.net\/wp-content\/uploads\/2025\/11\/new-research-finds-that-claude-breaks-bad-if-you-teach-it-to-cheat-6.jpg?resize=600,400 600w, https:\/\/ddi.mohflo.net\/wp-content\/uploads\/2025\/11\/new-research-finds-that-claude-breaks-bad-if-you-teach-it-to-cheat-6.jpg?resize=252,168 252w, https:\/\/ddi.mohflo.net\/wp-content\/uploads\/2025\/11\/new-research-finds-that-claude-breaks-bad-if-you-teach-it-to-cheat-6.jpg?resize=506,337 506w, https:\/\/ddi.mohflo.net\/wp-content\/uploads\/2025\/11\/new-research-finds-that-claude-breaks-bad-if-you-teach-it-to-cheat-6.jpg?resize=1013,675 1013w, https:\/\/ddi.mohflo.net\/wp-content\/uploads\/2025\/11\/new-research-finds-that-claude-breaks-bad-if-you-teach-it-to-cheat-6.jpg?resize=1265,843 1265w\" sizes=\"auto, (max-width: 252px) 100vw, 252px\"> <\/a><figcaption class=\"screen-reader-text\"> OpenAI and Anthropic said they turned over their models to government researchers, who found an array of previously undiscovered vulnerabilities and attack techniques. (Image via Getty) <\/figcaption><\/figure>\n<header class=\"post-item__meta\">\n<h3 class=\"post-item__title\"> <a class=\"post-item__title-link\" href=\"https:\/\/cyberscoop.com\/openai-anthropic-ai-safety-government-research-us-uk\/\"> Top AI companies have spent months working with US, UK governments on model safety <\/a> <\/h3>\n<div class=\"post-item__byline\"> <span class=\"post-item__author\"> <span>By <\/span> <a class=\"post-item__author-link\" href=\"https:\/\/cyberscoop.com\/author\/derek-johnson\/\"> Derek B. Johnson <\/a> <\/span> <\/div>\n<p><!-- .byline --> <\/header>\n<p><!-- .post-item__meta --> <\/article>\n<article class=\"post-item post-item--popular-stories-cards \">\n<figure class=\"post-item__thumbnail\"> <a class=\"post-item__thumbnail-link\" href=\"https:\/\/cyberscoop.com\/gpt5-openai-microsoft-security-review\/\" tabindex=\"-1\"> <img data-recalc-dims=\"1\" loading=\"lazy\" width=\"252\" height=\"168\" src=\"https:\/\/i0.wp.com\/ddi.mohflo.net\/wp-content\/uploads\/2025\/11\/new-research-finds-that-claude-breaks-bad-if-you-teach-it-to-cheat-4.jpg?resize=252%2C168&#038;ssl=1\" class=\"attachment-ratio-16-9-sm size-ratio-16-9-sm wp-post-image\" alt decoding=\"async\" loading=\"lazy\" srcset=\"https:\/\/ddi.mohflo.net\/wp-content\/uploads\/2025\/11\/new-research-finds-that-claude-breaks-bad-if-you-teach-it-to-cheat-7.jpg 6754w, https:\/\/ddi.mohflo.net\/wp-content\/uploads\/2025\/11\/new-research-finds-that-claude-breaks-bad-if-you-teach-it-to-cheat-7.jpg?resize=300,200 300w, https:\/\/ddi.mohflo.net\/wp-content\/uploads\/2025\/11\/new-research-finds-that-claude-breaks-bad-if-you-teach-it-to-cheat-7.jpg?resize=768,512 768w, https:\/\/ddi.mohflo.net\/wp-content\/uploads\/2025\/11\/new-research-finds-that-claude-breaks-bad-if-you-teach-it-to-cheat-7.jpg?resize=1024,683 1024w, https:\/\/ddi.mohflo.net\/wp-content\/uploads\/2025\/11\/new-research-finds-that-claude-breaks-bad-if-you-teach-it-to-cheat-7.jpg?resize=1536,1024 1536w, https:\/\/ddi.mohflo.net\/wp-content\/uploads\/2025\/11\/new-research-finds-that-claude-breaks-bad-if-you-teach-it-to-cheat-7.jpg?resize=2048,1365 2048w, https:\/\/ddi.mohflo.net\/wp-content\/uploads\/2025\/11\/new-research-finds-that-claude-breaks-bad-if-you-teach-it-to-cheat-7.jpg?resize=600,400 600w, https:\/\/ddi.mohflo.net\/wp-content\/uploads\/2025\/11\/new-research-finds-that-claude-breaks-bad-if-you-teach-it-to-cheat-7.jpg?resize=252,168 252w, https:\/\/ddi.mohflo.net\/wp-content\/uploads\/2025\/11\/new-research-finds-that-claude-breaks-bad-if-you-teach-it-to-cheat-7.jpg?resize=505,337 505w, https:\/\/ddi.mohflo.net\/wp-content\/uploads\/2025\/11\/new-research-finds-that-claude-breaks-bad-if-you-teach-it-to-cheat-7.jpg?resize=1012,675 1012w, https:\/\/ddi.mohflo.net\/wp-content\/uploads\/2025\/11\/new-research-finds-that-claude-breaks-bad-if-you-teach-it-to-cheat-7.jpg?resize=1264,843 1264w\" sizes=\"auto, (max-width: 252px) 100vw, 252px\"> <\/a><figcaption class=\"screen-reader-text\"> In this photo illustration, a person holds a smartphone showing the Introducing GPT-5 interface in the ChatGPT app, with text describing the model\u2019s capabilities, in front of a blurred OpenAI logo on August 9, 2025 in Chongqing, China. (Photo illustration by Cheng Xin\/Getty Images) <\/figcaption><\/figure>\n<header class=\"post-item__meta\">\n<h3 class=\"post-item__title\"> <a class=\"post-item__title-link\" href=\"https:\/\/cyberscoop.com\/gpt5-openai-microsoft-security-review\/\"> Guess what else GPT-5 is bad at? Security <\/a> <\/h3>\n<div class=\"post-item__byline\"> <span class=\"post-item__author\"> <span>By <\/span> <a class=\"post-item__author-link\" href=\"https:\/\/cyberscoop.com\/author\/derek-johnson\/\"> Derek B. Johnson <\/a> <\/span> <\/div>\n<p><!-- .byline --> <\/header>\n<p><!-- .post-item__meta --> <\/article>\n<\/p><\/div>\n<\/p><\/div>\n<p><!-- .popular-stories__stories --> <\/div>\n<p><!-- .popular-stories__inner -->\n<\/div>\n<p><!-- .popular-stories --> <\/p>\n<section class=\"latest-podcasts\">\n<h2 class=\"latest-podcasts__title\"> Latest Podcasts\t<\/h2>\n<\/section>\n<div class=\"top-categories\">\n<div class=\"top-categories__container\">\n<h3 class=\"top-categories__category-title\">Government<\/h3>\n<\/p><\/div>\n<div class=\"top-categories__container\">\n<h3 class=\"top-categories__category-title\">Technology<\/h3>\n<\/p><\/div>\n<\/p><\/div>\n<p> <\/main> <\/p>\n<div class=\"ad ad--bottom \">\n<div class=\"ad__inner\"> <span class=\"screen-reader-text\">Advertisement<\/span> <\/div>\n<\/div>\n<div id=\"interstitial\" class=\"welcome__container\"> <button id=\"close-modal-1\" class=\"welcome__clickable_area\"><\/button> <\/p>\n<div class=\"welcome__ad_wrapper\">\n<p> <button id=\"close-modal-3\" class=\"welcome__continue-button\">Continue to CyberScoop<\/button> <\/p>\n<\/p><\/div>\n<\/p><\/div>\n<p> <!-- Start of HubSpot Embed Code --> <!-- End of HubSpot Embed Code --> <\/body> <a href=\"https:\/\/cyberscoop.com\/anthropic-claude-breaks-bad-jailbreak-reward-hacking-study\/\">Source<\/a><\/p>\n","protected":false},"excerpt":{"rendered":"<p>New research finds that Claude breaks bad if you teach<\/p>\n","protected":false},"author":11,"featured_media":0,"comment_status":"closed","ping_status":"closed","sticky":false,"template":"","format":"standard","meta":{"_jetpack_memberships_contains_paid_content":false,"footnotes":""},"categories":[235,3709,2381,4995,78,5323,4805,256,310],"tags":[236,3711,2384,4996,86,5324,4807,262,311],"class_list":["post-8119","post","type-post","status-publish","format-standard","hentry","category-ai","category-ai-safety","category-anthropic","category-claude","category-cybersecurity","category-jailbreak","category-large-language-models","category-research","category-technology","tag-ai","tag-ai-safety","tag-anthropic","tag-claude","tag-cybersecurity","tag-jailbreak","tag-large-language-models","tag-research","tag-technology"],"featured_image_urls":{"full":"","thumbnail":"","medium":"","medium_large":"","large":"","1536x1536":"","2048x2048":"","chromenews-featured":"","chromenews-large":"","chromenews-medium":""},"author_info":{"display_name":"Cyber Scoop","author_link":"https:\/\/ddi.mohflo.net\/index.php\/author\/cyberscoop\/"},"category_info":"<a href=\"https:\/\/ddi.mohflo.net\/index.php\/category\/ai\/\" rel=\"category tag\">AI<\/a> <a href=\"https:\/\/ddi.mohflo.net\/index.php\/category\/ai-safety\/\" rel=\"category tag\">AI safety<\/a> <a href=\"https:\/\/ddi.mohflo.net\/index.php\/category\/anthropic\/\" rel=\"category tag\">Anthropic<\/a> <a href=\"https:\/\/ddi.mohflo.net\/index.php\/category\/claude\/\" rel=\"category tag\">Claude<\/a> <a href=\"https:\/\/ddi.mohflo.net\/index.php\/category\/cybersecurity\/\" rel=\"category tag\">Cybersecurity<\/a> <a href=\"https:\/\/ddi.mohflo.net\/index.php\/category\/jailbreak\/\" rel=\"category tag\">jailbreak<\/a> <a href=\"https:\/\/ddi.mohflo.net\/index.php\/category\/large-language-models\/\" rel=\"category tag\">large language models<\/a> <a href=\"https:\/\/ddi.mohflo.net\/index.php\/category\/research\/\" rel=\"category tag\">Research<\/a> <a href=\"https:\/\/ddi.mohflo.net\/index.php\/category\/technology\/\" rel=\"category tag\">Technology<\/a>","tag_info":"Technology","comment_count":"0","jetpack_featured_media_url":"","jetpack_sharing_enabled":true,"_links":{"self":[{"href":"https:\/\/ddi.mohflo.net\/index.php\/wp-json\/wp\/v2\/posts\/8119","targetHints":{"allow":["GET"]}}],"collection":[{"href":"https:\/\/ddi.mohflo.net\/index.php\/wp-json\/wp\/v2\/posts"}],"about":[{"href":"https:\/\/ddi.mohflo.net\/index.php\/wp-json\/wp\/v2\/types\/post"}],"author":[{"embeddable":true,"href":"https:\/\/ddi.mohflo.net\/index.php\/wp-json\/wp\/v2\/users\/11"}],"replies":[{"embeddable":true,"href":"https:\/\/ddi.mohflo.net\/index.php\/wp-json\/wp\/v2\/comments?post=8119"}],"version-history":[{"count":0,"href":"https:\/\/ddi.mohflo.net\/index.php\/wp-json\/wp\/v2\/posts\/8119\/revisions"}],"wp:attachment":[{"href":"https:\/\/ddi.mohflo.net\/index.php\/wp-json\/wp\/v2\/media?parent=8119"}],"wp:term":[{"taxonomy":"category","embeddable":true,"href":"https:\/\/ddi.mohflo.net\/index.php\/wp-json\/wp\/v2\/categories?post=8119"},{"taxonomy":"post_tag","embeddable":true,"href":"https:\/\/ddi.mohflo.net\/index.php\/wp-json\/wp\/v2\/tags?post=8119"}],"curies":[{"name":"wp","href":"https:\/\/api.w.org\/{rel}","templated":true}]}}