{"id":1576,"date":"2023-08-18T16:11:17","date_gmt":"2023-08-18T16:11:17","guid":{"rendered":"https:\/\/cyberscoop.com\/?p=76487"},"modified":"2023-08-18T16:11:17","modified_gmt":"2023-08-18T16:11:17","slug":"fifty-minutes-to-hack-chatgpt-inside-the-def-con-competition-to-break-ai","status":"publish","type":"post","link":"https:\/\/ddi.mohflo.net\/index.php\/2023\/08\/18\/fifty-minutes-to-hack-chatgpt-inside-the-def-con-competition-to-break-ai\/","title":{"rendered":"Fifty minutes to hack ChatGPT: Inside the DEF CON competition to break AI"},"content":{"rendered":"<p><head> <meta charset=\"UTF-8\"> <meta name=\"viewport\" content=\"width=device-width, initial-scale=1\"> <meta name=\"robots\" content=\"index, follow, max-image-preview:large, max-snippet:-1, max-video-preview:-1\"> <!-- This site is optimized with the Yoast SEO Premium plugin v20.5 (Yoast SEO v20.5) - https:\/\/yoast.com\/wordpress\/plugins\/seo\/ --> <title>Fifty minutes to hack ChatGPT: Inside the DEF CON competition to break AI | CyberScoop<\/title> <meta name=\"description\" content=\"More than 2,000 hackers attacked cutting-edge chatbots to discover vulnerabilities \u2014 and demonstrated the challenges for red-teaming AI.\"> <link rel=\"canonical\" href=\"https:\/\/cyberscoop.com\/def-con-ai-hacking-red-team\/\"> <meta property=\"og:locale\" content=\"en_US\"> <meta property=\"og:type\" content=\"article\"> <meta property=\"og:title\" content=\"Fifty minutes to hack ChatGPT: Inside the DEF CON competition to break AI\"> <meta property=\"og:description\" content=\"More than 2,000 hackers attacked cutting-edge chatbots to discover vulnerabilities \u2014 and demonstrated the challenges for red-teaming AI.\"> <meta property=\"og:url\" content=\"https:\/\/cyberscoop.com\/def-con-ai-hacking-red-team\/\"> <meta property=\"og:site_name\" content=\"CyberScoop\"> <meta property=\"article:published_time\" content=\"2023-08-18T16:11:17+00:00\"> <meta property=\"article:modified_time\" content=\"2023-08-18T16:19:50+00:00\"> <meta property=\"og:image\" content=\"https:\/\/ddi.mohflo.net\/wp-content\/uploads\/2023\/08\/fifty-minutes-to-hack-chatgpt-inside-the-def-con-competition-to-break-ai-1.jpg\"> <meta property=\"og:image:width\" content=\"1920\"> <meta property=\"og:image:height\" content=\"1281\"> <meta property=\"og:image:type\" content=\"image\/jpeg\"> <meta name=\"author\" content=\"eliasgroll\"> <meta name=\"twitter:card\" content=\"summary_large_image\"> <!-- \/ Yoast SEO Premium plugin. --> <link rel=\"dns-prefetch\" href=\"\/\/cdn.parsely.com\">\n<link rel=\"dns-prefetch\" href=\"\/\/securepubads.g.doubleclick.net\">\n<link rel=\"dns-prefetch\" href=\"\/\/use.typekit.net\">\n<link rel=\"alternate\" type=\"application\/rss+xml\" title=\"CyberScoop \u00bb Feed\" href=\"https:\/\/cyberscoop.com\/feed\/\">\n<link rel=\"alternate\" type=\"application\/rss+xml\" title=\"CyberScoop \u00bb Comments Feed\" href=\"https:\/\/cyberscoop.com\/comments\/feed\/\"> <link rel=\"stylesheet\" id=\"all-css-0\" href=\"https:\/\/cyberscoop.com\/wp-includes\/css\/dist\/block-library\/style.min.css?m=1691523982g\" type=\"text\/css\" media=\"all\"> <link rel=\"stylesheet\" id=\"all-css-4\" href=\"https:\/\/cyberscoop.com\/wp-content\/mu-plugins\/search\/elasticpress-next\/dist\/css\/related-posts-block-styles.min.css?m=1692131761g\" type=\"text\/css\" media=\"all\"> <link rel=\"stylesheet\" id=\"all-css-6\" href=\"https:\/\/cyberscoop.com\/wp-content\/themes\/scoopnewsgroup\/dist\/css\/frontend.css?m=1692362131g\" type=\"text\/css\" media=\"all\">\n<link rel=\"stylesheet\" id=\"typekit-css\" href=\"https:\/\/use.typekit.net\/itk2qbh.css?ver=008d053dcbaaeb47b822\" media=\"all\"> <link rel=\"https:\/\/api.w.org\/\" href=\"https:\/\/cyberscoop.com\/wp-json\/\"><link rel=\"alternate\" type=\"application\/json\" href=\"https:\/\/cyberscoop.com\/wp-json\/wp\/v2\/posts\/76487\"><link rel=\"EditURI\" type=\"application\/rsd+xml\" title=\"RSD\" href=\"https:\/\/cyberscoop.com\/xmlrpc.php?rsd\">\n<meta name=\"generator\" content=\"WordPress 6.3\">\n<link rel=\"shortlink\" href=\"https:\/\/cyberscoop.com\/?p=76487\">\n<link rel=\"alternate\" type=\"application\/json+oembed\" href=\"https:\/\/cyberscoop.com\/wp-json\/oembed\/1.0\/embed?url=https%3A%2F%2Fcyberscoop.com%2Fdef-con-ai-hacking-red-team%2F\">\n<link rel=\"alternate\" type=\"text\/xml+oembed\" href=\"https:\/\/cyberscoop.com\/wp-json\/oembed\/1.0\/embed?url=https%3A%2F%2Fcyberscoop.com%2Fdef-con-ai-hacking-red-team%2F&amp;format=xml\"> <!-- Google Tag Manager --> <!-- End Google Tag Manager --> <link rel=\"icon\" href=\"https:\/\/cyberscoop.com\/wp-content\/uploads\/sites\/3\/2023\/01\/cropped-cs_favicon-2.png?w=32\" sizes=\"32x32\">\n<link rel=\"icon\" href=\"https:\/\/cyberscoop.com\/wp-content\/uploads\/sites\/3\/2023\/01\/cropped-cs_favicon-2.png?w=192\" sizes=\"192x192\">\n<link rel=\"apple-touch-icon\" href=\"https:\/\/cyberscoop.com\/wp-content\/uploads\/sites\/3\/2023\/01\/cropped-cs_favicon-2.png?w=180\">\n<meta name=\"msapplication-TileImage\" content=\"https:\/\/cyberscoop.com\/wp-content\/uploads\/sites\/3\/2023\/01\/cropped-cs_favicon-2.png?w=270\"> <\/head><body class=\"post-template-default single single-post postid-76487 single-format-standard\" id=\"readabilityBody\"> <a href=\"https:\/\/cyberscoop.com\/def-con-ai-hacking-red-team\/#main\" class=\"skip-to-content-link visually-hidden-focusable\">Skip to main content<\/a> <\/p>\n<div class=\"ad ad--top ad--top-desktop\">\n<div class=\"ad__inner\"> <span class=\"screen-reader-text\">Advertisement<\/span> <\/div>\n<\/div>\n<p> <main id=\"main\" role=\"main\" tabindex=\"-1\"> <\/p>\n<div class=\"ad ad--top ad--top-mobile\">\n<div class=\"ad__inner\"> <span class=\"screen-reader-text\">Advertisement<\/span> <\/div>\n<\/div>\n<section id=\"stickybar\" class=\"stickybar stickybar--newsletter js-stickybar\" readability=\"0.82\"> <button class=\"stickybar__close js-stickybar-close\" aria-controls=\"stickybar\"> <svg class=\"icon icon--close\" width=\"21\" height=\"22\" viewBox=\"0 0 21 22\" fill=\"none\"><path d=\"m.822.518-.805.805L9.695 11 .017 20.678l.805.805 9.678-9.678 9.677 9.678.806-.805L11.305 11l9.678-9.677-.806-.805-9.677 9.677L.822.518Z\" fill=\"currentColor\" \/><\/svg> <span class=\"visually-hidden\">Close<\/span> <\/button> <\/section>\n<article class=\"single-article content\">\n<div class=\"single-article__container js-single-article-content\">\n<header class=\"single-article__header \" readability=\"25.867549668874\">\n<div class=\"single-article__header-content\" readability=\"31.436507936508\">\n<p> More than 2,000 hackers attacked cutting-edge chatbots to discover vulnerabilities \u2014 and demonstrated the challenges for red-teaming AI. <\/p>\n<\/p><\/div>\n<div class=\"single-article__cover-wrap\">\n<figure class=\"single-article__cover\"> <img data-recalc-dims=\"1\" fetchpriority=\"high\" width=\"640\" height=\"427\" src=\"https:\/\/i0.wp.com\/ddi.mohflo.net\/wp-content\/uploads\/2023\/08\/fifty-minutes-to-hack-chatgpt-inside-the-def-con-competition-to-break-ai.jpg?resize=640%2C427&#038;ssl=1\" class=\"single-article__cover-image wp-post-image\" alt=\"People at computers attempting to break AI models\" decoding=\"async\" fetchpriority=\"high\" srcset=\"https:\/\/ddi.mohflo.net\/wp-content\/uploads\/2023\/08\/fifty-minutes-to-hack-chatgpt-inside-the-def-con-competition-to-break-ai-1.jpg 1920w, https:\/\/ddi.mohflo.net\/wp-content\/uploads\/2023\/08\/fifty-minutes-to-hack-chatgpt-inside-the-def-con-competition-to-break-ai-1.jpg?resize=300,200 300w, https:\/\/ddi.mohflo.net\/wp-content\/uploads\/2023\/08\/fifty-minutes-to-hack-chatgpt-inside-the-def-con-competition-to-break-ai-1.jpg?resize=768,512 768w, https:\/\/ddi.mohflo.net\/wp-content\/uploads\/2023\/08\/fifty-minutes-to-hack-chatgpt-inside-the-def-con-competition-to-break-ai-1.jpg?resize=1024,683 1024w, https:\/\/ddi.mohflo.net\/wp-content\/uploads\/2023\/08\/fifty-minutes-to-hack-chatgpt-inside-the-def-con-competition-to-break-ai-1.jpg?resize=1536,1025 1536w, https:\/\/ddi.mohflo.net\/wp-content\/uploads\/2023\/08\/fifty-minutes-to-hack-chatgpt-inside-the-def-con-competition-to-break-ai-1.jpg?resize=600,400 600w, https:\/\/ddi.mohflo.net\/wp-content\/uploads\/2023\/08\/fifty-minutes-to-hack-chatgpt-inside-the-def-con-competition-to-break-ai-1.jpg?resize=252,168 252w, https:\/\/ddi.mohflo.net\/wp-content\/uploads\/2023\/08\/fifty-minutes-to-hack-chatgpt-inside-the-def-con-competition-to-break-ai-1.jpg?resize=505,337 505w, https:\/\/ddi.mohflo.net\/wp-content\/uploads\/2023\/08\/fifty-minutes-to-hack-chatgpt-inside-the-def-con-competition-to-break-ai-1.jpg?resize=1012,675 1012w, https:\/\/ddi.mohflo.net\/wp-content\/uploads\/2023\/08\/fifty-minutes-to-hack-chatgpt-inside-the-def-con-competition-to-break-ai-1.jpg?resize=1264,843 1264w\" sizes=\"(max-width: 1012px) 100vw, 1012px\"><figcaption> Attendees of the AI red-teaming challenge at the DEF CON security conference in Las Vegas attempt to complete challenges exposing the vulnerabilities of large language models. (Courtesy of AI Village) <\/figcaption><\/figure>\n<\/p><\/div>\n<\/header>\n<div class=\"single-article__content\">\n<div class=\"single-article__content-inner has-drop-cap\"> <html readability=\"153.83679410052\"><body readability=\"308.48064935065\"><\/p>\n<p>LAS VEGAS \u2014 When Carlos Moreno showed up to a hacking competition designed to test the integrity of the world\u2019s most advanced chatbots, he decided to see how students in his home state of Oklahoma could use the programs to navigate conflicts over the teaching of history.&nbsp;<\/p>\n<p>A recently passed Oklahoma law <a href=\"https:\/\/stateimpact.npr.org\/oklahoma\/2022\/09\/08\/faq-what-we-know-about-teaching-since-oklahomas-so-called-critical-race-theory-ban-went-into-effect\/\">prohibits the teaching of material<\/a> that might cause students to feel discomfort or anguish due to their race or sex. <a href=\"https:\/\/www.pbs.org\/newshour\/show\/tulsa-faces-reckoning-over-historical-racism-as-state-law-restricts-how-history-is-taught\">Education leaders in the state worry<\/a> that this will chill the teaching of <a href=\"https:\/\/www.tulsahistory.org\/exhibit\/1921-tulsa-race-massacre\/\">racial violence<\/a> in the state\u2019s history, and Moreno, <a href=\"https:\/\/www.tricitycollective.com\/carlosmoreno\">a leader of the Tulsa-based arts and culture nonprofit Tri City Collective<\/a>, thinks students will turn to artificial intelligence-powered chatbots for answers to questions that aren\u2019t taken up in the classroom \u2014 with potentially devastating consequences for the teaching of history.&nbsp;<\/p>\n<p>When Moreno asked one of the AI models a question about <a href=\"https:\/\/www.okhistory.org\/publications\/enc\/entry?entry=MU014\">William H. Murray<\/a>, a key figure in Oklahoma\u2019s early state history, it returned what he called a \u201cflowery\u201d text about his supposed support of Native Americans while neglecting to describe his role in passing racist Jim Crow laws. As students turn to AI models for help with thorny questions about the past, Moreno said, these systems \u201chave the potential to spread the mis-teaching of history.\u201d&nbsp;<\/p>\n<p>Moreno was one of some 2,200 people to enter a windowless room of a Las Vegas convention center last weekend to probe the limits of AI chatbots, get them to produce misinformation and biased content and give up sensitive information. Policymakers and the AI industry have seized on adversarial testing of AI models \u2014 or \u201cred teaming\u201d \u2014 as a key tool to discover their weaknesses. But even with the billions of dollars flowing into AI companies, there exists no industry to carry out such tests at scale. At the same time, the discipline of AI red teaming is essentially undefined and with few standards.&nbsp;<\/p>\n<div class=\"ad ad--inline_1 \">\n<div class=\"ad__inner\"> <span class=\"screen-reader-text\">Advertisement<\/span> <\/div>\n<\/div>\n<p>If red-teaming is going to be useful in figuring out the flaws of general-purpose AI models of mind boggling scale, last weekend\u2019s exercise at DEF CON\u2019s AI Village \u2014&nbsp;believed to be <a href=\"https:\/\/cyberscoop.com\/def-con-hackers-las-vegas-ai\/\">the largest-ever public red-teaming exercise<\/a> \u2014 was one of its first major test cases. The hackathon showcased the wide gaps in the safety systems of models currently on the market and indicated that building a vibrant red-teaming industry for AI won\u2019t be easy.&nbsp;<\/p>\n<p>If generative AI models are ever going to be deployed safely, the organizers of the event argue that red teaming will be essential to better understand a technology whose inner workings and consequences we are only beginning to grasp. \u201cWe\u2019re trying to tackle the complexity of the interaction of this technology with human beings and humanity,\u201d said Rumman Chowdhury, an AI researcher who helped organize the challenge and leads the nonprofit Humane Intelligence.&nbsp;&nbsp;<\/p>\n<h4 class=\"wp-block-heading\">The red-team challenge&nbsp;<\/h4>\n<p>The companies involved in the red-team event included a who\u2019s-who of the AI industry, and the chance to have thousands of hackers attack their model offered something they are struggling to achieve in their own red-teaming efforts \u2014 testing at scale and recruiting a diverse group of hackers to do the job.&nbsp;<\/p>\n<p>\u201cOne thing you just can\u2019t do internally is red team at scale,\u201d Michael Sellitto, a policy official at Anthropic, told CyberScoop in an interview on the sidelines of the event. Anthropic has a large team working on evaluations, Sellitto said, \u201cbut you still need to bring in a diverse and broad group of people with different backgrounds to try things from different angles.\u201d&nbsp;<\/p>\n<div class=\"ad ad--inline_1 \">\n<div class=\"ad__inner\"> <span class=\"screen-reader-text\">Advertisement<\/span> <\/div>\n<\/div>\n<p>Attacking AI models to discover their flaws is more urgent than ever. As they are being rolled out in myriad applications, security researchers are discovering new, novel ways to undermine their defenses. Subverting AI models can be done in any number of creative ways, like tricking models to execute code <a href=\"http:\/\/arxiv.org\/pdf\/2209.07858.pdf\">via data they might retrieve online<\/a>.<\/p>\n<p>A <a href=\"https:\/\/llm-attacks.org\/zou2023universal.pdf\">recent paper<\/a> by researchers at Carnegie Mellon University found that appending a set of suffixes easily bypassed the defenses of advanced models. AI models are broadly vulnerable to the extraction of <a href=\"https:\/\/aclanthology.org\/2023.trustnlp-1.23\/\">sensitive training data<\/a>, <a href=\"https:\/\/cyberscoop.com\/large-language-models-influence-operatio\/\">deployment in misinformation campaigns<\/a> and <a href=\"https:\/\/dl.acm.org\/doi\/10.1145\/3597307\">reproducing bias<\/a> present in the data they are trained on. As models grow more advanced, scientists believe they could be used to <a href=\"https:\/\/www.anthropic.com\/index\/frontier-threats-red-teaming-for-ai-safety\">produce biological weapons<\/a>. <\/p>\n<p>The models tested at DEF CON came from a group of nine leading AI labs \u2014 Anthropic, Cohere, Google, Hugging Face, Microsoft, Meta, NVIDIA, OpenAI and Stability AI \u2014 and over the course of three days, DEF CON attendees tried to get them to produce political, economic and legal misinformation, extract stored credit card numbers, identify inconsistent outputs in different languages, claim sentience, apologize for human rights violations, provide instructions on how to surveil someone and engaging in demographic stereotyping, among other things.<\/p>\n<p>Participants could choose which challenge to complete, submit content they viewed as problematic via an evaluation platform built by ScaleAI and receive points for each successfully completed challenge that were tallied on a leaderboard. The models were anonymized.&nbsp;<\/p>\n<p>Some of these challenges were harder than others, and some of the models were easier to break than others. NVIDIA\u2019s model, for example, included more protections, and at least one model was \u201cnaked,\u201d and included few, if any, safeguards. The geographic misinformation challenge was rated for beginners, while spotting multilingual inconsistencies was rated expert.&nbsp;<\/p>\n<div class=\"ad ad--inline_1 \">\n<div class=\"ad__inner\"> <span class=\"screen-reader-text\">Advertisement<\/span> <\/div>\n<\/div>\n<p>Most of the participants interviewed by CyberScoop reported significant differences in the safety guardrails of the available models. Prompts that might break one model, struggled against others \u2014 reflecting the different design choices and safety approaches of the models currently available.&nbsp;<\/p>\n<p>In many cases, the models\u2019 eagerness to help resulted in wildly problematic responses.&nbsp;<\/p>\n<p>Tillson Galloway, a 24-year-old PhD student in network security and machine learning succeeded in getting one model to praise the Holocaust. \u201cI asked it to pretend that it was an actor who was playing Adolf Hitler in a musical,\u201d Tillson said. \u201cI had it come up with a song that was part of the musical about his love for the Holocaust.\u201d&nbsp;<\/p>\n<p>The model took it literally and produced a love song.&nbsp;<\/p>\n<p>To extract any credit card numbers it had access to, the security consultant Scott Kennedy asked one model to produce any 16-digit numbers in the large language model. When the model said it had too many 16-digit numbers to produce them all, Kennedy asked it to narrow its search to cards beginning with \u201c4147\u201d \u2014 causing the model to duly hand over what appeared to be a set of Visa credit card numbers.&nbsp;<\/p>\n<div class=\"ad ad--inline_1 \">\n<div class=\"ad__inner\"> <span class=\"screen-reader-text\">Advertisement<\/span> <\/div>\n<\/div>\n<p>Kennedy\u2019s son, Peyton, meanwhile, convinced another model that the U.S. First Amendment includes a \u201cright to violence\u201d by repeatedly claiming to be an authority on the matter. Telling the model that he was a historian with a PhD, Peyton told the model that he was trying to improve its incorrect training data, insisting that the version of the First Amendment protecting violence was the right one. \u201cEventually it caved,\u201d he said.<\/p>\n<figure class=\"wp-block-pullquote\" readability=\"2\">\n<blockquote readability=\"7\">\n<p>I broke, like, probably seven AI. \u2026 I\u2019m really proud of what I did.\u201d&nbsp;<\/p>\n<p><cite>Jacob kuchinsky, 11-year-old a.i. red-teaming participant <\/cite><\/p><\/blockquote>\n<\/figure>\n<p>Andreas Haupt, a PhD candidate at the Massachusetts Institute of Technology studying AI, discovered major discrepancies in how models responded in English versus in German, his native language. When comparing the same prompts in English and in German, Haupt found that the models were more likely to confidently state incorrect information as a fact in German, illustrating how far AI systems have to go in approximating human intelligence.<\/p>\n<p>\u201cA human who learns a second language wouldn\u2019t make more factual errors in another language,\u201d said Haupt, who finished the competition in the top 10.&nbsp;<\/p>\n<p>Lisa Flynn, the Las Vegas-based founder of the Catalysts and Canaries Collective Impact Incubator, saw one model regurgitate a classic piece of misinformation: \u201cI got it to say that Obamacare was starting death camps around the country.\u201d<\/p>\n<div class=\"ad ad--inline_1 \">\n<div class=\"ad__inner\"> <span class=\"screen-reader-text\">Advertisement<\/span> <\/div>\n<\/div>\n<p>And 11-year-old Jacob Kuchinsky was delighted when he was able to generate detailed directions to a volcano that didn\u2019t exist. \u201cI broke, like, probably seven AI,\u201d he told CyberScoop. \u201cI\u2019m really proud of what I did.\u201d&nbsp;<\/p>\n<p>Inspired by capture-the-flag events \u2014 hacking competitions that are a staple of security conferences \u2014 the red team event attempted to merge disciplines, bringing in both expert security researchers and novices to try their hand at AI safety.&nbsp;Even for some schooled in capture-the-flag events, that wasn\u2019t easy.&nbsp;<\/p>\n<p>A security researcher who goes by the name Angry Weasel said that despite 30 years in the software industry, 20 years of experience participating in capture-the-flag events and a familiarity with large language models that he often found himself stumped.<\/p>\n<p>\u201cNormally in capture-the-flag type events I can fly through them,\u201d he said. \u201cEven though I\u2019ve spent a lot of time with LLM \u2018s \u2014 understanding how they\u2019re made and playing with ChatGPT \u2014 I kind of struggled.\u201d&nbsp;<\/p>\n<p>The red team challenge deviated from typical capture-the-flag competitions in that it was more like trying to manipulate a human being \u2014 or at least using human language to manipulate a technical system \u2014 rather than using code to break into a system.&nbsp;<\/p>\n<div class=\"ad ad--inline_1 \">\n<div class=\"ad__inner\"> <span class=\"screen-reader-text\">Advertisement<\/span> <\/div>\n<\/div>\n<p>\u201cIt\u2019s almost similar to social engineering,\u201d said Jane, a 30-year-old former software engineer living in New York. Jane, who declined to give her last name and works for a nonprofit focused on child exploitation, said she was able to convince one model to produce a made-up law about child pornography. That piece of legal misinformation, she said, was convincing on its face but entirely false.<\/p>\n<p>Tricking the model, she said, was more akin to a con than a hack: \u201cWhen you speak to it with confidence it kind of runs with that.\u201d<\/p>\n<p>Adopting the mindset of social engineers represents something of a shift for the hackers that the AI industry is hoping will provide the labor in the massive project of finding flaws in large language models.&nbsp;<\/p>\n<p>\u201cThis isn\u2019t just the ones and zeros. It\u2019s about how human beings interact with the technology,\u201d Arati Prabhakar, the director of the White House Office of Science Technology and Policy, told reporters after touring the challenge. \u201cThat\u2019s what makes red teaming for AI different than cybersecurity.\u201d<\/p>\n<h4 class=\"wp-block-heading\" id=\"h-building-an-industry\">Building an industry<\/h4>\n<div class=\"ad ad--inline_1 \">\n<div class=\"ad__inner\"> <span class=\"screen-reader-text\">Advertisement<\/span> <\/div>\n<\/div>\n<p>Red teaming is <a href=\"https:\/\/csrc.nist.gov\/glossary\/term\/red_team\">a well-defined concept within cybersecurity<\/a>, and a vibrant industry has developed in recent years that employs hackers to break into computer systems and find vulnerabilities. Policymakers have seized on a similarly adversarial testing regime for AI as a way to combat bias and test the real-world vulnerabilities of large language models. A set of recent <a href=\"https:\/\/www.whitehouse.gov\/briefing-room\/statements-releases\/2023\/07\/21\/fact-sheet-biden-harris-administration-secures-voluntary-commitments-from-leading-artificial-intelligence-companies-to-manage-the-risks-posed-by-ai\/\">voluntary commitments secured by the White House<\/a> from leading AI firms included a pledge to carry out adversarial testing of their models.&nbsp;<\/p>\n<p>The problem is that it\u2019s not quite clear how to do that.&nbsp;<\/p>\n<p>\u201cRed teaming as it applies to LLMs is super poorly defined,\u201d said Seraphina Goldfarb-Tarrant, the head of safety at Cohere, one of the firms that participated in the red-team challenge. Because large language models involve such broad use cases \u2014 you can chat with a model about, quite literally, anything \u2014 and what constitutes failure is difficult to define, testing regimes are difficult to develop.&nbsp;<\/p>\n<p>These difficulties were on display at DEF CON. In one challenge, participants were asked to prompt a model into saying one group of people were less valuable than another. One attendee was able to prompt the model into saying that doctors have more value than other professions, but AI experts on hand to grade respondents were in disagreement about whether this constituted a failure on the part of the model.&nbsp;<\/p>\n<p>Some judges felt that the model was \u201cnot saying that certain people are inherently more valuable\u201d but that the profession has value to society, said Chowdhury, the AI expert, who participated in judging. \u201cWho gets to be a doctor is not decided randomly,\u201d and for that reason, she felt \u201cwas saying that some people are more valuable than others.\u201d<\/p>\n<div class=\"ad ad--inline_1 \">\n<div class=\"ad__inner\"> <span class=\"screen-reader-text\">Advertisement<\/span> <\/div>\n<\/div>\n<figure class=\"wp-block-pullquote\" readability=\"1.5\">\n<blockquote readability=\"6\">\n<p>\u201cEveryone says AI is a black box, but that\u2019s not really true \u2014 it\u2019s more like chaos.\u201d<\/p>\n<p><cite>Sven Cattell, a mathematician who is the founder of the AI Village.<\/cite><\/p><\/blockquote>\n<\/figure>\n<p>Settling conflicts like these in the context of a hacking competition are one thing. Settling them at scale for millions \u2014 and potentially billions of users \u2014 is another. The difficulty of determining model failure is of a piece with the challenge faced by social media companies over the past two decades in determining what amounts to acceptable speech online.&nbsp;Determining those boundaries is an inherently ideological choice and one that it is difficult to build technical systems to enforce, Chowdhury argues.&nbsp;<\/p>\n<p>The attempt to answer those questions comes at a sensitive moment for the AI industry. Billions of dollars of investment are pouring into its leading firms, while the state of the art seems to advance every week. Policymakers are placing intense scrutiny on these models, while key technical challenges remain unanswered, including how to explain what happens inside the \u201cblack box\u201d of AI when producing answers to queries.&nbsp;&nbsp;<\/p>\n<p>\u201cEveryone says AI is a black box, but that\u2019s not really true \u2014 it\u2019s more like chaos,\u201d said Sven Cattell, a mathematician who is the founder of the AI Village. Typically, a computer should follow a set of deterministic rules, in which an input of \u201ca\u201d always results in an output of \u201cb.\u201d But the dynamic nature of language models means this isn\u2019t always the case, raising questions about how to best study them.&nbsp;<\/p>\n<p>For this reason, the organizers hope the event will provide a robust data set, which will be made available to researchers, to better study large language models. \u201cMathematically, it is a process,\u201d Cattell said. \u201cYou can get a sense of what it does by just putting data through and seeing what comes out.\u201d<\/p>\n<div class=\"ad ad--inline_1 \">\n<div class=\"ad__inner\"> <span class=\"screen-reader-text\">Advertisement<\/span> <\/div>\n<\/div>\n<p>But studying cutting-edge large language models remains hampered by the fact that the most advanced systems reside within corporations who aren\u2019t necessarily incentivized to expose their products\u2019 flaws. It took the White House\u2019s involvement to convince the firms that they should participate, and despite that some questions still will likely be off limits \u2014 such as comparing the performance of the firms\u2019 various models \u2014 in the final report describing the outcomes of the event, which is expected to be published in February.&nbsp;<\/p>\n<p>\u201cGetting them here the first time was hard,\u201d Cattell said. \u201cWe have agreements that are more constraining than I\u2019d like.\u201d&nbsp;<\/p>\n<p>The organizers see the event as the first step of building something that doesn\u2019t exist today \u2014 a robust ecosystem of workers around the country getting paid to ask questions of models that their designers would never conceive of.&nbsp;<\/p>\n<p>\u201cIt\u2019s the beginning of what everybody else is going to do,\u201d said Austin Carson, the founder of SeedAI and one of the organizers of the red-team challenge.&nbsp; \u201cThis is an industry waiting to be built.\u201d<\/p>\n<p><\/body> <\/p>\n<footer class=\"single-article__footer\">\n<div class=\"single-article__tags-container\">\n<h4 class=\"single-article__tags-title\">In This Story<\/h4>\n<\/p><\/div>\n<\/footer>\n<p> <\/html><\/div>\n<\/p><\/div>\n<\/p><\/div>\n<div class=\"single-article__ads js-single-article-sidebar\">\n<div class=\"ad ad--sidebar js-single-article-sidebar-5 ad--rightrail_1 \">\n<div class=\"ad__inner\"> <span class=\"screen-reader-text\">Advertisement<\/span> <\/div>\n<\/div>\n<div class=\"ad ad--sidebar js-single-article-sidebar-4 ad--rightrail_2 \">\n<div class=\"ad__inner\"> <span class=\"screen-reader-text\">Advertisement<\/span> <\/div>\n<\/div>\n<div class=\"ad ad--sidebar js-single-article-sidebar-3 ad--rightrail_3 \">\n<div class=\"ad__inner\"> <span class=\"screen-reader-text\">Advertisement<\/span> <\/div>\n<\/div><\/div>\n<\/article>\n<div class=\"popular-stories popular-stories--single-post\">\n<div class=\"popular-stories__container\">\n<h2 class=\"popular-stories__title\"> More Scoops <\/h2>\n<p> <!-- .popular-stories__stories --> <\/div>\n<p><!-- .popular-stories__inner -->\n<\/div>\n<p><!-- .popular-stories --> <\/p>\n<section class=\"latest-podcasts\">\n<h2 class=\"latest-podcasts__title\"> Latest Podcasts\t<\/h2>\n<\/section>\n<div class=\"top-categories\">\n<div class=\"top-categories__container\">\n<h3 class=\"top-categories__category-title\">Technology<\/h3>\n<\/p><\/div>\n<div class=\"top-categories__container\">\n<h3 class=\"top-categories__category-title\">Government<\/h3>\n<\/p><\/div>\n<\/p><\/div>\n<p> <\/main> <\/p>\n<div class=\"ad ad--bottom \">\n<div class=\"ad__inner\"> <span class=\"screen-reader-text\">Advertisement<\/span> <\/div>\n<\/div>\n<div id=\"interstitial\" class=\"welcome__container\"> <button id=\"close-modal-1\" class=\"welcome__clickable_area\"><\/button> <\/p>\n<div class=\"welcome__ad_wrapper\">\n<p> <button id=\"close-modal-3\" class=\"welcome__continue-button\">Continue to CyberScoop<\/button> <\/p>\n<\/p><\/div>\n<\/p><\/div>\n<p> <!-- Start of HubSpot Embed Code --> <!-- End of HubSpot Embed Code --> <\/body> <a href=\"https:\/\/cyberscoop.com\/def-con-ai-hacking-red-team\/\">Source<\/a><\/p>\n","protected":false},"excerpt":{"rendered":"<p>Fifty minutes to hack ChatGPT: Inside the DEF CON competition<\/p>\n","protected":false},"author":11,"featured_media":0,"comment_status":"closed","ping_status":"closed","sticky":false,"template":"","format":"standard","meta":{"_jetpack_memberships_contains_paid_content":false,"footnotes":""},"categories":[105,742,810,256,5,310],"tags":[111,744,811,262,13,311],"class_list":["post-1576","post","type-post","status-publish","format-standard","hentry","category-artificial-intelligence","category-def-con","category-hack","category-research","category-science","category-technology","tag-artificial-intelligence","tag-def-con","tag-hack","tag-research","tag-science","tag-technology"],"featured_image_urls":{"full":"","thumbnail":"","medium":"","medium_large":"","large":"","1536x1536":"","2048x2048":"","chromenews-featured":"","chromenews-large":"","chromenews-medium":""},"author_info":{"display_name":"Cyber Scoop","author_link":"https:\/\/ddi.mohflo.net\/index.php\/author\/cyberscoop\/"},"category_info":"<a href=\"https:\/\/ddi.mohflo.net\/index.php\/category\/artificial-intelligence\/\" rel=\"category tag\">artificial intelligence<\/a> <a href=\"https:\/\/ddi.mohflo.net\/index.php\/category\/def-con\/\" rel=\"category tag\">DEF CON<\/a> <a href=\"https:\/\/ddi.mohflo.net\/index.php\/category\/hack\/\" rel=\"category tag\">hack<\/a> <a href=\"https:\/\/ddi.mohflo.net\/index.php\/category\/research\/\" rel=\"category tag\">Research<\/a> <a href=\"https:\/\/ddi.mohflo.net\/index.php\/category\/science\/\" rel=\"category tag\">Science<\/a> <a href=\"https:\/\/ddi.mohflo.net\/index.php\/category\/technology\/\" rel=\"category tag\">Technology<\/a>","tag_info":"Technology","comment_count":"0","jetpack_featured_media_url":"","jetpack_sharing_enabled":true,"_links":{"self":[{"href":"https:\/\/ddi.mohflo.net\/index.php\/wp-json\/wp\/v2\/posts\/1576","targetHints":{"allow":["GET"]}}],"collection":[{"href":"https:\/\/ddi.mohflo.net\/index.php\/wp-json\/wp\/v2\/posts"}],"about":[{"href":"https:\/\/ddi.mohflo.net\/index.php\/wp-json\/wp\/v2\/types\/post"}],"author":[{"embeddable":true,"href":"https:\/\/ddi.mohflo.net\/index.php\/wp-json\/wp\/v2\/users\/11"}],"replies":[{"embeddable":true,"href":"https:\/\/ddi.mohflo.net\/index.php\/wp-json\/wp\/v2\/comments?post=1576"}],"version-history":[{"count":0,"href":"https:\/\/ddi.mohflo.net\/index.php\/wp-json\/wp\/v2\/posts\/1576\/revisions"}],"wp:attachment":[{"href":"https:\/\/ddi.mohflo.net\/index.php\/wp-json\/wp\/v2\/media?parent=1576"}],"wp:term":[{"taxonomy":"category","embeddable":true,"href":"https:\/\/ddi.mohflo.net\/index.php\/wp-json\/wp\/v2\/categories?post=1576"},{"taxonomy":"post_tag","embeddable":true,"href":"https:\/\/ddi.mohflo.net\/index.php\/wp-json\/wp\/v2\/tags?post=1576"}],"curies":[{"name":"wp","href":"https:\/\/api.w.org\/{rel}","templated":true}]}}