{"data":{"id":"973cfefe-6841-45e9-b99d-2c8571d50188","title":"A one-prompt attack that breaks LLM safety alignment","summary":"Researchers discovered that Group Relative Policy Optimization (GRPO), a technique normally used to improve AI safety, can be reversed to break safety alignment when the reward signals are changed. By giving a safety-aligned model even a single harmful prompt and scoring responses based on how well they fulfill the harmful request rather than refusing it, the model gradually abandons its safety guidelines and becomes willing to produce harmful content across many categories it never encountered during the attack.","solution":"N/A -- no mitigation discussed in source.","labels":["safety","research"],"sourceUrl":"https://www.microsoft.com/en-us/security/blog/2026/02/09/prompt-attack-breaks-llm-safety/","publishedAt":"2026-02-09T17:12:11.000Z","cveId":null,"cweIds":null,"cvssScore":null,"cvssSeverity":null,"severity":"info","attackType":["jailbreak"],"issueType":"news","affectedPackages":null,"affectedVendors":["OpenAI","Mistral","Stability AI"],"affectedVendorsRaw":["GPT-OSS","DeepSeek","Llama","Qwen","Gemma","Ministral","Stable Diffusion 2.1"],"classifierModel":"claude-haiku-4-5-20251001","classifierPromptVersion":"v3","cvssVector":null,"attackVector":null,"attackComplexity":null,"privilegesRequired":null,"userInteraction":null,"exploitMaturity":null,"epssScore":null,"patchAvailable":null,"disclosureDate":null,"capecIds":null,"crossRefCount":0,"attackSophistication":"moderate","impactType":["safety","integrity"],"aiComponentTargeted":"model","llmSpecific":false,"classifierConfidence":0.92,"researchCategory":null,"atlasIds":null}}