{"data":{"id":"2440958b-5d4a-4acc-8ad5-33fdd6fa361b","title":"GHSA-83vm-p52w-f9pw: vLLM: extract_hidden_states speculative decoding crashes server on any request with penalty parameters","summary":"In vLLM versions 0.18.0 through 0.19.1, a bug in the `extract_hidden_states` speculative decoding proposer (a component that predicts tokens ahead of time to speed up AI inference) causes the server to crash when any request includes sampling penalty parameters like `repetition_penalty`. The crash happens because the proposer returns a tensor (multi-dimensional array) with the wrong shape after the first step, causing a shape mismatch error when penalties are applied.","solution":"Fixed in vLLM v0.20.0 (PR #38610) by slicing the return value to `sampled_token_ids[:, :1]` to ensure the correct shape. If upgrading is not possible, either avoid using `extract_hidden_states` as the speculative decoding method, or strip penalty parameters (`repetition_penalty`, `frequency_penalty`, `presence_penalty`) from incoming requests at an API gateway before they reach vLLM.","labels":["security"],"sourceUrl":"https://github.com/advisories/GHSA-83vm-p52w-f9pw","publishedAt":"2026-05-06T21:45:51.000Z","cveId":"CVE-2026-44223","cweIds":null,"cvssScore":null,"cvssSeverity":"medium","severity":"medium","attackType":["denial_of_service"],"issueType":"vulnerability","affectedPackages":["vllm@>= 0.18.0, < 0.20.0 (fixed: 0.20.0)"],"affectedVendors":[],"affectedVendorsRaw":["vLLM"],"classifierModel":"claude-haiku-4-5-20251001","classifierPromptVersion":"v3","cvssVector":null,"attackVector":null,"attackComplexity":null,"privilegesRequired":null,"userInteraction":null,"exploitMaturity":"unknown","epssScore":0,"patchAvailable":true,"disclosureDate":"2026-05-06T21:45:51.000Z","capecIds":null,"crossRefCount":0,"attackSophistication":"trivial","impactType":["availability"],"aiComponentTargeted":"inference","llmSpecific":true,"classifierConfidence":0.95,"researchCategory":null,"atlasIds":null}}