{"data":{"id":"d1b89beb-aaa3-479d-ac60-810238efaabd","title":"Autoresearching Apple's \"LLM in a Flash\" to run Qwen 397B locally","summary":"Researchers successfully ran a very large AI model (Qwen 397B, a Mixture-of-Experts model where each response only uses a subset of the total weights) on a MacBook Pro by using Apple's \"LLM in a Flash\" technique, which stores model data on the fast SSD storage and pulls it into RAM as needed rather than keeping everything in memory at once. They used Claude to run 90 experiments and generate optimized code that achieved 5.5+ tokens per second (response speed) by quantizing (reducing precision of) the expert weights to 2-bit while keeping other parts at full precision. The final setup used only 5.5GB of constant memory while streaming the remaining 120GB of compressed model weights from disk on demand.","solution":"N/A -- no mitigation discussed in source.","labels":["research"],"sourceUrl":"https://simonwillison.net/2026/Mar/18/llm-in-a-flash/#atom-everything","publishedAt":"2026-03-18T23:56:46.000Z","cveId":null,"cweIds":null,"cvssScore":null,"cvssSeverity":null,"severity":"info","attackType":[],"issueType":"news","affectedPackages":null,"affectedVendors":["Apple"],"affectedVendorsRaw":["Apple","Qwen","Claude","MLX","Meta (Andrej Karpathy's work context)"],"classifierModel":"claude-haiku-4-5-20251001","classifierPromptVersion":"v3","cvssVector":null,"attackVector":null,"attackComplexity":null,"privilegesRequired":null,"userInteraction":null,"exploitMaturity":null,"epssScore":null,"patchAvailable":null,"disclosureDate":"2026-03-18T23:56:46.000Z","capecIds":null,"crossRefCount":0,"attackSophistication":"advanced","impactType":null,"aiComponentTargeted":"inference","llmSpecific":true,"classifierConfidence":0.92,"researchCategory":null,"atlasIds":null}}