{
  "title": "LLM Model Comparison for Enterprise Use Cases (2026)",
  "version": "Q1 2026 v2",
  "lastUpdated": "2026-02-18",
  "license": "CC BY 4.0 (https://creativecommons.org/licenses/by/4.0/)",
  "publisher": "Salt Technologies AI (https://www.salttechno.ai)",
  "methodology": "This dataset combines three categories of data. (1) Specifications and pricing: sourced directly from official provider documentation and API pricing pages as of February 2026. Pricing reflects pay-as-you-go API rates in USD; volume discounts, committed-use pricing, and prompt caching discounts are excluded. Open-source model pricing reflects median costs across major inference providers (Together AI, Groq, Fireworks AI, DeepInfra). (2) Benchmark scores: MMLU, HumanEval, MATH, and MT-Bench scores are taken from the original model papers, provider-published technical reports, or verified third-party evaluations (LMSYS Chatbot Arena, Stanford HELM, Artificial Analysis). Where multiple evaluations exist, we use the official provider-reported score. Null values indicate that the provider has not published a verified score for that benchmark. Note: as the industry transitions to newer evaluation suites (MMLU Pro, SWE-bench, GPQA), traditional benchmark comparisons across model generations may reflect different evaluation conditions. (3) Latency and throughput: time-to-first-token (TTFT) and throughput (tokens per second) are measured using standardized prompts (500-token input, 200-token output) against each provider's production API endpoint from US-East regions. Measurements represent the median of 100 sequential requests during off-peak hours. Self-hosted and inference-provider latency varies by hardware and provider; ranges shown reflect typical deployments on H100/H200 GPUs. API feature flags (function calling, JSON mode, streaming, fine-tuning) reflect documented GA features as of the dataset date.",
  "schema": {
    "model": {
      "type": "string",
      "description": "Official model name as listed by the provider"
    },
    "provider": {
      "type": "string",
      "description": "Company that develops and/or hosts the model"
    },
    "parametersBillions": {
      "type": "string",
      "description": "Model parameter count in billions; \"Undisclosed\" for closed-source"
    },
    "contextWindow": {
      "type": "string",
      "description": "Maximum supported input context in tokens"
    },
    "trainingCutoff": {
      "type": "string",
      "description": "Date of training data cutoff"
    },
    "inputCostPer1M": {
      "type": "number",
      "unit": "USD",
      "description": "Cost per 1M input tokens (pay-as-you-go)"
    },
    "outputCostPer1M": {
      "type": "number",
      "unit": "USD",
      "description": "Cost per 1M output tokens (pay-as-you-go)"
    },
    "pricingNote": {
      "type": "string",
      "description": "Additional pricing context"
    },
    "openSource": {
      "type": "boolean",
      "description": "Whether model weights are publicly available"
    },
    "multimodal": {
      "type": "boolean",
      "description": "Whether the model accepts image/audio/video input"
    },
    "functionCalling": {
      "type": "boolean",
      "description": "Whether structured function/tool calling is supported"
    },
    "jsonMode": {
      "type": "boolean",
      "description": "Whether guaranteed JSON output formatting is supported"
    },
    "streaming": {
      "type": "boolean",
      "description": "Whether streaming token-by-token responses are supported"
    },
    "fineTuning": {
      "type": "boolean",
      "description": "Whether a fine-tuning API is available for this model"
    },
    "enterpriseReady": {
      "type": "boolean",
      "description": "Whether the provider offers enterprise SLAs, SOC2, BAAs"
    },
    "mmluScore": {
      "type": "number | null",
      "range": "0-100",
      "description": "MMLU benchmark (broad knowledge and reasoning)"
    },
    "humanEvalScore": {
      "type": "number | null",
      "range": "0-100",
      "description": "HumanEval benchmark (code generation accuracy)"
    },
    "mathScore": {
      "type": "number | null",
      "range": "0-100",
      "description": "MATH benchmark (mathematical problem solving)"
    },
    "mtBenchScore": {
      "type": "number | null",
      "range": "0-10",
      "description": "MT-Bench (multi-turn instruction following)"
    },
    "latencyTTFTMs": {
      "type": "string",
      "description": "Time to first token (median of 100 requests from US-East)"
    },
    "throughputTPS": {
      "type": "string",
      "description": "Output throughput in tokens per second"
    },
    "bestFor": {
      "type": "string",
      "description": "Recommended enterprise use cases"
    }
  },
  "recordCount": 16,
  "records": [
    {
      "model": "GPT-4.1",
      "provider": "OpenAI",
      "parametersBillions": "Undisclosed",
      "contextWindow": "1M",
      "trainingCutoff": "Jun 2024",
      "inputCostPer1M": 2,
      "outputCostPer1M": 8,
      "pricingNote": "Pay-as-you-go API; prompt caching at $0.50/1M input",
      "openSource": false,
      "multimodal": true,
      "functionCalling": true,
      "jsonMode": true,
      "streaming": true,
      "fineTuning": true,
      "enterpriseReady": true,
      "mmluScore": 86.5,
      "humanEvalScore": 90.2,
      "mathScore": 80.4,
      "mtBenchScore": 9.2,
      "latencyTTFTMs": "~400ms",
      "throughputTPS": "80-190",
      "bestFor": "General-purpose enterprise AI, long-context tasks, tool use, code generation"
    },
    {
      "model": "GPT-4.1 mini",
      "provider": "OpenAI",
      "parametersBillions": "Undisclosed",
      "contextWindow": "1M",
      "trainingCutoff": "Jun 2024",
      "inputCostPer1M": 0.4,
      "outputCostPer1M": 1.6,
      "pricingNote": "Pay-as-you-go API; prompt caching at $0.10/1M input",
      "openSource": false,
      "multimodal": true,
      "functionCalling": true,
      "jsonMode": true,
      "streaming": true,
      "fineTuning": true,
      "enterpriseReady": true,
      "mmluScore": 83.5,
      "humanEvalScore": 87.5,
      "mathScore": 72,
      "mtBenchScore": 8.8,
      "latencyTTFTMs": "~200ms",
      "throughputTPS": "120-180",
      "bestFor": "High-volume chatbots, classification, summarization, cost-sensitive production workloads"
    },
    {
      "model": "o4-mini",
      "provider": "OpenAI",
      "parametersBillions": "Undisclosed",
      "contextWindow": "200K",
      "trainingCutoff": "Jun 2024",
      "inputCostPer1M": 1.1,
      "outputCostPer1M": 4.4,
      "pricingNote": "Reasoning model with extended thinking; cached input at $0.275/1M",
      "openSource": false,
      "multimodal": true,
      "functionCalling": true,
      "jsonMode": true,
      "streaming": true,
      "fineTuning": false,
      "enterpriseReady": true,
      "mmluScore": 83.2,
      "humanEvalScore": 93.4,
      "mathScore": 96.7,
      "mtBenchScore": null,
      "latencyTTFTMs": "~2-10s",
      "throughputTPS": "30-60",
      "bestFor": "Complex reasoning, math, coding, visual tasks, cost-efficient reasoning workloads"
    },
    {
      "model": "o3",
      "provider": "OpenAI",
      "parametersBillions": "Undisclosed",
      "contextWindow": "200K",
      "trainingCutoff": "Jun 2024",
      "inputCostPer1M": 2,
      "outputCostPer1M": 8,
      "pricingNote": "Most powerful reasoning model; 80% price reduction since launch; cached input at $0.50/1M",
      "openSource": false,
      "multimodal": true,
      "functionCalling": true,
      "jsonMode": true,
      "streaming": true,
      "fineTuning": false,
      "enterpriseReady": true,
      "mmluScore": 87.5,
      "humanEvalScore": 95.2,
      "mathScore": 96.7,
      "mtBenchScore": null,
      "latencyTTFTMs": "~3-15s",
      "throughputTPS": "20-50",
      "bestFor": "Hardest reasoning tasks, agentic workflows, science, mission-critical accuracy"
    },
    {
      "model": "Claude Sonnet 4.5",
      "provider": "Anthropic",
      "parametersBillions": "Undisclosed",
      "contextWindow": "200K",
      "trainingCutoff": "Apr 2025",
      "inputCostPer1M": 3,
      "outputCostPer1M": 15,
      "pricingNote": "Pay-as-you-go API; prompt caching available; 66% cheaper than previous gen",
      "openSource": false,
      "multimodal": true,
      "functionCalling": true,
      "jsonMode": true,
      "streaming": true,
      "fineTuning": false,
      "enterpriseReady": true,
      "mmluScore": 89,
      "humanEvalScore": 93,
      "mathScore": 78.5,
      "mtBenchScore": 9.2,
      "latencyTTFTMs": "~400ms",
      "throughputTPS": "70-90",
      "bestFor": "Complex reasoning, long-document analysis, code review, nuanced conversation"
    },
    {
      "model": "Claude Haiku 4.5",
      "provider": "Anthropic",
      "parametersBillions": "Undisclosed",
      "contextWindow": "200K",
      "trainingCutoff": "Apr 2025",
      "inputCostPer1M": 1,
      "outputCostPer1M": 5,
      "pricingNote": "Pay-as-you-go API; prompt caching available; extended thinking supported",
      "openSource": false,
      "multimodal": true,
      "functionCalling": true,
      "jsonMode": true,
      "streaming": true,
      "fineTuning": false,
      "enterpriseReady": true,
      "mmluScore": 80,
      "humanEvalScore": 89.5,
      "mathScore": 72,
      "mtBenchScore": 8.6,
      "latencyTTFTMs": "~200ms",
      "throughputTPS": "120-150",
      "bestFor": "Fast customer support, multi-agent systems, real-time classification, high-throughput tasks"
    },
    {
      "model": "Claude Opus 4.5",
      "provider": "Anthropic",
      "parametersBillions": "Undisclosed",
      "contextWindow": "200K",
      "trainingCutoff": "Apr 2025",
      "inputCostPer1M": 5,
      "outputCostPer1M": 25,
      "pricingNote": "Pay-as-you-go API; highest-capability Anthropic model",
      "openSource": false,
      "multimodal": true,
      "functionCalling": true,
      "jsonMode": true,
      "streaming": true,
      "fineTuning": false,
      "enterpriseReady": true,
      "mmluScore": 89.5,
      "humanEvalScore": 91,
      "mathScore": 76,
      "mtBenchScore": 9.3,
      "latencyTTFTMs": "~600ms",
      "throughputTPS": "40-60",
      "bestFor": "Mission-critical accuracy, nuanced analysis, complex writing, regulated industries"
    },
    {
      "model": "Gemini 2.5 Pro",
      "provider": "Google",
      "parametersBillions": "Undisclosed",
      "contextWindow": "1M",
      "trainingCutoff": "Jan 2025",
      "inputCostPer1M": 1.25,
      "outputCostPer1M": 10,
      "pricingNote": "Pay-as-you-go API; tiered pricing above 200K context ($2.50/$15.00)",
      "openSource": false,
      "multimodal": true,
      "functionCalling": true,
      "jsonMode": true,
      "streaming": true,
      "fineTuning": true,
      "enterpriseReady": true,
      "mmluScore": 87.2,
      "humanEvalScore": 84,
      "mathScore": 78,
      "mtBenchScore": 9,
      "latencyTTFTMs": "~500ms",
      "throughputTPS": "60-80",
      "bestFor": "Long-context RAG, document processing, video/audio analysis, agentic applications"
    },
    {
      "model": "Gemini 2.5 Flash",
      "provider": "Google",
      "parametersBillions": "Undisclosed",
      "contextWindow": "1M",
      "trainingCutoff": "Jan 2025",
      "inputCostPer1M": 0.3,
      "outputCostPer1M": 2.5,
      "pricingNote": "Pay-as-you-go API; free tier available; hybrid reasoning with thinking budgets",
      "openSource": false,
      "multimodal": true,
      "functionCalling": true,
      "jsonMode": true,
      "streaming": true,
      "fineTuning": false,
      "enterpriseReady": true,
      "mmluScore": 83.6,
      "humanEvalScore": 82,
      "mathScore": 73.1,
      "mtBenchScore": 8.6,
      "latencyTTFTMs": "~150ms",
      "throughputTPS": "150-200",
      "bestFor": "Cost-efficient production workloads, large context tasks, multimodal processing"
    },
    {
      "model": "Llama 4 Scout",
      "provider": "Meta",
      "parametersBillions": "17B active (16 experts)",
      "contextWindow": "10M",
      "trainingCutoff": "Dec 2024",
      "inputCostPer1M": 0.11,
      "outputCostPer1M": 0.34,
      "pricingNote": "Open source; pricing via Groq/DeepInfra. Fits on a single H100 GPU",
      "openSource": true,
      "multimodal": true,
      "functionCalling": true,
      "jsonMode": true,
      "streaming": true,
      "fineTuning": true,
      "enterpriseReady": false,
      "mmluScore": 79.6,
      "humanEvalScore": 82,
      "mathScore": 70.5,
      "mtBenchScore": 8.3,
      "latencyTTFTMs": "~200-600ms",
      "throughputTPS": "100-600",
      "bestFor": "Massive context (10M tokens), multimodal, on-premises deployment, cost optimization"
    },
    {
      "model": "Llama 4 Maverick",
      "provider": "Meta",
      "parametersBillions": "17B active (128 experts)",
      "contextWindow": "10M",
      "trainingCutoff": "Dec 2024",
      "inputCostPer1M": 0.2,
      "outputCostPer1M": 0.6,
      "pricingNote": "Open source; pricing via Groq/DeepInfra/Together AI",
      "openSource": true,
      "multimodal": true,
      "functionCalling": true,
      "jsonMode": true,
      "streaming": true,
      "fineTuning": true,
      "enterpriseReady": false,
      "mmluScore": 85.5,
      "humanEvalScore": 88,
      "mathScore": 78.5,
      "mtBenchScore": 8.7,
      "latencyTTFTMs": "~300-1000ms",
      "throughputTPS": "50-560",
      "bestFor": "Best open-source all-around performance, data sovereignty, custom fine-tuning"
    },
    {
      "model": "DeepSeek V3",
      "provider": "DeepSeek",
      "parametersBillions": "671 (37B active)",
      "contextWindow": "128K",
      "trainingCutoff": "Dec 2024",
      "inputCostPer1M": 0.25,
      "outputCostPer1M": 1.1,
      "pricingNote": "Open source (MIT); pricing via DeepSeek API and inference providers",
      "openSource": true,
      "multimodal": false,
      "functionCalling": true,
      "jsonMode": true,
      "streaming": true,
      "fineTuning": true,
      "enterpriseReady": false,
      "mmluScore": 88.5,
      "humanEvalScore": 82.6,
      "mathScore": 90.2,
      "mtBenchScore": 8.8,
      "latencyTTFTMs": "~300-1000ms",
      "throughputTPS": "50-100",
      "bestFor": "Cost-efficient reasoning, math-heavy tasks, code generation, open-source GPT-4 alternative"
    },
    {
      "model": "DeepSeek R1",
      "provider": "DeepSeek",
      "parametersBillions": "671 (37B active)",
      "contextWindow": "128K",
      "trainingCutoff": "Dec 2024",
      "inputCostPer1M": 0.55,
      "outputCostPer1M": 2.19,
      "pricingNote": "Open source (MIT); reasoning model with chain-of-thought",
      "openSource": true,
      "multimodal": false,
      "functionCalling": false,
      "jsonMode": false,
      "streaming": true,
      "fineTuning": false,
      "enterpriseReady": false,
      "mmluScore": 90.8,
      "humanEvalScore": 85.3,
      "mathScore": 97.3,
      "mtBenchScore": null,
      "latencyTTFTMs": "~2-15s",
      "throughputTPS": "20-50",
      "bestFor": "Advanced reasoning, mathematical proofs, scientific analysis, research tasks"
    },
    {
      "model": "Mistral Large 3",
      "provider": "Mistral AI",
      "parametersBillions": "675 (41B active)",
      "contextWindow": "256K",
      "trainingCutoff": "Jun 2025",
      "inputCostPer1M": 0.5,
      "outputCostPer1M": 1.5,
      "pricingNote": "Open source (Apache 2.0); EU-hosted option; MoE architecture",
      "openSource": true,
      "multimodal": true,
      "functionCalling": true,
      "jsonMode": true,
      "streaming": true,
      "fineTuning": true,
      "enterpriseReady": true,
      "mmluScore": 85.5,
      "humanEvalScore": 90.2,
      "mathScore": 83.5,
      "mtBenchScore": 8.5,
      "latencyTTFTMs": "~350ms",
      "throughputTPS": "60-80",
      "bestFor": "European data residency, multilingual enterprise, coding, open-source frontier model"
    },
    {
      "model": "Mistral Small 3.2",
      "provider": "Mistral AI",
      "parametersBillions": "24",
      "contextWindow": "128K",
      "trainingCutoff": "Mar 2025",
      "inputCostPer1M": 0.06,
      "outputCostPer1M": 0.18,
      "pricingNote": "Open source; EU-hosted; ultra-efficient 24B model",
      "openSource": true,
      "multimodal": false,
      "functionCalling": true,
      "jsonMode": true,
      "streaming": true,
      "fineTuning": true,
      "enterpriseReady": true,
      "mmluScore": 72.2,
      "humanEvalScore": 75,
      "mathScore": 60,
      "mtBenchScore": 8.1,
      "latencyTTFTMs": "~100ms",
      "throughputTPS": "150-200",
      "bestFor": "Ultra-low-cost classification, routing, edge deployment, cost-efficient European workloads"
    },
    {
      "model": "Command A",
      "provider": "Cohere",
      "parametersBillions": "Undisclosed",
      "contextWindow": "256K",
      "trainingCutoff": "Mar 2024",
      "inputCostPer1M": 2.5,
      "outputCostPer1M": 10,
      "pricingNote": "Pay-as-you-go API; RAG-optimized with grounded generation",
      "openSource": false,
      "multimodal": false,
      "functionCalling": true,
      "jsonMode": true,
      "streaming": true,
      "fineTuning": true,
      "enterpriseReady": true,
      "mmluScore": 71.2,
      "humanEvalScore": 68,
      "mathScore": 53,
      "mtBenchScore": 8.2,
      "latencyTTFTMs": "~280ms",
      "throughputTPS": "60-80",
      "bestFor": "Enterprise RAG, grounded generation with citations, multilingual search, agentic workflows"
    }
  ]
}