first commit

2026-04-26 21:52:23 +03:00
commit 880f412e2c
2662 changed files with 866266 additions and 0 deletions
--- a/plugins/semanticcache/changelog.md
+++ b/plugins/semanticcache/changelog.md
--- a/plugins/semanticcache/config_unmarshal_test.go
+++ b/plugins/semanticcache/config_unmarshal_test.go
@@ -0,0 +1,193 @@
+package semanticcache
+
+import (
+	"encoding/json"
+	"testing"
+	"time"
+
+	bifrost "github.com/maximhq/bifrost/core"
+)
+
+func TestUnmarshalJSON_DefaultCacheKey(t *testing.T) {
+	tests := []struct {
+		name     string
+		json     string
+		expected string
+	}{
+		{
+			name:     "set",
+			json:     `{"dimension": 1536, "default_cache_key": "my-cache-key"}`,
+			expected: "my-cache-key",
+		},
+		{
+			name:     "omitted",
+			json:     `{"dimension": 1536}`,
+			expected: "",
+		},
+	}
+
+	for _, tc := range tests {
+		t.Run(tc.name, func(t *testing.T) {
+			var config Config
+			if err := json.Unmarshal([]byte(tc.json), &config); err != nil {
+				t.Fatalf("Failed to unmarshal: %v", err)
+			}
+			if config.DefaultCacheKey != tc.expected {
+				t.Errorf("Expected DefaultCacheKey %q, got %q", tc.expected, config.DefaultCacheKey)
+			}
+		})
+	}
+}
+
+func TestUnmarshalJSON_AllFields(t *testing.T) {
+	input := `{
+		"provider": "openai",
+		"embedding_model": "text-embedding-3-small",
+		"cleanup_on_shutdown": true,
+		"dimension": 1536,
+		"ttl": "10m",
+		"threshold": 0.9,
+		"vector_store_namespace": "my-ns",
+		"default_cache_key": "global-key",
+		"conversation_history_threshold": 5,
+		"cache_by_model": false,
+		"cache_by_provider": false,
+		"exclude_system_prompt": true
+	}`
+
+	var config Config
+	if err := json.Unmarshal([]byte(input), &config); err != nil {
+		t.Fatalf("Failed to unmarshal: %v", err)
+	}
+
+	if config.Provider != "openai" {
+		t.Errorf("Provider: expected %q, got %q", "openai", config.Provider)
+	}
+	if config.EmbeddingModel != "text-embedding-3-small" {
+		t.Errorf("EmbeddingModel: expected %q, got %q", "text-embedding-3-small", config.EmbeddingModel)
+	}
+	if !config.CleanUpOnShutdown {
+		t.Error("CleanUpOnShutdown: expected true")
+	}
+	if config.Dimension != 1536 {
+		t.Errorf("Dimension: expected 1536, got %d", config.Dimension)
+	}
+	if config.TTL != 10*time.Minute {
+		t.Errorf("TTL: expected 10m, got %v", config.TTL)
+	}
+	if config.Threshold != 0.9 {
+		t.Errorf("Threshold: expected 0.9, got %f", config.Threshold)
+	}
+	if config.VectorStoreNamespace != "my-ns" {
+		t.Errorf("VectorStoreNamespace: expected %q, got %q", "my-ns", config.VectorStoreNamespace)
+	}
+	if config.DefaultCacheKey != "global-key" {
+		t.Errorf("DefaultCacheKey: expected %q, got %q", "global-key", config.DefaultCacheKey)
+	}
+	if config.ConversationHistoryThreshold != 5 {
+		t.Errorf("ConversationHistoryThreshold: expected 5, got %d", config.ConversationHistoryThreshold)
+	}
+	if config.CacheByModel == nil || *config.CacheByModel != false {
+		t.Errorf("CacheByModel: expected false, got %v", config.CacheByModel)
+	}
+	if config.CacheByProvider == nil || *config.CacheByProvider != false {
+		t.Errorf("CacheByProvider: expected false, got %v", config.CacheByProvider)
+	}
+	if config.ExcludeSystemPrompt == nil || *config.ExcludeSystemPrompt != true {
+		t.Errorf("ExcludeSystemPrompt: expected true, got %v", config.ExcludeSystemPrompt)
+	}
+}
+
+func TestUnmarshalJSON_TTLFormats(t *testing.T) {
+	tests := []struct {
+		name     string
+		json     string
+		expected time.Duration
+	}{
+		{
+			name:     "duration string",
+			json:     `{"dimension": 1536, "ttl": "5m"}`,
+			expected: 5 * time.Minute,
+		},
+		{
+			name:     "numeric seconds",
+			json:     `{"dimension": 1536, "ttl": 300}`,
+			expected: 300 * time.Second,
+		},
+		{
+			name:     "omitted",
+			json:     `{"dimension": 1536}`,
+			expected: 0,
+		},
+	}
+
+	for _, tc := range tests {
+		t.Run(tc.name, func(t *testing.T) {
+			var config Config
+			if err := json.Unmarshal([]byte(tc.json), &config); err != nil {
+				t.Fatalf("Failed to unmarshal: %v", err)
+			}
+			if config.TTL != tc.expected {
+				t.Errorf("Expected TTL %v, got %v", tc.expected, config.TTL)
+			}
+		})
+	}
+}
+
+func TestUnmarshalJSON_BoolPointerFields(t *testing.T) {
+	tests := []struct {
+		name                string
+		json                string
+		expectCacheByModel  *bool
+		expectCacheByProv   *bool
+		expectExcludeSys    *bool
+	}{
+		{
+			name:                "all set to true",
+			json:                `{"dimension": 1536, "cache_by_model": true, "cache_by_provider": true, "exclude_system_prompt": true}`,
+			expectCacheByModel:  bifrost.Ptr(true),
+			expectCacheByProv:   bifrost.Ptr(true),
+			expectExcludeSys:    bifrost.Ptr(true),
+		},
+		{
+			name:                "all set to false",
+			json:                `{"dimension": 1536, "cache_by_model": false, "cache_by_provider": false, "exclude_system_prompt": false}`,
+			expectCacheByModel:  bifrost.Ptr(false),
+			expectCacheByProv:   bifrost.Ptr(false),
+			expectExcludeSys:    bifrost.Ptr(false),
+		},
+		{
+			name:                "all omitted",
+			json:                `{"dimension": 1536}`,
+			expectCacheByModel:  nil,
+			expectCacheByProv:   nil,
+			expectExcludeSys:    nil,
+		},
+	}
+
+	for _, tc := range tests {
+		t.Run(tc.name, func(t *testing.T) {
+			var config Config
+			if err := json.Unmarshal([]byte(tc.json), &config); err != nil {
+				t.Fatalf("Failed to unmarshal: %v", err)
+			}
+			assertBoolPtr(t, "CacheByModel", config.CacheByModel, tc.expectCacheByModel)
+			assertBoolPtr(t, "CacheByProvider", config.CacheByProvider, tc.expectCacheByProv)
+			assertBoolPtr(t, "ExcludeSystemPrompt", config.ExcludeSystemPrompt, tc.expectExcludeSys)
+		})
+	}
+}
+
+func assertBoolPtr(t *testing.T, field string, got, want *bool) {
+	t.Helper()
+	if got == nil && want == nil {
+		return
+	}
+	if got == nil || want == nil {
+		t.Errorf("%s: expected %v, got %v", field, want, got)
+		return
+	}
+	if *got != *want {
+		t.Errorf("%s: expected %v, got %v", field, *want, *got)
+	}
+}
--- a/plugins/semanticcache/go.mod
+++ b/plugins/semanticcache/go.mod
@@ -0,0 +1,161 @@
+module github.com/maximhq/bifrost/plugins/semanticcache
+
+go 1.26.2
+
+require (
+	github.com/cespare/xxhash/v2 v2.3.0
+	github.com/google/uuid v1.6.0
+	github.com/maximhq/bifrost/core v1.5.4
+	github.com/maximhq/bifrost/framework v1.3.4
+	github.com/maximhq/bifrost/plugins/mocker v1.5.3
+)
+
+require (
+	cel.dev/expr v0.25.1 // indirect
+	cloud.google.com/go v0.123.0 // indirect
+	cloud.google.com/go/auth v0.18.2 // indirect
+	cloud.google.com/go/auth/oauth2adapt v0.2.8 // indirect
+	cloud.google.com/go/compute/metadata v0.9.0 // indirect
+	cloud.google.com/go/iam v1.5.3 // indirect
+	cloud.google.com/go/monitoring v1.24.3 // indirect
+	cloud.google.com/go/storage v1.61.3 // indirect
+	github.com/Azure/azure-sdk-for-go/sdk/azcore v1.20.0 // indirect
+	github.com/Azure/azure-sdk-for-go/sdk/azidentity v1.13.1 // indirect
+	github.com/Azure/azure-sdk-for-go/sdk/internal v1.11.2 // indirect
+	github.com/AzureAD/microsoft-authentication-library-for-go v1.6.0 // indirect
+	github.com/GoogleCloudPlatform/opentelemetry-operations-go/detectors/gcp v1.31.0 // indirect
+	github.com/GoogleCloudPlatform/opentelemetry-operations-go/exporter/metric v0.55.0 // indirect
+	github.com/GoogleCloudPlatform/opentelemetry-operations-go/internal/resourcemapping v0.55.0 // indirect
+	github.com/andybalholm/brotli v1.2.0 // indirect
+	github.com/apapsch/go-jsonmerge/v2 v2.0.0 // indirect
+	github.com/aws/aws-sdk-go-v2 v1.41.5 // indirect
+	github.com/aws/aws-sdk-go-v2/aws/protocol/eventstream v1.7.8 // indirect
+	github.com/aws/aws-sdk-go-v2/config v1.32.11 // indirect
+	github.com/aws/aws-sdk-go-v2/credentials v1.19.14 // indirect
+	github.com/aws/aws-sdk-go-v2/feature/ec2/imds v1.18.21 // indirect
+	github.com/aws/aws-sdk-go-v2/internal/configsources v1.4.21 // indirect
+	github.com/aws/aws-sdk-go-v2/internal/endpoints/v2 v2.7.21 // indirect
+	github.com/aws/aws-sdk-go-v2/internal/ini v1.8.5 // indirect
+	github.com/aws/aws-sdk-go-v2/internal/v4a v1.4.22 // indirect
+	github.com/aws/aws-sdk-go-v2/service/internal/accept-encoding v1.13.7 // indirect
+	github.com/aws/aws-sdk-go-v2/service/internal/checksum v1.9.13 // indirect
+	github.com/aws/aws-sdk-go-v2/service/internal/presigned-url v1.13.21 // indirect
+	github.com/aws/aws-sdk-go-v2/service/internal/s3shared v1.19.21 // indirect
+	github.com/aws/aws-sdk-go-v2/service/s3 v1.97.3 // indirect
+	github.com/aws/aws-sdk-go-v2/service/signin v1.0.9 // indirect
+	github.com/aws/aws-sdk-go-v2/service/sso v1.30.15 // indirect
+	github.com/aws/aws-sdk-go-v2/service/ssooidc v1.35.19 // indirect
+	github.com/aws/aws-sdk-go-v2/service/sts v1.41.10 // indirect
+	github.com/aws/smithy-go v1.24.2 // indirect
+	github.com/bahlo/generic-list-go v0.2.0 // indirect
+	github.com/buger/jsonparser v1.1.2 // indirect
+	github.com/bytedance/gopkg v0.1.3 // indirect
+	github.com/bytedance/sonic v1.15.0 // indirect
+	github.com/bytedance/sonic/loader v0.5.0 // indirect
+	github.com/cloudwego/base64x v0.1.6 // indirect
+	github.com/cncf/xds/go v0.0.0-20251210132809-ee656c7534f5 // indirect
+	github.com/davecgh/go-spew v1.1.2-0.20180830191138-d8f796af33cc // indirect
+	github.com/dgryski/go-rendezvous v0.0.0-20200823014737-9f7001d12a5f // indirect
+	github.com/envoyproxy/go-control-plane/envoy v1.36.0 // indirect
+	github.com/envoyproxy/protoc-gen-validate v1.3.0 // indirect
+	github.com/felixge/httpsnoop v1.0.4 // indirect
+	github.com/go-jose/go-jose/v4 v4.1.4 // indirect
+	github.com/go-logr/logr v1.4.3 // indirect
+	github.com/go-logr/stdr v1.2.2 // indirect
+	github.com/go-openapi/analysis v0.24.2 // indirect
+	github.com/go-openapi/errors v0.22.5 // indirect
+	github.com/go-openapi/jsonpointer v0.22.4 // indirect
+	github.com/go-openapi/jsonreference v0.21.4 // indirect
+	github.com/go-openapi/loads v0.23.2 // indirect
+	github.com/go-openapi/runtime v0.29.2 // indirect
+	github.com/go-openapi/spec v0.22.2 // indirect
+	github.com/go-openapi/strfmt v0.25.0 // indirect
+	github.com/go-openapi/swag v0.25.4 // indirect
+	github.com/go-openapi/swag/cmdutils v0.25.4 // indirect
+	github.com/go-openapi/swag/conv v0.25.4 // indirect
+	github.com/go-openapi/swag/fileutils v0.25.4 // indirect
+	github.com/go-openapi/swag/jsonname v0.25.4 // indirect
+	github.com/go-openapi/swag/jsonutils v0.25.4 // indirect
+	github.com/go-openapi/swag/loading v0.25.4 // indirect
+	github.com/go-openapi/swag/mangling v0.25.4 // indirect
+	github.com/go-openapi/swag/netutils v0.25.4 // indirect
+	github.com/go-openapi/swag/stringutils v0.25.4 // indirect
+	github.com/go-openapi/swag/typeutils v0.25.4 // indirect
+	github.com/go-openapi/swag/yamlutils v0.25.4 // indirect
+	github.com/go-openapi/validate v0.25.1 // indirect
+	github.com/go-viper/mapstructure/v2 v2.4.0 // indirect
+	github.com/golang-jwt/jwt/v5 v5.3.0 // indirect
+	github.com/google/s2a-go v0.1.9 // indirect
+	github.com/googleapis/enterprise-certificate-proxy v0.3.14 // indirect
+	github.com/googleapis/gax-go/v2 v2.19.0 // indirect
+	github.com/invopop/jsonschema v0.13.0 // indirect
+	github.com/jackc/pgpassfile v1.0.0 // indirect
+	github.com/jackc/pgservicefile v0.0.0-20240606120523-5a60cdf6a761 // indirect
+	github.com/jackc/pgx/v5 v5.9.1 // indirect
+	github.com/jackc/puddle/v2 v2.2.2 // indirect
+	github.com/jaswdr/faker/v2 v2.8.0 // indirect
+	github.com/jinzhu/inflection v1.0.0 // indirect
+	github.com/jinzhu/now v1.1.5 // indirect
+	github.com/klauspost/compress v1.18.2 // indirect
+	github.com/klauspost/cpuid/v2 v2.3.0 // indirect
+	github.com/kylelemons/godebug v1.1.0 // indirect
+	github.com/mailru/easyjson v0.9.1 // indirect
+	github.com/mark3labs/mcp-go v0.43.2 // indirect
+	github.com/mattn/go-colorable v0.1.14 // indirect
+	github.com/mattn/go-isatty v0.0.20 // indirect
+	github.com/mattn/go-sqlite3 v1.14.32 // indirect
+	github.com/oapi-codegen/runtime v1.1.1 // indirect
+	github.com/oklog/ulid v1.3.1 // indirect
+	github.com/pinecone-io/go-pinecone/v5 v5.3.0 // indirect
+	github.com/pkg/browser v0.0.0-20240102092130-5ac0b6a4141c // indirect
+	github.com/pkg/errors v0.9.1 // indirect
+	github.com/planetscale/vtprotobuf v0.6.1-0.20240319094008-0393e58bdf10 // indirect
+	github.com/pmezard/go-difflib v1.0.1-0.20181226105442-5d4384ee4fb2 // indirect
+	github.com/qdrant/go-client v1.16.2 // indirect
+	github.com/redis/go-redis/v9 v9.17.2 // indirect
+	github.com/rs/zerolog v1.34.0 // indirect
+	github.com/spf13/cast v1.10.0 // indirect
+	github.com/spiffe/go-spiffe/v2 v2.6.0 // indirect
+	github.com/stretchr/testify v1.11.1 // indirect
+	github.com/tidwall/gjson v1.18.0 // indirect
+	github.com/tidwall/match v1.1.1 // indirect
+	github.com/tidwall/pretty v1.2.0 // indirect
+	github.com/tidwall/sjson v1.2.5 // indirect
+	github.com/twitchyliquid64/golang-asm v0.15.1 // indirect
+	github.com/valyala/bytebufferpool v1.0.0 // indirect
+	github.com/valyala/fasthttp v1.68.0 // indirect
+	github.com/weaviate/weaviate v1.36.5 // indirect
+	github.com/weaviate/weaviate-go-client/v5 v5.7.1 // indirect
+	github.com/wk8/go-ordered-map/v2 v2.1.8 // indirect
+	github.com/yosida95/uritemplate/v3 v3.0.2 // indirect
+	go.mongodb.org/mongo-driver v1.17.6 // indirect
+	go.opentelemetry.io/auto/sdk v1.2.1 // indirect
+	go.opentelemetry.io/contrib/detectors/gcp v1.40.0 // indirect
+	go.opentelemetry.io/contrib/instrumentation/google.golang.org/grpc/otelgrpc v0.63.0 // indirect
+	go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp v0.63.0 // indirect
+	go.opentelemetry.io/otel v1.43.0 // indirect
+	go.opentelemetry.io/otel/metric v1.43.0 // indirect
+	go.opentelemetry.io/otel/sdk v1.43.0 // indirect
+	go.opentelemetry.io/otel/sdk/metric v1.43.0 // indirect
+	go.opentelemetry.io/otel/trace v1.43.0 // indirect
+	go.starlark.net v0.0.0-20260102030733-3fee463870c9 // indirect
+	go.yaml.in/yaml/v3 v3.0.4 // indirect
+	golang.org/x/arch v0.23.0 // indirect
+	golang.org/x/crypto v0.49.0 // indirect
+	golang.org/x/net v0.52.0 // indirect
+	golang.org/x/oauth2 v0.36.0 // indirect
+	golang.org/x/sync v0.20.0 // indirect
+	golang.org/x/sys v0.42.0 // indirect
+	golang.org/x/text v0.35.0 // indirect
+	golang.org/x/time v0.15.0 // indirect
+	google.golang.org/api v0.274.0 // indirect
+	google.golang.org/genproto v0.0.0-20260316180232-0b37fe3546d5 // indirect
+	google.golang.org/genproto/googleapis/api v0.0.0-20260401024825-9d38bb4040a9 // indirect
+	google.golang.org/genproto/googleapis/rpc v0.0.0-20260401024825-9d38bb4040a9 // indirect
+	google.golang.org/grpc v1.80.0 // indirect
+	google.golang.org/protobuf v1.36.11 // indirect
+	gopkg.in/yaml.v3 v3.0.1 // indirect
+	gorm.io/driver/postgres v1.6.0 // indirect
+	gorm.io/driver/sqlite v1.6.0 // indirect
+	gorm.io/gorm v1.31.1 // indirect
+)
--- a/plugins/semanticcache/go.sum
+++ b/plugins/semanticcache/go.sum
@@ -0,0 +1,393 @@
+cel.dev/expr v0.25.1 h1:1KrZg61W6TWSxuNZ37Xy49ps13NUovb66QLprthtwi4=
+cel.dev/expr v0.25.1/go.mod h1:hrXvqGP6G6gyx8UAHSHJ5RGk//1Oj5nXQ2NI02Nrsg4=
+cloud.google.com/go v0.123.0 h1:2NAUJwPR47q+E35uaJeYoNhuNEM9kM8SjgRgdeOJUSE=
+cloud.google.com/go v0.123.0/go.mod h1:xBoMV08QcqUGuPW65Qfm1o9Y4zKZBpGS+7bImXLTAZU=
+cloud.google.com/go/auth v0.18.2 h1:+Nbt5Ev0xEqxlNjd6c+yYUeosQ5TtEUaNcN/3FozlaM=
+cloud.google.com/go/auth v0.18.2/go.mod h1:xD+oY7gcahcu7G2SG2DsBerfFxgPAJz17zz2joOFF3M=
+cloud.google.com/go/auth/oauth2adapt v0.2.8 h1:keo8NaayQZ6wimpNSmW5OPc283g65QNIiLpZnkHRbnc=
+cloud.google.com/go/auth/oauth2adapt v0.2.8/go.mod h1:XQ9y31RkqZCcwJWNSx2Xvric3RrU88hAYYbjDWYDL+c=
+cloud.google.com/go/compute/metadata v0.9.0 h1:pDUj4QMoPejqq20dK0Pg2N4yG9zIkYGdBtwLoEkH9Zs=
+cloud.google.com/go/compute/metadata v0.9.0/go.mod h1:E0bWwX5wTnLPedCKqk3pJmVgCBSM6qQI1yTBdEb3C10=
+cloud.google.com/go/iam v1.5.3 h1:+vMINPiDF2ognBJ97ABAYYwRgsaqxPbQDlMnbHMjolc=
+cloud.google.com/go/iam v1.5.3/go.mod h1:MR3v9oLkZCTlaqljW6Eb2d3HGDGK5/bDv93jhfISFvU=
+cloud.google.com/go/logging v1.13.2 h1:qqlHCBvieJT9Cdq4QqYx1KPadCQ2noD4FK02eNqHAjA=
+cloud.google.com/go/logging v1.13.2/go.mod h1:zaybliM3yun1J8mU2dVQ1/qDzjbOqEijZCn6hSBtKak=
+cloud.google.com/go/longrunning v0.8.0 h1:LiKK77J3bx5gDLi4SMViHixjD2ohlkwBi+mKA7EhfW8=
+cloud.google.com/go/longrunning v0.8.0/go.mod h1:UmErU2Onzi+fKDg2gR7dusz11Pe26aknR4kHmJJqIfk=
+cloud.google.com/go/monitoring v1.24.3 h1:dde+gMNc0UhPZD1Azu6at2e79bfdztVDS5lvhOdsgaE=
+cloud.google.com/go/monitoring v1.24.3/go.mod h1:nYP6W0tm3N9H/bOw8am7t62YTzZY+zUeQ+Bi6+2eonI=
+cloud.google.com/go/storage v1.61.3 h1:VS//ZfBuPGDvakfD9xyPW1RGF1Vy3BWUoVZXgW1KMOg=
+cloud.google.com/go/storage v1.61.3/go.mod h1:JtqK8BBB7TWv0HVGHubtUdzYYrakOQIsMLffZ2Z/HWk=
+cloud.google.com/go/trace v1.11.7 h1:kDNDX8JkaAG3R2nq1lIdkb7FCSi1rCmsEtKVsty7p+U=
+cloud.google.com/go/trace v1.11.7/go.mod h1:TNn9d5V3fQVf6s4SCveVMIBS2LJUqo73GACmq/Tky0s=
+github.com/Azure/azure-sdk-for-go/sdk/azcore v1.20.0 h1:JXg2dwJUmPB9JmtVmdEB16APJ7jurfbY5jnfXpJoRMc=
+github.com/Azure/azure-sdk-for-go/sdk/azcore v1.20.0/go.mod h1:YD5h/ldMsG0XiIw7PdyNhLxaM317eFh5yNLccNfGdyw=
+github.com/Azure/azure-sdk-for-go/sdk/azidentity v1.13.1 h1:Hk5QBxZQC1jb2Fwj6mpzme37xbCDdNTxU7O9eb5+LB4=
+github.com/Azure/azure-sdk-for-go/sdk/azidentity v1.13.1/go.mod h1:IYus9qsFobWIc2YVwe/WPjcnyCkPKtnHAqUYeebc8z0=
+github.com/Azure/azure-sdk-for-go/sdk/azidentity/cache v0.3.2 h1:yz1bePFlP5Vws5+8ez6T3HWXPmwOK7Yvq8QxDBD3SKY=
+github.com/Azure/azure-sdk-for-go/sdk/azidentity/cache v0.3.2/go.mod h1:Pa9ZNPuoNu/GztvBSKk9J1cDJW6vk/n0zLtV4mgd8N8=
+github.com/Azure/azure-sdk-for-go/sdk/internal v1.11.2 h1:9iefClla7iYpfYWdzPCRDozdmndjTm8DXdpCzPajMgA=
+github.com/Azure/azure-sdk-for-go/sdk/internal v1.11.2/go.mod h1:XtLgD3ZD34DAaVIIAyG3objl5DynM3CQ/vMcbBNJZGI=
+github.com/AzureAD/microsoft-authentication-extensions-for-go/cache v0.1.1 h1:WJTmL004Abzc5wDB5VtZG2PJk5ndYDgVacGqfirKxjM=
+github.com/AzureAD/microsoft-authentication-extensions-for-go/cache v0.1.1/go.mod h1:tCcJZ0uHAmvjsVYzEFivsRTN00oz5BEsRgQHu5JZ9WE=
+github.com/AzureAD/microsoft-authentication-library-for-go v1.6.0 h1:XRzhVemXdgvJqCH0sFfrBUTnUJSBrBf7++ypk+twtRs=
+github.com/AzureAD/microsoft-authentication-library-for-go v1.6.0/go.mod h1:HKpQxkWaGLJ+D/5H8QRpyQXA1eKjxkFlOMwck5+33Jk=
+github.com/GoogleCloudPlatform/opentelemetry-operations-go/detectors/gcp v1.31.0 h1:DHa2U07rk8syqvCge0QIGMCE1WxGj9njT44GH7zNJLQ=
+github.com/GoogleCloudPlatform/opentelemetry-operations-go/detectors/gcp v1.31.0/go.mod h1:P4WPRUkOhJC13W//jWpyfJNDAIpvRbAUIYLX/4jtlE0=
+github.com/GoogleCloudPlatform/opentelemetry-operations-go/exporter/metric v0.55.0 h1:UnDZ/zFfG1JhH/DqxIZYU/1CUAlTUScoXD/LcM2Ykk8=
+github.com/GoogleCloudPlatform/opentelemetry-operations-go/exporter/metric v0.55.0/go.mod h1:IA1C1U7jO/ENqm/vhi7V9YYpBsp+IMyqNrEN94N7tVc=
+github.com/GoogleCloudPlatform/opentelemetry-operations-go/internal/cloudmock v0.55.0 h1:7t/qx5Ost0s0wbA/VDrByOooURhp+ikYwv20i9Y07TQ=
+github.com/GoogleCloudPlatform/opentelemetry-operations-go/internal/cloudmock v0.55.0/go.mod h1:vB2GH9GAYYJTO3mEn8oYwzEdhlayZIdQz6zdzgUIRvA=
+github.com/GoogleCloudPlatform/opentelemetry-operations-go/internal/resourcemapping v0.55.0 h1:0s6TxfCu2KHkkZPnBfsQ2y5qia0jl3MMrmBhu3nCOYk=
+github.com/GoogleCloudPlatform/opentelemetry-operations-go/internal/resourcemapping v0.55.0/go.mod h1:Mf6O40IAyB9zR/1J8nGDDPirZQQPbYJni8Yisy7NTMc=
+github.com/RaveNoX/go-jsoncommentstrip v1.0.0/go.mod h1:78ihd09MekBnJnxpICcwzCMzGrKSKYe4AqU6PDYYpjk=
+github.com/andybalholm/brotli v1.2.0 h1:ukwgCxwYrmACq68yiUqwIWnGY0cTPox/M94sVwToPjQ=
+github.com/andybalholm/brotli v1.2.0/go.mod h1:rzTDkvFWvIrjDXZHkuS16NPggd91W3kUSvPlQ1pLaKY=
+github.com/apapsch/go-jsonmerge/v2 v2.0.0 h1:axGnT1gRIfimI7gJifB699GoE/oq+F2MU7Dml6nw9rQ=
+github.com/apapsch/go-jsonmerge/v2 v2.0.0/go.mod h1:lvDnEdqiQrp0O42VQGgmlKpxL1AP2+08jFMw88y4klk=
+github.com/aws/aws-sdk-go-v2 v1.41.5 h1:dj5kopbwUsVUVFgO4Fi5BIT3t4WyqIDjGKCangnV/yY=
+github.com/aws/aws-sdk-go-v2 v1.41.5/go.mod h1:mwsPRE8ceUUpiTgF7QmQIJ7lgsKUPQOUl3o72QBrE1o=
+github.com/aws/aws-sdk-go-v2/aws/protocol/eventstream v1.7.8 h1:eBMB84YGghSocM7PsjmmPffTa+1FBUeNvGvFou6V/4o=
+github.com/aws/aws-sdk-go-v2/aws/protocol/eventstream v1.7.8/go.mod h1:lyw7GFp3qENLh7kwzf7iMzAxDn+NzjXEAGjKS2UOKqI=
+github.com/aws/aws-sdk-go-v2/config v1.32.11 h1:ftxI5sgz8jZkckuUHXfC/wMUc8u3fG1vQS0plr2F2Zs=
+github.com/aws/aws-sdk-go-v2/config v1.32.11/go.mod h1:twF11+6ps9aNRKEDimksp923o44w/Thk9+8YIlzWMmo=
+github.com/aws/aws-sdk-go-v2/credentials v1.19.14 h1:n+UcGWAIZHkXzYt87uMFBv/l8THYELoX6gVcUvgl6fI=
+github.com/aws/aws-sdk-go-v2/credentials v1.19.14/go.mod h1:cJKuyWB59Mqi0jM3nFYQRmnHVQIcgoxjEMAbLkpr62w=
+github.com/aws/aws-sdk-go-v2/feature/ec2/imds v1.18.21 h1:NUS3K4BTDArQqNu2ih7yeDLaS3bmHD0YndtA6UP884g=
+github.com/aws/aws-sdk-go-v2/feature/ec2/imds v1.18.21/go.mod h1:YWNWJQNjKigKY1RHVJCuupeWDrrHjRqHm0N9rdrWzYI=
+github.com/aws/aws-sdk-go-v2/internal/configsources v1.4.21 h1:Rgg6wvjjtX8bNHcvi9OnXWwcE0a2vGpbwmtICOsvcf4=
+github.com/aws/aws-sdk-go-v2/internal/configsources v1.4.21/go.mod h1:A/kJFst/nm//cyqonihbdpQZwiUhhzpqTsdbhDdRF9c=
+github.com/aws/aws-sdk-go-v2/internal/endpoints/v2 v2.7.21 h1:PEgGVtPoB6NTpPrBgqSE5hE/o47Ij9qk/SEZFbUOe9A=
+github.com/aws/aws-sdk-go-v2/internal/endpoints/v2 v2.7.21/go.mod h1:p+hz+PRAYlY3zcpJhPwXlLC4C+kqn70WIHwnzAfs6ps=
+github.com/aws/aws-sdk-go-v2/internal/ini v1.8.5 h1:clHU5fm//kWS1C2HgtgWxfQbFbx4b6rx+5jzhgX9HrI=
+github.com/aws/aws-sdk-go-v2/internal/ini v1.8.5/go.mod h1:O3h0IK87yXci+kg6flUKzJnWeziQUKciKrLjcatSNcY=
+github.com/aws/aws-sdk-go-v2/internal/v4a v1.4.22 h1:rWyie/PxDRIdhNf4DzRk0lvjVOqFJuNnO8WwaIRVxzQ=
+github.com/aws/aws-sdk-go-v2/internal/v4a v1.4.22/go.mod h1:zd/JsJ4P7oGfUhXn1VyLqaRZwPmZwg44Jf2dS84Dm3Y=
+github.com/aws/aws-sdk-go-v2/service/internal/accept-encoding v1.13.7 h1:5EniKhLZe4xzL7a+fU3C2tfUN4nWIqlLesfrjkuPFTY=
+github.com/aws/aws-sdk-go-v2/service/internal/accept-encoding v1.13.7/go.mod h1:x0nZssQ3qZSnIcePWLvcoFisRXJzcTVvYpAAdYX8+GI=
+github.com/aws/aws-sdk-go-v2/service/internal/checksum v1.9.13 h1:JRaIgADQS/U6uXDqlPiefP32yXTda7Kqfx+LgspooZM=
+github.com/aws/aws-sdk-go-v2/service/internal/checksum v1.9.13/go.mod h1:CEuVn5WqOMilYl+tbccq8+N2ieCy0gVn3OtRb0vBNNM=
+github.com/aws/aws-sdk-go-v2/service/internal/presigned-url v1.13.21 h1:c31//R3xgIJMSC8S6hEVq+38DcvUlgFY0FM6mSI5oto=
+github.com/aws/aws-sdk-go-v2/service/internal/presigned-url v1.13.21/go.mod h1:r6+pf23ouCB718FUxaqzZdbpYFyDtehyZcmP5KL9FkA=
+github.com/aws/aws-sdk-go-v2/service/internal/s3shared v1.19.21 h1:ZlvrNcHSFFWURB8avufQq9gFsheUgjVD9536obIknfM=
+github.com/aws/aws-sdk-go-v2/service/internal/s3shared v1.19.21/go.mod h1:cv3TNhVrssKR0O/xxLJVRfd2oazSnZnkUeTf6ctUwfQ=
+github.com/aws/aws-sdk-go-v2/service/s3 v1.97.3 h1:HwxWTbTrIHm5qY+CAEur0s/figc3qwvLWsNkF4RPToo=
+github.com/aws/aws-sdk-go-v2/service/s3 v1.97.3/go.mod h1:uoA43SdFwacedBfSgfFSjjCvYe8aYBS7EnU5GZ/YKMM=
+github.com/aws/aws-sdk-go-v2/service/signin v1.0.9 h1:QKZH0S178gCmFEgst8hN0mCX1KxLgHBKKY/CLqwP8lg=
+github.com/aws/aws-sdk-go-v2/service/signin v1.0.9/go.mod h1:7yuQJoT+OoH8aqIxw9vwF+8KpvLZ8AWmvmUWHsGQZvI=
+github.com/aws/aws-sdk-go-v2/service/sso v1.30.15 h1:lFd1+ZSEYJZYvv9d6kXzhkZu07si3f+GQ1AaYwa2LUM=
+github.com/aws/aws-sdk-go-v2/service/sso v1.30.15/go.mod h1:WSvS1NLr7JaPunCXqpJnWk1Bjo7IxzZXrZi1QQCkuqM=
+github.com/aws/aws-sdk-go-v2/service/ssooidc v1.35.19 h1:dzztQ1YmfPrxdrOiuZRMF6fuOwWlWpD2StNLTceKpys=
+github.com/aws/aws-sdk-go-v2/service/ssooidc v1.35.19/go.mod h1:YO8TrYtFdl5w/4vmjL8zaBSsiNp3w0L1FfKVKenZT7w=
+github.com/aws/aws-sdk-go-v2/service/sts v1.41.10 h1:p8ogvvLugcR/zLBXTXrTkj0RYBUdErbMnAFFp12Lm/U=
+github.com/aws/aws-sdk-go-v2/service/sts v1.41.10/go.mod h1:60dv0eZJfeVXfbT1tFJinbHrDfSJ2GZl4Q//OSSNAVw=
+github.com/aws/smithy-go v1.24.2 h1:FzA3bu/nt/vDvmnkg+R8Xl46gmzEDam6mZ1hzmwXFng=
+github.com/aws/smithy-go v1.24.2/go.mod h1:YE2RhdIuDbA5E5bTdciG9KrW3+TiEONeUWCqxX9i1Fc=
+github.com/bahlo/generic-list-go v0.2.0 h1:5sz/EEAK+ls5wF+NeqDpk5+iNdMDXrh3z3nPnH1Wvgk=
+github.com/bahlo/generic-list-go v0.2.0/go.mod h1:2KvAjgMlE5NNynlg/5iLrrCCZ2+5xWbdbCW3pNTGyYg=
+github.com/bmatcuk/doublestar v1.1.1/go.mod h1:UD6OnuiIn0yFxxA2le/rnRU1G4RaI4UvFv1sNto9p6w=
+github.com/bsm/ginkgo/v2 v2.12.0 h1:Ny8MWAHyOepLGlLKYmXG4IEkioBysk6GpaRTLC8zwWs=
+github.com/bsm/ginkgo/v2 v2.12.0/go.mod h1:SwYbGRRDovPVboqFv0tPTcG1sN61LM1Z4ARdbAV9g4c=
+github.com/bsm/gomega v1.27.10 h1:yeMWxP2pV2fG3FgAODIY8EiRE3dy0aeFYt4l7wh6yKA=
+github.com/bsm/gomega v1.27.10/go.mod h1:JyEr/xRbxbtgWNi8tIEVPUYZ5Dzef52k01W3YH0H+O0=
+github.com/buger/jsonparser v1.1.2 h1:frqHqw7otoVbk5M8LlE/L7HTnIq2v9RX6EJ48i9AxJk=
+github.com/buger/jsonparser v1.1.2/go.mod h1:6RYKKt7H4d4+iWqouImQ9R2FZql3VbhNgx27UK13J/0=
+github.com/bytedance/gopkg v0.1.3 h1:TPBSwH8RsouGCBcMBktLt1AymVo2TVsBVCY4b6TnZ/M=
+github.com/bytedance/gopkg v0.1.3/go.mod h1:576VvJ+eJgyCzdjS+c4+77QF3p7ubbtiKARP3TxducM=
+github.com/bytedance/sonic v1.15.0 h1:/PXeWFaR5ElNcVE84U0dOHjiMHQOwNIx3K4ymzh/uSE=
+github.com/bytedance/sonic v1.15.0/go.mod h1:tFkWrPz0/CUCLEF4ri4UkHekCIcdnkqXw9VduqpJh0k=
+github.com/bytedance/sonic/loader v0.5.0 h1:gXH3KVnatgY7loH5/TkeVyXPfESoqSBSBEiDd5VjlgE=
+github.com/bytedance/sonic/loader v0.5.0/go.mod h1:AR4NYCk5DdzZizZ5djGqQ92eEhCCcdf5x77udYiSJRo=
+github.com/cespare/xxhash/v2 v2.3.0 h1:UL815xU9SqsFlibzuggzjXhog7bL6oX9BbNZnL2UFvs=
+github.com/cespare/xxhash/v2 v2.3.0/go.mod h1:VGX0DQ3Q6kWi7AoAeZDth3/j3BFtOZR5XLFGgcrjCOs=
+github.com/cloudwego/base64x v0.1.6 h1:t11wG9AECkCDk5fMSoxmufanudBtJ+/HemLstXDLI2M=
+github.com/cloudwego/base64x v0.1.6/go.mod h1:OFcloc187FXDaYHvrNIjxSe8ncn0OOM8gEHfghB2IPU=
+github.com/cncf/xds/go v0.0.0-20251210132809-ee656c7534f5 h1:6xNmx7iTtyBRev0+D/Tv1FZd4SCg8axKApyNyRsAt/w=
+github.com/cncf/xds/go v0.0.0-20251210132809-ee656c7534f5/go.mod h1:KdCmV+x/BuvyMxRnYBlmVaq4OLiKW6iRQfvC62cvdkI=
+github.com/coreos/go-systemd/v22 v22.5.0/go.mod h1:Y58oyj3AT4RCenI/lSvhwexgC+NSVTIJ3seZv2GcEnc=
+github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
+github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
+github.com/davecgh/go-spew v1.1.2-0.20180830191138-d8f796af33cc h1:U9qPSI2PIWSS1VwoXQT9A3Wy9MM3WgvqSxFWenqJduM=
+github.com/davecgh/go-spew v1.1.2-0.20180830191138-d8f796af33cc/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
+github.com/dgryski/go-rendezvous v0.0.0-20200823014737-9f7001d12a5f h1:lO4WD4F/rVNCu3HqELle0jiPLLBs70cWOduZpkS1E78=
+github.com/dgryski/go-rendezvous v0.0.0-20200823014737-9f7001d12a5f/go.mod h1:cuUVRXasLTGF7a8hSLbxyZXjz+1KgoB3wDUb6vlszIc=
+github.com/envoyproxy/go-control-plane v0.14.0 h1:hbG2kr4RuFj222B6+7T83thSPqLjwBIfQawTkC++2HA=
+github.com/envoyproxy/go-control-plane v0.14.0/go.mod h1:NcS5X47pLl/hfqxU70yPwL9ZMkUlwlKxtAohpi2wBEU=
+github.com/envoyproxy/go-control-plane/envoy v1.36.0 h1:yg/JjO5E7ubRyKX3m07GF3reDNEnfOboJ0QySbH736g=
+github.com/envoyproxy/go-control-plane/envoy v1.36.0/go.mod h1:ty89S1YCCVruQAm9OtKeEkQLTb+Lkz0k8v9W0Oxsv98=
+github.com/envoyproxy/go-control-plane/ratelimit v0.1.0 h1:/G9QYbddjL25KvtKTv3an9lx6VBE2cnb8wp1vEGNYGI=
+github.com/envoyproxy/go-control-plane/ratelimit v0.1.0/go.mod h1:Wk+tMFAFbCXaJPzVVHnPgRKdUdwW/KdbRt94AzgRee4=
+github.com/envoyproxy/protoc-gen-validate v1.3.0 h1:TvGH1wof4H33rezVKWSpqKz5NXWg5VPuZ0uONDT6eb4=
+github.com/envoyproxy/protoc-gen-validate v1.3.0/go.mod h1:HvYl7zwPa5mffgyeTUHA9zHIH36nmrm7oCbo4YKoSWA=
+github.com/fasthttp/websocket v1.5.12 h1:e4RGPpWW2HTbL3zV0Y/t7g0ub294LkiuXXUuTOUInlE=
+github.com/fasthttp/websocket v1.5.12/go.mod h1:I+liyL7/4moHojiOgUOIKEWm9EIxHqxZChS+aMFltyg=
+github.com/felixge/httpsnoop v1.0.4 h1:NFTV2Zj1bL4mc9sqWACXbQFVBBg2W3GPvqp8/ESS2Wg=
+github.com/felixge/httpsnoop v1.0.4/go.mod h1:m8KPJKqk1gH5J9DgRY2ASl2lWCfGKXixSwevea8zH2U=
+github.com/frankban/quicktest v1.14.6 h1:7Xjx+VpznH+oBnejlPUj8oUpdxnVs4f8XU8WnHkI4W8=
+github.com/frankban/quicktest v1.14.6/go.mod h1:4ptaffx2x8+WTWXmUCuVU6aPUX1/Mz7zb5vbUoiM6w0=
+github.com/go-jose/go-jose/v4 v4.1.4 h1:moDMcTHmvE6Groj34emNPLs/qtYXRVcd6S7NHbHz3kA=
+github.com/go-jose/go-jose/v4 v4.1.4/go.mod h1:x4oUasVrzR7071A4TnHLGSPpNOm2a21K9Kf04k1rs08=
+github.com/go-logr/logr v1.2.2/go.mod h1:jdQByPbusPIv2/zmleS9BjJVeZ6kBagPoEUsqbVz/1A=
+github.com/go-logr/logr v1.4.3 h1:CjnDlHq8ikf6E492q6eKboGOC0T8CDaOvkHCIg8idEI=
+github.com/go-logr/logr v1.4.3/go.mod h1:9T104GzyrTigFIr8wt5mBrctHMim0Nb2HLGrmQ40KvY=
+github.com/go-logr/stdr v1.2.2 h1:hSWxHoqTgW2S2qGc0LTAI563KZ5YKYRhT3MFKZMbjag=
+github.com/go-logr/stdr v1.2.2/go.mod h1:mMo/vtBO5dYbehREoey6XUKy/eSumjCCveDpRre4VKE=
+github.com/go-openapi/analysis v0.24.2 h1:6p7WXEuKy1llDgOH8FooVeO+Uq2za9qoAOq4ZN08B50=
+github.com/go-openapi/analysis v0.24.2/go.mod h1:x27OOHKANE0lutg2ml4kzYLoHGMKgRm1Cj2ijVOjJuE=
+github.com/go-openapi/errors v0.22.5 h1:Yfv4O/PRYpNF3BNmVkEizcHb3uLVVsrDt3LNdgAKRY4=
+github.com/go-openapi/errors v0.22.5/go.mod h1:z9S8ASTUqx7+CP1Q8dD8ewGH/1JWFFLX/2PmAYNQLgk=
+github.com/go-openapi/jsonpointer v0.22.4 h1:dZtK82WlNpVLDW2jlA1YCiVJFVqkED1MegOUy9kR5T4=
+github.com/go-openapi/jsonpointer v0.22.4/go.mod h1:elX9+UgznpFhgBuaMQ7iu4lvvX1nvNsesQ3oxmYTw80=
+github.com/go-openapi/jsonreference v0.21.4 h1:24qaE2y9bx/q3uRK/qN+TDwbok1NhbSmGjjySRCHtC8=
+github.com/go-openapi/jsonreference v0.21.4/go.mod h1:rIENPTjDbLpzQmQWCj5kKj3ZlmEh+EFVbz3RTUh30/4=
+github.com/go-openapi/loads v0.23.2 h1:rJXAcP7g1+lWyBHC7iTY+WAF0rprtM+pm8Jxv1uQJp4=
+github.com/go-openapi/loads v0.23.2/go.mod h1:IEVw1GfRt/P2Pplkelxzj9BYFajiWOtY2nHZNj4UnWY=
+github.com/go-openapi/runtime v0.29.2 h1:UmwSGWNmWQqKm1c2MGgXVpC2FTGwPDQeUsBMufc5Yj0=
+github.com/go-openapi/runtime v0.29.2/go.mod h1:biq5kJXRJKBJxTDJXAa00DOTa/anflQPhT0/wmjuy+0=
+github.com/go-openapi/spec v0.22.2 h1:KEU4Fb+Lp1qg0V4MxrSCPv403ZjBl8Lx1a83gIPU8Qc=
+github.com/go-openapi/spec v0.22.2/go.mod h1:iIImLODL2loCh3Vnox8TY2YWYJZjMAKYyLH2Mu8lOZs=
+github.com/go-openapi/strfmt v0.25.0 h1:7R0RX7mbKLa9EYCTHRcCuIPcaqlyQiWNPTXwClK0saQ=
+github.com/go-openapi/strfmt v0.25.0/go.mod h1:nNXct7OzbwrMY9+5tLX4I21pzcmE6ccMGXl3jFdPfn8=
+github.com/go-openapi/swag v0.25.4 h1:OyUPUFYDPDBMkqyxOTkqDYFnrhuhi9NR6QVUvIochMU=
+github.com/go-openapi/swag v0.25.4/go.mod h1:zNfJ9WZABGHCFg2RnY0S4IOkAcVTzJ6z2Bi+Q4i6qFQ=
+github.com/go-openapi/swag/cmdutils v0.25.4 h1:8rYhB5n6WawR192/BfUu2iVlxqVR9aRgGJP6WaBoW+4=
+github.com/go-openapi/swag/cmdutils v0.25.4/go.mod h1:pdae/AFo6WxLl5L0rq87eRzVPm/XRHM3MoYgRMvG4A0=
+github.com/go-openapi/swag/conv v0.25.4 h1:/Dd7p0LZXczgUcC/Ikm1+YqVzkEeCc9LnOWjfkpkfe4=
+github.com/go-openapi/swag/conv v0.25.4/go.mod h1:3LXfie/lwoAv0NHoEuY1hjoFAYkvlqI/Bn5EQDD3PPU=
+github.com/go-openapi/swag/fileutils v0.25.4 h1:2oI0XNW5y6UWZTC7vAxC8hmsK/tOkWXHJQH4lKjqw+Y=
+github.com/go-openapi/swag/fileutils v0.25.4/go.mod h1:cdOT/PKbwcysVQ9Tpr0q20lQKH7MGhOEb6EwmHOirUk=
+github.com/go-openapi/swag/jsonname v0.25.4 h1:bZH0+MsS03MbnwBXYhuTttMOqk+5KcQ9869Vye1bNHI=
+github.com/go-openapi/swag/jsonname v0.25.4/go.mod h1:GPVEk9CWVhNvWhZgrnvRA6utbAltopbKwDu8mXNUMag=
+github.com/go-openapi/swag/jsonutils v0.25.4 h1:VSchfbGhD4UTf4vCdR2F4TLBdLwHyUDTd1/q4i+jGZA=
+github.com/go-openapi/swag/jsonutils v0.25.4/go.mod h1:7OYGXpvVFPn4PpaSdPHJBtF0iGnbEaTk8AvBkoWnaAY=
+github.com/go-openapi/swag/jsonutils/fixtures_test v0.25.4 h1:IACsSvBhiNJwlDix7wq39SS2Fh7lUOCJRmx/4SN4sVo=
+github.com/go-openapi/swag/jsonutils/fixtures_test v0.25.4/go.mod h1:Mt0Ost9l3cUzVv4OEZG+WSeoHwjWLnarzMePNDAOBiM=
+github.com/go-openapi/swag/loading v0.25.4 h1:jN4MvLj0X6yhCDduRsxDDw1aHe+ZWoLjW+9ZQWIKn2s=
+github.com/go-openapi/swag/loading v0.25.4/go.mod h1:rpUM1ZiyEP9+mNLIQUdMiD7dCETXvkkC30z53i+ftTE=
+github.com/go-openapi/swag/mangling v0.25.4 h1:2b9kBJk9JvPgxr36V23FxJLdwBrpijI26Bx5JH4Hp48=
+github.com/go-openapi/swag/mangling v0.25.4/go.mod h1:6dxwu6QyORHpIIApsdZgb6wBk/DPU15MdyYj/ikn0Hg=
+github.com/go-openapi/swag/netutils v0.25.4 h1:Gqe6K71bGRb3ZQLusdI8p/y1KLgV4M/k+/HzVSqT8H0=
+github.com/go-openapi/swag/netutils v0.25.4/go.mod h1:m2W8dtdaoX7oj9rEttLyTeEFFEBvnAx9qHd5nJEBzYg=
+github.com/go-openapi/swag/stringutils v0.25.4 h1:O6dU1Rd8bej4HPA3/CLPciNBBDwZj9HiEpdVsb8B5A8=
+github.com/go-openapi/swag/stringutils v0.25.4/go.mod h1:GTsRvhJW5xM5gkgiFe0fV3PUlFm0dr8vki6/VSRaZK0=
+github.com/go-openapi/swag/typeutils v0.25.4 h1:1/fbZOUN472NTc39zpa+YGHn3jzHWhv42wAJSN91wRw=
+github.com/go-openapi/swag/typeutils v0.25.4/go.mod h1:Ou7g//Wx8tTLS9vG0UmzfCsjZjKhpjxayRKTHXf2pTE=
+github.com/go-openapi/swag/yamlutils v0.25.4 h1:6jdaeSItEUb7ioS9lFoCZ65Cne1/RZtPBZ9A56h92Sw=
+github.com/go-openapi/swag/yamlutils v0.25.4/go.mod h1:MNzq1ulQu+yd8Kl7wPOut/YHAAU/H6hL91fF+E2RFwc=
+github.com/go-openapi/testify/enable/yaml/v2 v2.0.2 h1:0+Y41Pz1NkbTHz8NngxTuAXxEodtNSI1WG1c/m5Akw4=
+github.com/go-openapi/testify/enable/yaml/v2 v2.0.2/go.mod h1:kme83333GCtJQHXQ8UKX3IBZu6z8T5Dvy5+CW3NLUUg=
+github.com/go-openapi/testify/v2 v2.0.2 h1:X999g3jeLcoY8qctY/c/Z8iBHTbwLz7R2WXd6Ub6wls=
+github.com/go-openapi/testify/v2 v2.0.2/go.mod h1:HCPmvFFnheKK2BuwSA0TbbdxJ3I16pjwMkYkP4Ywn54=
+github.com/go-openapi/validate v0.25.1 h1:sSACUI6Jcnbo5IWqbYHgjibrhhmt3vR6lCzKZnmAgBw=
+github.com/go-openapi/validate v0.25.1/go.mod h1:RMVyVFYte0gbSTaZ0N4KmTn6u/kClvAFp+mAVfS/DQc=
+github.com/go-viper/mapstructure/v2 v2.4.0 h1:EBsztssimR/CONLSZZ04E8qAkxNYq4Qp9LvH92wZUgs=
+github.com/go-viper/mapstructure/v2 v2.4.0/go.mod h1:oJDH3BJKyqBA2TXFhDsKDGDTlndYOZ6rGS0BRZIxGhM=
+github.com/godbus/dbus/v5 v5.0.4/go.mod h1:xhWf0FNVPg57R7Z0UbKHbJfkEywrmjJnf7w5xrFpKfA=
+github.com/golang-jwt/jwt/v5 v5.3.0 h1:pv4AsKCKKZuqlgs5sUmn4x8UlGa0kEVt/puTpKx9vvo=
+github.com/golang-jwt/jwt/v5 v5.3.0/go.mod h1:fxCRLWMO43lRc8nhHWY6LGqRcf+1gQWArsqaEUEa5bE=
+github.com/golang/protobuf v1.5.4 h1:i7eJL8qZTpSEXOPTxNKhASYpMn+8e5Q6AdndVa1dWek=
+github.com/golang/protobuf v1.5.4/go.mod h1:lnTiLA8Wa4RWRcIUkrtSVa5nRhsEGBg48fD6rSs7xps=
+github.com/google/go-cmp v0.7.0 h1:wk8382ETsv4JYUZwIsn6YpYiWiBsYLSJiTsyBybVuN8=
+github.com/google/go-cmp v0.7.0/go.mod h1:pXiqmnSA92OHEEa9HXL2W4E7lf9JzCmGVUdgjX3N/iU=
+github.com/google/martian/v3 v3.3.3 h1:DIhPTQrbPkgs2yJYdXU/eNACCG5DVQjySNRNlflZ9Fc=
+github.com/google/martian/v3 v3.3.3/go.mod h1:iEPrYcgCF7jA9OtScMFQyAlZZ4YXTKEtJ1E6RWzmBA0=
+github.com/google/s2a-go v0.1.9 h1:LGD7gtMgezd8a/Xak7mEWL0PjoTQFvpRudN895yqKW0=
+github.com/google/s2a-go v0.1.9/go.mod h1:YA0Ei2ZQL3acow2O62kdp9UlnvMmU7kA6Eutn0dXayM=
+github.com/google/uuid v1.6.0 h1:NIvaJDMOsjHA8n1jAhLSgzrAzy1Hgr+hNrb57e+94F0=
+github.com/google/uuid v1.6.0/go.mod h1:TIyPZe4MgqvfeYDBFedMoGGpEw/LqOeaOT+nhxU+yHo=
+github.com/googleapis/enterprise-certificate-proxy v0.3.14 h1:yh8ncqsbUY4shRD5dA6RlzjJaT4hi3kII+zYw8wmLb8=
+github.com/googleapis/enterprise-certificate-proxy v0.3.14/go.mod h1:vqVt9yG9480NtzREnTlmGSBmFrA+bzb0yl0TxoBQXOg=
+github.com/googleapis/gax-go/v2 v2.19.0 h1:fYQaUOiGwll0cGj7jmHT/0nPlcrZDFPrZRhTsoCr8hE=
+github.com/googleapis/gax-go/v2 v2.19.0/go.mod h1:w2ROXVdfGEVFXzmlciUU4EdjHgWvB5h2n6x/8XSTTJA=
+github.com/hajimehoshi/go-mp3 v0.3.4 h1:NUP7pBYH8OguP4diaTZ9wJbUbk3tC0KlfzsEpWmYj68=
+github.com/hajimehoshi/go-mp3 v0.3.4/go.mod h1:fRtZraRFcWb0pu7ok0LqyFhCUrPeMsGRSVop0eemFmo=
+github.com/invopop/jsonschema v0.13.0 h1:KvpoAJWEjR3uD9Kbm2HWJmqsEaHt8lBUpd0qHcIi21E=
+github.com/invopop/jsonschema v0.13.0/go.mod h1:ffZ5Km5SWWRAIN6wbDXItl95euhFz2uON45H2qjYt+0=
+github.com/jackc/pgpassfile v1.0.0 h1:/6Hmqy13Ss2zCq62VdNG8tM1wchn8zjSGOBJ6icpsIM=
+github.com/jackc/pgpassfile v1.0.0/go.mod h1:CEx0iS5ambNFdcRtxPj5JhEz+xB6uRky5eyVu/W2HEg=
+github.com/jackc/pgservicefile v0.0.0-20240606120523-5a60cdf6a761 h1:iCEnooe7UlwOQYpKFhBabPMi4aNAfoODPEFNiAnClxo=
+github.com/jackc/pgservicefile v0.0.0-20240606120523-5a60cdf6a761/go.mod h1:5TJZWKEWniPve33vlWYSoGYefn3gLQRzjfDlhSJ9ZKM=
+github.com/jackc/pgx/v5 v5.9.1 h1:uwrxJXBnx76nyISkhr33kQLlUqjv7et7b9FjCen/tdc=
+github.com/jackc/pgx/v5 v5.9.1/go.mod h1:mal1tBGAFfLHvZzaYh77YS/eC6IX9OWbRV1QIIM0Jn4=
+github.com/jackc/puddle/v2 v2.2.2 h1:PR8nw+E/1w0GLuRFSmiioY6UooMp6KJv0/61nB7icHo=
+github.com/jackc/puddle/v2 v2.2.2/go.mod h1:vriiEXHvEE654aYKXXjOvZM39qJ0q+azkZFrfEOc3H4=
+github.com/jaswdr/faker/v2 v2.8.0 h1:3AxdXW9U7dJmWckh/P0YgRbNlCcVsTyrUNUnLVP9b3Q=
+github.com/jaswdr/faker/v2 v2.8.0/go.mod h1:jZq+qzNQr8/P+5fHd9t3txe2GNPnthrTfohtnJ7B+68=
+github.com/jinzhu/inflection v1.0.0 h1:K317FqzuhWc8YvSVlFMCCUb36O/S9MCKRDI7QkRKD/E=
+github.com/jinzhu/inflection v1.0.0/go.mod h1:h+uFLlag+Qp1Va5pdKtLDYj+kHp5pxUVkryuEj+Srlc=
+github.com/jinzhu/now v1.1.5 h1:/o9tlHleP7gOFmsnYNz3RGnqzefHA47wQpKrrdTIwXQ=
+github.com/jinzhu/now v1.1.5/go.mod h1:d3SSVoowX0Lcu0IBviAWJpolVfI5UJVZZ7cO71lE/z8=
+github.com/juju/gnuflag v0.0.0-20171113085948-2ce1bb71843d/go.mod h1:2PavIy+JPciBPrBUjwbNvtwB6RQlve+hkpll6QSNmOE=
+github.com/keybase/go-keychain v0.0.1 h1:way+bWYa6lDppZoZcgMbYsvC7GxljxrskdNInRtuthU=
+github.com/keybase/go-keychain v0.0.1/go.mod h1:PdEILRW3i9D8JcdM+FmY6RwkHGnhHxXwkPPMeUgOK1k=
+github.com/klauspost/compress v1.18.2 h1:iiPHWW0YrcFgpBYhsA6D1+fqHssJscY/Tm/y2Uqnapk=
+github.com/klauspost/compress v1.18.2/go.mod h1:R0h/fSBs8DE4ENlcrlib3PsXS61voFxhIs2DeRhCvJ4=
+github.com/klauspost/cpuid/v2 v2.3.0 h1:S4CRMLnYUhGeDFDqkGriYKdfoFlDnMtqTiI/sFzhA9Y=
+github.com/klauspost/cpuid/v2 v2.3.0/go.mod h1:hqwkgyIinND0mEev00jJYCxPNVRVXFQeu1XKlok6oO0=
+github.com/kr/pretty v0.3.1 h1:flRD4NNwYAUpkphVc1HcthR4KEIFJ65n8Mw5qdRn3LE=
+github.com/kr/pretty v0.3.1/go.mod h1:hoEshYVHaxMs3cyo3Yncou5ZscifuDolrwPKZanG3xk=
+github.com/kr/text v0.2.0 h1:5Nx0Ya0ZqY2ygV366QzturHI13Jq95ApcVaJBhpS+AY=
+github.com/kr/text v0.2.0/go.mod h1:eLer722TekiGuMkidMxC/pM04lWEeraHUUmBw8l2grE=
+github.com/kylelemons/godebug v1.1.0 h1:RPNrshWIDI6G2gRW9EHilWtl7Z6Sb1BR0xunSBf0SNc=
+github.com/kylelemons/godebug v1.1.0/go.mod h1:9/0rRGxNHcop5bhtWyNeEfOS8JIWk580+fNqagV/RAw=
+github.com/mailru/easyjson v0.9.1 h1:LbtsOm5WAswyWbvTEOqhypdPeZzHavpZx96/n553mR8=
+github.com/mailru/easyjson v0.9.1/go.mod h1:1+xMtQp2MRNVL/V1bOzuP3aP8VNwRW55fQUto+XFtTU=
+github.com/mark3labs/mcp-go v0.43.2 h1:21PUSlWWiSbUPQwXIJ5WKlETixpFpq+WBpbMGDSVy/I=
+github.com/mark3labs/mcp-go v0.43.2/go.mod h1:YnJfOL382MIWDx1kMY+2zsRHU/q78dBg9aFb8W6Thdw=
+github.com/mattn/go-colorable v0.1.13/go.mod h1:7S9/ev0klgBDR4GtXTXX8a3vIGJpMovkB8vQcUbaXHg=
+github.com/mattn/go-colorable v0.1.14 h1:9A9LHSqF/7dyVVX6g0U9cwm9pG3kP9gSzcuIPHPsaIE=
+github.com/mattn/go-colorable v0.1.14/go.mod h1:6LmQG8QLFO4G5z1gPvYEzlUgJ2wF+stgPZH1UqBm1s8=
+github.com/mattn/go-isatty v0.0.16/go.mod h1:kYGgaQfpe5nmfYZH+SKPsOc2e4SrIfOl2e/yFXSvRLM=
+github.com/mattn/go-isatty v0.0.19/go.mod h1:W+V8PltTTMOvKvAeJH7IuucS94S2C6jfK/D7dTCTo3Y=
+github.com/mattn/go-isatty v0.0.20 h1:xfD0iDuEKnDkl03q4limB+vH+GxLEtL/jb4xVJSWWEY=
+github.com/mattn/go-isatty v0.0.20/go.mod h1:W+V8PltTTMOvKvAeJH7IuucS94S2C6jfK/D7dTCTo3Y=
+github.com/mattn/go-sqlite3 v1.14.32 h1:JD12Ag3oLy1zQA+BNn74xRgaBbdhbNIDYvQUEuuErjs=
+github.com/mattn/go-sqlite3 v1.14.32/go.mod h1:Uh1q+B4BYcTPb+yiD3kU8Ct7aC0hY9fxUwlHK0RXw+Y=
+github.com/maximhq/bifrost/core v1.5.4 h1:hf0BhoHVVpY1EQ4FkyRzW4IBYjrolxdZV0ucgWfHhcE=
+github.com/maximhq/bifrost/core v1.5.4/go.mod h1:z1/vOalbDAD7v7sYbXQsqR+2qIFP0jKOSIStw6Q4P4U=
+github.com/maximhq/bifrost/framework v1.3.4 h1:nZPv1FYry1njexZ0Hb6CZQXybwRFKGMTRyGWz2HGcio=
+github.com/maximhq/bifrost/framework v1.3.4/go.mod h1:e0defDjWWFi6c2Zs3AOkMcRbYzjww4sjkyZtARrP4Zk=
+github.com/maximhq/bifrost/plugins/mocker v1.5.3 h1:PuQShiJS6jbI1S0XAnwtB9dfiYC+TSbxbjJ1FWOb2aE=
+github.com/maximhq/bifrost/plugins/mocker v1.5.3/go.mod h1:Ob9R3faldCd1EnTfuPqkLK4CbjA1nLe4e2/Onf/Kk7E=
+github.com/oapi-codegen/runtime v1.1.1 h1:EXLHh0DXIJnWhdRPN2w4MXAzFyE4CskzhNLUmtpMYro=
+github.com/oapi-codegen/runtime v1.1.1/go.mod h1:SK9X900oXmPWilYR5/WKPzt3Kqxn/uS/+lbpREv+eCg=
+github.com/oklog/ulid v1.3.1 h1:EGfNDEx6MqHz8B3uNV6QAib1UR2Lm97sHi3ocA6ESJ4=
+github.com/oklog/ulid v1.3.1/go.mod h1:CirwcVhetQ6Lv90oh/F+FBtV6XMibvdAFo93nm5qn4U=
+github.com/pinecone-io/go-pinecone/v5 v5.3.0 h1:0YQlEtmXGWK/I8ztkOVM6PuBYgFJZhjSdb0ddU+bHPE=
+github.com/pinecone-io/go-pinecone/v5 v5.3.0/go.mod h1:6Fg85fcyvMUQFf9KW7zniN81kelSYvsjF+KPLdc1MGA=
+github.com/pkg/browser v0.0.0-20240102092130-5ac0b6a4141c h1:+mdjkGKdHQG3305AYmdv1U2eRNDiU2ErMBj1gwrq8eQ=
+github.com/pkg/browser v0.0.0-20240102092130-5ac0b6a4141c/go.mod h1:7rwL4CYBLnjLxUqIJNnCWiEdr3bn6IUYi15bNlnbCCU=
+github.com/pkg/errors v0.9.1 h1:FEBLx1zS214owpjy7qsBeixbURkuhQAwrK5UwLGTwt4=
+github.com/pkg/errors v0.9.1/go.mod h1:bwawxfHBFNV+L2hUp1rHADufV3IMtnDRdf1r5NINEl0=
+github.com/planetscale/vtprotobuf v0.6.1-0.20240319094008-0393e58bdf10 h1:GFCKgmp0tecUJ0sJuv4pzYCqS9+RGSn52M3FUwPs+uo=
+github.com/planetscale/vtprotobuf v0.6.1-0.20240319094008-0393e58bdf10/go.mod h1:t/avpk3KcrXxUnYOhZhMXJlSEyie6gQbtLq5NM3loB8=
+github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4=
+github.com/pmezard/go-difflib v1.0.1-0.20181226105442-5d4384ee4fb2 h1:Jamvg5psRIccs7FGNTlIRMkT8wgtp5eCXdBlqhYGL6U=
+github.com/pmezard/go-difflib v1.0.1-0.20181226105442-5d4384ee4fb2/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4=
+github.com/qdrant/go-client v1.16.2 h1:UUMJJfvXTByhwhH1DwWdbkhZ2cTdvSqVkXSIfBrVWSg=
+github.com/qdrant/go-client v1.16.2/go.mod h1:I+EL3h4HRoRTeHtbfOd/4kDXwCukZfkd41j/9wryGkw=
+github.com/redis/go-redis/v9 v9.17.2 h1:P2EGsA4qVIM3Pp+aPocCJ7DguDHhqrXNhVcEp4ViluI=
+github.com/redis/go-redis/v9 v9.17.2/go.mod h1:u410H11HMLoB+TP67dz8rL9s6QW2j76l0//kSOd3370=
+github.com/rogpeppe/go-internal v1.14.1 h1:UQB4HGPB6osV0SQTLymcB4TgvyWu6ZyliaW0tI/otEQ=
+github.com/rogpeppe/go-internal v1.14.1/go.mod h1:MaRKkUm5W0goXpeCfT7UZI6fk/L7L7so1lCWt35ZSgc=
+github.com/rs/xid v1.6.0/go.mod h1:7XoLgs4eV+QndskICGsho+ADou8ySMSjJKDIan90Nz0=
+github.com/rs/zerolog v1.34.0 h1:k43nTLIwcTVQAncfCw4KZ2VY6ukYoZaBPNOE8txlOeY=
+github.com/rs/zerolog v1.34.0/go.mod h1:bJsvje4Z08ROH4Nhs5iH600c3IkWhwp44iRc54W6wYQ=
+github.com/savsgio/gotils v0.0.0-20250408102913-196191ec6287 h1:qIQ0tWF9vxGtkJa24bR+2i53WBCz1nW/Pc47oVYauC4=
+github.com/savsgio/gotils v0.0.0-20250408102913-196191ec6287/go.mod h1:sM7Mt7uEoCeFSCBM+qBrqvEo+/9vdmj19wzp3yzUhmg=
+github.com/spf13/cast v1.10.0 h1:h2x0u2shc1QuLHfxi+cTJvs30+ZAHOGRic8uyGTDWxY=
+github.com/spf13/cast v1.10.0/go.mod h1:jNfB8QC9IA6ZuY2ZjDp0KtFO2LZZlg4S/7bzP6qqeHo=
+github.com/spiffe/go-spiffe/v2 v2.6.0 h1:l+DolpxNWYgruGQVV0xsfeya3CsC7m8iBzDnMpsbLuo=
+github.com/spiffe/go-spiffe/v2 v2.6.0/go.mod h1:gm2SeUoMZEtpnzPNs2Csc0D/gX33k1xIx7lEzqblHEs=
+github.com/spkg/bom v0.0.0-20160624110644-59b7046e48ad/go.mod h1:qLr4V1qq6nMqFKkMo8ZTx3f+BZEkzsRUY10Xsm2mwU0=
+github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME=
+github.com/stretchr/objx v0.4.0/go.mod h1:YvHI0jy2hoMjB+UWwv71VJQ9isScKT/TqJzVSSt89Yw=
+github.com/stretchr/objx v0.5.0/go.mod h1:Yh+to48EsGEfYuaHDzXPcE3xhTkx73EhmCGUpEOglKo=
+github.com/stretchr/objx v0.5.2/go.mod h1:FRsXN1f5AsAjCGJKqEizvkpNtU+EGNCLh3NxZ/8L+MA=
+github.com/stretchr/objx v0.5.3 h1:jmXUvGomnU1o3W/V5h2VEradbpJDwGrzugQQvL0POH4=
+github.com/stretchr/objx v0.5.3/go.mod h1:rDQraq+vQZU7Fde9LOZLr8Tax6zZvy4kuNKF+QYS+U0=
+github.com/stretchr/testify v1.3.0/go.mod h1:M5WIy9Dh21IEIfnGCwXGc5bZfKNJtfHm1UVUgZn+9EI=
+github.com/stretchr/testify v1.7.0/go.mod h1:6Fq8oRcR53rry900zMqJjRRixrwX3KX962/h/Wwjteg=
+github.com/stretchr/testify v1.7.1/go.mod h1:6Fq8oRcR53rry900zMqJjRRixrwX3KX962/h/Wwjteg=
+github.com/stretchr/testify v1.8.0/go.mod h1:yNjHg4UonilssWZ8iaSj1OCr/vHnekPRkoO+kdMU+MU=
+github.com/stretchr/testify v1.8.4/go.mod h1:sz/lmYIOXD/1dqDmKjjqLyZ2RngseejIcXlSw2iwfAo=
+github.com/stretchr/testify v1.10.0/go.mod h1:r2ic/lqez/lEtzL7wO/rwa5dbSLXVDPFyf8C91i36aY=
+github.com/stretchr/testify v1.11.1 h1:7s2iGBzp5EwR7/aIZr8ao5+dra3wiQyKjjFuvgVKu7U=
+github.com/stretchr/testify v1.11.1/go.mod h1:wZwfW3scLgRK+23gO65QZefKpKQRnfz6sD981Nm4B6U=
+github.com/tidwall/gjson v1.14.2/go.mod h1:/wbyibRr2FHMks5tjHJ5F8dMZh3AcwJEMf5vlfC0lxk=
+github.com/tidwall/gjson v1.18.0 h1:FIDeeyB800efLX89e5a8Y0BNH+LOngJyGrIWxG2FKQY=
+github.com/tidwall/gjson v1.18.0/go.mod h1:/wbyibRr2FHMks5tjHJ5F8dMZh3AcwJEMf5vlfC0lxk=
+github.com/tidwall/match v1.1.1 h1:+Ho715JplO36QYgwN9PGYNhgZvoUSc9X2c80KVTi+GA=
+github.com/tidwall/match v1.1.1/go.mod h1:eRSPERbgtNPcGhD8UCthc6PmLEQXEWd3PRB5JTxsfmM=
+github.com/tidwall/pretty v1.2.0 h1:RWIZEg2iJ8/g6fDDYzMpobmaoGh5OLl4AXtGUGPcqCs=
+github.com/tidwall/pretty v1.2.0/go.mod h1:ITEVvHYasfjBbM0u2Pg8T2nJnzm8xPwvNhhsoaGGjNU=
+github.com/tidwall/sjson v1.2.5 h1:kLy8mja+1c9jlljvWTlSazM7cKDRfJuR/bOJhcY5NcY=
+github.com/tidwall/sjson v1.2.5/go.mod h1:Fvgq9kS/6ociJEDnK0Fk1cpYF4FIW6ZF7LAe+6jwd28=
+github.com/twitchyliquid64/golang-asm v0.15.1 h1:SU5vSMR7hnwNxj24w34ZyCi/FmDZTkS4MhqMhdFk5YI=
+github.com/twitchyliquid64/golang-asm v0.15.1/go.mod h1:a1lVb/DtPvCB8fslRZhAngC2+aY1QWCk3Cedj/Gdt08=
+github.com/valyala/bytebufferpool v1.0.0 h1:GqA5TC/0021Y/b9FG4Oi9Mr3q7XYx6KllzawFIhcdPw=
+github.com/valyala/bytebufferpool v1.0.0/go.mod h1:6bBcMArwyJ5K/AmCkWv1jt77kVWyCJ6HpOuEn7z0Csc=
+github.com/valyala/fasthttp v1.68.0 h1:v12Nx16iepr8r9ySOwqI+5RBJ/DqTxhOy1HrHoDFnok=
+github.com/valyala/fasthttp v1.68.0/go.mod h1:5EXiRfYQAoiO/khu4oU9VISC/eVY6JqmSpPJoHCKsz4=
+github.com/weaviate/weaviate v1.36.5 h1:lCiuEfQ08+5wK0DkTCUBb6ayNep9QpBH6JJhmZaRfzk=
+github.com/weaviate/weaviate v1.36.5/go.mod h1:ljzrgEmGKn3CRzDdcxvhmBUUZIcghwIYd1Lmn54f3Z8=
+github.com/weaviate/weaviate-go-client/v5 v5.7.1 h1:vEMxh486QqRqWaq58UEe/TiTbGbo9T5x7ZPFd5QENvQ=
+github.com/weaviate/weaviate-go-client/v5 v5.7.1/go.mod h1:T/JDErjN074GrnYIa0AgK1TGUGP/6A/8vqXNPlv4c6E=
+github.com/wk8/go-ordered-map/v2 v2.1.8 h1:5h/BUHu93oj4gIdvHHHGsScSTMijfx5PeYkE/fJgbpc=
+github.com/wk8/go-ordered-map/v2 v2.1.8/go.mod h1:5nJHM5DyteebpVlHnWMV0rPz6Zp7+xBAnxjb1X5vnTw=
+github.com/xyproto/randomstring v1.0.5 h1:YtlWPoRdgMu3NZtP45drfy1GKoojuR7hmRcnhZqKjWU=
+github.com/xyproto/randomstring v1.0.5/go.mod h1:rgmS5DeNXLivK7YprL0pY+lTuhNQW3iGxZ18UQApw/E=
+github.com/yosida95/uritemplate/v3 v3.0.2 h1:Ed3Oyj9yrmi9087+NczuL5BwkIc4wvTb5zIM+UJPGz4=
+github.com/yosida95/uritemplate/v3 v3.0.2/go.mod h1:ILOh0sOhIJR3+L/8afwt/kE++YT040gmv5BQTMR2HP4=
+go.mongodb.org/mongo-driver v1.17.6 h1:87JUG1wZfWsr6rIz3ZmpH90rL5tea7O3IHuSwHUpsss=
+go.mongodb.org/mongo-driver v1.17.6/go.mod h1:Hy04i7O2kC4RS06ZrhPRqj/u4DTYkFDAAccj+rVKqgQ=
+go.opentelemetry.io/auto/sdk v1.2.1 h1:jXsnJ4Lmnqd11kwkBV2LgLoFMZKizbCi5fNZ/ipaZ64=
+go.opentelemetry.io/auto/sdk v1.2.1/go.mod h1:KRTj+aOaElaLi+wW1kO/DZRXwkF4C5xPbEe3ZiIhN7Y=
+go.opentelemetry.io/contrib/detectors/gcp v1.40.0 h1:Awaf8gmW99tZTOWqkLCOl6aw1/rxAWVlHsHIZ3fT2sA=
+go.opentelemetry.io/contrib/detectors/gcp v1.40.0/go.mod h1:99OY9ZCqyLkzJLTh5XhECpLRSxcZl+ZDKBEO+jMBFR4=
+go.opentelemetry.io/contrib/instrumentation/google.golang.org/grpc/otelgrpc v0.63.0 h1:YH4g8lQroajqUwWbq/tr2QX1JFmEXaDLgG+ew9bLMWo=
+go.opentelemetry.io/contrib/instrumentation/google.golang.org/grpc/otelgrpc v0.63.0/go.mod h1:fvPi2qXDqFs8M4B4fmJhE92TyQs9Ydjlg3RvfUp+NbQ=
+go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp v0.63.0 h1:RbKq8BG0FI8OiXhBfcRtqqHcZcka+gU3cskNuf05R18=
+go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp v0.63.0/go.mod h1:h06DGIukJOevXaj/xrNjhi/2098RZzcLTbc0jDAUbsg=
+go.opentelemetry.io/otel v1.43.0 h1:mYIM03dnh5zfN7HautFE4ieIig9amkNANT+xcVxAj9I=
+go.opentelemetry.io/otel v1.43.0/go.mod h1:JuG+u74mvjvcm8vj8pI5XiHy1zDeoCS2LB1spIq7Ay0=
+go.opentelemetry.io/otel/exporters/stdout/stdoutmetric v1.40.0 h1:ZrPRak/kS4xI3AVXy8F7pipuDXmDsrO8Lg+yQjBLjw0=
+go.opentelemetry.io/otel/exporters/stdout/stdoutmetric v1.40.0/go.mod h1:3y6kQCWztq6hyW8Z9YxQDDm0Je9AJoFar2G0yDcmhRk=
+go.opentelemetry.io/otel/metric v1.43.0 h1:d7638QeInOnuwOONPp4JAOGfbCEpYb+K6DVWvdxGzgM=
+go.opentelemetry.io/otel/metric v1.43.0/go.mod h1:RDnPtIxvqlgO8GRW18W6Z/4P462ldprJtfxHxyKd2PY=
+go.opentelemetry.io/otel/sdk v1.43.0 h1:pi5mE86i5rTeLXqoF/hhiBtUNcrAGHLKQdhg4h4V9Dg=
+go.opentelemetry.io/otel/sdk v1.43.0/go.mod h1:P+IkVU3iWukmiit/Yf9AWvpyRDlUeBaRg6Y+C58QHzg=
+go.opentelemetry.io/otel/sdk/metric v1.43.0 h1:S88dyqXjJkuBNLeMcVPRFXpRw2fuwdvfCGLEo89fDkw=
+go.opentelemetry.io/otel/sdk/metric v1.43.0/go.mod h1:C/RJtwSEJ5hzTiUz5pXF1kILHStzb9zFlIEe85bhj6A=
+go.opentelemetry.io/otel/trace v1.43.0 h1:BkNrHpup+4k4w+ZZ86CZoHHEkohws8AY+WTX09nk+3A=
+go.opentelemetry.io/otel/trace v1.43.0/go.mod h1:/QJhyVBUUswCphDVxq+8mld+AvhXZLhe+8WVFxiFff0=
+go.starlark.net v0.0.0-20260102030733-3fee463870c9 h1:nV1OyvU+0CYrp5eKfQ3rD03TpFYYhH08z31NK1HmtTk=
+go.starlark.net v0.0.0-20260102030733-3fee463870c9/go.mod h1:YKMCv9b1WrfWmeqdV5MAuEHWsu5iC+fe6kYl2sQjdI8=
+go.yaml.in/yaml/v3 v3.0.4 h1:tfq32ie2Jv2UxXFdLJdh3jXuOzWiL1fo0bu/FbuKpbc=
+go.yaml.in/yaml/v3 v3.0.4/go.mod h1:DhzuOOF2ATzADvBadXxruRBLzYTpT36CKvDb3+aBEFg=
+golang.org/x/arch v0.23.0 h1:lKF64A2jF6Zd8L0knGltUnegD62JMFBiCPBmQpToHhg=
+golang.org/x/arch v0.23.0/go.mod h1:dNHoOeKiyja7GTvF9NJS1l3Z2yntpQNzgrjh1cU103A=
+golang.org/x/crypto v0.49.0 h1:+Ng2ULVvLHnJ/ZFEq4KdcDd/cfjrrjjNSXNzxg0Y4U4=
+golang.org/x/crypto v0.49.0/go.mod h1:ErX4dUh2UM+CFYiXZRTcMpEcN8b/1gxEuv3nODoYtCA=
+golang.org/x/net v0.52.0 h1:He/TN1l0e4mmR3QqHMT2Xab3Aj3L9qjbhRm78/6jrW0=
+golang.org/x/net v0.52.0/go.mod h1:R1MAz7uMZxVMualyPXb+VaqGSa3LIaUqk0eEt3w36Sw=
+golang.org/x/oauth2 v0.36.0 h1:peZ/1z27fi9hUOFCAZaHyrpWG5lwe0RJEEEeH0ThlIs=
+golang.org/x/oauth2 v0.36.0/go.mod h1:YDBUJMTkDnJS+A4BP4eZBjCqtokkg1hODuPjwiGPO7Q=
+golang.org/x/sync v0.20.0 h1:e0PTpb7pjO8GAtTs2dQ6jYa5BWYlMuX047Dco/pItO4=
+golang.org/x/sync v0.20.0/go.mod h1:9xrNwdLfx4jkKbNva9FpL6vEN7evnE43NNNJQ2LF3+0=
+golang.org/x/sys v0.0.0-20220811171246-fbc7d0a398ab/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
+golang.org/x/sys v0.1.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
+golang.org/x/sys v0.6.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
+golang.org/x/sys v0.12.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
+golang.org/x/sys v0.42.0 h1:omrd2nAlyT5ESRdCLYdm3+fMfNFE/+Rf4bDIQImRJeo=
+golang.org/x/sys v0.42.0/go.mod h1:4GL1E5IUh+htKOUEOaiffhrAeqysfVGipDYzABqnCmw=
+golang.org/x/text v0.35.0 h1:JOVx6vVDFokkpaq1AEptVzLTpDe9KGpj5tR4/X+ybL8=
+golang.org/x/text v0.35.0/go.mod h1:khi/HExzZJ2pGnjenulevKNX1W67CUy0AsXcNubPGCA=
+golang.org/x/time v0.15.0 h1:bbrp8t3bGUeFOx08pvsMYRTCVSMk89u4tKbNOZbp88U=
+golang.org/x/time v0.15.0/go.mod h1:Y4YMaQmXwGQZoFaVFk4YpCt4FLQMYKZe9oeV/f4MSno=
+gonum.org/v1/gonum v0.17.0 h1:VbpOemQlsSMrYmn7T2OUvQ4dqxQXU+ouZFQsZOx50z4=
+gonum.org/v1/gonum v0.17.0/go.mod h1:El3tOrEuMpv2UdMrbNlKEh9vd86bmQ6vqIcDwxEOc1E=
+google.golang.org/api v0.274.0 h1:aYhycS5QQCwxHLwfEHRRLf9yNsfvp1JadKKWBE54RFA=
+google.golang.org/api v0.274.0/go.mod h1:JbAt7mF+XVmWu6xNP8/+CTiGH30ofmCmk9nM8d8fHew=
+google.golang.org/genproto v0.0.0-20260316180232-0b37fe3546d5 h1:JNfk58HZ8lfmXbYK2vx/UvsqIL59TzByCxPIX4TDmsE=
+google.golang.org/genproto v0.0.0-20260316180232-0b37fe3546d5/go.mod h1:x5julN69+ED4PcFk/XWayw35O0lf/nGa4aNgODCmNmw=
+google.golang.org/genproto/googleapis/api v0.0.0-20260401024825-9d38bb4040a9 h1:VPWxll4HlMw1Vs/qXtN7BvhZqsS9cdAittCNvVENElA=
+google.golang.org/genproto/googleapis/api v0.0.0-20260401024825-9d38bb4040a9/go.mod h1:7QBABkRtR8z+TEnmXTqIqwJLlzrZKVfAUm7tY3yGv0M=
+google.golang.org/genproto/googleapis/rpc v0.0.0-20260401024825-9d38bb4040a9 h1:m8qni9SQFH0tJc1X0vmnpw/0t+AImlSvp30sEupozUg=
+google.golang.org/genproto/googleapis/rpc v0.0.0-20260401024825-9d38bb4040a9/go.mod h1:4Hqkh8ycfw05ld/3BWL7rJOSfebL2Q+DVDeRgYgxUU8=
+google.golang.org/grpc v1.80.0 h1:Xr6m2WmWZLETvUNvIUmeD5OAagMw3FiKmMlTdViWsHM=
+google.golang.org/grpc v1.80.0/go.mod h1:ho/dLnxwi3EDJA4Zghp7k2Ec1+c2jqup0bFkw07bwF4=
+google.golang.org/protobuf v1.36.11 h1:fV6ZwhNocDyBLK0dj+fg8ektcVegBBuEolpbTQyBNVE=
+google.golang.org/protobuf v1.36.11/go.mod h1:HTf+CrKn2C3g5S8VImy6tdcUvCska2kB7j23XfzDpco=
+gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0=
+gopkg.in/check.v1 v1.0.0-20201130134442-10cb98267c6c h1:Hei/4ADfdWqJk1ZMxUNpqntNwaWcugrBjAiHlqqRiVk=
+gopkg.in/check.v1 v1.0.0-20201130134442-10cb98267c6c/go.mod h1:JHkPIbrfpd72SG/EVd6muEfDQjcINNoR0C8j2r3qZ4Q=
+gopkg.in/yaml.v3 v3.0.0-20200313102051-9f266ea9e77c/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM=
+gopkg.in/yaml.v3 v3.0.1 h1:fxVm/GzAzEWqLHuvctI91KS9hhNmmWOoWu0XTYJS7CA=
+gopkg.in/yaml.v3 v3.0.1/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM=
+gorm.io/driver/postgres v1.6.0 h1:2dxzU8xJ+ivvqTRph34QX+WrRaJlmfyPqXmoGVjMBa4=
+gorm.io/driver/postgres v1.6.0/go.mod h1:vUw0mrGgrTK+uPHEhAdV4sfFELrByKVGnaVRkXDhtWo=
+gorm.io/driver/sqlite v1.6.0 h1:WHRRrIiulaPiPFmDcod6prc4l2VGVWHz80KspNsxSfQ=
+gorm.io/driver/sqlite v1.6.0/go.mod h1:AO9V1qIQddBESngQUKWL9yoH93HIeA1X6V633rBwyT8=
+gorm.io/gorm v1.31.1 h1:7CA8FTFz/gRfgqgpeKIBcervUn3xSyPUmr6B2WXJ7kg=
+gorm.io/gorm v1.31.1/go.mod h1:XyQVbO2k6YkOis7C2437jSit3SsDK72s7n7rsSHd+Gs=
--- a/plugins/semanticcache/main.go
+++ b/plugins/semanticcache/main.go
@@ -0,0 +1,871 @@
+// Package semanticcache provides semantic caching integration for Bifrost plugin.
+// This plugin caches responses using both direct hash matching (xxhash) and semantic similarity search (embeddings).
+// It supports configurable caching behavior via the VectorStore abstraction, with TTL management and streaming response handling.
+package semanticcache
+
+import (
+	"context"
+	"encoding/json"
+	"fmt"
+	"strconv"
+	"sync"
+	"time"
+
+	"github.com/google/uuid"
+
+	bifrost "github.com/maximhq/bifrost/core"
+	"github.com/maximhq/bifrost/core/schemas"
+	"github.com/maximhq/bifrost/framework"
+	"github.com/maximhq/bifrost/framework/vectorstore"
+)
+
+// Config contains configuration for the semantic cache plugin.
+// The VectorStore abstraction handles the underlying storage implementation and its defaults.
+// Only specify values you want to override from the semantic cache defaults.
+type Config struct {
+	// Embedding Model settings - REQUIRED for semantic caching
+	Provider       schemas.ModelProvider `json:"provider"`
+	Keys           []schemas.Key         `json:"keys"`
+	EmbeddingModel string                `json:"embedding_model,omitempty"` // Model to use for generating embeddings (optional)
+
+	// Plugin behavior settings
+	CleanUpOnShutdown    bool          `json:"cleanup_on_shutdown,omitempty"`    // Clean up cache on shutdown (default: false)
+	TTL                  time.Duration `json:"ttl,omitempty"`                    // Time-to-live for cached responses (default: 5min)
+	Threshold            float64       `json:"threshold,omitempty"`              // Cosine similarity threshold for semantic matching (default: 0.8)
+	VectorStoreNamespace string        `json:"vector_store_namespace,omitempty"` // Namespace for vector store (optional)
+	Dimension            int           `json:"dimension"`                        // Dimension for vector store
+
+	// Advanced caching behavior
+	DefaultCacheKey              string `json:"default_cache_key,omitempty"`              // Default cache key used when no per-request key is provided (optional, caching is disabled when empty and no per-request key is set)
+	ConversationHistoryThreshold int    `json:"conversation_history_threshold,omitempty"` // Skip caching for requests with more than this number of messages in the conversation history (default: 3)
+	CacheByModel                 *bool  `json:"cache_by_model,omitempty"`                 // Include model in cache key (default: true)
+	CacheByProvider              *bool  `json:"cache_by_provider,omitempty"`              // Include provider in cache key (default: true)
+	ExcludeSystemPrompt          *bool  `json:"exclude_system_prompt,omitempty"`          // Exclude system prompt in cache key (default: false)
+}
+
+// UnmarshalJSON implements custom JSON unmarshaling for semantic cache Config.
+// It supports TTL parsing from both string durations ("1m", "1hr") and numeric seconds for configurable cache behavior.
+func (c *Config) UnmarshalJSON(data []byte) error {
+	// Define a temporary struct to avoid infinite recursion
+	type TempConfig struct {
+		Provider                     string        `json:"provider"`
+		Keys                         []schemas.Key `json:"keys"`
+		EmbeddingModel               string        `json:"embedding_model,omitempty"`
+		CleanUpOnShutdown            bool          `json:"cleanup_on_shutdown,omitempty"`
+		Dimension                    int           `json:"dimension"`
+		TTL                          interface{}   `json:"ttl,omitempty"`
+		Threshold                    float64       `json:"threshold,omitempty"`
+		VectorStoreNamespace         string        `json:"vector_store_namespace,omitempty"`
+		DefaultCacheKey              string        `json:"default_cache_key,omitempty"`
+		ConversationHistoryThreshold int           `json:"conversation_history_threshold,omitempty"`
+		CacheByModel                 *bool         `json:"cache_by_model,omitempty"`
+		CacheByProvider              *bool         `json:"cache_by_provider,omitempty"`
+		ExcludeSystemPrompt          *bool         `json:"exclude_system_prompt,omitempty"`
+	}
+
+	var temp TempConfig
+	if err := json.Unmarshal(data, &temp); err != nil {
+		return fmt.Errorf("failed to unmarshal config: %w", err)
+	}
+
+	// Set simple fields
+	c.Provider = schemas.ModelProvider(temp.Provider)
+	c.Keys = temp.Keys
+	c.EmbeddingModel = temp.EmbeddingModel
+	c.CleanUpOnShutdown = temp.CleanUpOnShutdown
+	c.Dimension = temp.Dimension
+	c.CacheByModel = temp.CacheByModel
+	c.CacheByProvider = temp.CacheByProvider
+	c.VectorStoreNamespace = temp.VectorStoreNamespace
+	c.ConversationHistoryThreshold = temp.ConversationHistoryThreshold
+	c.Threshold = temp.Threshold
+	c.DefaultCacheKey = temp.DefaultCacheKey
+	c.ExcludeSystemPrompt = temp.ExcludeSystemPrompt
+	// Handle TTL field with custom parsing for VectorStore-backed cache behavior
+	if temp.TTL != nil {
+		switch v := temp.TTL.(type) {
+		case string:
+			// Try parsing as duration string (e.g., "1m", "1hr") for semantic cache TTL
+			duration, err := time.ParseDuration(v)
+			if err != nil {
+				return fmt.Errorf("failed to parse TTL duration string '%s': %w", v, err)
+			}
+			c.TTL = duration
+		case int:
+			// Handle integer seconds for semantic cache TTL
+			c.TTL = time.Duration(v) * time.Second
+		default:
+			// Try converting to string and parsing as number for semantic cache TTL
+			ttlStr := fmt.Sprintf("%v", v)
+			if seconds, err := strconv.ParseFloat(ttlStr, 64); err == nil {
+				c.TTL = time.Duration(seconds * float64(time.Second))
+			} else {
+				return fmt.Errorf("unsupported TTL type: %T (value: %v)", v, v)
+			}
+		}
+	}
+
+	return nil
+}
+
+// StreamChunk represents a single chunk from a streaming response
+type StreamChunk struct {
+	Timestamp    time.Time                // When chunk was received
+	Response     *schemas.BifrostResponse // The actual response chunk
+	FinishReason *string                  // If this is the final chunk
+}
+
+// StreamAccumulator manages accumulation of streaming chunks for caching
+type StreamAccumulator struct {
+	RequestID      string                 // The request ID
+	StorageID      string                 // The final cache entry ID
+	Chunks         []*StreamChunk         // All chunks for this stream
+	IsComplete     bool                   // Whether the stream is complete
+	HasError       bool                   // Whether any chunk in the stream had an error
+	FinalTimestamp time.Time              // When the stream completed
+	Embedding      []float32              // Embedding for the original request
+	Metadata       map[string]interface{} // Metadata for caching
+	TTL            time.Duration          // TTL for this cache entry
+	mu             sync.Mutex             // Protects chunk operations
+}
+
+// Plugin implements the schemas.LLMPlugin interface for semantic caching.
+// It caches responses using a two-tier approach: direct hash matching for exact requests
+// and semantic similarity search for related content. The plugin supports configurable caching behavior
+// via the VectorStore abstraction, including TTL management and streaming response handling.
+//
+// Fields:
+//   - store: VectorStore instance for semantic cache operations
+//   - config: Plugin configuration including semantic cache and caching settings
+//   - logger: Logger instance for plugin operations
+type Plugin struct {
+	store              vectorstore.VectorStore
+	config             *Config
+	logger             schemas.Logger
+	client             *bifrost.Bifrost
+	streamAccumulators sync.Map // Track stream accumulators by request ID
+	waitGroup          sync.WaitGroup
+}
+
+// Plugin constants
+const (
+	PluginName                          string        = "semantic_cache"
+	DefaultVectorStoreNamespace         string        = "BifrostSemanticCachePlugin"
+	PluginLoggerPrefix                  string        = "[Semantic Cache]"
+	CacheConnectionTimeout              time.Duration = 5 * time.Second
+	CreateNamespaceTimeout              time.Duration = 30 * time.Second
+	CacheSetTimeout                     time.Duration = 30 * time.Second
+	DefaultCacheTTL                     time.Duration = 5 * time.Minute
+	DefaultCacheThreshold               float64       = 0.8
+	DefaultConversationHistoryThreshold int           = 3
+)
+
+var SelectFields = []string{"request_hash", "response", "stream_chunks", "expires_at", "cache_key", "provider", "model"}
+
+var VectorStoreProperties = map[string]vectorstore.VectorStoreProperties{
+	"request_hash": {
+		DataType:    vectorstore.VectorStorePropertyTypeString,
+		Description: "The hash of the request",
+	},
+	"response": {
+		DataType:    vectorstore.VectorStorePropertyTypeString,
+		Description: "The response from the provider",
+	},
+	"stream_chunks": {
+		DataType:    vectorstore.VectorStorePropertyTypeStringArray,
+		Description: "The stream chunks from the provider",
+	},
+	"expires_at": {
+		DataType:    vectorstore.VectorStorePropertyTypeInteger,
+		Description: "The expiration time of the cache entry",
+	},
+	"cache_key": {
+		DataType:    vectorstore.VectorStorePropertyTypeString,
+		Description: "The cache key from the request",
+	},
+	"provider": {
+		DataType:    vectorstore.VectorStorePropertyTypeString,
+		Description: "The provider used for the request",
+	},
+	"model": {
+		DataType:    vectorstore.VectorStorePropertyTypeString,
+		Description: "The model used for the request",
+	},
+	"params_hash": {
+		DataType:    vectorstore.VectorStorePropertyTypeString,
+		Description: "The hash of the parameters used for the request",
+	},
+	"from_bifrost_semantic_cache_plugin": {
+		DataType:    vectorstore.VectorStorePropertyTypeBoolean,
+		Description: "Whether the cache entry was created by the BifrostSemanticCachePlugin",
+	},
+}
+
+type PluginAccount struct {
+	provider schemas.ModelProvider
+	keys     []schemas.Key
+}
+
+func (pa *PluginAccount) GetConfiguredProviders() ([]schemas.ModelProvider, error) {
+	return []schemas.ModelProvider{pa.provider}, nil
+}
+
+func (pa *PluginAccount) GetKeysForProvider(ctx context.Context, providerKey schemas.ModelProvider) ([]schemas.Key, error) {
+	return pa.keys, nil
+}
+
+func (pa *PluginAccount) GetConfigForProvider(providerKey schemas.ModelProvider) (*schemas.ProviderConfig, error) {
+	return &schemas.ProviderConfig{
+		NetworkConfig:            schemas.DefaultNetworkConfig,
+		ConcurrencyAndBufferSize: schemas.DefaultConcurrencyAndBufferSize,
+	}, nil
+}
+
+// Dependencies is a list of dependencies that the plugin requires.
+var Dependencies []framework.FrameworkDependency = []framework.FrameworkDependency{framework.FrameworkDependencyVectorStore}
+
+// ProvidersWithEmbeddingSupport lists all providers that support embedding operations.
+// Providers not in this list will return UnsupportedOperationError for embedding requests.
+var ProvidersWithEmbeddingSupport = map[schemas.ModelProvider]bool{
+	schemas.OpenAI:      true,
+	schemas.Azure:       true,
+	schemas.Bedrock:     true,
+	schemas.Cohere:      true,
+	schemas.Gemini:      true,
+	schemas.Vertex:      true,
+	schemas.Mistral:     true,
+	schemas.Ollama:      true,
+	schemas.Nebius:      true,
+	schemas.HuggingFace: true,
+	schemas.SGL:         true,
+}
+
+const (
+	CacheKey          schemas.BifrostContextKey = "semantic_cache_key"        // To set the cache key for a request - REQUIRED for all requests
+	CacheTTLKey       schemas.BifrostContextKey = "semantic_cache_ttl"        // To explicitly set the TTL for a request
+	CacheThresholdKey schemas.BifrostContextKey = "semantic_cache_threshold"  // To explicitly set the threshold for a request
+	CacheTypeKey      schemas.BifrostContextKey = "semantic_cache_cache_type" // To explicitly set the cache type for a request
+	CacheNoStoreKey   schemas.BifrostContextKey = "semantic_cache_no_store"   // To explicitly disable storing the response in the cache
+
+	// context keys for internal usage
+	requestIDKey              schemas.BifrostContextKey = "semantic_cache_request_id"
+	requestStorageIDKey       schemas.BifrostContextKey = "semantic_cache_request_storage_id"
+	requestHashKey            schemas.BifrostContextKey = "semantic_cache_request_hash"
+	requestEmbeddingKey       schemas.BifrostContextKey = "semantic_cache_embedding"
+	requestEmbeddingTokensKey schemas.BifrostContextKey = "semantic_cache_embedding_tokens"
+	requestParamsHashKey      schemas.BifrostContextKey = "semantic_cache_params_hash"
+	requestModelKey           schemas.BifrostContextKey = "semantic_cache_model"
+	requestProviderKey        schemas.BifrostContextKey = "semantic_cache_provider"
+	isCacheHitKey             schemas.BifrostContextKey = "semantic_cache_is_cache_hit"
+	cacheHitTypeKey           schemas.BifrostContextKey = "semantic_cache_cache_hit_type"
+)
+
+type CacheType string
+
+const (
+	CacheTypeDirect   CacheType = "direct"
+	CacheTypeSemantic CacheType = "semantic"
+)
+
+// Init creates a new semantic cache plugin instance with the provided configuration.
+// It uses the VectorStore abstraction for cache operations and returns a configured plugin.
+//
+// The VectorStore handles the underlying storage implementation and its defaults.
+// The plugin only sets defaults for its own behavior (TTL, cache key generation, etc.).
+//
+// Parameters:
+//   - config: Semantic cache and plugin configuration (CacheKey is required)
+//   - logger: Logger instance for the plugin
+//   - store: VectorStore instance for cache operations
+//
+// Returns:
+//   - schemas.LLMPlugin: A configured semantic cache plugin instance
+//   - error: Any error that occurred during plugin initialization
+func Init(ctx context.Context, config *Config, logger schemas.Logger, store vectorstore.VectorStore) (schemas.LLMPlugin, error) {
+	if config == nil {
+		return nil, fmt.Errorf("config is required")
+	}
+	if store == nil {
+		return nil, fmt.Errorf("store is required")
+	}
+	// Set plugin-specific defaults
+	if config.VectorStoreNamespace == "" {
+		logger.Debug(PluginLoggerPrefix + " Vector store namespace is not set, using default of " + DefaultVectorStoreNamespace)
+		config.VectorStoreNamespace = DefaultVectorStoreNamespace
+	}
+	if config.TTL == 0 {
+		logger.Debug(PluginLoggerPrefix + " TTL is not set, using default of 5 minutes")
+		config.TTL = DefaultCacheTTL
+	}
+	if config.Threshold == 0 {
+		logger.Debug(PluginLoggerPrefix + " Threshold is not set, using default of " + strconv.FormatFloat(DefaultCacheThreshold, 'f', -1, 64))
+		config.Threshold = DefaultCacheThreshold
+	}
+	if config.ConversationHistoryThreshold == 0 {
+		logger.Debug(PluginLoggerPrefix + " Conversation history threshold is not set, using default of " + strconv.Itoa(DefaultConversationHistoryThreshold))
+		config.ConversationHistoryThreshold = DefaultConversationHistoryThreshold
+	}
+
+	// Set cache behavior defaults
+	if config.CacheByModel == nil {
+		config.CacheByModel = bifrost.Ptr(true)
+	}
+	if config.CacheByProvider == nil {
+		config.CacheByProvider = bifrost.Ptr(true)
+	}
+
+	plugin := &Plugin{
+		store:     store,
+		config:    config,
+		logger:    logger,
+		waitGroup: sync.WaitGroup{},
+	}
+
+	if config.Provider == "" && config.Dimension == 1 {
+		logger.Info(PluginLoggerPrefix + " Starting in direct-only mode (dimension=1, no embedding provider)")
+	} else if config.Provider == "" || len(config.Keys) == 0 {
+		logger.Warn(PluginLoggerPrefix + " Incomplete semantic mode config: missing provider or keys, falling back to direct search only")
+	} else {
+		// Validate that the provider supports embeddings
+		if bifrost.IsStandardProvider(config.Provider) && !ProvidersWithEmbeddingSupport[config.Provider] {
+			return nil, fmt.Errorf("provider '%s' does not support embedding operations required for semantic cache. Supported providers: openai, azure, bedrock, cohere, gemini, vertex, mistral, ollama, nebius, huggingface, sgl. Note: custom providers based on embedding-capable providers are also supported", config.Provider)
+		}
+
+		bifrost, err := bifrost.Init(ctx, schemas.BifrostConfig{
+			Logger: logger,
+			Account: &PluginAccount{
+				provider: config.Provider,
+				keys:     config.Keys,
+			},
+		})
+		if err != nil {
+			return nil, fmt.Errorf("failed to initialize bifrost for semantic cache: %w", err)
+		}
+
+		plugin.client = bifrost
+	}
+
+	createCtx, cancel := context.WithTimeout(ctx, CreateNamespaceTimeout)
+	defer cancel()
+	if err := store.CreateNamespace(createCtx, config.VectorStoreNamespace, config.Dimension, VectorStoreProperties); err != nil {
+		return nil, fmt.Errorf("failed to create namespace for semantic cache: %w", err)
+	}
+
+	return plugin, nil
+}
+
+// GetName returns the canonical name of the semantic cache plugin.
+// This name is used for plugin identification and logging purposes.
+//
+// Returns:
+//   - string: The plugin name for semantic cache
+func (plugin *Plugin) GetName() string {
+	return PluginName
+}
+
+// HTTPTransportPreHook is not used for this plugin
+func (plugin *Plugin) HTTPTransportPreHook(ctx *schemas.BifrostContext, req *schemas.HTTPRequest) (*schemas.HTTPResponse, error) {
+	return nil, nil
+}
+
+// HTTPTransportPostHook is not used for this plugin
+func (plugin *Plugin) HTTPTransportPostHook(ctx *schemas.BifrostContext, req *schemas.HTTPRequest, resp *schemas.HTTPResponse) error {
+	return nil
+}
+
+// HTTPTransportStreamChunkHook passes through streaming chunks unchanged
+func (plugin *Plugin) HTTPTransportStreamChunkHook(ctx *schemas.BifrostContext, req *schemas.HTTPRequest, chunk *schemas.BifrostStreamChunk) (*schemas.BifrostStreamChunk, error) {
+	return chunk, nil
+}
+
+func (plugin *Plugin) clearRequestScopedContext(ctx *schemas.BifrostContext) {
+	ctx.ClearValue(requestIDKey)
+	ctx.ClearValue(requestStorageIDKey)
+	ctx.ClearValue(requestHashKey)
+	ctx.ClearValue(requestParamsHashKey)
+	ctx.ClearValue(requestModelKey)
+	ctx.ClearValue(requestProviderKey)
+	ctx.ClearValue(requestEmbeddingKey)
+	ctx.ClearValue(requestEmbeddingTokensKey)
+	ctx.ClearValue(isCacheHitKey)
+	ctx.ClearValue(cacheHitTypeKey)
+}
+
+// PreLLMHook is called before a request is processed by Bifrost.
+// It performs a two-stage cache lookup: first direct hash matching, then semantic similarity search.
+// Uses UUID-based keys for entries stored in the VectorStore.
+//
+// Parameters:
+//   - ctx: Pointer to the schemas.BifrostContext
+//   - req: The incoming Bifrost request
+//
+// Returns:
+//   - *schemas.BifrostRequest: The original request
+//   - *schemas.BifrostResponse: Cached response if found, nil otherwise
+//   - error: Any error that occurred during cache lookup
+func (plugin *Plugin) PreLLMHook(ctx *schemas.BifrostContext, req *schemas.BifrostRequest) (*schemas.BifrostRequest, *schemas.LLMPluginShortCircuit, error) {
+	provider, model, _ := req.GetRequestFields()
+	// Get the cache key from the context
+	var cacheKey string
+	var ok bool
+
+	cacheKey, ok = ctx.Value(CacheKey).(string)
+	if !ok || cacheKey == "" {
+		if plugin.config.DefaultCacheKey != "" {
+			cacheKey = plugin.config.DefaultCacheKey
+			plugin.logger.Debug(PluginLoggerPrefix + " Using default cache key: " + cacheKey)
+		} else {
+			plugin.logger.Debug(PluginLoggerPrefix + " No cache key found in context, continuing without caching")
+			return req, nil, nil
+		}
+	}
+
+	// Clear request-scoped semantic cache state up front in case the context is reused.
+	plugin.clearRequestScopedContext(ctx)
+
+	if !isSemanticCacheSupportedRequestType(req.RequestType) {
+		plugin.logger.Debug(PluginLoggerPrefix + " Skipping caching for unsupported request type: " + string(req.RequestType))
+		return req, nil, nil
+	}
+
+	if plugin.isConversationHistoryThresholdExceeded(req) {
+		plugin.logger.Debug(PluginLoggerPrefix + " Skipping caching for request with conversation history threshold exceeded")
+		return req, nil, nil
+	}
+
+	// Generate UUID for this request
+	requestID := uuid.New().String()
+
+	// Store request ID, model, and provider in context for PostLLMHook
+	ctx.SetValue(requestIDKey, requestID)
+	ctx.SetValue(requestModelKey, model)
+	ctx.SetValue(requestProviderKey, provider)
+
+	performDirectSearch, performSemanticSearch := true, true
+	if ctx.Value(CacheTypeKey) != nil {
+		cacheTypeVal, ok := ctx.Value(CacheTypeKey).(CacheType)
+		if !ok {
+			plugin.logger.Warn(PluginLoggerPrefix + " Cache type is not a CacheType, using all available cache types")
+		} else {
+			performDirectSearch = cacheTypeVal == CacheTypeDirect
+			performSemanticSearch = cacheTypeVal == CacheTypeSemantic
+		}
+	}
+
+	if performDirectSearch {
+		shortCircuit, err := plugin.performDirectSearch(ctx, req, cacheKey)
+		if err != nil {
+			plugin.logger.Warn(PluginLoggerPrefix + " Direct search failed: " + err.Error() + " (" + describeRequestShape(req) + ")")
+			// Don't return - continue to semantic search fallback
+			shortCircuit = nil // Ensure we don't use an invalid shortCircuit
+		}
+
+		if shortCircuit != nil {
+			return req, shortCircuit, nil
+		}
+	}
+
+	if performSemanticSearch && plugin.client != nil {
+		if req.EmbeddingRequest != nil || req.TranscriptionRequest != nil {
+			plugin.logger.Debug(PluginLoggerPrefix + " Skipping semantic search for embedding/transcription input")
+			// For vector stores that require vectors, set a zero vector placeholder
+			// This allows direct hash matching to work without the overhead of generating embeddings
+			if plugin.store.RequiresVectors() && plugin.config.Dimension > 0 {
+				zeroVector := make([]float32, plugin.config.Dimension)
+				ctx.SetValue(requestEmbeddingKey, zeroVector)
+				plugin.logger.Debug(PluginLoggerPrefix + " Using zero vector placeholder for embedding/transcription request storage")
+			}
+			return req, nil, nil
+		}
+
+		// Try semantic search as fallback
+		shortCircuit, err := plugin.performSemanticSearch(ctx, req, cacheKey)
+		if err != nil {
+			plugin.logger.Debug(PluginLoggerPrefix + " Semantic search skipped: " + err.Error() + " (" + describeRequestShape(req) + ")")
+			return req, nil, nil
+		}
+
+		if shortCircuit != nil {
+			return req, shortCircuit, nil
+		}
+	} else if !performSemanticSearch && plugin.store.RequiresVectors() && plugin.client != nil {
+		// Vector store requires vectors but we're in direct-only mode
+		// Generate embeddings for storage purposes (not for searching)
+		if req.EmbeddingRequest != nil || req.TranscriptionRequest != nil {
+			plugin.logger.Debug(PluginLoggerPrefix + " Skipping embedding generation for embedding/transcription input")
+			// For vector stores that require vectors, set a zero vector placeholder
+			// This allows direct hash matching to work without the overhead of generating embeddings
+			if plugin.config.Dimension > 0 {
+				zeroVector := make([]float32, plugin.config.Dimension)
+				ctx.SetValue(requestEmbeddingKey, zeroVector)
+				plugin.logger.Debug(PluginLoggerPrefix + " Using zero vector placeholder for embedding/transcription request storage")
+			}
+			return req, nil, nil
+		}
+
+		// Use zero vector for direct-only cache type to prevent semantic search matches
+		// This preserves cache type isolation - direct-only entries won't be found by semantic search
+		if plugin.config.Dimension > 0 {
+			zeroVector := make([]float32, plugin.config.Dimension)
+			ctx.SetValue(requestEmbeddingKey, zeroVector)
+			plugin.logger.Debug(PluginLoggerPrefix + " Using zero vector for direct-only cache storage (preserves isolation)")
+		}
+	}
+
+	return req, nil, nil
+}
+
+// PostLLMHook is called after a response is received from a provider.
+// It caches responses in the VectorStore using UUID-based keys with unified metadata structure
+// including provider, model, request hash, and TTL. Handles both single and streaming responses.
+//
+// The function performs the following operations:
+// 1. Checks configurable caching behavior and skips caching for unsuccessful responses if configured
+// 2. Retrieves the request hash and ID from the context (set during PreLLMHook)
+// 3. Marshals the response for storage
+// 4. Stores the unified cache entry in the VectorStore asynchronously (non-blocking)
+//
+// The VectorStore Add operation runs in a separate goroutine to avoid blocking the response.
+// The function gracefully handles errors and continues without caching if any step fails,
+// ensuring that response processing is never interrupted by caching issues.
+//
+// Parameters:
+//   - ctx: Pointer to the schemas.BifrostContext containing the request hash and ID
+//   - res: The response from the provider to be cached
+//   - bifrostErr: The error from the provider, if any (used for success determination)
+//
+// Returns:
+//   - *schemas.BifrostResponse: The original response, unmodified
+//   - *schemas.BifrostError: The original error, unmodified
+//   - error: Any error that occurred during caching preparation (always nil as errors are handled gracefully)
+func (plugin *Plugin) PostLLMHook(ctx *schemas.BifrostContext, res *schemas.BifrostResponse, bifrostErr *schemas.BifrostError) (*schemas.BifrostResponse, *schemas.BifrostError, error) {
+	if bifrostErr != nil {
+		return res, bifrostErr, nil
+	}
+
+	// Skip caching for large payloads — body is too large to materialize for cache storage
+	if isLargePayload, ok := ctx.Value(schemas.BifrostContextKeyLargePayloadMode).(bool); ok && isLargePayload {
+		plugin.logger.Debug(PluginLoggerPrefix + " Skipping semantic cache for large payload request")
+		return res, nil, nil
+	}
+	if isLargeResponse, ok := ctx.Value(schemas.BifrostContextKeyLargeResponseMode).(bool); ok && isLargeResponse {
+		plugin.logger.Debug(PluginLoggerPrefix + " Skipping semantic cache for large payload response")
+		return res, nil, nil
+	}
+
+	isCacheHit := ctx.Value(isCacheHitKey)
+	if isCacheHit != nil {
+		isCacheHitValue, ok := isCacheHit.(bool)
+		if ok && isCacheHitValue {
+			return res, nil, nil
+		}
+	}
+
+	// Check if caching is explicitly disabled
+	noStore := ctx.Value(CacheNoStoreKey)
+	if noStore != nil {
+		noStoreValue, ok := noStore.(bool)
+		if ok && noStoreValue {
+			plugin.logger.Debug(PluginLoggerPrefix + " Caching is explicitly disabled for this request, continuing without caching")
+			return res, nil, nil
+		}
+	}
+
+	// Get the cache key from context
+	cacheKey, ok := ctx.Value(CacheKey).(string)
+	if !ok || cacheKey == "" {
+		if plugin.config.DefaultCacheKey != "" {
+			cacheKey = plugin.config.DefaultCacheKey
+		} else {
+			return res, nil, nil
+		}
+	}
+
+	// Get the request ID from context
+	requestID, ok := ctx.Value(requestIDKey).(string)
+	if !ok {
+		return res, nil, nil
+	}
+	storageID := requestID
+	// When direct lookup prepared a deterministic storage ID, reuse it here so
+	// default-mode traffic warms the GetChunk fast path instead of only the
+	// legacy search path.
+	if v, ok := ctx.Value(requestStorageIDKey).(string); ok && v != "" {
+		storageID = v
+	}
+	// Check cache type to optimize embedding handling
+	var embedding []float32
+	var hash string
+	var shouldStoreEmbeddings = true
+	var shouldStoreHash = true
+
+	if ctx.Value(CacheTypeKey) != nil {
+		cacheTypeVal, ok := ctx.Value(CacheTypeKey).(CacheType)
+		if ok {
+			if cacheTypeVal == CacheTypeDirect {
+				// For direct-only caching, skip embedding operations entirely
+				// unless the vector store requires vectors for all entries
+				if plugin.store.RequiresVectors() {
+					// Vector stores like Qdrant and Pinecone require vectors for all entries
+					// Keep embeddings enabled for storage, but lookups will still use direct hash matching
+					plugin.logger.Debug(PluginLoggerPrefix + " Vector store requires vectors, keeping embedding generation enabled for storage")
+				} else {
+					shouldStoreEmbeddings = false
+					plugin.logger.Debug(PluginLoggerPrefix + " Skipping embedding operations for direct-only cache type")
+				}
+			} else if cacheTypeVal == CacheTypeSemantic {
+				shouldStoreHash = false
+				plugin.logger.Debug(PluginLoggerPrefix + " Skipping hash operations for semantic cache type")
+			}
+		}
+	}
+
+	if shouldStoreHash {
+		// Get the hash from context
+		hash, ok = ctx.Value(requestHashKey).(string)
+		if !ok {
+			plugin.logger.Warn(PluginLoggerPrefix + " Hash is not a string. Continuing without caching")
+			return res, nil, nil
+		}
+	}
+
+	extraFields := res.GetExtraFields()
+	requestType := extraFields.RequestType
+
+	// Get embedding from context if available and needed
+	// For embedding/transcription requests, we still need to retrieve the zero vector placeholder
+	// if the vector store requires vectors for all entries
+	isEmbeddingOrTranscription := requestType == schemas.EmbeddingRequest || requestType == schemas.TranscriptionRequest
+	needsEmbedding := shouldStoreEmbeddings && !isEmbeddingOrTranscription
+	needsZeroVector := isEmbeddingOrTranscription && plugin.store.RequiresVectors()
+
+	if needsEmbedding || needsZeroVector {
+		embeddingValue := ctx.Value(requestEmbeddingKey)
+		if embeddingValue != nil {
+			embedding, ok = embeddingValue.([]float32)
+			if !ok {
+				plugin.logger.Warn(PluginLoggerPrefix + " Embedding is not a []float32, continuing without caching")
+				return res, nil, nil
+			}
+		}
+		// Note: embedding can be nil for direct cache hits or when semantic search is disabled
+		// This is fine - we can still cache using direct hash matching (unless store requires vectors)
+	}
+
+	// Get the provider from context
+	provider, ok := ctx.Value(requestProviderKey).(schemas.ModelProvider)
+	if !ok {
+		plugin.logger.Warn(PluginLoggerPrefix + " Provider is not a schemas.ModelProvider, continuing without caching")
+		return res, nil, nil
+	}
+
+	// Get the model from context
+	model, ok := ctx.Value(requestModelKey).(string)
+	if !ok {
+		plugin.logger.Warn(PluginLoggerPrefix + " Model is not a string, continuing without caching")
+		return res, nil, nil
+	}
+
+	isFinalChunk := bifrost.IsFinalChunk(ctx)
+
+	// Get the input tokens from context (can be nil if not set)
+	inputTokens, ok := ctx.Value(requestEmbeddingTokensKey).(int)
+	if ok {
+		isStreamRequest := bifrost.IsStreamRequestType(requestType)
+
+		if !isStreamRequest || (isStreamRequest && isFinalChunk) {
+			if extraFields.CacheDebug == nil {
+				extraFields.CacheDebug = &schemas.BifrostCacheDebug{}
+			}
+			extraFields.CacheDebug.CacheHit = false
+			extraFields.CacheDebug.ProviderUsed = bifrost.Ptr(string(plugin.config.Provider))
+			extraFields.CacheDebug.ModelUsed = bifrost.Ptr(plugin.config.EmbeddingModel)
+			extraFields.CacheDebug.InputTokens = &inputTokens
+		}
+	}
+
+	cacheTTL := plugin.config.TTL
+
+	ttlValue := ctx.Value(CacheTTLKey)
+	if ttlValue != nil {
+		// Get the request TTL from the context
+		ttl, ok := ttlValue.(time.Duration)
+		if !ok {
+			plugin.logger.Warn(PluginLoggerPrefix + " TTL is not a time.Duration, using default TTL")
+		} else {
+			cacheTTL = ttl
+		}
+	}
+
+	// Get metadata from context BEFORE goroutine to avoid race conditions
+	// when the same context is reused across multiple requests
+	paramsHash, _ := ctx.Value(requestParamsHashKey).(string)
+
+	// Cache everything in a unified VectorEntry asynchronously to avoid blocking the response
+	plugin.waitGroup.Add(1)
+	go func() {
+		defer plugin.waitGroup.Done()
+		// Create a background context with timeout for the cache operation
+		cacheCtx, cancel := context.WithTimeout(context.Background(), CacheSetTimeout)
+		defer cancel()
+
+		// Build unified metadata with provider, model, and all params
+		unifiedMetadata := plugin.buildUnifiedMetadata(provider, model, paramsHash, hash, cacheKey, cacheTTL)
+
+		// Handle streaming vs non-streaming responses
+		// Pass nil for embedding if we're in direct-only mode to optimize storage
+		embeddingToStore := embedding
+		if !shouldStoreEmbeddings {
+			embeddingToStore = nil
+		}
+
+		if bifrost.IsStreamRequestType(requestType) {
+			if err := plugin.addStreamingResponse(cacheCtx, requestID, storageID, res, bifrostErr, embeddingToStore, unifiedMetadata, cacheTTL, isFinalChunk); err != nil {
+				plugin.logger.Warn("%s Failed to cache streaming response: %v", PluginLoggerPrefix, err)
+			}
+		} else {
+			if err := plugin.addSingleResponse(cacheCtx, storageID, res, embeddingToStore, unifiedMetadata, cacheTTL); err != nil {
+				plugin.logger.Warn("%s Failed to cache single response: %v", PluginLoggerPrefix, err)
+			}
+		}
+	}()
+
+	return res, nil, nil
+}
+
+// WaitForPendingOperations blocks until all pending cache operations (goroutines) complete.
+// This is useful in tests to ensure cache entries are stored before checking for cache hits.
+func (plugin *Plugin) WaitForPendingOperations() {
+	plugin.waitGroup.Wait()
+}
+
+// Cleanup performs cleanup operations for the semantic cache plugin.
+// It removes all cached entries created by this plugin from the VectorStore only if CleanUpOnShutdown is true.
+// Identifies cache entries by the presence of semantic cache-specific fields (request_hash, cache_key).
+//
+// The function performs the following operations:
+// 1. Checks if cleanup is enabled via CleanUpOnShutdown config
+// 2. Retrieves all entries and filters client-side to identify cache entries
+// 3. Deletes all matching cache entries from the VectorStore in batches
+//
+// This method should be called when shutting down the application to ensure
+// proper resource cleanup if configured to do so.
+//
+// Returns:
+//   - error: Any error that occurred during cleanup operations
+func (plugin *Plugin) Cleanup() error {
+	plugin.waitGroup.Wait()
+
+	// Clean up old stream accumulators first
+	plugin.cleanupOldStreamAccumulators()
+
+	// Shutdown the internal Bifrost client used for embeddings
+	if plugin.client != nil {
+		plugin.client.Shutdown()
+	}
+
+	// Only clean up cache entries if configured to do so
+	if !plugin.config.CleanUpOnShutdown {
+		plugin.logger.Debug(PluginLoggerPrefix + " Cleanup on shutdown is disabled, skipping cache cleanup")
+		return nil
+	}
+
+	// Clean up all cache entries created by this plugin
+	ctx, cancel := context.WithTimeout(context.Background(), CacheSetTimeout)
+	defer cancel()
+
+	plugin.logger.Debug(PluginLoggerPrefix + " Starting cleanup of cache entries...")
+
+	// Delete all cache entries created by this plugin
+	queries := []vectorstore.Query{
+		{
+			Field:    "from_bifrost_semantic_cache_plugin",
+			Operator: vectorstore.QueryOperatorEqual,
+			Value:    true,
+		},
+	}
+
+	results, err := plugin.store.DeleteAll(ctx, plugin.config.VectorStoreNamespace, queries)
+	if err != nil {
+		return fmt.Errorf("failed to delete cache entries: %w", err)
+	}
+
+	for _, result := range results {
+		if result.Status == vectorstore.DeleteStatusError {
+			plugin.logger.Warn("%s Failed to delete cache entry: %s", PluginLoggerPrefix, result.Error)
+		}
+	}
+	plugin.logger.Info("%s Cleanup completed - deleted all cache entries", PluginLoggerPrefix)
+
+	if err := plugin.store.DeleteNamespace(ctx, plugin.config.VectorStoreNamespace); err != nil {
+		return fmt.Errorf("failed to delete namespace: %w", err)
+	}
+
+	return nil
+}
+
+// Public Methods for External Use
+
+// ClearCacheForKey deletes cache entries for a specific cache key.
+// Uses the unified VectorStore interface for deletion of all entries with the given cache key.
+//
+// Parameters:
+//   - cacheKey: The specific cache key to delete
+//
+// Returns:
+//   - error: Any error that occurred during cache key deletion
+func (plugin *Plugin) ClearCacheForKey(cacheKey string) error {
+	// Delete all entries with "cache_key" equal to the given cacheKey
+	queries := []vectorstore.Query{
+		{
+			Field:    "cache_key",
+			Operator: vectorstore.QueryOperatorEqual,
+			Value:    cacheKey,
+		},
+		{
+			Field:    "from_bifrost_semantic_cache_plugin",
+			Operator: vectorstore.QueryOperatorEqual,
+			Value:    true,
+		},
+	}
+
+	ctx, cancel := context.WithTimeout(context.Background(), CacheSetTimeout)
+	defer cancel()
+	results, err := plugin.store.DeleteAll(ctx, plugin.config.VectorStoreNamespace, queries)
+	if err != nil {
+		plugin.logger.Warn("%s Failed to delete cache entries for key '%s': %v", PluginLoggerPrefix, cacheKey, err)
+		return err
+	}
+
+	for _, result := range results {
+		if result.Status == vectorstore.DeleteStatusError {
+			plugin.logger.Warn("%s Failed to delete cache entry for key %s: %s", PluginLoggerPrefix, result.ID, result.Error)
+		}
+	}
+
+	plugin.logger.Debug(fmt.Sprintf("%s Deleted all cache entries for key %s", PluginLoggerPrefix, cacheKey))
+
+	return nil
+}
+
+// ClearCacheForRequestID deletes cache entries for a specific request ID.
+// Uses the unified VectorStore interface to delete the single entry by its UUID.
+//
+// Parameters:
+//   - requestID: The UUID-based request ID to delete cache entries for
+//
+// Returns:
+//   - error: Any error that occurred during cache key deletion
+func (plugin *Plugin) ClearCacheForRequestID(requestID string) error {
+	// With the unified VectorStore interface, we delete the single entry by its UUID
+	ctx, cancel := context.WithTimeout(context.Background(), CacheSetTimeout)
+	defer cancel()
+	if err := plugin.store.Delete(ctx, plugin.config.VectorStoreNamespace, requestID); err != nil {
+		plugin.logger.Warn("%s Failed to delete cache entry: %v", PluginLoggerPrefix, err)
+		return err
+	}
+
+	plugin.logger.Debug(fmt.Sprintf("%s Deleted cache entry for key %s", PluginLoggerPrefix, requestID))
+
+	return nil
+}
--- a/plugins/semanticcache/plugin_cache_type_test.go
+++ b/plugins/semanticcache/plugin_cache_type_test.go
@@ -0,0 +1,924 @@
+package semanticcache
+
+import (
+	"context"
+	"errors"
+	"sync"
+	"testing"
+	"time"
+
+	bifrost "github.com/maximhq/bifrost/core"
+	"github.com/maximhq/bifrost/core/schemas"
+	"github.com/maximhq/bifrost/framework/vectorstore"
+)
+
+// TestCacheTypeDirectOnly tests that CacheTypeKey set to "direct" only performs direct hash matching
+func TestCacheTypeDirectOnly(t *testing.T) {
+	setup := NewTestSetup(t)
+	defer setup.Cleanup()
+
+	// First, cache a response using CacheTypeDirect so it is stored under the deterministic ID
+	ctx1 := CreateContextWithCacheKeyAndType("test-cache-type-direct", CacheTypeDirect)
+	testRequest := CreateBasicChatRequest("What is Bifrost?", 0.7, 50)
+
+	t.Log("Making first request to populate cache...")
+	response1, err1 := setup.Client.ChatCompletionRequest(ctx1, testRequest)
+	if err1 != nil {
+		return // Test will be skipped by retry function
+	}
+	AssertNoCacheHit(t, &schemas.BifrostResponse{ChatResponse: response1})
+
+	WaitForCache(setup.Plugin)
+
+	// Now test with CacheTypeKey set to direct only
+	ctx2 := CreateContextWithCacheKeyAndType("test-cache-type-direct", CacheTypeDirect)
+
+	t.Log("Making second request with CacheTypeKey=direct...")
+	response2, err2 := setup.Client.ChatCompletionRequest(ctx2, testRequest)
+	if err2 != nil {
+		t.Fatalf("Second request failed: %v", err2.Error.Message)
+	}
+
+	// Should be a cache hit from direct search
+	AssertCacheHit(t, &schemas.BifrostResponse{ChatResponse: response2}, "direct")
+
+	t.Log("✅ CacheTypeKey=direct correctly performs only direct hash matching")
+}
+
+// TestCacheTypeSemanticOnly tests that CacheTypeKey set to "semantic" only performs semantic search
+func TestCacheTypeSemanticOnly(t *testing.T) {
+	setup := NewTestSetup(t)
+	defer setup.Cleanup()
+
+	// First, cache a response using normal behavior
+	ctx1 := CreateContextWithCacheKey("test-cache-type-semantic")
+	testRequest := CreateBasicChatRequest("Explain machine learning concepts", 0.7, 50)
+
+	t.Log("Making first request to populate cache...")
+	response1, err1 := setup.Client.ChatCompletionRequest(ctx1, testRequest)
+	if err1 != nil {
+		return // Test will be skipped by retry function
+	}
+	AssertNoCacheHit(t, &schemas.BifrostResponse{ChatResponse: response1})
+
+	WaitForCache(setup.Plugin)
+
+	// Test with slightly different wording that should match semantically but not directly
+	similarRequest := CreateBasicChatRequest("Can you explain concepts in machine learning", 0.7, 50)
+
+	// Try with semantic-only search
+	ctx2 := CreateContextWithCacheKeyAndType("test-cache-type-semantic", CacheTypeSemantic)
+
+	t.Log("Making second request with similar content and CacheTypeKey=semantic...")
+	response2, err2 := setup.Client.ChatCompletionRequest(ctx2, similarRequest)
+	if err2 != nil {
+		if err2.Error != nil {
+			t.Fatalf("Second request failed: %v", err2.Error.Message)
+		} else {
+			t.Fatalf("Second request failed: %v", err2)
+		}
+	}
+
+	// This might be a cache hit if semantic similarity is high enough
+	// The test validates that semantic search is attempted
+	if response2.ExtraFields.CacheDebug != nil && response2.ExtraFields.CacheDebug.CacheHit {
+		AssertCacheHit(t, &schemas.BifrostResponse{ChatResponse: response2}, "semantic")
+		t.Log("✅ CacheTypeKey=semantic correctly found semantic match")
+	} else {
+		t.Log("ℹ️  No semantic match found (threshold may be too high for these similar phrases)")
+		AssertNoCacheHit(t, &schemas.BifrostResponse{ChatResponse: response2})
+	}
+
+	t.Log("✅ CacheTypeKey=semantic correctly performs only semantic search")
+}
+
+// TestCacheTypeDirectWithSemanticFallback tests the default behavior (both direct and semantic)
+func TestCacheTypeDirectWithSemanticFallback(t *testing.T) {
+	setup := NewTestSetup(t)
+	defer setup.Cleanup()
+
+	// Cache a response first
+	ctx1 := CreateContextWithCacheKey("test-cache-type-fallback")
+	testRequest := CreateBasicChatRequest("Define artificial intelligence", 0.7, 50)
+
+	t.Log("Making first request to populate cache...")
+	response1, err1 := setup.Client.ChatCompletionRequest(ctx1, testRequest)
+	if err1 != nil {
+		return // Test will be skipped by retry function
+	}
+	AssertNoCacheHit(t, &schemas.BifrostResponse{ChatResponse: response1})
+
+	WaitForCache(setup.Plugin)
+
+	// Test exact match (should hit direct cache)
+	ctx2 := CreateContextWithCacheKey("test-cache-type-fallback")
+
+	t.Log("Making second identical request (should hit direct cache)...")
+	response2, err2 := setup.Client.ChatCompletionRequest(ctx2, testRequest)
+	if err2 != nil {
+		if err2.Error != nil {
+			t.Fatalf("Second request failed: %v", err2.Error.Message)
+		} else {
+			t.Fatalf("Second request failed: %v", err2)
+		}
+	}
+	AssertCacheHit(t, &schemas.BifrostResponse{ChatResponse: response2}, "direct")
+
+	// Test similar request (should potentially hit semantic cache)
+	similarRequest := CreateBasicChatRequest("What is artificial intelligence", 0.7, 50)
+
+	t.Log("Making third similar request (should attempt semantic match)...")
+	response3, err3 := setup.Client.ChatCompletionRequest(ctx2, similarRequest)
+	if err3 != nil {
+		t.Fatalf("Third request failed: %v", err3)
+	}
+
+	// May or may not be a cache hit depending on semantic similarity
+	if response3.ExtraFields.CacheDebug != nil && response3.ExtraFields.CacheDebug.CacheHit {
+		AssertCacheHit(t, &schemas.BifrostResponse{ChatResponse: response3}, "semantic")
+		t.Log("✅ Default behavior correctly found semantic match")
+	} else {
+		t.Log("ℹ️  No semantic match found (normal for different wording)")
+		AssertNoCacheHit(t, &schemas.BifrostResponse{ChatResponse: response3})
+	}
+
+	t.Log("✅ Default behavior correctly attempts both direct and semantic search")
+}
+
+// TestCacheTypeInvalidValue tests behavior with invalid CacheTypeKey values
+func TestCacheTypeInvalidValue(t *testing.T) {
+	setup := NewTestSetup(t)
+	defer setup.Cleanup()
+
+	// Create context with invalid cache type
+	ctx := CreateContextWithCacheKey("test-invalid-cache-type")
+	ctx = ctx.WithValue(CacheTypeKey, "invalid_type")
+
+	testRequest := CreateBasicChatRequest("Test invalid cache type", 0.7, 50)
+
+	t.Log("Making request with invalid CacheTypeKey value...")
+	response, err := setup.Client.ChatCompletionRequest(ctx, testRequest)
+	if err != nil {
+		return // Test will be skipped by retry function
+	}
+
+	// Should fall back to default behavior (both direct and semantic)
+	AssertNoCacheHit(t, &schemas.BifrostResponse{ChatResponse: response})
+
+	t.Log("✅ Invalid CacheTypeKey value falls back to default behavior")
+}
+
+// TestCacheTypeWithEmbeddingRequests tests CacheTypeKey behavior with embedding requests
+func TestCacheTypeWithEmbeddingRequests(t *testing.T) {
+	setup := NewTestSetup(t)
+	defer setup.Cleanup()
+
+	embeddingRequest := CreateEmbeddingRequest([]string{"Test embedding with cache type"})
+
+	// Cache first request
+	ctx1 := CreateContextWithCacheKey("test-embedding-cache-type")
+	t.Log("Making first embedding request...")
+	response1, err1 := setup.Client.EmbeddingRequest(ctx1, embeddingRequest)
+	if err1 != nil {
+		return // Test will be skipped by retry function
+	}
+	AssertNoCacheHit(t, &schemas.BifrostResponse{EmbeddingResponse: response1})
+
+	WaitForCache(setup.Plugin)
+
+	// Test with direct-only cache type
+	ctx2 := CreateContextWithCacheKeyAndType("test-embedding-cache-type", CacheTypeDirect)
+	t.Log("Making second embedding request with CacheTypeKey=direct...")
+	response2, err2 := setup.Client.EmbeddingRequest(ctx2, embeddingRequest)
+	if err2 != nil {
+		if err2.Error != nil {
+			t.Fatalf("Second request failed: %v", err2.Error.Message)
+		} else {
+			t.Fatalf("Second request failed: %v", err2)
+		}
+	}
+	AssertCacheHit(t, &schemas.BifrostResponse{EmbeddingResponse: response2}, "direct")
+
+	// Test with semantic-only cache type (should not find semantic match for embeddings)
+	ctx3 := CreateContextWithCacheKeyAndType("test-embedding-cache-type", CacheTypeSemantic)
+	t.Log("Making third embedding request with CacheTypeKey=semantic...")
+	response3, err3 := setup.Client.EmbeddingRequest(ctx3, embeddingRequest)
+	if err3 != nil {
+		t.Fatalf("Third request failed: %v", err3)
+	}
+	// Semantic search should be skipped for embedding requests
+	AssertNoCacheHit(t, &schemas.BifrostResponse{EmbeddingResponse: response3})
+
+	t.Log("✅ CacheTypeKey works correctly with embedding requests")
+}
+
+// TestCacheTypePerformanceCharacteristics tests that different cache types have expected performance
+func TestCacheTypePerformanceCharacteristics(t *testing.T) {
+	setup := NewTestSetup(t)
+	defer setup.Cleanup()
+
+	testRequest := CreateBasicChatRequest("Performance test for cache types", 0.7, 50)
+
+	// Cache first request using CacheTypeDirect so it is stored under the deterministic ID
+	ctx1 := CreateContextWithCacheKeyAndType("test-cache-performance", CacheTypeDirect)
+	t.Log("Making first request to populate cache...")
+	response1, err1 := setup.Client.ChatCompletionRequest(ctx1, testRequest)
+	if err1 != nil {
+		return // Test will be skipped by retry function
+	}
+	AssertNoCacheHit(t, &schemas.BifrostResponse{ChatResponse: response1})
+
+	WaitForCache(setup.Plugin)
+
+	// Test direct-only performance
+	ctx2 := CreateContextWithCacheKeyAndType("test-cache-performance", CacheTypeDirect)
+	start2 := time.Now()
+	response2, err2 := setup.Client.ChatCompletionRequest(ctx2, testRequest)
+	duration2 := time.Since(start2)
+	if err2 != nil {
+		t.Fatalf("Direct cache request failed: %v", err2)
+	}
+	AssertCacheHit(t, &schemas.BifrostResponse{ChatResponse: response2}, "direct")
+
+	t.Logf("Direct cache lookup took: %v", duration2)
+
+	// Test default behavior (both direct and semantic) performance
+	ctx3 := CreateContextWithCacheKey("test-cache-performance")
+	start3 := time.Now()
+	response3, err3 := setup.Client.ChatCompletionRequest(ctx3, testRequest)
+	duration3 := time.Since(start3)
+	if err3 != nil {
+		t.Fatalf("Default cache request failed: %v", err3)
+	}
+	AssertCacheHit(t, &schemas.BifrostResponse{ChatResponse: response3}, "direct")
+
+	t.Logf("Default cache lookup took: %v", duration3)
+
+	// Both should be fast since they hit direct cache
+	// Direct-only might be slightly faster as it doesn't need to prepare for semantic fallback
+	t.Log("✅ Cache type performance characteristics validated")
+}
+
+type directFastPathStore struct {
+	chunks         map[string]vectorstore.SearchResult
+	addIDs         []string
+	getChunkCalls  int
+	getAllCalls    int
+	lastGetChunkID string
+	lastGetAllCtx  context.Context
+	getAllErr      error
+}
+
+func newDirectFastPathStore() *directFastPathStore {
+	return &directFastPathStore{
+		chunks: make(map[string]vectorstore.SearchResult),
+	}
+}
+
+func (s *directFastPathStore) Ping(ctx context.Context) error { return nil }
+
+func (s *directFastPathStore) CreateNamespace(ctx context.Context, namespace string, dimension int, properties map[string]vectorstore.VectorStoreProperties) error {
+	return nil
+}
+
+func (s *directFastPathStore) DeleteNamespace(ctx context.Context, namespace string) error {
+	return nil
+}
+
+func (s *directFastPathStore) GetChunk(ctx context.Context, namespace string, id string) (vectorstore.SearchResult, error) {
+	s.getChunkCalls++
+	s.lastGetChunkID = id
+	result, ok := s.chunks[id]
+	if !ok {
+		return vectorstore.SearchResult{}, vectorstore.ErrNotFound
+	}
+	return result, nil
+}
+
+func (s *directFastPathStore) GetChunks(ctx context.Context, namespace string, ids []string) ([]vectorstore.SearchResult, error) {
+	return nil, vectorstore.ErrNotSupported
+}
+
+func (s *directFastPathStore) GetAll(ctx context.Context, namespace string, queries []vectorstore.Query, selectFields []string, cursor *string, limit int64) ([]vectorstore.SearchResult, *string, error) {
+	s.getAllCalls++
+	s.lastGetAllCtx = ctx
+	if s.getAllErr != nil {
+		return nil, nil, s.getAllErr
+	}
+	return nil, nil, vectorstore.ErrNotSupported
+}
+
+func (s *directFastPathStore) GetNearest(ctx context.Context, namespace string, vector []float32, queries []vectorstore.Query, selectFields []string, threshold float64, limit int64) ([]vectorstore.SearchResult, error) {
+	return nil, vectorstore.ErrNotSupported
+}
+
+func (s *directFastPathStore) RequiresVectors() bool { return false }
+
+func (s *directFastPathStore) Add(ctx context.Context, namespace string, id string, embedding []float32, metadata map[string]interface{}) error {
+	s.addIDs = append(s.addIDs, id)
+	s.chunks[id] = vectorstore.SearchResult{
+		ID:         id,
+		Properties: metadata,
+	}
+	return nil
+}
+
+func (s *directFastPathStore) Delete(ctx context.Context, namespace string, id string) error {
+	return nil
+}
+
+func (s *directFastPathStore) DeleteAll(ctx context.Context, namespace string, queries []vectorstore.Query) ([]vectorstore.DeleteResult, error) {
+	return nil, vectorstore.ErrNotSupported
+}
+
+func (s *directFastPathStore) Close(ctx context.Context, namespace string) error { return nil }
+
+func newCrossProviderChatRequest(provider schemas.ModelProvider, model string, requestType schemas.RequestType, prompt string) *schemas.BifrostRequest {
+	return &schemas.BifrostRequest{
+		RequestType: requestType,
+		ChatRequest: &schemas.BifrostChatRequest{
+			Provider: provider,
+			Model:    model,
+			Input: []schemas.ChatMessage{
+				{
+					Role: schemas.ChatMessageRoleUser,
+					Content: &schemas.ChatMessageContent{
+						ContentStr: bifrost.Ptr(prompt),
+					},
+				},
+			},
+		},
+	}
+}
+
+func TestDirectCacheHitPreservesCachedProviderMetadataAcrossProviders(t *testing.T) {
+	logger := bifrost.NewDefaultLogger(schemas.LogLevelDebug)
+	store := newDirectFastPathStore()
+	config := getDefaultTestConfig()
+	config.CacheByProvider = bifrost.Ptr(false)
+	config.CacheByModel = bifrost.Ptr(false)
+	config.ConversationHistoryThreshold = DefaultConversationHistoryThreshold
+	plugin := &Plugin{
+		store:  store,
+		config: config,
+		logger: logger,
+	}
+
+	const cacheKey = "cross-provider-direct-single"
+	const prompt = "Explain green threading in Go in one short sentence."
+
+	seedCtx := CreateContextWithCacheKeyAndType(cacheKey, CacheTypeDirect)
+	seedReq := newCrossProviderChatRequest(schemas.OpenAI, "gpt-5.2", schemas.ChatCompletionRequest, prompt)
+
+	_, shortCircuit, err := plugin.PreLLMHook(seedCtx, seedReq)
+	if err != nil {
+		t.Fatalf("seed PreLLMHook failed: %v", err)
+	}
+	if shortCircuit != nil {
+		t.Fatal("expected seed request to miss cache")
+	}
+
+	seedResponse := &schemas.BifrostResponse{
+		ChatResponse: &schemas.BifrostChatResponse{
+			ID: "cross-provider-direct-single",
+			Choices: []schemas.BifrostResponseChoice{
+				{
+					ChatNonStreamResponseChoice: &schemas.ChatNonStreamResponseChoice{
+						Message: &schemas.ChatMessage{
+							Role: schemas.ChatMessageRoleAssistant,
+							Content: &schemas.ChatMessageContent{
+								ContentStr: bifrost.Ptr("Go schedules lightweight goroutines in user space onto a smaller pool of OS threads."),
+							},
+						},
+					},
+				},
+			},
+			ExtraFields: schemas.BifrostResponseExtraFields{
+				Provider:               schemas.OpenAI,
+				OriginalModelRequested: "gpt-5.2",
+				ResolvedModelUsed:      "gpt-5.2",
+				RequestType:            schemas.ChatCompletionRequest,
+			},
+		},
+	}
+
+	if _, _, err = plugin.PostLLMHook(seedCtx, seedResponse, nil); err != nil {
+		t.Fatalf("seed PostLLMHook failed: %v", err)
+	}
+	plugin.WaitForPendingOperations()
+
+	hitCtx := CreateContextWithCacheKeyAndType(cacheKey, CacheTypeDirect)
+	hitReq := newCrossProviderChatRequest(schemas.Anthropic, "claude-sonnet-4-6", schemas.ChatCompletionRequest, prompt)
+
+	_, shortCircuit, err = plugin.PreLLMHook(hitCtx, hitReq)
+	if err != nil {
+		t.Fatalf("hit PreLLMHook failed: %v", err)
+	}
+	if shortCircuit == nil || shortCircuit.Response == nil || shortCircuit.Response.ChatResponse == nil {
+		t.Fatal("expected cross-provider direct cache hit to return a response")
+	}
+
+	extraFields := shortCircuit.Response.ChatResponse.ExtraFields
+	if extraFields.Provider != schemas.OpenAI {
+		t.Fatalf("expected cached provider %q, got %q", schemas.OpenAI, extraFields.Provider)
+	}
+	if extraFields.OriginalModelRequested != "gpt-5.2" {
+		t.Fatalf("expected OriginalModelRequested %q, got %q", "gpt-5.2", extraFields.OriginalModelRequested)
+	}
+	if extraFields.ResolvedModelUsed != "gpt-5.2" {
+		t.Fatalf("expected ResolvedModelUsed %q, got %q", "gpt-5.2", extraFields.ResolvedModelUsed)
+	}
+	if extraFields.CacheDebug == nil {
+		t.Fatal("expected cache_debug on cache hit")
+	}
+	if !extraFields.CacheDebug.CacheHit {
+		t.Fatal("expected cache hit to be marked in cache_debug")
+	}
+	if extraFields.CacheDebug.HitType == nil || *extraFields.CacheDebug.HitType != string(CacheTypeDirect) {
+		t.Fatalf("expected hit_type %q, got %v", CacheTypeDirect, extraFields.CacheDebug.HitType)
+	}
+	if extraFields.CacheDebug.RequestedProvider == nil || *extraFields.CacheDebug.RequestedProvider != string(schemas.Anthropic) {
+		t.Fatalf("expected requested_provider %q, got %v", schemas.Anthropic, extraFields.CacheDebug.RequestedProvider)
+	}
+	if extraFields.CacheDebug.RequestedModel == nil || *extraFields.CacheDebug.RequestedModel != "claude-sonnet-4-6" {
+		t.Fatalf("expected requested_model %q, got %v", "claude-sonnet-4-6", extraFields.CacheDebug.RequestedModel)
+	}
+}
+
+func TestStreamingDirectCacheHitPreservesCachedProviderMetadataAcrossProviders(t *testing.T) {
+	logger := bifrost.NewDefaultLogger(schemas.LogLevelDebug)
+	store := newDirectFastPathStore()
+	config := getDefaultTestConfig()
+	config.CacheByProvider = bifrost.Ptr(false)
+	config.CacheByModel = bifrost.Ptr(false)
+	config.ConversationHistoryThreshold = DefaultConversationHistoryThreshold
+	plugin := &Plugin{
+		store:  store,
+		config: config,
+		logger: logger,
+	}
+
+	const cacheKey = "cross-provider-direct-stream"
+	const prompt = "Explain green threading in Go in one short sentence."
+
+	seedCtx := CreateContextWithCacheKeyAndType(cacheKey, CacheTypeDirect)
+	seedReq := newCrossProviderChatRequest(schemas.OpenAI, "gpt-5.2", schemas.ChatCompletionStreamRequest, prompt)
+
+	_, shortCircuit, err := plugin.PreLLMHook(seedCtx, seedReq)
+	if err != nil {
+		t.Fatalf("seed PreLLMHook failed: %v", err)
+	}
+	if shortCircuit != nil {
+		t.Fatal("expected seed request to miss cache")
+	}
+
+	chunks := []struct {
+		content      string
+		chunkIndex   int
+		finishReason *string
+		streamEnd    bool
+	}{
+		{content: "Go schedules lightweight goroutines", chunkIndex: 0, finishReason: nil, streamEnd: false},
+		{content: " onto a smaller pool of OS threads.", chunkIndex: 1, finishReason: bifrost.Ptr("stop"), streamEnd: true},
+	}
+
+	for _, chunk := range chunks {
+		seedCtx.SetValue(schemas.BifrostContextKeyStreamEndIndicator, chunk.streamEnd)
+		chunkResponse := &schemas.BifrostResponse{
+			ChatResponse: &schemas.BifrostChatResponse{
+				ID: "cross-provider-direct-stream",
+				Choices: []schemas.BifrostResponseChoice{
+					{
+						Index:        chunk.chunkIndex,
+						FinishReason: chunk.finishReason,
+						ChatStreamResponseChoice: &schemas.ChatStreamResponseChoice{
+							Delta: &schemas.ChatStreamResponseChoiceDelta{
+								Content: bifrost.Ptr(chunk.content),
+							},
+						},
+					},
+				},
+				ExtraFields: schemas.BifrostResponseExtraFields{
+					Provider:               schemas.OpenAI,
+					OriginalModelRequested: "gpt-5.2",
+					ResolvedModelUsed:      "gpt-5.2",
+					RequestType:            schemas.ChatCompletionStreamRequest,
+					ChunkIndex:             chunk.chunkIndex,
+				},
+			},
+		}
+
+		if _, _, err = plugin.PostLLMHook(seedCtx, chunkResponse, nil); err != nil {
+			t.Fatalf("seed PostLLMHook failed for chunk %d: %v", chunk.chunkIndex, err)
+		}
+		plugin.WaitForPendingOperations()
+	}
+
+	hitCtx := CreateContextWithCacheKeyAndType(cacheKey, CacheTypeDirect)
+	hitReq := newCrossProviderChatRequest(schemas.Anthropic, "claude-sonnet-4-6", schemas.ChatCompletionStreamRequest, prompt)
+
+	_, shortCircuit, err = plugin.PreLLMHook(hitCtx, hitReq)
+	if err != nil {
+		t.Fatalf("hit PreLLMHook failed: %v", err)
+	}
+	if shortCircuit == nil || shortCircuit.Stream == nil {
+		t.Fatal("expected cross-provider streaming direct cache hit to return a stream")
+	}
+
+	chunkCount := 0
+	for chunk := range shortCircuit.Stream {
+		if chunk.BifrostChatResponse == nil {
+			t.Fatal("expected cached chat stream chunk")
+		}
+
+		extraFields := chunk.BifrostChatResponse.ExtraFields
+		if extraFields.Provider != schemas.OpenAI {
+			t.Fatalf("expected cached provider %q on chunk %d, got %q", schemas.OpenAI, chunkCount, extraFields.Provider)
+		}
+		if extraFields.OriginalModelRequested != "gpt-5.2" {
+			t.Fatalf("expected OriginalModelRequested %q on chunk %d, got %q", "gpt-5.2", chunkCount, extraFields.OriginalModelRequested)
+		}
+		if extraFields.ResolvedModelUsed != "gpt-5.2" {
+			t.Fatalf("expected ResolvedModelUsed %q on chunk %d, got %q", "gpt-5.2", chunkCount, extraFields.ResolvedModelUsed)
+		}
+		if chunkCount == len(chunks)-1 {
+			if extraFields.CacheDebug == nil || !extraFields.CacheDebug.CacheHit {
+				t.Fatal("expected final cached stream chunk to include cache_debug cache_hit=true")
+			}
+			if extraFields.CacheDebug.HitType == nil || *extraFields.CacheDebug.HitType != string(CacheTypeDirect) {
+				t.Fatalf("expected final stream hit_type %q, got %v", CacheTypeDirect, extraFields.CacheDebug.HitType)
+			}
+			if extraFields.CacheDebug.RequestedProvider == nil || *extraFields.CacheDebug.RequestedProvider != string(schemas.Anthropic) {
+				t.Fatalf("expected final stream requested_provider %q, got %v", schemas.Anthropic, extraFields.CacheDebug.RequestedProvider)
+			}
+			if extraFields.CacheDebug.RequestedModel == nil || *extraFields.CacheDebug.RequestedModel != "claude-sonnet-4-6" {
+				t.Fatalf("expected final stream requested_model %q, got %v", "claude-sonnet-4-6", extraFields.CacheDebug.RequestedModel)
+			}
+		}
+
+		chunkCount++
+	}
+
+	if chunkCount != len(chunks) {
+		t.Fatalf("expected %d cached stream chunks, got %d", len(chunks), chunkCount)
+	}
+}
+
+func TestCacheTypeDirectUsesChunkLookup(t *testing.T) {
+	logger := bifrost.NewDefaultLogger(schemas.LogLevelDebug)
+	store := newDirectFastPathStore()
+	plugin := &Plugin{
+		store:  store,
+		config: getDefaultTestConfig(),
+		logger: logger,
+	}
+
+	req := &schemas.BifrostRequest{
+		RequestType: schemas.ChatCompletionRequest,
+		ChatRequest: CreateBasicChatRequest("What is Bifrost?", 0.7, 50),
+	}
+
+	ctx := CreateContextWithCacheKeyAndType("chunk-fast-path", CacheTypeDirect)
+	directID, err := plugin.prepareDirectCacheLookup(ctx, req, "chunk-fast-path")
+	if err != nil {
+		t.Fatalf("prepareDirectCacheLookup failed: %v", err)
+	}
+
+	cachedContent := "cached response"
+	cachedResponse := &schemas.BifrostResponse{
+		ChatResponse: &schemas.BifrostChatResponse{
+			Choices: []schemas.BifrostResponseChoice{
+				{
+					ChatNonStreamResponseChoice: &schemas.ChatNonStreamResponseChoice{
+						Message: &schemas.ChatMessage{
+							Role: schemas.ChatMessageRoleAssistant,
+							Content: &schemas.ChatMessageContent{
+								ContentStr: &cachedContent,
+							},
+						},
+					},
+				},
+			},
+		},
+	}
+	responseJSON, err := schemas.MarshalDeeplySorted(cachedResponse)
+	if err != nil {
+		t.Fatalf("failed to marshal cached response: %v", err)
+	}
+
+	store.chunks[directID] = vectorstore.SearchResult{
+		ID: directID,
+		Properties: map[string]interface{}{
+			"response":   string(responseJSON),
+			"expires_at": time.Now().Add(time.Minute).Unix(),
+		},
+	}
+
+	shortCircuit, err := plugin.performDirectChunkLookup(ctx, req, "chunk-fast-path")
+	if err != nil {
+		t.Fatalf("performDirectChunkLookup failed: %v", err)
+	}
+	if shortCircuit == nil || shortCircuit.Response == nil || shortCircuit.Response.ChatResponse == nil {
+		t.Fatal("expected direct chunk lookup to return cached response")
+	}
+	if store.getChunkCalls != 1 {
+		t.Fatalf("expected one GetChunk call, got %d", store.getChunkCalls)
+	}
+	if store.getAllCalls != 0 {
+		t.Fatalf("expected no GetAll calls, got %d", store.getAllCalls)
+	}
+	if store.lastGetChunkID != directID {
+		t.Fatalf("expected GetChunk to use %q, got %q", directID, store.lastGetChunkID)
+	}
+}
+
+func TestDefaultDirectSearchSetsStorageIDForDeterministicWrites(t *testing.T) {
+	logger := bifrost.NewDefaultLogger(schemas.LogLevelDebug)
+	store := newDirectFastPathStore()
+	plugin := &Plugin{
+		store:  store,
+		config: getDefaultTestConfig(),
+		logger: logger,
+	}
+
+	req := &schemas.BifrostRequest{
+		RequestType: schemas.ChatCompletionRequest,
+		ChatRequest: CreateBasicChatRequest("What is Bifrost?", 0.7, 50),
+	}
+
+	ctx := CreateContextWithCacheKey("default-mode")
+	_, err := plugin.performDirectSearch(ctx, req, "default-mode")
+	if err != nil && !errors.Is(err, vectorstore.ErrNotSupported) {
+		t.Fatalf("performDirectSearch failed: %v", err)
+	}
+
+	storageID, _ := ctx.Value(requestStorageIDKey).(string)
+	if storageID == "" {
+		t.Fatal("expected default direct search to set requestStorageIDKey")
+	}
+	if store.getChunkCalls != 1 {
+		t.Fatalf("expected one GetChunk call, got %d", store.getChunkCalls)
+	}
+}
+
+func TestPreLLMHookClearsStaleStorageIDOnReusedContext(t *testing.T) {
+	logger := bifrost.NewDefaultLogger(schemas.LogLevelDebug)
+	store := newDirectFastPathStore()
+	config := getDefaultTestConfig()
+	config.ConversationHistoryThreshold = 3
+	plugin := &Plugin{
+		store:  store,
+		config: config,
+		logger: logger,
+	}
+
+	req := &schemas.BifrostRequest{
+		RequestType: schemas.ChatCompletionRequest,
+		ChatRequest: CreateBasicChatRequest("What is Bifrost?", 0.7, 50),
+	}
+
+	ctx := CreateContextWithCacheKey("reused-context")
+	ctx.SetValue(requestStorageIDKey, "stale-storage-id")
+
+	if _, _, err := plugin.PreLLMHook(ctx, req); err != nil {
+		t.Fatalf("PreLLMHook failed: %v", err)
+	}
+
+	storageID, _ := ctx.Value(requestStorageIDKey).(string)
+	if storageID == "" {
+		t.Fatal("expected PreLLMHook to replace stale requestStorageIDKey with a deterministic id")
+	}
+	if storageID == "stale-storage-id" {
+		t.Fatal("expected PreLLMHook to clear stale requestStorageIDKey before setting a deterministic id")
+	}
+}
+
+func TestCacheTypeDirectStoresDeterministicID(t *testing.T) {
+	logger := bifrost.NewDefaultLogger(schemas.LogLevelDebug)
+	store := newDirectFastPathStore()
+	config := getDefaultTestConfig()
+	plugin := &Plugin{
+		store:  store,
+		config: config,
+		logger: logger,
+	}
+
+	req := &schemas.BifrostRequest{
+		RequestType: schemas.ChatCompletionRequest,
+		ChatRequest: CreateBasicChatRequest("What is Bifrost?", 0.7, 50),
+	}
+	ctx := CreateContextWithCacheKeyAndType("deterministic-store", CacheTypeDirect)
+	ctx.SetValue(requestIDKey, "request-uuid")
+	ctx.SetValue(requestProviderKey, schemas.OpenAI)
+	ctx.SetValue(requestModelKey, req.ChatRequest.Model)
+
+	directID, err := plugin.prepareDirectCacheLookup(ctx, req, "deterministic-store")
+	if err != nil {
+		t.Fatalf("prepareDirectCacheLookup failed: %v", err)
+	}
+	ctx.SetValue(requestStorageIDKey, directID)
+
+	content := "stored response"
+	response := &schemas.BifrostResponse{
+		ChatResponse: &schemas.BifrostChatResponse{
+			Choices: []schemas.BifrostResponseChoice{
+				{
+					ChatNonStreamResponseChoice: &schemas.ChatNonStreamResponseChoice{
+						Message: &schemas.ChatMessage{
+							Role: schemas.ChatMessageRoleAssistant,
+							Content: &schemas.ChatMessageContent{
+								ContentStr: &content,
+							},
+						},
+					},
+				},
+			},
+		},
+	}
+	response.ChatResponse.ExtraFields.RequestType = schemas.ChatCompletionRequest
+
+	if _, _, err := plugin.PostLLMHook(ctx, response, nil); err != nil {
+		t.Fatalf("PostLLMHook failed: %v", err)
+	}
+
+	plugin.WaitForPendingOperations()
+
+	if len(store.addIDs) != 1 {
+		t.Fatalf("expected one store.Add call, got %d", len(store.addIDs))
+	}
+	if store.addIDs[0] != directID {
+		t.Fatalf("expected deterministic storage id %q, got %q", directID, store.addIDs[0])
+	}
+	if store.addIDs[0] == "request-uuid" {
+		t.Fatal("expected storage id to differ from request UUID")
+	}
+}
+
+func TestPostLLMHookUsesDeterministicStorageIDOutsideDirectMode(t *testing.T) {
+	logger := bifrost.NewDefaultLogger(schemas.LogLevelDebug)
+	store := newDirectFastPathStore()
+	plugin := &Plugin{
+		store:  store,
+		config: getDefaultTestConfig(),
+		logger: logger,
+	}
+
+	content := "stored response"
+	response := &schemas.BifrostResponse{
+		ChatResponse: &schemas.BifrostChatResponse{
+			Choices: []schemas.BifrostResponseChoice{
+				{
+					ChatNonStreamResponseChoice: &schemas.ChatNonStreamResponseChoice{
+						Message: &schemas.ChatMessage{
+							Role: schemas.ChatMessageRoleAssistant,
+							Content: &schemas.ChatMessageContent{
+								ContentStr: &content,
+							},
+						},
+					},
+				},
+			},
+		},
+	}
+	response.ChatResponse.ExtraFields.RequestType = schemas.ChatCompletionRequest
+
+	ctx := CreateContextWithCacheKey("default-mode-store")
+	ctx.SetValue(requestIDKey, "request-uuid")
+	ctx.SetValue(requestProviderKey, schemas.OpenAI)
+	ctx.SetValue(requestModelKey, "openai/gpt-4o-mini")
+	ctx.SetValue(requestHashKey, "request-hash")
+	ctx.SetValue(requestParamsHashKey, "params-hash")
+
+	directID := plugin.generateDirectCacheID(schemas.OpenAI, "openai/gpt-4o-mini", "default-mode-store", "request-hash", "params-hash")
+	ctx.SetValue(requestStorageIDKey, directID)
+
+	if _, _, err := plugin.PostLLMHook(ctx, response, nil); err != nil {
+		t.Fatalf("PostLLMHook failed: %v", err)
+	}
+
+	plugin.WaitForPendingOperations()
+
+	if len(store.addIDs) != 1 {
+		t.Fatalf("expected one store.Add call, got %d", len(store.addIDs))
+	}
+	if store.addIDs[0] != directID {
+		t.Fatalf("expected PostLLMHook to use deterministic storage id outside direct mode, got %q", store.addIDs[0])
+	}
+}
+
+func TestPerformDirectSearchDisablesScanFallbackForLegacyLookup(t *testing.T) {
+	logger := bifrost.NewDefaultLogger(schemas.LogLevelDebug)
+	store := newDirectFastPathStore()
+	plugin := &Plugin{
+		store:  store,
+		config: getDefaultTestConfig(),
+		logger: logger,
+	}
+
+	req := &schemas.BifrostRequest{
+		RequestType: schemas.ChatCompletionRequest,
+		ChatRequest: CreateBasicChatRequest("What is Bifrost?", 0.7, 50),
+	}
+
+	ctx := CreateContextWithCacheKey("legacy-no-scan")
+	_, err := plugin.performDirectSearch(ctx, req, "legacy-no-scan")
+	if err != nil && !errors.Is(err, vectorstore.ErrNotSupported) {
+		t.Fatalf("performDirectSearch failed: %v", err)
+	}
+
+	if store.getAllCalls != 1 {
+		t.Fatalf("expected one legacy GetAll call, got %d", store.getAllCalls)
+	}
+	if !vectorstore.IsScanFallbackDisabled(store.lastGetAllCtx) {
+		t.Fatal("expected legacy direct lookup to disable scan fallback")
+	}
+}
+
+func TestPerformLegacyDirectSearchTreatsQuerySyntaxErrorAsMiss(t *testing.T) {
+	logger := bifrost.NewDefaultLogger(schemas.LogLevelDebug)
+	store := newDirectFastPathStore()
+	store.getAllErr = vectorstore.ErrQuerySyntax
+	plugin := &Plugin{
+		store:  store,
+		config: getDefaultTestConfig(),
+		logger: logger,
+	}
+
+	req := &schemas.BifrostRequest{
+		RequestType: schemas.ChatCompletionRequest,
+		ChatRequest: CreateBasicChatRequest("What is Bifrost?", 0.7, 50),
+	}
+
+	ctx := CreateContextWithCacheKey("legacy-query-syntax")
+	_, err := plugin.prepareDirectCacheLookup(ctx, req, "legacy-query-syntax")
+	if err != nil {
+		t.Fatalf("prepareDirectCacheLookup failed: %v", err)
+	}
+
+	shortCircuit, err := plugin.performLegacyDirectSearch(ctx, req, "legacy-query-syntax")
+	if err != nil {
+		t.Fatalf("performLegacyDirectSearch failed: %v", err)
+	}
+	if shortCircuit != nil {
+		t.Fatal("expected query syntax incompatibility to be treated as a miss")
+	}
+	if store.getAllCalls != 1 {
+		t.Fatalf("expected one legacy GetAll call, got %d", store.getAllCalls)
+	}
+}
+
+func TestGetOrCreateStreamAccumulatorUsesSingleAccumulatorPerRequest(t *testing.T) {
+	logger := bifrost.NewDefaultLogger(schemas.LogLevelDebug)
+	plugin := &Plugin{
+		logger: logger,
+	}
+
+	requestID := "stream-request"
+	storageID := "stream-storage"
+	embedding := []float32{1, 2, 3}
+	metadata := map[string]interface{}{"cache_key": "stream-cache"}
+	ttl := time.Minute
+
+	const workers = 8
+	results := make(chan *StreamAccumulator, workers)
+
+	var wg sync.WaitGroup
+	wg.Add(workers)
+	for range workers {
+		go func() {
+			defer wg.Done()
+			results <- plugin.getOrCreateStreamAccumulator(requestID, storageID, embedding, metadata, ttl)
+		}()
+	}
+
+	wg.Wait()
+	close(results)
+
+	var first *StreamAccumulator
+	for accumulator := range results {
+		if accumulator == nil {
+			t.Fatal("expected accumulator")
+		}
+		if first == nil {
+			first = accumulator
+			continue
+		}
+		if accumulator != first {
+			t.Fatal("expected all callers to receive the same accumulator instance")
+		}
+	}
+
+	stored, ok := plugin.streamAccumulators.Load(requestID)
+	if !ok {
+		t.Fatal("expected accumulator to be stored")
+	}
+	if stored.(*StreamAccumulator) != first {
+		t.Fatal("expected stored accumulator to match returned accumulator")
+	}
+	if first.StorageID != storageID {
+		t.Fatalf("expected storage id %q, got %q", storageID, first.StorageID)
+	}
+	if first.TTL != ttl {
+		t.Fatalf("expected ttl %v, got %v", ttl, first.TTL)
+	}
+}
--- a/plugins/semanticcache/plugin_conversation_config_test.go
+++ b/plugins/semanticcache/plugin_conversation_config_test.go
@@ -0,0 +1,454 @@
+package semanticcache
+
+import (
+	"strconv"
+	"testing"
+
+	bifrost "github.com/maximhq/bifrost/core"
+	"github.com/maximhq/bifrost/core/schemas"
+)
+
+// TestConversationHistoryThresholdBasic tests basic conversation history threshold functionality
+func TestConversationHistoryThresholdBasic(t *testing.T) {
+	// Test with threshold of 2 messages
+	setup := CreateTestSetupWithConversationThreshold(t, 2)
+	defer setup.Cleanup()
+
+	ctx := CreateContextWithCacheKey("test-conversation-threshold-basic")
+
+	// Test 1: Conversation with exactly 2 messages (should cache)
+	conversation1 := BuildConversationHistory("",
+		[]string{"Hello", "Hi there!"},
+	)
+	request1 := CreateConversationRequest(conversation1, 0.7, 50)
+
+	t.Log("Testing conversation with exactly 2 messages (at threshold)...")
+	response1, err1 := setup.Client.ChatCompletionRequest(ctx, request1)
+	if err1 != nil {
+		return // Test will be skipped by retry function
+	}
+	AssertNoCacheHit(t, &schemas.BifrostResponse{ChatResponse: response1}) // Fresh request
+
+	WaitForCache(setup.Plugin)
+
+	// Verify it was cached
+	response2, err2 := setup.Client.ChatCompletionRequest(ctx, request1)
+	if err2 != nil {
+		if err2.Error != nil {
+			t.Fatalf("Second request failed: %v", err2.Error.Message)
+		} else {
+			t.Fatalf("Second request failed: %v", err2)
+		}
+	}
+	AssertCacheHit(t, &schemas.BifrostResponse{ChatResponse: response2}, "direct") // Should be cached
+
+	// Test 2: Conversation with 3 messages (exceeds threshold, should NOT cache)
+	conversation2 := BuildConversationHistory("",
+		[]string{"Hello", "Hi there!"},
+		[]string{"How are you?", "I'm doing well!"},
+	)
+	messages2 := AddUserMessage(conversation2, "What's the weather?")
+	request2 := CreateConversationRequest(messages2, 0.7, 50) // 5 messages total > 2
+
+	t.Log("Testing conversation with 5 messages (exceeds threshold)...")
+	response3, err3 := setup.Client.ChatCompletionRequest(ctx, request2)
+	if err3 != nil {
+		return // Test will be skipped by retry function
+	}
+	AssertNoCacheHit(t, &schemas.BifrostResponse{ChatResponse: response3}) // Should not cache
+
+	WaitForCache(setup.Plugin)
+
+	// Verify it was NOT cached
+	t.Log("Verifying conversation exceeding threshold was not cached...")
+	response4, err4 := setup.Client.ChatCompletionRequest(ctx, request2)
+	if err4 != nil {
+		return // Test will be skipped by retry function
+	}
+	AssertNoCacheHit(t, &schemas.BifrostResponse{ChatResponse: response4}) // Should still be fresh (not cached)
+
+	t.Log("✅ Conversation history threshold works correctly")
+}
+
+// TestConversationHistoryThresholdWithSystemPrompt tests threshold with system messages
+func TestConversationHistoryThresholdWithSystemPrompt(t *testing.T) {
+	// Test with threshold of 3, ExcludeSystemPrompt = false
+	setup := CreateTestSetupWithConversationThreshold(t, 3)
+	defer setup.Cleanup()
+
+	ctx := CreateContextWithCacheKey("test-threshold-system-prompt")
+
+	// System prompt + 2 user/assistant pairs = 5 messages total > 3
+	conversation := BuildConversationHistory(
+		"You are a helpful assistant", // System message (counts toward threshold)
+		[]string{"Hello", "Hi there!"},
+		[]string{"How are you?", "I'm doing well!"},
+	)
+	request := CreateConversationRequest(conversation, 0.7, 50)
+
+	t.Log("Testing conversation with system prompt (5 total messages > 3 threshold)...")
+	response1, err1 := setup.Client.ChatCompletionRequest(ctx, request)
+	if err1 != nil {
+		return // Test will be skipped by retry function
+	}
+	AssertNoCacheHit(t, &schemas.BifrostResponse{ChatResponse: response1}) // Should not cache (exceeds threshold)
+
+	WaitForCache(setup.Plugin)
+
+	// Verify not cached
+	response2, err2 := setup.Client.ChatCompletionRequest(ctx, request)
+	if err2 != nil {
+		return // Test will be skipped by retry function
+	}
+	AssertNoCacheHit(t, &schemas.BifrostResponse{ChatResponse: response2}) // Should not be cached
+
+	t.Log("✅ Conversation threshold correctly counts system messages")
+}
+
+// TestConversationHistoryThresholdWithExcludeSystemPrompt tests interaction between threshold and exclude system prompt
+func TestConversationHistoryThresholdWithExcludeSystemPrompt(t *testing.T) {
+	// Create setup with both threshold=3 and ExcludeSystemPrompt=true
+	setup := CreateTestSetupWithThresholdAndExcludeSystem(t, 3, true)
+	defer setup.Cleanup()
+
+	ctx := CreateContextWithCacheKey("test-threshold-exclude-system")
+
+	// Create conversation with exactly 3 non-system messages to test threshold boundary
+	// System + 1.5 user/assistant pairs = 4 messages total
+	// With ExcludeSystemPrompt=true, should only count 3 non-system messages for threshold
+	conversation := BuildConversationHistory(
+		"You are helpful",       // System (excluded from count)
+		[]string{"Hello", "Hi"}, // User + Assistant = 2 messages
+		[]string{"Thanks", ""},  // User only = 1 message (no assistant response)
+	)
+	// No slicing needed; BuildConversationHistory skips empty assistant entries.
+	request := CreateConversationRequest(conversation, 0.7, 50) // 3 non-system messages exactly
+
+	t.Log("Testing threshold with ExcludeSystemPrompt=true (3 non-system messages = at threshold)...")
+
+	// Test logic:
+	// - Total messages: 4 (1 system + 3 others)
+	// - With ExcludeSystemPrompt=true: counts as 3 non-system messages
+	// - Threshold is 3, so 3 <= 3 should allow caching
+
+	response1, err1 := setup.Client.ChatCompletionRequest(ctx, request)
+	if err1 != nil {
+		return // Test will be skipped by retry function
+	}
+	AssertNoCacheHit(t, &schemas.BifrostResponse{ChatResponse: response1}) // Fresh request, should not hit cache
+
+	WaitForCache(setup.Plugin)
+
+	// Second request should hit cache (3 non-system messages <= 3 threshold)
+	response2, err2 := setup.Client.ChatCompletionRequest(ctx, request)
+	if err2 != nil {
+		if err2.Error != nil {
+			t.Fatalf("Second request failed: %v", err2.Error.Message)
+		} else {
+			t.Fatalf("Second request failed: %v", err2)
+		}
+	}
+	AssertCacheHit(t, &schemas.BifrostResponse{ChatResponse: response2}, "direct") // Should cache since 3 <= 3 after excluding system
+
+	t.Log("✅ Conversation threshold respects ExcludeSystemPrompt setting")
+}
+
+// TestConversationHistoryThresholdDifferentValues tests different threshold values
+func TestConversationHistoryThresholdDifferentValues(t *testing.T) {
+	testCases := []struct {
+		name        string
+		threshold   int
+		messages    int
+		shouldCache bool
+	}{
+		{"Threshold 1, 1 message", 1, 1, true},
+		{"Threshold 1, 2 messages", 1, 2, false},
+		{"Threshold 5, 4 messages", 5, 4, true},
+		{"Threshold 5, 6 messages", 5, 6, false},
+	}
+
+	for _, tc := range testCases {
+		t.Run(tc.name, func(t *testing.T) {
+			setup := CreateTestSetupWithConversationThreshold(t, tc.threshold)
+			defer setup.Cleanup()
+
+			ctx := CreateContextWithCacheKey("test-threshold-" + tc.name)
+
+			// Build conversation with specified number of messages
+			var conversation []schemas.ChatMessage
+			for i := 0; i < tc.messages; i++ {
+				role := schemas.ChatMessageRoleUser
+				if i%2 == 1 {
+					role = schemas.ChatMessageRoleAssistant
+				}
+				message := schemas.ChatMessage{
+					Role: role,
+					Content: &schemas.ChatMessageContent{
+						ContentStr: bifrost.Ptr("Message " + strconv.Itoa(i+1)),
+					},
+				}
+				conversation = append(conversation, message)
+			}
+
+			request := CreateConversationRequest(conversation, 0.7, 50)
+
+			response1, err1 := setup.Client.ChatCompletionRequest(ctx, request)
+			if err1 != nil {
+				return // Test will be skipped by retry function
+			}
+			AssertNoCacheHit(t, &schemas.BifrostResponse{ChatResponse: response1}) // Always fresh first time
+
+			WaitForCache(setup.Plugin)
+
+			response2, err2 := setup.Client.ChatCompletionRequest(ctx, request)
+			if err2 != nil {
+				return // Test will be skipped by retry function
+			}
+
+			if tc.shouldCache {
+				AssertCacheHit(t, &schemas.BifrostResponse{ChatResponse: response2}, "direct")
+			} else {
+				AssertNoCacheHit(t, &schemas.BifrostResponse{ChatResponse: response2})
+			}
+		})
+	}
+
+	t.Log("✅ Different conversation threshold values work correctly")
+}
+
+// TestExcludeSystemPromptBasic tests basic ExcludeSystemPrompt functionality
+func TestExcludeSystemPromptBasic(t *testing.T) {
+	// Test with ExcludeSystemPrompt = true
+	setup := CreateTestSetupWithExcludeSystemPrompt(t, true)
+	defer setup.Cleanup()
+
+	ctx := CreateContextWithCacheKey("test-exclude-system-basic")
+
+	// Create two conversations with different system prompts but same user/assistant messages
+	conversation1 := BuildConversationHistory(
+		"You are a helpful assistant",
+		[]string{"What is AI?", "AI is artificial intelligence."},
+	)
+
+	conversation2 := BuildConversationHistory(
+		"You are a technical expert",                              // Different system prompt
+		[]string{"What is AI?", "AI is artificial intelligence."}, // Same user/assistant
+	)
+
+	request1 := CreateConversationRequest(conversation1, 0.7, 50)
+	request2 := CreateConversationRequest(conversation2, 0.7, 50)
+
+	t.Log("Caching conversation with system prompt 1...")
+	response1, err1 := setup.Client.ChatCompletionRequest(ctx, request1)
+	if err1 != nil {
+		return // Test will be skipped by retry function
+	}
+	AssertNoCacheHit(t, &schemas.BifrostResponse{ChatResponse: response1})
+
+	WaitForCache(setup.Plugin)
+
+	t.Log("Testing conversation with different system prompt (should hit cache due to ExcludeSystemPrompt=true)...")
+	response2, err2 := setup.Client.ChatCompletionRequest(ctx, request2)
+	if err2 != nil {
+		if err2.Error != nil {
+			t.Fatalf("Second request failed: %v", err2.Error.Message)
+		} else {
+			t.Fatalf("Second request failed: %v", err2)
+		}
+	}
+	// Should hit cache because system prompts are excluded from cache key
+	AssertCacheHit(t, &schemas.BifrostResponse{ChatResponse: response2}, "direct")
+
+	t.Log("✅ ExcludeSystemPrompt=true correctly ignores system prompts in cache keys")
+}
+
+// TestExcludeSystemPromptComparison tests ExcludeSystemPrompt true vs false
+func TestExcludeSystemPromptComparison(t *testing.T) {
+	// Test 1: ExcludeSystemPrompt = false (default)
+	setup1 := CreateTestSetupWithExcludeSystemPrompt(t, false)
+	defer setup1.Cleanup()
+
+	ctx1 := CreateContextWithCacheKey("test-exclude-system-false")
+
+	conversation1 := BuildConversationHistory(
+		"You are helpful",
+		[]string{"Hello", "Hi there!"},
+	)
+
+	conversation2 := BuildConversationHistory(
+		"You are an expert",            // Different system prompt
+		[]string{"Hello", "Hi there!"}, // Same user/assistant
+	)
+
+	request1 := CreateConversationRequest(conversation1, 0.7, 50)
+	request2 := CreateConversationRequest(conversation2, 0.7, 50)
+
+	t.Log("Testing ExcludeSystemPrompt=false...")
+	response1, err1 := setup1.Client.ChatCompletionRequest(ctx1, request1)
+	if err1 != nil {
+		return // Test will be skipped by retry function
+	}
+	AssertNoCacheHit(t, &schemas.BifrostResponse{ChatResponse: response1})
+
+	WaitForCache(setup1.Plugin)
+
+	response2, err2 := setup1.Client.ChatCompletionRequest(ctx1, request2)
+	if err2 != nil {
+		if err2.Error != nil {
+			t.Fatalf("Second request failed: %v", err2.Error.Message)
+		} else {
+			t.Fatalf("Second request failed: %v", err2)
+		}
+	}
+	// Should NOT hit direct cache, but might hit semantic cache due to similar content
+	if response2.ExtraFields.CacheDebug != nil && response2.ExtraFields.CacheDebug.CacheHit {
+		if response2.ExtraFields.CacheDebug.HitType != nil && *response2.ExtraFields.CacheDebug.HitType == "semantic" {
+			t.Log("✅ Found semantic cache match (expected with similar content)")
+		} else {
+			t.Error("❌ Unexpected direct cache hit with different system prompts")
+		}
+	} else {
+		t.Log("✅ No cache hit (system prompts create different cache keys)")
+	}
+
+	// Test 2: ExcludeSystemPrompt = true
+	setup2 := CreateTestSetupWithExcludeSystemPrompt(t, true)
+	defer setup2.Cleanup()
+
+	ctx2 := CreateContextWithCacheKey("test-exclude-system-true")
+
+	t.Log("Testing ExcludeSystemPrompt=true...")
+	response3, err3 := setup2.Client.ChatCompletionRequest(ctx2, request1)
+	if err3 != nil {
+		return // Test will be skipped by retry function
+	}
+	AssertNoCacheHit(t, &schemas.BifrostResponse{ChatResponse: response3})
+
+	WaitForCache(setup2.Plugin)
+
+	response4, err4 := setup2.Client.ChatCompletionRequest(ctx2, request2)
+	if err4 != nil {
+		t.Fatalf("Fourth request failed: %v", err4)
+	}
+	// Should hit cache because system prompts are excluded from cache key
+	AssertCacheHit(t, &schemas.BifrostResponse{ChatResponse: response4}, "direct")
+
+	t.Log("✅ ExcludeSystemPrompt true vs false comparison works correctly")
+}
+
+// TestExcludeSystemPromptWithMultipleSystemMessages tests behavior with multiple system messages
+func TestExcludeSystemPromptWithMultipleSystemMessages(t *testing.T) {
+	setup := CreateTestSetupWithExcludeSystemPrompt(t, true)
+	defer setup.Cleanup()
+
+	ctx := CreateContextWithCacheKey("test-multiple-system-messages")
+
+	// Manually create conversation with multiple system messages
+	conversation1 := []schemas.ChatMessage{
+		{
+			Role:    schemas.ChatMessageRoleSystem,
+			Content: &schemas.ChatMessageContent{ContentStr: bifrost.Ptr("You are helpful")},
+		},
+		{
+			Role:    schemas.ChatMessageRoleSystem,
+			Content: &schemas.ChatMessageContent{ContentStr: bifrost.Ptr("Be concise")},
+		},
+		{
+			Role:    schemas.ChatMessageRoleUser,
+			Content: &schemas.ChatMessageContent{ContentStr: bifrost.Ptr("Hello")},
+		},
+		{
+			Role:    schemas.ChatMessageRoleAssistant,
+			Content: &schemas.ChatMessageContent{ContentStr: bifrost.Ptr("Hi!")},
+		},
+	}
+
+	conversation2 := []schemas.ChatMessage{
+		{
+			Role:    schemas.ChatMessageRoleSystem,
+			Content: &schemas.ChatMessageContent{ContentStr: bifrost.Ptr("You are an expert")},
+		},
+		{
+			Role:    schemas.ChatMessageRoleSystem,
+			Content: &schemas.ChatMessageContent{ContentStr: bifrost.Ptr("Be detailed")},
+		},
+		{
+			Role:    schemas.ChatMessageRoleUser,
+			Content: &schemas.ChatMessageContent{ContentStr: bifrost.Ptr("Hello")},
+		},
+		{
+			Role:    schemas.ChatMessageRoleAssistant,
+			Content: &schemas.ChatMessageContent{ContentStr: bifrost.Ptr("Hi!")},
+		},
+	}
+
+	request1 := CreateConversationRequest(conversation1, 0.7, 50)
+	request2 := CreateConversationRequest(conversation2, 0.7, 50)
+
+	t.Log("Caching conversation with multiple system messages...")
+	response1, err1 := setup.Client.ChatCompletionRequest(ctx, request1)
+	if err1 != nil {
+		return // Test will be skipped by retry function
+	}
+	AssertNoCacheHit(t, &schemas.BifrostResponse{ChatResponse: response1})
+
+	WaitForCache(setup.Plugin)
+
+	t.Log("Testing conversation with different multiple system messages...")
+	response2, err2 := setup.Client.ChatCompletionRequest(ctx, request2)
+	if err2 != nil {
+		if err2.Error != nil {
+			t.Fatalf("Second request failed: %v", err2.Error.Message)
+		} else {
+			t.Fatalf("Second request failed: %v", err2)
+		}
+	}
+	// Should hit cache because all system messages are excluded
+	AssertCacheHit(t, &schemas.BifrostResponse{ChatResponse: response2}, "direct")
+
+	t.Log("✅ ExcludeSystemPrompt works with multiple system messages")
+}
+
+// TestExcludeSystemPromptWithNoSystemMessages tests behavior when there are no system messages
+func TestExcludeSystemPromptWithNoSystemMessages(t *testing.T) {
+	setup := CreateTestSetupWithExcludeSystemPrompt(t, true)
+	defer setup.Cleanup()
+
+	ctx := CreateContextWithCacheKey("test-no-system-messages")
+
+	// Conversation with no system messages
+	conversation := []schemas.ChatMessage{
+		{
+			Role:    schemas.ChatMessageRoleUser,
+			Content: &schemas.ChatMessageContent{ContentStr: bifrost.Ptr("Hello")},
+		},
+		{
+			Role:    schemas.ChatMessageRoleAssistant,
+			Content: &schemas.ChatMessageContent{ContentStr: bifrost.Ptr("Hi there!")},
+		},
+	}
+
+	request := CreateConversationRequest(conversation, 0.7, 50)
+
+	t.Log("Testing conversation with no system messages...")
+	response1, err1 := setup.Client.ChatCompletionRequest(ctx, request)
+	if err1 != nil {
+		return // Test will be skipped by retry function
+	}
+	AssertNoCacheHit(t, &schemas.BifrostResponse{ChatResponse: response1})
+
+	WaitForCache(setup.Plugin)
+
+	// Should cache normally
+	response2, err2 := setup.Client.ChatCompletionRequest(ctx, request)
+	if err2 != nil {
+		if err2.Error != nil {
+			t.Fatalf("Second request failed: %v", err2.Error.Message)
+		} else {
+			t.Fatalf("Second request failed: %v", err2)
+		}
+	}
+	AssertCacheHit(t, &schemas.BifrostResponse{ChatResponse: response2}, "direct")
+
+	t.Log("✅ ExcludeSystemPrompt works correctly when no system messages present")
+}
--- a/plugins/semanticcache/plugin_core_test.go
+++ b/plugins/semanticcache/plugin_core_test.go
@@ -0,0 +1,601 @@
+package semanticcache
+
+import (
+	"context"
+	"strings"
+	"testing"
+	"time"
+
+	bifrost "github.com/maximhq/bifrost/core"
+	"github.com/maximhq/bifrost/core/schemas"
+	"github.com/maximhq/bifrost/framework/vectorstore"
+)
+
+// TestSemanticCacheBasicFunctionality tests the core caching functionality
+func TestSemanticCacheBasicFunctionality(t *testing.T) {
+	setup := NewTestSetup(t)
+	defer setup.Cleanup()
+
+	ctx := CreateContextWithCacheKey("test-basic-value")
+
+	// Create test request
+	testRequest := CreateBasicChatRequest(
+		"What is Bifrost? Answer in one short sentence.",
+		0.7,
+		50,
+	)
+
+	t.Log("Making first request (should go to OpenAI and be cached)...")
+
+	// Make first request (will go to OpenAI and be cached) - with retries
+	start1 := time.Now()
+	response1, err1 := setup.Client.ChatCompletionRequest(ctx, testRequest)
+	duration1 := time.Since(start1)
+
+	if err1 != nil {
+		return // Test will be skipped by retry function
+	}
+
+	if response1 == nil || len(response1.Choices) == 0 || response1.Choices[0].Message.Content.ContentStr == nil {
+		t.Fatal("First response is invalid")
+	}
+
+	t.Logf("First request completed in %v", duration1)
+	t.Logf("Response: %s", *response1.Choices[0].Message.Content.ContentStr)
+
+	// Wait for cache to be written
+	WaitForCache(setup.Plugin)
+
+	t.Log("Making second identical request (should be served from cache)...")
+
+	// Make second identical request (should be cached)
+	start2 := time.Now()
+	response2, err2 := setup.Client.ChatCompletionRequest(ctx, testRequest)
+	duration2 := time.Since(start2)
+
+	if err2 != nil {
+		if err2.Error != nil {
+			t.Fatalf("Second request failed: %v", err2.Error.Message)
+		} else {
+			t.Fatalf("Second request failed: %v", err2)
+		}
+	}
+
+	if response2 == nil || len(response2.Choices) == 0 || response2.Choices[0].Message.Content.ContentStr == nil {
+		t.Fatal("Second response is invalid")
+	}
+
+	t.Logf("Second request completed in %v", duration2)
+	t.Logf("Response: %s", *response2.Choices[0].Message.Content.ContentStr)
+
+	// Verify cache hit
+	AssertCacheHit(t, &schemas.BifrostResponse{ChatResponse: response2}, string(CacheTypeDirect))
+
+	// Performance comparison
+	t.Logf("Performance Summary:")
+	t.Logf("First request (OpenAI):  %v", duration1)
+	t.Logf("Second request (Cache):  %v", duration2)
+
+	if duration2 >= duration1 {
+		t.Errorf("Cache request took longer than original request: cache=%v, original=%v", duration2, duration1)
+	} else {
+		speedup := float64(duration1) / float64(duration2)
+		t.Logf("Cache speedup: %.2fx faster", speedup)
+
+		// Assert that cache is at least 1.5x faster (reasonable expectation)
+		if speedup < 1.5 {
+			t.Errorf("Cache speedup is less than 1.5x: got %.2fx", speedup)
+		}
+	}
+
+	// Verify responses are identical (content should be the same)
+	content1 := *response1.Choices[0].Message.Content.ContentStr
+	content2 := *response2.Choices[0].Message.Content.ContentStr
+
+	if content1 != content2 {
+		t.Errorf("Response content differs between cached and original:\nOriginal: %s\nCached:   %s", content1, content2)
+	}
+
+	// Verify provider information is maintained in cached response
+	if response2.ExtraFields.Provider != testRequest.Provider {
+		t.Errorf("Provider mismatch in cached response: expected %s, got %s",
+			testRequest.Provider, response2.ExtraFields.Provider)
+	}
+
+	t.Log("✅ Basic semantic caching test completed successfully!")
+}
+
+// TestSemanticSearch tests the semantic similarity search functionality
+func TestSemanticSearch(t *testing.T) {
+	setup := NewTestSetup(t)
+	defer setup.Cleanup()
+
+	// Lower threshold for more flexible matching
+	setup.Config.Threshold = 0.5
+
+	ctx := CreateContextWithCacheKey("semantic-test-value")
+
+	// First request - this will be cached
+	firstRequest := CreateBasicChatRequest(
+		"What is machine learning? Explain briefly.",
+		0.0, // Use 0 temperature for consistent results
+		50,
+	)
+
+	t.Log("Making first request (should go to OpenAI and be cached)...")
+	start1 := time.Now()
+	response1, err1 := setup.Client.ChatCompletionRequest(ctx, firstRequest)
+	duration1 := time.Since(start1)
+
+	if err1 != nil {
+		return // Test will be skipped by retry function
+	}
+
+	if response1 == nil || len(response1.Choices) == 0 || response1.Choices[0].Message.Content.ContentStr == nil {
+		t.Fatal("First response is invalid")
+	}
+
+	t.Logf("First request completed in %v", duration1)
+	t.Logf("Response: %s", *response1.Choices[0].Message.Content.ContentStr)
+
+	// Wait for cache to be written (async PostLLMHook needs time to complete)
+	WaitForCache(setup.Plugin)
+
+	// Second request - very similar text to test semantic matching
+	secondRequest := CreateBasicChatRequest(
+		"What is machine learning? Explain it briefly.",
+		0.0, // Use 0 temperature for consistent results
+		50,
+	)
+
+	t.Log("Making semantically similar request (should be served from semantic cache)...")
+	start2 := time.Now()
+	response2, err2 := setup.Client.ChatCompletionRequest(ctx, secondRequest)
+	duration2 := time.Since(start2)
+
+	if err2 != nil {
+		if err2.Error != nil {
+			t.Fatalf("Second request failed: %v", err2.Error.Message)
+		} else {
+			t.Fatalf("Second request failed: %v", err2)
+		}
+	}
+
+	if response2 == nil || len(response2.Choices) == 0 || response2.Choices[0].Message.Content.ContentStr == nil {
+		t.Fatal("Second response is invalid")
+	}
+
+	t.Logf("Second request completed in %v", duration2)
+	t.Logf("Response: %s", *response2.Choices[0].Message.Content.ContentStr)
+
+	// Check if second request was served from semantic cache
+	semanticMatch := false
+
+	if response2.ExtraFields.CacheDebug != nil && response2.ExtraFields.CacheDebug.CacheHit {
+		if response2.ExtraFields.CacheDebug.HitType != nil && *response2.ExtraFields.CacheDebug.HitType == string(CacheTypeSemantic) {
+			semanticMatch = true
+
+			threshold := 0.0
+			similarity := 0.0
+
+			if response2.ExtraFields.CacheDebug.Threshold != nil {
+				threshold = *response2.ExtraFields.CacheDebug.Threshold
+			}
+			if response2.ExtraFields.CacheDebug.Similarity != nil {
+				similarity = *response2.ExtraFields.CacheDebug.Similarity
+			}
+
+			t.Logf("✅ Second request was served from semantic cache! Cache threshold: %f, Cache similarity: %f", threshold, similarity)
+		}
+	}
+
+	if !semanticMatch {
+		t.Error("Semantic match expected but not found")
+		return
+	}
+
+	// Performance comparison
+	t.Logf("Semantic Cache Performance:")
+	t.Logf("First request (OpenAI):     %v", duration1)
+	t.Logf("Second request (Semantic):  %v", duration2)
+
+	if duration2 < duration1 {
+		speedup := float64(duration1) / float64(duration2)
+		t.Logf("Semantic cache speedup: %.2fx faster", speedup)
+	}
+
+	t.Log("✅ Semantic search test completed successfully!")
+}
+
+func TestToFloat32Embedding(t *testing.T) {
+	input := []float64{0.12345678901234568, -0.875, 1.5}
+
+	got := toFloat32Embedding(input)
+
+	if len(got) != len(input) {
+		t.Fatalf("expected %d elements, got %d", len(input), len(got))
+	}
+
+	for i, want := range input {
+		if got[i] != float32(want) {
+			t.Fatalf("expected element %d to be %v, got %v", i, float32(want), got[i])
+		}
+	}
+}
+
+func TestFlattenToFloat32Embedding(t *testing.T) {
+	input := [][]float64{
+		{0.25, 0.5},
+		{-0.75},
+		{},
+		{1.25, 2.5},
+	}
+
+	got := flattenToFloat32Embedding(input)
+	want := []float32{0.25, 0.5, -0.75, 1.25, 2.5}
+
+	if len(got) != len(want) {
+		t.Fatalf("expected %d elements, got %d", len(want), len(got))
+	}
+
+	for i := range want {
+		if got[i] != want[i] {
+			t.Fatalf("expected element %d to be %v, got %v", i, want[i], got[i])
+		}
+	}
+}
+
+// TestDirectVsSemanticSearch tests the difference between direct hash matching and semantic search
+func TestDirectVsSemanticSearch(t *testing.T) {
+	setup := NewTestSetup(t)
+	defer setup.Cleanup()
+
+	// Lower threshold for more flexible semantic matching
+	setup.Config.Threshold = 0.2
+
+	ctx := CreateContextWithCacheKey("direct-vs-semantic-test")
+
+	// Test Case 1: Exact same request (should use direct hash matching)
+	t.Log("=== Test Case 1: Exact Same Request (Direct Hash Match) ===")
+
+	exactRequest := CreateBasicChatRequest(
+		"What is artificial intelligence?",
+		0.1,
+		100,
+	)
+
+	t.Log("Making first request...")
+	_, err1 := setup.Client.ChatCompletionRequest(ctx, exactRequest)
+	if err1 != nil {
+		return // Test will be skipped by retry function
+	}
+
+	WaitForCache(setup.Plugin)
+
+	t.Log("Making exact same request (should hit direct cache)...")
+	response2, err2 := setup.Client.ChatCompletionRequest(ctx, exactRequest)
+	if err2 != nil {
+		if err2.Error != nil {
+			t.Fatalf("Second request failed: %v", err2.Error.Message)
+		} else {
+			t.Fatalf("Second request failed: %v", err2)
+		}
+	}
+
+	// Should be a direct cache hit
+	AssertCacheHit(t, &schemas.BifrostResponse{ChatResponse: response2}, string(CacheTypeDirect))
+
+	// Test Case 2: Similar but different request (should use semantic search)
+	t.Log("\n=== Test Case 2: Semantically Similar Request ===")
+
+	semanticRequest := CreateBasicChatRequest(
+		"Can you explain what AI is?", // Similar but different wording
+		0.1,                           // Same parameters
+		100,
+	)
+
+	t.Log("Making semantically similar request...")
+	response3, err3 := setup.Client.ChatCompletionRequest(ctx, semanticRequest)
+	if err3 != nil {
+		t.Fatalf("Third request failed: %v", err3)
+	}
+
+	semanticMatch := false
+
+	// Check if it was served from cache and what type
+	if response3.ExtraFields.CacheDebug != nil && response3.ExtraFields.CacheDebug.CacheHit {
+		if response3.ExtraFields.CacheDebug.HitType != nil && *response3.ExtraFields.CacheDebug.HitType == string(CacheTypeSemantic) {
+			semanticMatch = true
+
+			threshold := 0.0
+			similarity := 0.0
+
+			if response3.ExtraFields.CacheDebug.Threshold != nil {
+				threshold = *response3.ExtraFields.CacheDebug.Threshold
+			}
+			if response3.ExtraFields.CacheDebug.Similarity != nil {
+				similarity = *response3.ExtraFields.CacheDebug.Similarity
+			}
+
+			t.Logf("✅ Third request was served from semantic cache! Cache threshold: %f, Cache similarity: %f", threshold, similarity)
+		}
+	}
+
+	if !semanticMatch {
+		t.Error("Semantic match expected but not found")
+		return
+	}
+
+	t.Log("✅ Direct vs semantic search test completed!")
+}
+
+// TestNoCacheScenarios tests scenarios where caching should NOT occur
+func TestNoCacheScenarios(t *testing.T) {
+	setup := NewTestSetup(t)
+	defer setup.Cleanup()
+
+	ctx := CreateContextWithCacheKey("no-cache-test")
+
+	// Test Case 1: Different parameters should NOT cache hit
+	t.Log("=== Test Case 1: Different Parameters ===")
+
+	basePrompt := "What is the capital of France?"
+
+	// First request
+	request1 := CreateBasicChatRequest(basePrompt, 0.1, 50)
+	_, err1 := setup.Client.ChatCompletionRequest(ctx, request1)
+	if err1 != nil {
+		return // Test will be skipped by retry function
+	}
+
+	WaitForCache(setup.Plugin)
+
+	// Second request with different temperature
+	request2 := CreateBasicChatRequest(basePrompt, 0.9, 50) // Different temperature
+	response2, err2 := setup.Client.ChatCompletionRequest(ctx, request2)
+	if err2 != nil {
+		return // Test will be skipped by retry function
+	}
+
+	// Should NOT be cached
+	AssertNoCacheHit(t, &schemas.BifrostResponse{ChatResponse: response2})
+
+	// Test Case 2: Different max_tokens should NOT cache hit
+	t.Log("\n=== Test Case 2: Different MaxTokens ===")
+
+	request3 := CreateBasicChatRequest(basePrompt, 0.1, 200) // Different max_tokens
+	response3, err3 := setup.Client.ChatCompletionRequest(ctx, request3)
+	if err3 != nil {
+		return // Test will be skipped by retry function
+	}
+
+	// Should NOT be cached
+	AssertNoCacheHit(t, &schemas.BifrostResponse{ChatResponse: response3})
+
+	t.Log("✅ No cache scenarios test completed!")
+}
+
+// TestCacheConfiguration tests different cache configuration options
+func TestCacheConfiguration(t *testing.T) {
+	tests := []struct {
+		name             string
+		config           *Config
+		expectedBehavior string
+	}{
+		{
+			name: "High Threshold",
+			config: &Config{
+				Provider:       schemas.OpenAI,
+				EmbeddingModel: "text-embedding-3-small",
+				Dimension:      1536,
+				Threshold:      0.95, // Very high threshold
+				Keys: []schemas.Key{
+					{Value: *schemas.NewEnvVar("env.OPENAI_API_KEY"), Models: schemas.WhiteList{"*"}, Weight: 1.0},
+				},
+			},
+			expectedBehavior: "strict_matching",
+		},
+		{
+			name: "Low Threshold",
+			config: &Config{
+				Provider:       schemas.OpenAI,
+				EmbeddingModel: "text-embedding-3-small",
+				Dimension:      1536,
+				Threshold:      0.1, // Very low threshold
+				Keys: []schemas.Key{
+					{Value: *schemas.NewEnvVar("env.OPENAI_API_KEY"), Models: schemas.WhiteList{"*"}, Weight: 1.0},
+				},
+			},
+			expectedBehavior: "loose_matching",
+		},
+		{
+			name: "Custom TTL",
+			config: &Config{
+				Provider:       schemas.OpenAI,
+				EmbeddingModel: "text-embedding-3-small",
+				Dimension:      1536,
+				Threshold:      0.8,
+				TTL:            1 * time.Hour, // Custom TTL
+				Keys: []schemas.Key{
+					{Value: *schemas.NewEnvVar("env.OPENAI_API_KEY"), Models: schemas.WhiteList{"*"}, Weight: 1.0},
+				},
+			},
+			expectedBehavior: "custom_ttl",
+		},
+	}
+
+	for _, tt := range tests {
+		t.Run(tt.name, func(t *testing.T) {
+			setup := NewTestSetupWithConfig(t, tt.config)
+			defer setup.Cleanup()
+
+			ctx := CreateContextWithCacheKey("config-test-" + tt.name)
+
+			// Basic functionality test with the configuration
+			testRequest := CreateBasicChatRequest("Test configuration: "+tt.name, 0.5, 50)
+
+			_, err1 := setup.Client.ChatCompletionRequest(ctx, testRequest)
+			if err1 != nil {
+				return // Test will be skipped by retry function
+			}
+
+			WaitForCache(setup.Plugin)
+
+			_, err2 := setup.Client.ChatCompletionRequest(ctx, testRequest)
+			if err2 != nil {
+				if err2.Error != nil {
+					t.Fatalf("Second request failed: %v", err2.Error.Message)
+				} else {
+					t.Fatalf("Second request failed: %v", err2)
+				}
+			}
+
+			t.Logf("✅ Configuration test '%s' completed", tt.name)
+		})
+	}
+}
+
+// MockUnsupportedStore is a mock store that returns ErrNotSupported for semantic operations
+type MockUnsupportedStore struct{}
+
+func (m *MockUnsupportedStore) Ping(ctx context.Context) error {
+	return nil
+}
+
+func (m *MockUnsupportedStore) CreateNamespace(ctx context.Context, namespace string, dimension int, properties map[string]vectorstore.VectorStoreProperties) error {
+	return vectorstore.ErrNotSupported
+}
+
+func (m *MockUnsupportedStore) DeleteNamespace(ctx context.Context, namespace string) error {
+	return nil
+}
+
+func (m *MockUnsupportedStore) GetChunk(ctx context.Context, namespace string, id string) (vectorstore.SearchResult, error) {
+	return vectorstore.SearchResult{}, vectorstore.ErrNotSupported
+}
+
+func (m *MockUnsupportedStore) GetChunks(ctx context.Context, namespace string, ids []string) ([]vectorstore.SearchResult, error) {
+	return nil, vectorstore.ErrNotSupported
+}
+
+func (m *MockUnsupportedStore) GetAll(ctx context.Context, namespace string, queries []vectorstore.Query, selectFields []string, cursor *string, limit int64) ([]vectorstore.SearchResult, *string, error) {
+	return nil, nil, vectorstore.ErrNotSupported
+}
+
+func (m *MockUnsupportedStore) GetNearest(ctx context.Context, namespace string, vector []float32, queries []vectorstore.Query, selectFields []string, threshold float64, limit int64) ([]vectorstore.SearchResult, error) {
+	return nil, vectorstore.ErrNotSupported
+}
+
+func (m *MockUnsupportedStore) RequiresVectors() bool {
+	return false
+}
+
+func (m *MockUnsupportedStore) Add(ctx context.Context, namespace string, id string, embedding []float32, metadata map[string]interface{}) error {
+	return vectorstore.ErrNotSupported
+}
+
+func (m *MockUnsupportedStore) Delete(ctx context.Context, namespace string, id string) error {
+	return vectorstore.ErrNotSupported
+}
+
+func (m *MockUnsupportedStore) DeleteAll(ctx context.Context, namespace string, queries []vectorstore.Query) ([]vectorstore.DeleteResult, error) {
+	return nil, vectorstore.ErrNotSupported
+}
+
+func (m *MockUnsupportedStore) SearchSemanticCache(ctx context.Context, queryEmbedding []float32, metadata map[string]interface{}, threshold float64, limit int64) ([]vectorstore.SearchResult, error) {
+	return nil, vectorstore.ErrNotSupported
+}
+
+func (m *MockUnsupportedStore) AddSemanticCache(ctx context.Context, key string, embedding []float32, metadata map[string]interface{}, ttl time.Duration) error {
+	return vectorstore.ErrNotSupported
+}
+
+func (m *MockUnsupportedStore) EnsureSemanticIndex(ctx context.Context, keyPrefix string, embeddingDim int, metadataFields []string) error {
+	return vectorstore.ErrNotSupported
+}
+
+func (m *MockUnsupportedStore) Close(ctx context.Context, namespace string) error {
+	return nil
+}
+
+// TestInvalidProviderRejection tests that providers without embedding support are rejected during initialization
+func TestInvalidProviderRejection(t *testing.T) {
+	ctx := schemas.NewBifrostContext(context.Background(), schemas.NoDeadline)
+	logger := bifrost.NewDefaultLogger(schemas.LogLevelDebug)
+
+	// Create a mock vector store for testing
+	mockStore := &MockUnsupportedStore{}
+
+	// Test each provider that doesn't support embeddings
+	unsupportedProviders := []schemas.ModelProvider{
+		schemas.Anthropic,
+		schemas.Cerebras,
+		schemas.Groq,
+		schemas.OpenRouter,
+		schemas.Parasail,
+		schemas.Perplexity,
+		schemas.Replicate,
+		schemas.XAI,
+		schemas.Elevenlabs,
+	}
+
+	for _, provider := range unsupportedProviders {
+		t.Run(string(provider), func(t *testing.T) {
+			config := &Config{
+				Provider:          provider,
+				EmbeddingModel:    "some-model",
+				Dimension:         1536,
+				Threshold:         0.8,
+				CleanUpOnShutdown: false,
+				Keys: []schemas.Key{
+					{
+						Value:  *schemas.NewEnvVar("env.TEST_API_KEY"),
+						Models: schemas.WhiteList{"*"},
+						Weight: 1.0,
+					},
+				},
+			}
+
+			_, err := Init(ctx, config, logger, mockStore)
+			if err == nil {
+				t.Errorf("Expected error for provider '%s' but got none", provider)
+			}
+
+			expectedErrSubstring := "does not support embedding operations"
+			if err != nil && !strings.Contains(err.Error(), expectedErrSubstring) {
+				t.Errorf("Expected error message to contain '%s', but got: %v", expectedErrSubstring, err)
+			}
+		})
+	}
+}
+
+// TestValidProviderAccepted tests that providers with embedding support are accepted during initialization
+func TestValidProviderAccepted(t *testing.T) {
+	ctx := schemas.NewBifrostContext(context.Background(), schemas.NoDeadline)
+	logger := bifrost.NewDefaultLogger(schemas.LogLevelDebug)
+
+	// Create a mock vector store for testing
+	mockStore := &MockUnsupportedStore{}
+
+	// Test a supported provider (OpenAI)
+	config := &Config{
+		Provider:          schemas.OpenAI,
+		EmbeddingModel:    "text-embedding-3-small",
+		Dimension:         1536,
+		Threshold:         0.8,
+		CleanUpOnShutdown: false,
+		Keys: []schemas.Key{
+			{
+				Value:  *schemas.NewEnvVar("env.OPENAI_API_KEY"),
+				Models: schemas.WhiteList{"*"},
+				Weight: 1.0,
+			},
+		},
+	}
+
+	// Should fail due to namespace creation, not provider validation
+	_, err := Init(ctx, config, logger, mockStore)
+	if err != nil && strings.Contains(err.Error(), "does not support embedding operations") {
+		t.Errorf("Valid provider OpenAI should not be rejected for embedding support, but got: %v", err)
+	}
+}
--- a/plugins/semanticcache/plugin_cross_cache_test.go
+++ b/plugins/semanticcache/plugin_cross_cache_test.go
@@ -0,0 +1,327 @@
+package semanticcache
+
+import (
+	"testing"
+
+	"github.com/maximhq/bifrost/core/schemas"
+)
+
+// TestCrossCacheTypeAccessibility tests that entries cached one way are accessible another way
+func TestCrossCacheTypeAccessibility(t *testing.T) {
+	setup := NewTestSetup(t)
+	defer setup.Cleanup()
+
+	testRequest := CreateBasicChatRequest("What is artificial intelligence?", 0.7, 100)
+
+	// Test 1: Cache with default behavior (both direct + semantic)
+	ctx1 := CreateContextWithCacheKey("test-cross-cache-access")
+	t.Log("Caching with default behavior (both direct + semantic)...")
+	response1, err1 := setup.Client.ChatCompletionRequest(ctx1, testRequest)
+	if err1 != nil {
+		return // Test will be skipped by retry function
+	}
+	AssertNoCacheHit(t, &schemas.BifrostResponse{ChatResponse: response1})
+
+	WaitForCache(setup.Plugin)
+
+	// Test 2: Retrieve with direct-only cache type
+	ctx2 := CreateContextWithCacheKeyAndType("test-cross-cache-access", CacheTypeDirect)
+	t.Log("Retrieving with CacheTypeKey=direct...")
+	response2, err2 := setup.Client.ChatCompletionRequest(ctx2, testRequest)
+	if err2 != nil {
+		if err2.Error != nil {
+			t.Fatalf("Second request failed: %v", err2.Error.Message)
+		} else {
+			t.Fatalf("Second request failed: %v", err2)
+		}
+	}
+	AssertCacheHit(t, &schemas.BifrostResponse{ChatResponse: response2}, "direct") // Should find direct match
+
+	// Test 3: Retrieve with semantic-only cache type
+	ctx3 := CreateContextWithCacheKeyAndType("test-cross-cache-access", CacheTypeSemantic)
+	t.Log("Retrieving with CacheTypeKey=semantic...")
+	response3, err3 := setup.Client.ChatCompletionRequest(ctx3, testRequest)
+	if err3 != nil {
+		t.Fatalf("Third request failed: %v", err3)
+	}
+	AssertCacheHit(t, &schemas.BifrostResponse{ChatResponse: response3}, "semantic") // Should find semantic match
+
+	t.Log("✅ Entries cached with default behavior are accessible via both cache types")
+}
+
+// TestCacheTypeIsolation tests that entries cached separately by type behave correctly
+func TestCacheTypeIsolation(t *testing.T) {
+	setup := NewTestSetup(t)
+	defer setup.Cleanup()
+
+	testRequest := CreateBasicChatRequest("Define blockchain technology", 0.7, 100)
+
+	// Clear cache to start fresh
+	clearTestKeysWithStore(t, setup.Store)
+
+	// Test 1: Cache with direct-only
+	ctx1 := CreateContextWithCacheKeyAndType("test-cache-isolation", CacheTypeDirect)
+	t.Log("Caching with CacheTypeKey=direct only...")
+	response1, err1 := setup.Client.ChatCompletionRequest(ctx1, testRequest)
+	if err1 != nil {
+		return // Test will be skipped by retry function
+	}
+	AssertNoCacheHit(t, &schemas.BifrostResponse{ChatResponse: response1}) // Fresh request
+
+	WaitForCache(setup.Plugin)
+
+	// Test 2: Try to retrieve with semantic-only (should miss because no semantic entry)
+	ctx2 := CreateContextWithCacheKeyAndType("test-cache-isolation", CacheTypeSemantic)
+	t.Log("Retrieving same request with CacheTypeKey=semantic (should miss)...")
+	response2, err2 := setup.Client.ChatCompletionRequest(ctx2, testRequest)
+	if err2 != nil {
+		return // Test will be skipped by retry function
+	}
+	AssertNoCacheHit(t, &schemas.BifrostResponse{ChatResponse: response2}) // Should miss - no semantic cache entry
+
+	WaitForCache(setup.Plugin)
+
+	// Test 3: Retrieve with direct-only (should hit)
+	t.Log("Retrieving with CacheTypeKey=direct (should hit)...")
+	response3, err3 := setup.Client.ChatCompletionRequest(ctx1, testRequest)
+	if err3 != nil {
+		t.Fatalf("Third request failed: %v", err3)
+	}
+	AssertCacheHit(t, &schemas.BifrostResponse{ChatResponse: response3}, "direct") // Should hit direct cache
+
+	// Test 4: Default behavior (should find the direct cache)
+	ctx4 := CreateContextWithCacheKey("test-cache-isolation")
+	t.Log("Retrieving with default behavior (should find direct cache)...")
+	response4, err4 := setup.Client.ChatCompletionRequest(ctx4, testRequest)
+	if err4 != nil {
+		t.Fatalf("Fourth request failed: %v", err4)
+	}
+	AssertCacheHit(t, &schemas.BifrostResponse{ChatResponse: response4}, "direct") // Should find existing direct cache
+
+	t.Log("✅ Cache type isolation works correctly")
+}
+
+// TestCacheTypeFallbackBehavior tests whether cache types fallback to each other
+func TestCacheTypeFallbackBehavior(t *testing.T) {
+	setup := NewTestSetup(t)
+	defer setup.Cleanup()
+
+	// Cache an entry with default behavior
+	originalRequest := CreateBasicChatRequest("Explain machine learning", 0.7, 100)
+	ctx1 := CreateContextWithCacheKey("test-fallback-behavior")
+
+	t.Log("Caching with default behavior...")
+	response1, err1 := setup.Client.ChatCompletionRequest(ctx1, originalRequest)
+	if err1 != nil {
+		return // Test will be skipped by retry function
+	}
+	AssertNoCacheHit(t, &schemas.BifrostResponse{ChatResponse: response1})
+
+	WaitForCache(setup.Plugin)
+
+	// Test similar request with direct-only (should miss direct, no fallback, but should cache response)
+	similarRequest := CreateBasicChatRequest("Explain machine learning concepts", 0.7, 100)
+	ctx2 := CreateContextWithCacheKeyAndType("test-fallback-behavior", CacheTypeDirect)
+
+	t.Log("Testing similar request with CacheTypeKey=direct (should miss, make request, cache without embeddings)...")
+	response2, err2 := setup.Client.ChatCompletionRequest(ctx2, similarRequest)
+	if err2 != nil {
+		return // Test will be skipped by retry function
+	}
+	AssertNoCacheHit(t, &schemas.BifrostResponse{ChatResponse: response2}) // Should miss - no direct match, no semantic search
+
+	WaitForCache(setup.Plugin) // Let the response get cached
+
+	// Test same similar request with semantic-only (should hit original entry)
+	ctx3 := CreateContextWithCacheKeyAndType("test-fallback-behavior", CacheTypeSemantic)
+
+	t.Log("Testing similar request with CacheTypeKey=semantic (should find semantic match from step 1)...")
+	response3, err3 := setup.Client.ChatCompletionRequest(ctx3, similarRequest)
+	if err3 != nil {
+		t.Fatalf("Third request failed: %v", err3)
+	}
+
+	// Should find semantic match from step 1's cached entry (which has embeddings)
+	if response3.ExtraFields.CacheDebug != nil && response3.ExtraFields.CacheDebug.CacheHit {
+		AssertCacheHit(t, &schemas.BifrostResponse{ChatResponse: response3}, "semantic")
+		t.Log("✅ Semantic search found similar entry from step 1")
+	} else {
+		AssertNoCacheHit(t, &schemas.BifrostResponse{ChatResponse: response3})
+		t.Log("ℹ️  No semantic match found (threshold may be too high or semantic similarity low)")
+	}
+
+	// Test a different similar request with default behavior (try both, fallback to semantic)
+	// Use a slightly different request to avoid hitting the cached response from step 2
+	differentSimilarRequest := CreateBasicChatRequest("Explain the basics of machine learning", 0.7, 100)
+	ctx4 := CreateContextWithCacheKey("test-fallback-behavior")
+
+	t.Log("Testing different similar request with default behavior (direct miss -> semantic fallback)...")
+	response4, err4 := setup.Client.ChatCompletionRequest(ctx4, differentSimilarRequest)
+	if err4 != nil {
+		t.Fatalf("Fourth request failed: %v", err4)
+	}
+
+	// Should try direct first (miss), then semantic (might hit)
+	if response4.ExtraFields.CacheDebug != nil && response4.ExtraFields.CacheDebug.CacheHit {
+		AssertCacheHit(t, &schemas.BifrostResponse{ChatResponse: response4}, "semantic")
+		t.Log("✅ Default behavior found semantic fallback")
+	} else {
+		AssertNoCacheHit(t, &schemas.BifrostResponse{ChatResponse: response4})
+		t.Log("ℹ️  No fallback match found")
+	}
+
+	t.Log("✅ Cache type fallback behavior verified")
+}
+
+// TestMultipleCacheEntriesPriority tests behavior when multiple cache entries exist
+func TestMultipleCacheEntriesPriority(t *testing.T) {
+	setup := NewTestSetup(t)
+	defer setup.Cleanup()
+
+	testRequest := CreateBasicChatRequest("What is deep learning?", 0.7, 100)
+
+	// Create cache entry with default behavior first
+	ctx1 := CreateContextWithCacheKey("test-cache-priority")
+	t.Log("Creating cache entry with default behavior...")
+	response1, err1 := setup.Client.ChatCompletionRequest(ctx1, testRequest)
+	if err1 != nil {
+		return // Test will be skipped by retry function
+	}
+	AssertNoCacheHit(t, &schemas.BifrostResponse{ChatResponse: response1})
+	originalContent := *response1.Choices[0].Message.Content.ContentStr
+
+	WaitForCache(setup.Plugin)
+
+	// Verify it hits cache with default behavior
+	t.Log("Verifying cache hit with default behavior...")
+	response2, err2 := setup.Client.ChatCompletionRequest(ctx1, testRequest)
+	if err2 != nil {
+		if err2.Error != nil {
+			t.Fatalf("Second request failed: %v", err2.Error.Message)
+		} else {
+			t.Fatalf("Second request failed: %v", err2)
+		}
+	}
+	AssertCacheHit(t, &schemas.BifrostResponse{ChatResponse: response2}, "direct") // Should hit direct cache
+	cachedContent := *response2.Choices[0].Message.Content.ContentStr
+
+	// Verify content is the same
+	if originalContent != cachedContent {
+		t.Errorf("Cache content mismatch:\nOriginal: %s\nCached: %s", originalContent, cachedContent)
+	}
+
+	// Test with direct-only access
+	ctx2 := CreateContextWithCacheKeyAndType("test-cache-priority", CacheTypeDirect)
+	t.Log("Accessing with CacheTypeKey=direct...")
+	response3, err3 := setup.Client.ChatCompletionRequest(ctx2, testRequest)
+	if err3 != nil {
+		t.Fatalf("Third request failed: %v", err3)
+	}
+	AssertCacheHit(t, &schemas.BifrostResponse{ChatResponse: response3}, "direct") // Should find direct cache
+
+	// Test with semantic-only access
+	ctx3 := CreateContextWithCacheKeyAndType("test-cache-priority", CacheTypeSemantic)
+	t.Log("Accessing with CacheTypeKey=semantic...")
+	response4, err4 := setup.Client.ChatCompletionRequest(ctx3, testRequest)
+	if err4 != nil {
+		t.Fatalf("Fourth request failed: %v", err4)
+	}
+	AssertCacheHit(t, &schemas.BifrostResponse{ChatResponse: response4}, "semantic") // Should find semantic cache
+
+	t.Log("✅ Multiple cache entries accessible correctly")
+}
+
+// TestCrossCacheTypeWithDifferentParameters tests cache type behavior with parameter variations
+func TestCrossCacheTypeWithDifferentParameters(t *testing.T) {
+	setup := NewTestSetup(t)
+	defer setup.Cleanup()
+
+	baseMessage := "Explain quantum computing"
+
+	// Cache with specific parameters
+	request1 := CreateBasicChatRequest(baseMessage, 0.7, 100)
+	ctx1 := CreateContextWithCacheKey("test-cross-cache-params")
+
+	t.Log("Caching with temp=0.7, max_tokens=100...")
+	response1, err1 := setup.Client.ChatCompletionRequest(ctx1, request1)
+	if err1 != nil {
+		return // Test will be skipped by retry function
+	}
+	AssertNoCacheHit(t, &schemas.BifrostResponse{ChatResponse: response1})
+
+	WaitForCache(setup.Plugin)
+
+	// Test same parameters with direct-only
+	ctx2 := CreateContextWithCacheKeyAndType("test-cross-cache-params", CacheTypeDirect)
+	t.Log("Retrieving same parameters with CacheTypeKey=direct...")
+	response2, err2 := setup.Client.ChatCompletionRequest(ctx2, request1)
+	if err2 != nil {
+		if err2.Error != nil {
+			t.Fatalf("Second request failed: %v", err2.Error.Message)
+		} else {
+			t.Fatalf("Second request failed: %v", err2)
+		}
+	}
+	AssertCacheHit(t, &schemas.BifrostResponse{ChatResponse: response2}, "direct") // Should hit
+
+	// Test different parameters - should miss
+	request3 := CreateBasicChatRequest(baseMessage, 0.5, 200) // Different temp and tokens
+	t.Log("Testing different parameters (should miss)...")
+	response3, err3 := setup.Client.ChatCompletionRequest(ctx2, request3)
+	if err3 != nil {
+		return // Test will be skipped by retry function
+	}
+	AssertNoCacheHit(t, &schemas.BifrostResponse{ChatResponse: response3}) // Should miss due to different params
+
+	// Test semantic search with different parameters
+	ctx4 := CreateContextWithCacheKeyAndType("test-cross-cache-params", CacheTypeSemantic)
+	similarRequest := CreateBasicChatRequest("Can you explain quantum computing", 0.5, 200)
+
+	t.Log("Testing semantic search with different params and similar message...")
+	response4, err4 := setup.Client.ChatCompletionRequest(ctx4, similarRequest)
+	if err4 != nil {
+		return // Test will be skipped by retry function
+	}
+	// Should miss semantic search due to different parameters (params_hash different)
+	AssertNoCacheHit(t, &schemas.BifrostResponse{ChatResponse: response4})
+
+	t.Log("✅ Cross-cache-type parameter handling works correctly")
+}
+
+// TestCacheTypeErrorHandling tests error scenarios with cache types
+func TestCacheTypeErrorHandling(t *testing.T) {
+	setup := NewTestSetup(t)
+	defer setup.Cleanup()
+
+	testRequest := CreateBasicChatRequest("Test error handling", 0.7, 50)
+
+	// Test invalid cache type (should fallback to default)
+	ctx1 := CreateContextWithCacheKey("test-cache-error-handling")
+	ctx1 = ctx1.WithValue(CacheTypeKey, "invalid_cache_type")
+
+	t.Log("Testing invalid cache type (should fallback to default behavior)...")
+	response1, err1 := setup.Client.ChatCompletionRequest(ctx1, testRequest)
+	if err1 != nil {
+		return // Test will be skipped by retry function
+	}
+	AssertNoCacheHit(t, &schemas.BifrostResponse{ChatResponse: response1}) // Should work with fallback behavior
+
+	WaitForCache(setup.Plugin)
+
+	// Test nil cache type (should use default)
+	ctx2 := CreateContextWithCacheKey("test-cache-error-handling")
+	ctx2 = ctx2.WithValue(CacheTypeKey, nil)
+
+	t.Log("Testing nil cache type (should use default behavior)...")
+	response2, err2 := setup.Client.ChatCompletionRequest(ctx2, testRequest)
+	if err2 != nil {
+		if err2.Error != nil {
+			t.Fatalf("Second request failed: %v", err2.Error.Message)
+		} else {
+			t.Fatalf("Second request failed: %v", err2)
+		}
+	}
+	AssertCacheHit(t, &schemas.BifrostResponse{ChatResponse: response2}, "direct") // Should find cached entry from first request
+
+	t.Log("✅ Cache type error handling works correctly")
+}
--- a/plugins/semanticcache/plugin_default_cache_key_test.go
+++ b/plugins/semanticcache/plugin_default_cache_key_test.go
@@ -0,0 +1,133 @@
+package semanticcache
+
+import (
+	"context"
+	"testing"
+
+	"github.com/maximhq/bifrost/core/schemas"
+)
+
+// TestDefaultCacheKey_CachesWithoutPerRequestKey verifies that when DefaultCacheKey
+// is configured, requests without an explicit cache key are cached automatically.
+func TestDefaultCacheKey_CachesWithoutPerRequestKey(t *testing.T) {
+	config := getDefaultTestConfig()
+	config.DefaultCacheKey = "test-default-key"
+
+	setup := NewTestSetupWithConfig(t, config)
+	defer setup.Cleanup()
+
+	// Context with NO per-request cache key
+	ctx := schemas.NewBifrostContext(context.Background(), schemas.NoDeadline)
+
+	testRequest := CreateBasicChatRequest("What is Bifrost? Answer in one short sentence.", 0.7, 50)
+
+	t.Log("Making first request without per-request cache key (should use default and be cached)...")
+	response1, err1 := setup.Client.ChatCompletionRequest(ctx, testRequest)
+	if err1 != nil {
+		return // Test will be skipped by retry function
+	}
+
+	if response1 == nil || len(response1.Choices) == 0 || response1.Choices[0].Message.Content.ContentStr == nil {
+		t.Fatal("First response is invalid")
+	}
+
+	// First request should NOT be a cache hit
+	AssertNoCacheHit(t, &schemas.BifrostResponse{ChatResponse: response1})
+
+	WaitForCache(setup.Plugin)
+
+	t.Log("Making second identical request without per-request cache key (should hit cache)...")
+	ctx2 := schemas.NewBifrostContext(context.Background(), schemas.NoDeadline)
+	response2, err2 := setup.Client.ChatCompletionRequest(ctx2, testRequest)
+	if err2 != nil {
+		if err2.Error != nil {
+			t.Fatalf("Second request failed: %v", err2.Error.Message)
+		}
+		t.Fatalf("Second request failed: %v", err2)
+	}
+
+	AssertCacheHit(t, &schemas.BifrostResponse{ChatResponse: response2}, string(CacheTypeDirect))
+	t.Log("Default cache key correctly enabled caching without per-request key")
+}
+
+// TestDefaultCacheKey_PerRequestKeyOverridesDefault verifies that an explicit
+// per-request cache key takes precedence over the configured default.
+func TestDefaultCacheKey_PerRequestKeyOverridesDefault(t *testing.T) {
+	config := getDefaultTestConfig()
+	config.DefaultCacheKey = "test-default-key"
+
+	setup := NewTestSetupWithConfig(t, config)
+	defer setup.Cleanup()
+
+	testRequest := CreateBasicChatRequest("What is the capital of France?", 0.5, 50)
+
+	// Cache with the default key (no per-request key)
+	ctx1 := schemas.NewBifrostContext(context.Background(), schemas.NoDeadline)
+	_, err1 := setup.Client.ChatCompletionRequest(ctx1, testRequest)
+	if err1 != nil {
+		return // Test will be skipped by retry function
+	}
+
+	WaitForCache(setup.Plugin)
+
+	// Verify the cache was actually populated with the default key
+	ctxDefault2 := schemas.NewBifrostContext(context.Background(), schemas.NoDeadline)
+	responseDefault2, errDefault2 := setup.Client.ChatCompletionRequest(ctxDefault2, testRequest)
+	if errDefault2 != nil {
+		if errDefault2.Error != nil {
+			t.Fatalf("Default-key verification request failed: %v", errDefault2.Error.Message)
+		}
+		t.Fatalf("Default-key verification request failed: %v", errDefault2)
+	}
+	AssertCacheHit(t, &schemas.BifrostResponse{ChatResponse: responseDefault2}, string(CacheTypeDirect))
+
+	// Same request but with a DIFFERENT per-request key — should miss
+	ctx2 := CreateContextWithCacheKey("override-key")
+	response2, err2 := setup.Client.ChatCompletionRequest(ctx2, testRequest)
+	if err2 != nil {
+		if err2.Error != nil {
+			t.Fatalf("Second request failed: %v", err2.Error.Message)
+		}
+		t.Fatalf("Second request failed: %v", err2)
+	}
+
+	AssertNoCacheHit(t, &schemas.BifrostResponse{ChatResponse: response2})
+	t.Log("Per-request cache key correctly overrides default (different namespace = cache miss)")
+}
+
+// TestDefaultCacheKey_EmptyDefault_NoCaching verifies that when DefaultCacheKey
+// is empty (default zero value), requests without a per-request key bypass caching.
+func TestDefaultCacheKey_EmptyDefault_NoCaching(t *testing.T) {
+	config := getDefaultTestConfig()
+	// DefaultCacheKey is intentionally left empty (zero value)
+
+	setup := NewTestSetupWithConfig(t, config)
+	defer setup.Cleanup()
+
+	ctx := schemas.NewBifrostContext(context.Background(), schemas.NoDeadline)
+
+	testRequest := CreateBasicChatRequest("What is deep learning", 0.7, 50)
+
+	t.Log("Making first request without any cache key and no default (should not cache)...")
+	response1, err1 := setup.Client.ChatCompletionRequest(ctx, testRequest)
+	if err1 != nil {
+		return // Test will be skipped by retry function
+	}
+
+	AssertNoCacheHit(t, &schemas.BifrostResponse{ChatResponse: response1})
+
+	WaitForCache(setup.Plugin)
+
+	t.Log("Making second identical request (should still not cache)...")
+	ctx2 := schemas.NewBifrostContext(context.Background(), schemas.NoDeadline)
+	response2, err2 := setup.Client.ChatCompletionRequest(ctx2, testRequest)
+	if err2 != nil {
+		if err2.Error != nil {
+			t.Fatalf("Second request failed: %v", err2.Error.Message)
+		}
+		t.Fatalf("Second request failed: %v", err2)
+	}
+
+	AssertNoCacheHit(t, &schemas.BifrostResponse{ChatResponse: response2})
+	t.Log("Empty default cache key correctly preserves opt-in behavior")
+}
--- a/plugins/semanticcache/plugin_edge_cases_test.go
+++ b/plugins/semanticcache/plugin_edge_cases_test.go
@@ -0,0 +1,622 @@
+package semanticcache
+
+import (
+	"context"
+	"strings"
+	"testing"
+
+	bifrost "github.com/maximhq/bifrost/core"
+	"github.com/maximhq/bifrost/core/schemas"
+)
+
+// TestParameterVariations tests that different parameters don't cache hit inappropriately
+func TestParameterVariations(t *testing.T) {
+	setup := NewTestSetup(t)
+	defer setup.Cleanup()
+
+	basePrompt := "What is the capital of France?"
+
+	tests := []struct {
+		name        string
+		request1    *schemas.BifrostChatRequest
+		request2    *schemas.BifrostChatRequest
+		shouldCache bool
+	}{
+		{
+			name:        "Same Parameters",
+			request1:    CreateBasicChatRequest(basePrompt, 0.5, 50),
+			request2:    CreateBasicChatRequest(basePrompt, 0.5, 50),
+			shouldCache: true,
+		},
+		{
+			name:        "Different Temperature",
+			request1:    CreateBasicChatRequest(basePrompt, 0.1, 50),
+			request2:    CreateBasicChatRequest(basePrompt, 0.9, 50),
+			shouldCache: false,
+		},
+		{
+			name:        "Different MaxTokens",
+			request1:    CreateBasicChatRequest(basePrompt, 0.5, 50),
+			request2:    CreateBasicChatRequest(basePrompt, 0.5, 200),
+			shouldCache: false,
+		},
+	}
+
+	for _, tt := range tests {
+		t.Run(tt.name, func(t *testing.T) {
+			// Create a fresh context for each subtest to avoid context pollution
+			ctx := CreateContextWithCacheKey("param-variations-test")
+
+			// Clear cache for this subtest
+			clearTestKeysWithStore(t, setup.Store)
+
+			// Make first request
+			_, err1 := setup.Client.ChatCompletionRequest(ctx, tt.request1)
+			if err1 != nil {
+				return // Test will be skipped by retry function
+			}
+
+			WaitForCache(setup.Plugin)
+
+			// Make second request
+			response2, err2 := setup.Client.ChatCompletionRequest(ctx, tt.request2)
+			if err2 != nil {
+				if err2.Error != nil {
+					t.Fatalf("Second request failed: %v", err2.Error.Message)
+				} else {
+					t.Fatalf("Second request failed: %v", err2)
+				}
+			}
+
+			// Check cache behavior
+			if tt.shouldCache {
+				AssertCacheHit(t, &schemas.BifrostResponse{ChatResponse: response2}, string(CacheTypeDirect))
+			} else {
+				AssertNoCacheHit(t, &schemas.BifrostResponse{ChatResponse: response2})
+			}
+		})
+	}
+}
+
+// TestToolVariations tests caching behavior with different tool configurations
+func TestToolVariations(t *testing.T) {
+	setup := NewTestSetup(t)
+	defer setup.Cleanup()
+
+	ctx := CreateContextWithCacheKey("tool-variations-test")
+
+	// Base request without tools
+	baseRequest := &schemas.BifrostChatRequest{
+		Provider: schemas.OpenAI,
+		Model:    "gpt-4o-mini",
+		Input: []schemas.ChatMessage{
+			{
+				Role: schemas.ChatMessageRoleUser,
+				Content: &schemas.ChatMessageContent{
+					ContentStr: bifrost.Ptr("What's the weather like today?"),
+				},
+			},
+		},
+		Params: &schemas.ChatParameters{
+			MaxCompletionTokens: bifrost.Ptr(100),
+			Temperature:         bifrost.Ptr(0.5),
+		},
+	}
+
+	// Request with tools
+	requestWithTools := &schemas.BifrostChatRequest{
+		Provider: schemas.OpenAI,
+		Model:    "gpt-4o-mini",
+		Input: []schemas.ChatMessage{
+			{
+				Role: schemas.ChatMessageRoleUser,
+				Content: &schemas.ChatMessageContent{
+					ContentStr: bifrost.Ptr("What's the weather like today?"),
+				},
+			},
+		},
+		Params: &schemas.ChatParameters{
+			MaxCompletionTokens: bifrost.Ptr(100),
+			Temperature:         bifrost.Ptr(0.5),
+			Tools: []schemas.ChatTool{
+				{
+					Type: schemas.ChatToolTypeFunction,
+					Function: &schemas.ChatToolFunction{
+						Name:        "get_weather",
+						Description: bifrost.Ptr("Get the current weather"),
+						Parameters: &schemas.ToolFunctionParameters{
+							Type: "object",
+							Properties: schemas.NewOrderedMapFromPairs(
+								schemas.KV("location", map[string]interface{}{
+									"type":        "string",
+									"description": "The city and state",
+								}),
+							),
+						},
+						Strict: bifrost.Ptr(false),
+					},
+				},
+			},
+		},
+	}
+
+	// Request with different tools
+	requestWithDifferentTools := &schemas.BifrostChatRequest{
+		Provider: schemas.OpenAI,
+		Model:    "gpt-4o-mini",
+		Input: []schemas.ChatMessage{
+			{
+				Role: schemas.ChatMessageRoleUser,
+				Content: &schemas.ChatMessageContent{
+					ContentStr: bifrost.Ptr("What's the weather like today?"),
+				},
+			},
+		},
+		Params: &schemas.ChatParameters{
+			MaxCompletionTokens: bifrost.Ptr(100),
+			Temperature:         bifrost.Ptr(0.5),
+			Tools: []schemas.ChatTool{
+				{
+					Type: schemas.ChatToolTypeFunction,
+					Function: &schemas.ChatToolFunction{
+						Name:        "get_current_weather",
+						Description: bifrost.Ptr("Get current weather information"),
+						Parameters: &schemas.ToolFunctionParameters{
+							Type: "object",
+							Properties: schemas.NewOrderedMapFromPairs(
+								schemas.KV("city", map[string]interface{}{ // Different parameter name
+									"type":        "string",
+									"description": "The city name",
+								}),
+							),
+						},
+						Strict: bifrost.Ptr(false),
+					},
+				},
+			},
+		},
+	}
+
+	// Test 1: Request without tools
+	t.Log("Making request without tools...")
+	_, err1 := setup.Client.ChatCompletionRequest(ctx, baseRequest)
+	if err1 != nil {
+		t.Fatalf("Request without tools failed: %v", err1)
+	}
+
+	WaitForCache(setup.Plugin)
+
+	// Test 2: Request with tools (should NOT cache hit)
+	t.Log("Making request with tools...")
+	response2, err2 := setup.Client.ChatCompletionRequest(ctx, requestWithTools)
+	if err2 != nil {
+		return // Test will be skipped by retry function
+	}
+
+	AssertNoCacheHit(t, &schemas.BifrostResponse{ChatResponse: response2})
+
+	WaitForCache(setup.Plugin)
+
+	// Test 3: Same request with tools (should cache hit)
+	t.Log("Making same request with tools again...")
+	response3, err3 := setup.Client.ChatCompletionRequest(ctx, requestWithTools)
+	if err3 != nil {
+		t.Fatalf("Second request with tools failed: %v", err3)
+	}
+
+	AssertCacheHit(t, &schemas.BifrostResponse{ChatResponse: response3}, "")
+
+	// Test 4: Request with different tools (should NOT cache hit)
+	t.Log("Making request with different tools...")
+	response4, err4 := setup.Client.ChatCompletionRequest(ctx, requestWithDifferentTools)
+	if err4 != nil {
+		return // Test will be skipped by retry function
+	}
+
+	AssertNoCacheHit(t, &schemas.BifrostResponse{ChatResponse: response4})
+
+	t.Log("✅ Tool variations test completed!")
+}
+
+// TestContentVariations tests caching behavior with different content types
+func TestContentVariations(t *testing.T) {
+	setup := NewTestSetup(t)
+	defer setup.Cleanup()
+
+	tests := []struct {
+		name    string
+		request *schemas.BifrostChatRequest
+	}{
+		{
+			name: "Image URL Content",
+			request: &schemas.BifrostChatRequest{
+				Provider: schemas.OpenAI,
+				Model:    "gpt-4o-mini",
+				Input: []schemas.ChatMessage{
+					{
+						Role: schemas.ChatMessageRoleUser,
+						Content: &schemas.ChatMessageContent{
+							ContentBlocks: []schemas.ChatContentBlock{
+								{
+									Type: schemas.ChatContentBlockTypeText,
+									Text: bifrost.Ptr("Analyze this image"),
+								},
+								{
+									Type: schemas.ChatContentBlockTypeImage,
+									ImageURLStruct: &schemas.ChatInputImage{
+										URL: "https://pub-cdead89c2f004d8f963fd34010c479d0.r2.dev/Gfp-wisconsin-madison-the-nature-boardwalk.jpg",
+									},
+								},
+							},
+						},
+					},
+				},
+				Params: &schemas.ChatParameters{
+					MaxCompletionTokens: bifrost.Ptr(200),
+					Temperature:         bifrost.Ptr(0.3),
+				},
+			},
+		},
+		{
+			name: "Multiple Images",
+			request: &schemas.BifrostChatRequest{
+				Provider: schemas.OpenAI,
+				Model:    "gpt-4o-mini",
+				Input: []schemas.ChatMessage{
+					{
+						Role: schemas.ChatMessageRoleUser,
+						Content: &schemas.ChatMessageContent{
+							ContentBlocks: []schemas.ChatContentBlock{
+								{
+									Type: schemas.ChatContentBlockTypeText,
+									Text: bifrost.Ptr("Compare these images"),
+								},
+								{
+									Type: schemas.ChatContentBlockTypeImage,
+									ImageURLStruct: &schemas.ChatInputImage{
+										URL: "https://pub-cdead89c2f004d8f963fd34010c479d0.r2.dev/Gfp-wisconsin-madison-the-nature-boardwalk.jpg",
+									},
+								},
+								{
+									Type: schemas.ChatContentBlockTypeImage,
+									ImageURLStruct: &schemas.ChatInputImage{
+										URL: "https://upload.wikimedia.org/wikipedia/commons/b/b5/Scenery_.jpg",
+									},
+								},
+							},
+						},
+					},
+				},
+				Params: &schemas.ChatParameters{
+					MaxCompletionTokens: bifrost.Ptr(200),
+					Temperature:         bifrost.Ptr(0.3),
+				},
+			},
+		},
+		{
+			name: "Very Long Content",
+			request: &schemas.BifrostChatRequest{
+				Provider: schemas.OpenAI,
+				Model:    "gpt-4o-mini",
+				Input: []schemas.ChatMessage{
+					{
+						Role: schemas.ChatMessageRoleUser,
+						Content: &schemas.ChatMessageContent{
+							ContentStr: bifrost.Ptr(strings.Repeat("This is a very long prompt. ", 100)),
+						},
+					},
+				},
+				Params: &schemas.ChatParameters{
+					MaxCompletionTokens: bifrost.Ptr(50),
+					Temperature:         bifrost.Ptr(0.2),
+				},
+			},
+		},
+		{
+			name: "Multi-turn Conversation",
+			request: &schemas.BifrostChatRequest{
+				Provider: schemas.OpenAI,
+				Model:    "gpt-4o-mini",
+				Input: []schemas.ChatMessage{
+					{
+						Role: schemas.ChatMessageRoleUser,
+						Content: &schemas.ChatMessageContent{
+							ContentStr: bifrost.Ptr("What is AI?"),
+						},
+					},
+					{
+						Role: schemas.ChatMessageRoleAssistant,
+						Content: &schemas.ChatMessageContent{
+							ContentStr: bifrost.Ptr("AI stands for Artificial Intelligence..."),
+						},
+					},
+					{
+						Role: schemas.ChatMessageRoleUser,
+						Content: &schemas.ChatMessageContent{
+							ContentStr: bifrost.Ptr("Can you give me examples?"),
+						},
+					},
+				},
+				Params: &schemas.ChatParameters{
+					MaxCompletionTokens: bifrost.Ptr(150),
+					Temperature:         bifrost.Ptr(0.5),
+				},
+			},
+		},
+	}
+
+	for _, tt := range tests {
+		t.Run(tt.name, func(t *testing.T) {
+			t.Logf("Testing content variation: %s", tt.name)
+
+			// Create a fresh context for each subtest to avoid context pollution
+			ctx := CreateContextWithCacheKey("content-variations-test")
+
+			// Make first request
+			_, err1 := setup.Client.ChatCompletionRequest(ctx, tt.request)
+			if err1 != nil {
+				t.Logf("⚠️  First %s request failed: %v", tt.name, err1)
+				return // Skip this test case
+			}
+
+			WaitForCache(setup.Plugin)
+
+			// Make second identical request
+			response2, err2 := setup.Client.ChatCompletionRequest(ctx, tt.request)
+			if err2 != nil {
+				t.Fatalf("Second %s request failed: %v", tt.name, err2)
+			}
+
+			// Should be cached
+			AssertCacheHit(t, &schemas.BifrostResponse{ChatResponse: response2}, string(CacheTypeDirect))
+			t.Logf("✅ %s content variation successful", tt.name)
+		})
+	}
+}
+
+// TestBoundaryParameterValues tests edge case parameter values
+func TestBoundaryParameterValues(t *testing.T) {
+	setup := NewTestSetup(t)
+	defer setup.Cleanup()
+
+	tests := []struct {
+		name    string
+		request *schemas.BifrostChatRequest
+	}{
+		{
+			name: "Maximum Parameter Values",
+			request: &schemas.BifrostChatRequest{
+				Provider: schemas.OpenAI,
+				Model:    "gpt-4o-mini",
+				Input: []schemas.ChatMessage{
+					{
+						Role: schemas.ChatMessageRoleUser,
+						Content: &schemas.ChatMessageContent{
+							ContentStr: bifrost.Ptr("Test max parameters"),
+						},
+					},
+				},
+				Params: &schemas.ChatParameters{
+					MaxCompletionTokens: bifrost.Ptr(4096),
+					PresencePenalty:     bifrost.Ptr(2.0),
+					FrequencyPenalty:    bifrost.Ptr(2.0),
+					Temperature:         bifrost.Ptr(2.0),
+					TopP:                bifrost.Ptr(1.0),
+				},
+			},
+		},
+		{
+			name: "Minimum Parameter Values",
+			request: &schemas.BifrostChatRequest{
+				Provider: schemas.OpenAI,
+				Model:    "gpt-4o-mini",
+				Input: []schemas.ChatMessage{
+					{
+						Role: schemas.ChatMessageRoleUser,
+						Content: &schemas.ChatMessageContent{
+							ContentStr: bifrost.Ptr("Test min parameters"),
+						},
+					},
+				},
+				Params: &schemas.ChatParameters{
+					MaxCompletionTokens: bifrost.Ptr(1),
+					PresencePenalty:     bifrost.Ptr(-2.0),
+					FrequencyPenalty:    bifrost.Ptr(-2.0),
+					Temperature:         bifrost.Ptr(0.0),
+					TopP:                bifrost.Ptr(0.01),
+				},
+			},
+		},
+		{
+			name: "Edge Case Parameters",
+			request: &schemas.BifrostChatRequest{
+				Provider: schemas.OpenAI,
+				Model:    "gpt-4o-mini",
+				Input: []schemas.ChatMessage{
+					{
+						Role: schemas.ChatMessageRoleUser,
+						Content: &schemas.ChatMessageContent{
+							ContentStr: bifrost.Ptr("Test edge case parameters"),
+						},
+					},
+				},
+				Params: &schemas.ChatParameters{
+					MaxCompletionTokens: bifrost.Ptr(1),
+					User:                bifrost.Ptr("test-user-id-12345"),
+					Temperature:         bifrost.Ptr(0.0),
+					TopP:                bifrost.Ptr(0.1),
+				},
+			},
+		},
+	}
+
+	for _, tt := range tests {
+		t.Run(tt.name, func(t *testing.T) {
+			t.Logf("Testing boundary parameters: %s", tt.name)
+
+			// Create a fresh context for each subtest to avoid context pollution
+			ctx := CreateContextWithCacheKey("boundary-params-test")
+
+			_, err := setup.Client.ChatCompletionRequest(ctx, tt.request)
+			if err != nil {
+				t.Logf("⚠️  %s request failed (may be expected): %v", tt.name, err)
+			} else {
+				t.Logf("✅ %s handled gracefully", tt.name)
+			}
+		})
+	}
+}
+
+// TestSemanticSimilarityEdgeCases tests edge cases in semantic similarity matching
+func TestSemanticSimilarityEdgeCases(t *testing.T) {
+	setup := NewTestSetup(t)
+	defer setup.Cleanup()
+
+	setup.Config.Threshold = 0.9
+
+	// Test case: Similar questions with different wording
+	similarTests := []struct {
+		prompt1     string
+		prompt2     string
+		shouldMatch bool
+		description string
+	}{
+		{
+			prompt1:     "What is machine learning?",
+			prompt2:     "Can you explain machine learning?",
+			shouldMatch: true,
+			description: "Similar questions about ML",
+		},
+		{
+			prompt1:     "How does AI work?",
+			prompt2:     "Explain artificial intelligence",
+			shouldMatch: true,
+			description: "AI-related questions",
+		},
+		{
+			prompt1:     "What is the weather today?",
+			prompt2:     "What do you know about bifrost?",
+			shouldMatch: false,
+			description: "Completely different topics",
+		},
+		{
+			prompt1:     "Hello, how are you?",
+			prompt2:     "Hi, how are you doing?",
+			shouldMatch: true,
+			description: "Similar greetings",
+		},
+	}
+
+	for i, test := range similarTests {
+		t.Run(test.description, func(t *testing.T) {
+			// Create a fresh context for each subtest to avoid context pollution
+			ctx := CreateContextWithCacheKey("semantic-edge-test")
+
+			// Clear cache for this subtest
+			clearTestKeysWithStore(t, setup.Store)
+
+			// Make first request
+			request1 := CreateBasicChatRequest(test.prompt1, 0.1, 50)
+			_, err1 := setup.Client.ChatCompletionRequest(ctx, request1)
+			if err1 != nil {
+				return // Test will be skipped by retry function
+			}
+
+			// Wait for cache to be written
+			WaitForCache(setup.Plugin)
+
+			// Make second request with similar content
+			request2 := CreateBasicChatRequest(test.prompt2, 0.1, 50) // Same parameters
+			response2, err2 := setup.Client.ChatCompletionRequest(ctx, request2)
+			if err2 != nil {
+				if err2.Error != nil {
+					t.Fatalf("Second request failed: %v", err2.Error.Message)
+				} else {
+					t.Fatalf("Second request failed: %v", err2)
+				}
+			}
+
+			var cacheThresholdFloat float64
+			var cacheSimilarityFloat float64
+
+			// Check if semantic matching occurred
+			semanticMatch := false
+			if response2.ExtraFields.CacheDebug != nil && response2.ExtraFields.CacheDebug.CacheHit {
+				if response2.ExtraFields.CacheDebug.HitType != nil && *response2.ExtraFields.CacheDebug.HitType == string(CacheTypeSemantic) {
+					semanticMatch = true
+
+					if response2.ExtraFields.CacheDebug.Threshold != nil {
+						cacheThresholdFloat = *response2.ExtraFields.CacheDebug.Threshold
+					}
+					if response2.ExtraFields.CacheDebug.Similarity != nil {
+						cacheSimilarityFloat = *response2.ExtraFields.CacheDebug.Similarity
+					}
+				}
+			}
+
+			if test.shouldMatch {
+				if semanticMatch {
+					t.Logf("✅ Test %d: Semantic match found as expected for '%s'", i+1, test.description)
+				} else {
+					t.Logf("ℹ️  Test %d: No semantic match found for '%s', check with threshold: %f and found similarity: %f", i+1, test.description, cacheThresholdFloat, cacheSimilarityFloat)
+				}
+			} else {
+				if semanticMatch {
+					t.Errorf("❌ Test %d: Unexpected semantic match for different topics: '%s', check with threshold: %f and found similarity: %f", i+1, test.description, cacheThresholdFloat, cacheSimilarityFloat)
+				} else {
+					t.Logf("✅ Test %d: Correctly no semantic match for different topics: '%s'", i+1, test.description)
+				}
+			}
+		})
+	}
+}
+
+// TestErrorHandlingEdgeCases tests various error scenarios
+func TestErrorHandlingEdgeCases(t *testing.T) {
+	setup := NewTestSetup(t)
+	defer setup.Cleanup()
+
+	testRequest := CreateBasicChatRequest("Test error handling scenarios", 0.5, 50)
+
+	// Test without cache key (should not crash and bypass cache)
+	t.Run("Request without cache key", func(t *testing.T) {
+		ctxNoKey := schemas.NewBifrostContext(context.Background(), schemas.NoDeadline)
+
+		response, err := setup.Client.ChatCompletionRequest(ctxNoKey, testRequest)
+		if err != nil {
+			t.Errorf("Request without cache key failed: %v", err)
+			return
+		}
+
+		// Should bypass cache since there's no cache key
+		AssertNoCacheHit(t, &schemas.BifrostResponse{ChatResponse: response})
+		t.Log("✅ Request without cache key correctly bypassed cache")
+	})
+
+	// Test with invalid cache key type
+	t.Run("Request with invalid cache key type", func(t *testing.T) {
+		// First establish a cached response with valid context
+		validCtx := CreateContextWithCacheKey("error-handling-test")
+		_, err := setup.Client.ChatCompletionRequest(validCtx, testRequest)
+		if err != nil {
+			t.Fatalf("First request with valid cache key failed: %v", err)
+		}
+
+		WaitForCache(setup.Plugin)
+
+		// Now test with invalid key type - should bypass cache
+		ctxInvalidKey := schemas.NewBifrostContext(context.Background(), schemas.NoDeadline).WithValue(CacheKey, 12345)
+
+		response, err := setup.Client.ChatCompletionRequest(ctxInvalidKey, testRequest)
+		if err != nil {
+			t.Errorf("Request with invalid cache key type failed: %v", err)
+			return
+		}
+
+		// Should bypass cache due to invalid key type
+		AssertNoCacheHit(t, &schemas.BifrostResponse{ChatResponse: response})
+		t.Log("✅ Request with invalid cache key type correctly bypassed cache")
+	})
+
+	t.Log("✅ Error handling edge cases completed!")
+}
--- a/plugins/semanticcache/plugin_embedding_test.go
+++ b/plugins/semanticcache/plugin_embedding_test.go
@@ -0,0 +1,174 @@
+package semanticcache
+
+import (
+	"testing"
+	"time"
+
+	"github.com/maximhq/bifrost/core/schemas"
+)
+
+// TestEmbeddingRequestsCaching tests that embedding requests are properly cached using direct hash matching
+func TestEmbeddingRequestsCaching(t *testing.T) {
+	setup := NewTestSetup(t)
+	defer setup.Cleanup()
+
+	ctx := CreateContextWithCacheKey("test-embedding-cache")
+
+	// Create embedding request
+	embeddingRequest := CreateEmbeddingRequest([]string{
+		"What is machine learning?",
+		"Explain artificial intelligence in simple terms.",
+	})
+
+	t.Log("Making first embedding request (should go to OpenAI and be cached)...")
+
+	// Make first request (will go to OpenAI and be cached) - with retries
+	start1 := time.Now()
+	response1, err1 := setup.Client.EmbeddingRequest(ctx, embeddingRequest)
+	duration1 := time.Since(start1)
+
+	if err1 != nil {
+		return // Test will be skipped by retry function
+	}
+
+	if response1 == nil || len(response1.Data) == 0 {
+		t.Fatal("First embedding response is invalid")
+	}
+
+	t.Logf("First embedding request completed in %v", duration1)
+	t.Logf("Response contains %d embeddings", len(response1.Data))
+
+	// Wait for cache to be written
+	WaitForCache(setup.Plugin)
+
+	t.Log("Making second identical embedding request (should be served from cache)...")
+
+	// Make second identical request (should be cached)
+	start2 := time.Now()
+	response2, err2 := setup.Client.EmbeddingRequest(ctx, embeddingRequest)
+	duration2 := time.Since(start2)
+
+	if err2 != nil {
+		t.Fatalf("Second embedding request failed: %v", err2)
+	}
+
+	if response2 == nil || len(response2.Data) == 0 {
+		t.Fatal("Second embedding response is invalid")
+	}
+
+	// Verify cache hit
+	AssertCacheHit(t, &schemas.BifrostResponse{EmbeddingResponse: response2}, "direct")
+
+	t.Logf("Second embedding request completed in %v", duration2)
+
+	// Cache should be significantly faster
+	if duration2 >= duration1 { // Allow some margin but cache should be much faster
+		t.Log("⚠️  Cache doesn't seem faster, but this could be due to test environment")
+	}
+
+	// Responses should be identical
+	if len(response1.Data) != len(response2.Data) {
+		t.Errorf("Response lengths differ: %d vs %d", len(response1.Data), len(response2.Data))
+	}
+
+	t.Log("✅ Embedding requests properly cached using direct hash matching")
+}
+
+// TestEmbeddingRequestsNoCacheWithoutCacheKey tests that embedding requests without cache key are not cached
+func TestEmbeddingRequestsNoCacheWithoutCacheKey(t *testing.T) {
+	setup := NewTestSetup(t)
+	defer setup.Cleanup()
+
+	// Don't set cache key in context
+	ctx := CreateContextWithCacheKey("")
+
+	embeddingRequest := CreateEmbeddingRequest([]string{"Test embedding without cache key"})
+
+	t.Log("Making embedding request without cache key...")
+
+	response, err := setup.Client.EmbeddingRequest(ctx, embeddingRequest)
+	if err != nil {
+		t.Fatalf("Embedding request failed: %v", err)
+	}
+
+	// Should not be cached
+	AssertNoCacheHit(t, &schemas.BifrostResponse{EmbeddingResponse: response})
+
+	t.Log("✅ Embedding requests without cache key are properly not cached")
+}
+
+// TestEmbeddingRequestsDifferentTexts tests that different embedding texts produce different cache entries
+func TestEmbeddingRequestsDifferentTexts(t *testing.T) {
+	setup := NewTestSetup(t)
+	defer setup.Cleanup()
+
+	ctx := CreateContextWithCacheKey("test-embedding-different")
+
+	// Create two different embedding requests
+	request1 := CreateEmbeddingRequest([]string{"First set of texts"})
+	request2 := CreateEmbeddingRequest([]string{"Second set of texts"})
+
+	t.Log("Making first embedding request...")
+	response1, err1 := setup.Client.EmbeddingRequest(ctx, request1)
+	if err1 != nil {
+		return // Test will be skipped by retry function
+	}
+	AssertNoCacheHit(t, &schemas.BifrostResponse{EmbeddingResponse: response1})
+
+	WaitForCache(setup.Plugin)
+
+	t.Log("Making second different embedding request...")
+	response2, err2 := setup.Client.EmbeddingRequest(ctx, request2)
+	if err2 != nil {
+		return // Test will be skipped by retry function
+	}
+	// Should not be a cache hit since texts are different
+	AssertNoCacheHit(t, &schemas.BifrostResponse{EmbeddingResponse: response2})
+
+	t.Log("✅ Different embedding texts produce different cache entries")
+}
+
+// TestEmbeddingRequestsCacheExpiration tests TTL functionality for embedding requests
+func TestEmbeddingRequestsCacheExpiration(t *testing.T) {
+	setup := NewTestSetup(t)
+	defer setup.Cleanup()
+
+	// Set very short TTL for testing
+	shortTTL := 5 * time.Second
+	ctx := CreateContextWithCacheKeyAndTTL("test-embedding-ttl", shortTTL)
+
+	embeddingRequest := CreateEmbeddingRequest([]string{"TTL test embedding"})
+
+	t.Log("Making first embedding request with short TTL...")
+	response1, err1 := setup.Client.EmbeddingRequest(ctx, embeddingRequest)
+	if err1 != nil {
+		return // Test will be skipped by retry function
+	}
+	AssertNoCacheHit(t, &schemas.BifrostResponse{EmbeddingResponse: response1})
+
+	WaitForCache(setup.Plugin)
+
+	t.Log("Making second request before TTL expiration...")
+	response2, err2 := setup.Client.EmbeddingRequest(ctx, embeddingRequest)
+	if err2 != nil {
+		if err2.Error != nil {
+			t.Fatalf("Second request failed: %v", err2.Error.Message)
+		} else {
+			t.Fatalf("Second request failed: %v", err2)
+		}
+	}
+	AssertCacheHit(t, &schemas.BifrostResponse{EmbeddingResponse: response2}, "direct")
+
+	t.Logf("Waiting for TTL expiration (%v)...", shortTTL)
+	time.Sleep(shortTTL + 1*time.Second) // Wait for TTL to expire
+
+	t.Log("Making third request after TTL expiration...")
+	response3, err3 := setup.Client.EmbeddingRequest(ctx, embeddingRequest)
+	if err3 != nil {
+		return // Test will be skipped by retry function
+	}
+	// Should not be a cache hit since TTL expired
+	AssertNoCacheHit(t, &schemas.BifrostResponse{EmbeddingResponse: response3})
+
+	t.Log("✅ Embedding requests properly handle TTL expiration")
+}
--- a/plugins/semanticcache/plugin_image_generation_test.go
+++ b/plugins/semanticcache/plugin_image_generation_test.go
@@ -0,0 +1,427 @@
+package semanticcache
+
+import (
+	"os"
+	"testing"
+	"time"
+
+	"github.com/maximhq/bifrost/core/schemas"
+)
+
+// TestImageGenerationCacheBasicFunctionality tests basic image generation caching
+func TestImageGenerationCacheBasicFunctionality(t *testing.T) {
+	if testing.Short() {
+		t.Skip("skipping integration test in -short mode")
+	}
+	if os.Getenv("OPENAI_API_KEY") == "" {
+		t.Skip("OPENAI_API_KEY not set")
+	}
+	setup := NewTestSetup(t)
+	defer setup.Cleanup()
+
+	ctx := CreateContextWithCacheKey("test-image-gen-value")
+
+	// Create test image generation request
+	testRequest := CreateImageGenerationRequest(
+		"A serene Japanese garden with cherry blossoms in spring",
+		"1024x1024",
+		"low",
+	)
+
+	t.Log("Making first image generation request (should go to OpenAI and be cached)...")
+
+	// Make first request (will go to OpenAI and be cached)
+	start1 := time.Now()
+	response1, err1 := setup.Client.ImageGenerationRequest(ctx, testRequest)
+	duration1 := time.Since(start1)
+
+	if err1 != nil {
+		t.Skipf("First image generation request failed (may be rate limited): %v", err1)
+		return
+	}
+
+	if response1 == nil || len(response1.Data) == 0 {
+		t.Fatal("First response is invalid or has no image data")
+	}
+
+	t.Logf("First request completed in %v", duration1)
+	t.Logf("Response: ID=%s, Images=%d", response1.ID, len(response1.Data))
+
+	// Wait for cache to be written
+	WaitForCache(setup.Plugin)
+
+	t.Log("Making second identical request (should be served from cache)...")
+
+	// Make second identical request (should be cached)
+	start2 := time.Now()
+	response2, err2 := setup.Client.ImageGenerationRequest(ctx, testRequest)
+	duration2 := time.Since(start2)
+
+	if err2 != nil {
+		if err2.Error != nil {
+			t.Fatalf("Second request failed: %v", err2.Error.Message)
+		} else {
+			t.Fatalf("Second request failed: %v", err2)
+		}
+	}
+
+	if response2 == nil || len(response2.Data) == 0 {
+		t.Fatal("Second response is invalid or has no image data")
+	}
+
+	t.Logf("Second request completed in %v", duration2)
+
+	// Verify cache hit
+	AssertCacheHit(t, &schemas.BifrostResponse{ImageGenerationResponse: response2}, string(CacheTypeDirect))
+
+	// Performance comparison
+	t.Logf("Performance Summary:")
+	t.Logf("First request (OpenAI):  %v", duration1)
+	t.Logf("Second request (Cache):  %v", duration2)
+
+	if duration2 < duration1 {
+		if duration2 == 0 {
+			t.Errorf("Second request duration too small to compute speedup (duration2=0)")
+			return
+		}
+		speedup := float64(duration1) / float64(duration2)
+		t.Logf("Cache speedup: %.2fx faster", speedup)
+	} else {
+		if duration2 == 0 {
+			t.Errorf("Second request duration too small to compute speedup (duration2=0)")
+			return
+		}
+		speedup := float64(duration1) / float64(duration2)
+		t.Logf("Cache was slower than original: speedup=%.2fx (this can happen due to system load)", speedup)
+		// Only fail if cache is extremely slow (10x+ slower), indicating a real problem
+		if duration2 > duration1*10 {
+			t.Errorf("Cache is extremely slow compared to original: cache=%v, original=%v (cache may not be working)", duration2, duration1)
+		}
+	}
+
+	// Verify image data is preserved in cached response
+	if len(response2.Data) != len(response1.Data) {
+		t.Errorf("Image count differs between cached and original: original=%d, cached=%d",
+			len(response1.Data), len(response2.Data))
+	}
+
+	// Verify provider information is maintained in cached response
+	if response2.ExtraFields.Provider != testRequest.Provider {
+		t.Errorf("Provider mismatch in cached response: expected %s, got %s",
+			testRequest.Provider, response2.ExtraFields.Provider)
+	}
+
+	t.Log("✅ Basic image generation caching test completed successfully!")
+}
+
+// TestImageGenerationSemanticSearch tests semantic similarity search for image generation
+func TestImageGenerationSemanticSearch(t *testing.T) {
+	if testing.Short() {
+		t.Skip("skipping integration test in -short mode")
+	}
+	if os.Getenv("OPENAI_API_KEY") == "" {
+		t.Skip("OPENAI_API_KEY not set")
+	}
+	// Initialize test with custom threshold
+	config := &Config{
+		Provider:       schemas.OpenAI,
+		EmbeddingModel: "text-embedding-3-small",
+		Dimension:      1536,
+		Threshold:      0.5,
+		Keys: []schemas.Key{
+			{Value: *schemas.NewEnvVar("env.OPENAI_API_KEY"), Models: []string{"*"}, Weight: 1.0},
+		},
+	}
+	setup := NewTestSetupWithConfig(t, config)
+	defer setup.Cleanup()
+
+	ctx := CreateContextWithCacheKey("image-semantic-test-value")
+
+	// First request - this will be cached
+	firstRequest := CreateImageGenerationRequest(
+		"A beautiful sunset over the ocean with golden clouds",
+		"1024x1024",
+		"low",
+	)
+
+	t.Log("Making first image generation request (should go to OpenAI and be cached)...")
+	start1 := time.Now()
+	response1, err1 := setup.Client.ImageGenerationRequest(ctx, firstRequest)
+	duration1 := time.Since(start1)
+
+	if err1 != nil {
+		t.Skipf("First image generation request failed (may be rate limited): %v", err1)
+		return
+	}
+
+	if response1 == nil || len(response1.Data) == 0 {
+		t.Fatal("First response is invalid or has no image data")
+	}
+
+	t.Logf("First request completed in %v", duration1)
+
+	// Wait for cache to be written
+	WaitForCache(setup.Plugin)
+
+	// Second request - very similar text to test semantic matching
+	secondRequest := CreateImageGenerationRequest(
+		"A gorgeous sunset over the sea with orange clouds",
+		"1024x1024",
+		"low",
+	)
+
+	t.Log("Making semantically similar request (should be served from semantic cache)...")
+	start2 := time.Now()
+	response2, err2 := setup.Client.ImageGenerationRequest(ctx, secondRequest)
+	duration2 := time.Since(start2)
+
+	if err2 != nil {
+		if err2.Error != nil {
+			t.Fatalf("Second request failed: %v", err2.Error.Message)
+		} else {
+			t.Fatalf("Second request failed: %v", err2)
+		}
+	}
+
+	if response2 == nil || len(response2.Data) == 0 {
+		t.Fatal("Second response is invalid or has no image data")
+	}
+
+	t.Logf("Second request completed in %v", duration2)
+
+	// Check if second request was served from semantic cache
+	semanticMatch := false
+
+	if response2.ExtraFields.CacheDebug != nil && response2.ExtraFields.CacheDebug.CacheHit {
+		if response2.ExtraFields.CacheDebug.HitType != nil && *response2.ExtraFields.CacheDebug.HitType == string(CacheTypeSemantic) {
+			semanticMatch = true
+
+			threshold := 0.0
+			similarity := 0.0
+
+			if response2.ExtraFields.CacheDebug.Threshold != nil {
+				threshold = *response2.ExtraFields.CacheDebug.Threshold
+			}
+			if response2.ExtraFields.CacheDebug.Similarity != nil {
+				similarity = *response2.ExtraFields.CacheDebug.Similarity
+			}
+
+			t.Logf("✅ Second request was served from semantic cache! Cache threshold: %f, Cache similarity: %f", threshold, similarity)
+		}
+	}
+
+	if !semanticMatch {
+		t.Error("Semantic match expected but not found")
+		return
+	}
+
+	// Performance comparison
+	t.Logf("Semantic Cache Performance:")
+	t.Logf("First request (OpenAI):     %v", duration1)
+	t.Logf("Second request (Semantic):  %v", duration2)
+
+	if duration2 < duration1 {
+		speedup := float64(duration1) / float64(duration2)
+		t.Logf("Semantic cache speedup: %.2fx faster", speedup)
+	} else {
+		slowdown := float64(duration2) / float64(duration1)
+		t.Logf("Semantic cache was slower than original: %.2fx slower (this can happen due to system load)", slowdown)
+		// Only fail if cache is extremely slow (10x+ slower), indicating a real problem
+		if slowdown > 10 {
+			t.Errorf("Semantic cache is extremely slow compared to original: slowdown=%.2fx, cache=%v, original=%v (cache may not be working)", slowdown, duration2, duration1)
+		}
+	}
+
+	t.Log("✅ Image generation semantic search test completed successfully!")
+}
+
+// TestImageGenerationDifferentParameters tests that different parameters are cached separately
+func TestImageGenerationDifferentParameters(t *testing.T) {
+	if testing.Short() {
+		t.Skip("skipping integration test in -short mode")
+	}
+	if os.Getenv("OPENAI_API_KEY") == "" {
+		t.Skip("OPENAI_API_KEY not set")
+	}
+	setup := NewTestSetup(t)
+	defer setup.Cleanup()
+
+	ctx := CreateContextWithCacheKey("image-params-test")
+
+	basePrompt := "A cute cat sitting on a windowsill"
+
+	// First request with 1024x1024
+	request1 := CreateImageGenerationRequest(basePrompt, "1024x1024", "low")
+
+	t.Log("Making first request with 1024x1024...")
+	_, err1 := setup.Client.ImageGenerationRequest(ctx, request1)
+	if err1 != nil {
+		t.Skipf("First image generation request failed (may be rate limited): %v", err1)
+		return
+	}
+
+	WaitForCache(setup.Plugin)
+
+	// Second request with different size - should NOT be cached
+	request2 := CreateImageGenerationRequest(basePrompt, "1024x1536", "low")
+
+	t.Log("Making second request with different size (1024x1536)...")
+	response2, err2 := setup.Client.ImageGenerationRequest(ctx, request2)
+	if err2 != nil {
+		t.Skipf("Second image generation request failed (may be rate limited): %v", err2)
+		return
+	}
+
+	// Should NOT be cached (different size)
+	AssertNoCacheHit(t, &schemas.BifrostResponse{ImageGenerationResponse: response2})
+
+	WaitForCache(setup.Plugin)
+
+	// Third request with different quality - should NOT be cached
+	request3 := CreateImageGenerationRequest(basePrompt, "1024x1024", "high")
+
+	t.Log("Making third request with different quality (high)...")
+	response3, err3 := setup.Client.ImageGenerationRequest(ctx, request3)
+	if err3 != nil {
+		t.Skipf("Third image generation request failed (may be rate limited): %v", err3)
+		return
+	}
+
+	// Should NOT be cached (different quality)
+	AssertNoCacheHit(t, &schemas.BifrostResponse{ImageGenerationResponse: response3})
+
+	t.Log("✅ Image generation different parameters test completed!")
+}
+
+// TestImageGenerationStreamCaching tests streaming image generation caching
+func TestImageGenerationStreamCaching(t *testing.T) {
+	if testing.Short() {
+		t.Skip("skipping integration test in -short mode")
+	}
+	if os.Getenv("OPENAI_API_KEY") == "" {
+		t.Skip("OPENAI_API_KEY not set")
+	}
+	setup := NewTestSetup(t)
+	defer setup.Cleanup()
+
+	ctx := CreateContextWithCacheKey("image-stream-test")
+
+	// Create test image generation request
+	testRequest := CreateImageGenerationRequest(
+		"A futuristic city skyline at night with neon lights",
+		"1024x1024",
+		"low",
+	)
+
+	t.Log("Making first streaming image generation request...")
+
+	// Make first streaming request
+	start1 := time.Now()
+	stream1, err1 := setup.Client.ImageGenerationStreamRequest(ctx, testRequest)
+	if err1 != nil {
+		t.Skipf("First streaming request failed (may be rate limited): %v", err1)
+		return
+	}
+
+	var responses1 []schemas.BifrostImageGenerationStreamResponse
+	for streamMsg := range stream1 {
+		if streamMsg.BifrostError != nil {
+			t.Fatalf("Error in first stream: %v", streamMsg.BifrostError)
+		}
+		if streamMsg.BifrostImageGenerationStreamResponse != nil {
+			responses1 = append(responses1, *streamMsg.BifrostImageGenerationStreamResponse)
+		}
+	}
+	duration1 := time.Since(start1)
+
+	if len(responses1) == 0 {
+		t.Fatal("First streaming request returned no responses")
+	}
+
+	t.Logf("First streaming request completed in %v with %d chunks", duration1, len(responses1))
+
+	// Wait for cache to be written
+	WaitForCache(setup.Plugin)
+
+	t.Log("Making second identical streaming request (should be served from cache)...")
+
+	// Make second identical streaming request
+	start2 := time.Now()
+	stream2, err2 := setup.Client.ImageGenerationStreamRequest(ctx, testRequest)
+	if err2 != nil {
+		t.Fatalf("Second streaming request failed: %v", err2)
+	}
+
+	var responses2 []schemas.BifrostImageGenerationStreamResponse
+	for streamMsg := range stream2 {
+		if streamMsg.BifrostError != nil {
+			t.Fatalf("Error in second stream: %v", streamMsg.BifrostError)
+		}
+		if streamMsg.BifrostImageGenerationStreamResponse != nil {
+			responses2 = append(responses2, *streamMsg.BifrostImageGenerationStreamResponse)
+		}
+	}
+	duration2 := time.Since(start2)
+
+	if len(responses2) == 0 {
+		t.Fatal("Second streaming request returned no responses")
+	}
+
+	t.Logf("Second streaming request completed in %v with %d chunks", duration2, len(responses2))
+
+	// Validate that both streams have the same number of chunks
+	if len(responses1) != len(responses2) {
+		t.Errorf("Stream chunk count mismatch: original=%d, cached=%d", len(responses1), len(responses2))
+	}
+
+	// Validate that the second stream was cached
+	// Cache debug info is only on the last chunk for streaming responses
+	cached := false
+	if len(responses2) > 0 {
+		lastResponse := responses2[len(responses2)-1]
+		if lastResponse.ExtraFields.CacheDebug != nil && lastResponse.ExtraFields.CacheDebug.CacheHit {
+			cached = true
+			hitType := "unknown"
+			cacheID := "unknown"
+			if lastResponse.ExtraFields.CacheDebug.HitType != nil {
+				hitType = *lastResponse.ExtraFields.CacheDebug.HitType
+			}
+			if lastResponse.ExtraFields.CacheDebug.CacheID != nil {
+				cacheID = *lastResponse.ExtraFields.CacheDebug.CacheID
+			}
+			t.Logf("✅ Cache hit confirmed on last chunk: HitType=%s, CacheID=%s", hitType, cacheID)
+		} else {
+			// Check all chunks for debugging
+			for i, response := range responses2 {
+				if response.ExtraFields.CacheDebug != nil {
+					t.Logf("Chunk %d: CacheDebug present, CacheHit=%v", i, response.ExtraFields.CacheDebug.CacheHit)
+				} else {
+					t.Logf("Chunk %d: No CacheDebug info", i)
+				}
+			}
+		}
+	}
+
+	if !cached {
+		t.Fatal("Second streaming request was not served from cache (CacheDebug not found on last chunk)")
+	}
+
+	// Performance comparison
+	t.Logf("Streaming Performance Summary:")
+	t.Logf("First request (OpenAI):  %v", duration1)
+	t.Logf("Second request (Cache):  %v", duration2)
+
+	if duration2 < duration1 {
+		speedup := float64(duration1) / float64(duration2)
+		t.Logf("Streaming cache speedup: %.2fx faster", speedup)
+	} else {
+		speedup := float64(duration1) / float64(duration2)
+		t.Logf("Streaming cache was slower than original: speedup=%.2fx (this can happen due to system load)", speedup)
+		// Only fail if cache is extremely slow (10x+ slower), indicating a real problem
+		if duration2 > duration1*10 {
+			t.Errorf("Streaming cache is extremely slow compared to original: cache=%v, original=%v (cache may not be working)", duration2, duration1)
+		}
+	}
+
+	t.Log("✅ Image generation streaming cache test completed successfully!")
+}
--- a/plugins/semanticcache/plugin_integration_test.go
+++ b/plugins/semanticcache/plugin_integration_test.go
@@ -0,0 +1,736 @@
+package semanticcache
+
+import (
+	"context"
+	"strings"
+	"testing"
+	"time"
+
+	"github.com/google/uuid"
+	bifrost "github.com/maximhq/bifrost/core"
+	"github.com/maximhq/bifrost/core/schemas"
+)
+
+// TestSemanticCacheBasicFlow tests the complete semantic cache flow
+func TestSemanticCacheBasicFlow(t *testing.T) {
+	setup := NewTestSetup(t)
+	defer setup.Cleanup()
+
+	ctx := schemas.NewBifrostContext(context.Background(), schemas.NoDeadline)
+	ctx.SetValue(CacheKey, "test-cache-enabled")
+
+	// Test request
+	request := &schemas.BifrostRequest{
+		RequestType: schemas.ChatCompletionRequest,
+		ChatRequest: &schemas.BifrostChatRequest{
+			Provider: schemas.OpenAI,
+			Model:    "gpt-4o-mini",
+			Input: []schemas.ChatMessage{
+				{
+					Role: schemas.ChatMessageRoleUser,
+					Content: &schemas.ChatMessageContent{
+						ContentStr: bifrost.Ptr("Hello, world!"),
+					},
+				},
+			},
+			Params: &schemas.ChatParameters{
+				Temperature:         bifrost.Ptr(0.7),
+				MaxCompletionTokens: bifrost.Ptr(100),
+			},
+		},
+	}
+
+	t.Log("Testing first request (cache miss)...")
+
+	// First request - should be a cache miss
+	modifiedReq, shortCircuit, err := setup.Plugin.PreLLMHook(ctx, request)
+	if err != nil {
+		t.Fatalf("PreLLMHook failed: %v", err)
+	}
+
+	if shortCircuit != nil {
+		t.Fatal("Expected cache miss, but got cache hit")
+	}
+
+	if modifiedReq == nil {
+		t.Fatal("Modified request is nil")
+	}
+
+	t.Log("✅ Cache miss handled correctly")
+
+	// Simulate a response
+	response := &schemas.BifrostResponse{
+		ChatResponse: &schemas.BifrostChatResponse{
+			ID: uuid.New().String(),
+			Choices: []schemas.BifrostResponseChoice{
+				{
+					Index: 0,
+					ChatNonStreamResponseChoice: &schemas.ChatNonStreamResponseChoice{
+						Message: &schemas.ChatMessage{
+							Role: schemas.ChatMessageRoleAssistant,
+							Content: &schemas.ChatMessageContent{
+								ContentStr: bifrost.Ptr("Hello! How can I help you today?"),
+							}},
+					},
+				},
+			},
+			ExtraFields: schemas.BifrostResponseExtraFields{
+				Provider:               schemas.OpenAI,
+				OriginalModelRequested: "gpt-4o-mini",
+				RequestType:            schemas.ChatCompletionRequest,
+			},
+		},
+	}
+
+	// Capture original response content for comparison
+	var originalContent string
+	if len(response.ChatResponse.Choices) > 0 && response.ChatResponse.Choices[0].Message.Content.ContentStr != nil {
+		originalContent = *response.ChatResponse.Choices[0].Message.Content.ContentStr
+	}
+	if originalContent == "" {
+		t.Fatal("Original response content is empty")
+	}
+	t.Logf("Original response content: %s", originalContent)
+
+	// Cache the response
+	t.Log("Caching response...")
+	_, _, err = setup.Plugin.PostLLMHook(ctx, response, nil)
+	if err != nil {
+		t.Fatalf("PostLLMHook failed: %v", err)
+	}
+
+	// Wait for async caching to complete
+	WaitForCache(setup.Plugin)
+	t.Log("✅ Response cached successfully")
+
+	// Second request - should be a cache hit
+	t.Log("Testing second identical request (expecting cache hit)...")
+
+	// Reset context for second request
+	ctx2 := schemas.NewBifrostContext(context.Background(), schemas.NoDeadline)
+	ctx2.SetValue(CacheKey, "test-cache-enabled")
+
+	modifiedReq2, shortCircuit2, err := setup.Plugin.PreLLMHook(ctx2, request)
+	if err != nil {
+		t.Fatalf("Second PreLLMHook failed: %v", err)
+	}
+
+	if shortCircuit2 == nil {
+		t.Fatal("expected cache hit on identical request")
+		return
+	}
+
+	if shortCircuit2.Response == nil {
+		t.Fatal("Cache hit but response is nil")
+	}
+
+	if modifiedReq2 == nil {
+		t.Fatal("Modified request is nil on cache hit")
+	}
+
+	t.Log("✅ Cache hit detected and response returned")
+
+	// Verify the cached response
+	if len(shortCircuit2.Response.ChatResponse.Choices) == 0 {
+		t.Fatal("Cached response has no choices")
+	}
+
+	cachedContent := shortCircuit2.Response.ChatResponse.Choices[0].Message.Content.ContentStr
+	if cachedContent == nil || *cachedContent == "" {
+		t.Fatal("Cached response content is empty")
+	}
+
+	t.Logf("✅ Cached response content: %s", *cachedContent)
+
+	// Compare original and cached content
+	cachedContentStr := *cachedContent
+	// Trim whitespace and newlines for comparison
+	originalContentTrimmed := strings.TrimSpace(originalContent)
+	cachedContentTrimmed := strings.TrimSpace(cachedContentStr)
+
+	if originalContentTrimmed != cachedContentTrimmed {
+		t.Fatalf("❌ Content mismatch: original='%s', cached='%s'", originalContentTrimmed, cachedContentTrimmed)
+	}
+
+	t.Log("✅ Content verification passed - original and cached responses match")
+	t.Log("🎉 Basic semantic cache flow test passed!")
+}
+
+// TestSemanticCacheStrictFiltering tests that the cache respects parameter differences
+func TestSemanticCacheStrictFiltering(t *testing.T) {
+	setup := NewTestSetup(t)
+	defer setup.Cleanup()
+
+	ctx := schemas.NewBifrostContext(context.Background(), schemas.NoDeadline)
+	ctx.SetValue(CacheKey, "test-cache-enabled")
+
+	// Base request
+	baseRequest := &schemas.BifrostRequest{
+		RequestType: schemas.ChatCompletionRequest,
+		ChatRequest: &schemas.BifrostChatRequest{
+			Provider: schemas.OpenAI,
+			Model:    "gpt-4o-mini",
+			Input: []schemas.ChatMessage{
+				{
+					Role: schemas.ChatMessageRoleUser,
+					Content: &schemas.ChatMessageContent{
+						ContentStr: bifrost.Ptr("What is the weather like?"),
+					},
+				},
+			},
+			Params: &schemas.ChatParameters{
+				Temperature:         bifrost.Ptr(0.7),
+				MaxCompletionTokens: bifrost.Ptr(100),
+			},
+		},
+	}
+
+	t.Log("Testing first request with temperature=0.7...")
+
+	// First request
+	_, shortCircuit1, err := setup.Plugin.PreLLMHook(ctx, baseRequest)
+	if err != nil {
+		t.Fatalf("First PreLLMHook failed: %v", err)
+	}
+
+	if shortCircuit1 != nil {
+		t.Fatal("Expected cache miss for first request")
+	}
+
+	// Cache a response
+	response := &schemas.BifrostResponse{
+		ChatResponse: &schemas.BifrostChatResponse{
+			ID: uuid.New().String(),
+			Choices: []schemas.BifrostResponseChoice{
+				{
+					ChatNonStreamResponseChoice: &schemas.ChatNonStreamResponseChoice{
+						Message: &schemas.ChatMessage{
+							Role: schemas.ChatMessageRoleAssistant,
+							Content: &schemas.ChatMessageContent{
+								ContentStr: bifrost.Ptr("It's sunny today!"),
+							}},
+					},
+				},
+			},
+			ExtraFields: schemas.BifrostResponseExtraFields{
+				Provider:               schemas.OpenAI,
+				OriginalModelRequested: "gpt-4o-mini",
+				RequestType:            schemas.ChatCompletionRequest,
+			},
+		},
+	}
+
+	_, _, err = setup.Plugin.PostLLMHook(ctx, response, nil)
+	if err != nil {
+		t.Fatalf("PostLLMHook failed: %v", err)
+	}
+
+	WaitForCache(setup.Plugin)
+	t.Log("✅ First response cached")
+
+	// Second request with different temperature - should be cache miss
+	t.Log("Testing second request with temperature=0.5 (expecting cache miss)...")
+
+	ctx2 := schemas.NewBifrostContext(context.Background(), schemas.NoDeadline)
+	ctx2.SetValue(CacheKey, "test-cache-enabled")
+
+	modifiedRequest := &schemas.BifrostRequest{
+		RequestType: schemas.ChatCompletionRequest,
+		ChatRequest: &schemas.BifrostChatRequest{
+			Provider: schemas.OpenAI,
+			Model:    "gpt-4o-mini",
+			Input: []schemas.ChatMessage{
+				{
+					Role: schemas.ChatMessageRoleUser,
+					Content: &schemas.ChatMessageContent{
+						ContentStr: bifrost.Ptr("What is the weather like?"),
+					},
+				},
+			},
+			Params: &schemas.ChatParameters{
+				Temperature:         bifrost.Ptr(0.5), // Different temperature
+				MaxCompletionTokens: bifrost.Ptr(100),
+			},
+		},
+	}
+
+	_, shortCircuit2, err := setup.Plugin.PreLLMHook(ctx2, modifiedRequest)
+	if err != nil {
+		t.Fatalf("Second PreLLMHook failed: %v", err)
+	}
+
+	if shortCircuit2 != nil {
+		t.Fatal("Expected cache miss due to different temperature, but got cache hit")
+	}
+
+	t.Log("✅ Strict filtering working - different parameters result in cache miss")
+
+	// Third request with different model - should be cache miss
+	t.Log("Testing third request with different model (expecting cache miss)...")
+
+	ctx3 := schemas.NewBifrostContext(context.Background(), schemas.NoDeadline)
+	ctx3.SetValue(CacheKey, "test-cache-enabled")
+
+	modifiedRequest2 := &schemas.BifrostRequest{
+		RequestType: schemas.ChatCompletionRequest,
+		ChatRequest: &schemas.BifrostChatRequest{
+			Provider: schemas.OpenAI,
+			Model:    "gpt-3.5-turbo", // Different model
+			Input: []schemas.ChatMessage{
+				{
+					Role: schemas.ChatMessageRoleUser,
+					Content: &schemas.ChatMessageContent{
+						ContentStr: bifrost.Ptr("What is the weather like?"),
+					},
+				},
+			},
+			Params: &schemas.ChatParameters{
+				Temperature:         bifrost.Ptr(0.7),
+				MaxCompletionTokens: bifrost.Ptr(100),
+			},
+		},
+	}
+
+	_, shortCircuit3, err := setup.Plugin.PreLLMHook(ctx3, modifiedRequest2)
+	if err != nil {
+		t.Fatalf("Third PreLLMHook failed: %v", err)
+	}
+
+	if shortCircuit3 != nil {
+		t.Fatal("Expected cache miss due to different model, but got cache hit")
+	}
+
+	t.Log("✅ Strict filtering working - different model results in cache miss")
+	t.Log("🎉 Strict filtering test passed!")
+}
+
+// TestSemanticCacheStreamingFlow tests streaming response caching
+func TestSemanticCacheStreamingFlow(t *testing.T) {
+	setup := NewTestSetup(t)
+	defer setup.Cleanup()
+
+	ctx := schemas.NewBifrostContext(context.Background(), schemas.NoDeadline)
+	ctx.SetValue(CacheKey, "test-cache-enabled")
+
+	request := &schemas.BifrostRequest{
+		RequestType: schemas.ChatCompletionStreamRequest,
+		ChatRequest: &schemas.BifrostChatRequest{
+			Provider: schemas.OpenAI,
+			Model:    "gpt-4o-mini",
+			Input: []schemas.ChatMessage{
+				{
+					Role: schemas.ChatMessageRoleUser,
+					Content: &schemas.ChatMessageContent{
+						ContentStr: bifrost.Ptr("Tell me a short story"),
+					},
+				},
+			},
+			Params: &schemas.ChatParameters{
+				Temperature: bifrost.Ptr(0.8),
+			},
+		},
+	}
+
+	t.Log("Testing streaming request (cache miss)...")
+
+	// First request - should be cache miss
+	_, shortCircuit, err := setup.Plugin.PreLLMHook(ctx, request)
+	if err != nil {
+		t.Fatalf("PreLLMHook failed: %v", err)
+	}
+
+	if shortCircuit != nil {
+		t.Fatal("Expected cache miss for streaming request")
+	}
+
+	t.Log("✅ Streaming cache miss handled correctly")
+
+	// Simulate streaming response chunks
+	t.Log("Caching streaming response chunks...")
+
+	chunks := []string{
+		"Once upon a time,",
+		" there was a brave",
+		" knight who saved the day.",
+	}
+
+	for i, chunk := range chunks {
+		var finishReason *string
+		if i == len(chunks)-1 {
+			finishReason = bifrost.Ptr("stop")
+		}
+
+		chunkResponse := &schemas.BifrostResponse{
+			ChatResponse: &schemas.BifrostChatResponse{
+				ID: uuid.New().String(),
+				Choices: []schemas.BifrostResponseChoice{
+					{
+						Index:        i,
+						FinishReason: finishReason,
+						ChatStreamResponseChoice: &schemas.ChatStreamResponseChoice{
+							Delta: &schemas.ChatStreamResponseChoiceDelta{
+								Content: bifrost.Ptr(chunk),
+							},
+						},
+					},
+				},
+				ExtraFields: schemas.BifrostResponseExtraFields{
+					Provider:               schemas.OpenAI,
+					OriginalModelRequested: "gpt-4o-mini",
+					RequestType:            schemas.ChatCompletionStreamRequest,
+					ChunkIndex:             i,
+				},
+			},
+		}
+
+		_, _, err = setup.Plugin.PostLLMHook(ctx, chunkResponse, nil)
+		if err != nil {
+			t.Fatalf("PostLLMHook failed for chunk %d: %v", i, err)
+		}
+	}
+
+	WaitForCache(setup.Plugin)
+	t.Log("✅ Streaming response chunks cached")
+
+	// Test cache retrieval for streaming
+	t.Log("Testing streaming cache retrieval...")
+
+	ctx2 := schemas.NewBifrostContext(context.Background(), schemas.NoDeadline)
+	ctx2.SetValue(CacheKey, "test-cache-enabled")
+
+	_, shortCircuit2, err := setup.Plugin.PreLLMHook(ctx2, request)
+	if err != nil {
+		t.Fatalf("Second PreLLMHook failed: %v", err)
+	}
+
+	if shortCircuit2 == nil {
+		t.Log("⚠️ Expected streaming cache hit, but got cache miss - this may be expected with the new unified storage")
+		return
+	}
+
+	if shortCircuit2.Stream == nil {
+		t.Fatal("Cache hit but stream is nil")
+	}
+
+	t.Log("✅ Streaming cache hit detected")
+
+	// Read from the cached stream
+	chunkCount := 0
+	for chunk := range shortCircuit2.Stream {
+		if chunk.BifrostChatResponse == nil {
+			continue
+		}
+		chunkCount++
+		t.Logf("Received cached chunk %d", chunkCount)
+	}
+
+	if chunkCount == 0 {
+		t.Fatal("No chunks received from cached stream")
+	}
+
+	t.Logf("✅ Received %d cached chunks", chunkCount)
+	t.Log("🎉 Streaming cache test passed!")
+}
+
+// TestSemanticCache_NoCacheWhenKeyMissing verifies cache is disabled when cache key is missing from context
+func TestSemanticCache_NoCacheWhenKeyMissing(t *testing.T) {
+	t.Log("Testing cache behavior when cache key is missing...")
+
+	setup := NewTestSetup(t)
+	defer setup.Cleanup()
+
+	ctx := schemas.NewBifrostContext(context.Background(), schemas.NoDeadline)
+	// Don't set the cache key - cache should be disabled
+
+	request := &schemas.BifrostRequest{
+		RequestType: schemas.ChatCompletionRequest,
+		ChatRequest: &schemas.BifrostChatRequest{
+			Provider: schemas.OpenAI,
+			Model:    "gpt-4o-mini",
+			Input: []schemas.ChatMessage{
+				{
+					Role: schemas.ChatMessageRoleUser,
+					Content: &schemas.ChatMessageContent{
+						ContentStr: bifrost.Ptr("Test message"),
+					},
+				},
+			},
+		},
+	}
+
+	_, shortCircuit, err := setup.Plugin.PreLLMHook(ctx, request)
+	if err != nil {
+		t.Fatalf("PreLLMHook failed: %v", err)
+	}
+
+	if shortCircuit != nil {
+		t.Fatal("Expected no caching when cache key is not set, but got cache hit")
+	}
+
+	t.Log("✅ Cache properly disabled when no cache key is set")
+	t.Log("🎉 No cache key test passed!")
+}
+
+// TestSemanticCache_CustomTTLHandling verifies cache respects custom TTL values from context
+func TestSemanticCache_CustomTTLHandling(t *testing.T) {
+	setup := NewTestSetup(t)
+	defer setup.Cleanup()
+
+	// Configure plugin with custom TTL key
+	ctx := schemas.NewBifrostContext(context.Background(), schemas.NoDeadline)
+	ctx.SetValue(CacheKey, "test-cache-enabled")
+	ctx.SetValue(CacheTTLKey, 1*time.Minute) // Custom TTL
+
+	request := &schemas.BifrostRequest{
+		RequestType: schemas.ChatCompletionRequest,
+		ChatRequest: &schemas.BifrostChatRequest{
+			Provider: schemas.OpenAI,
+			Model:    "gpt-4o-mini",
+			Input: []schemas.ChatMessage{
+				{
+					Role: schemas.ChatMessageRoleUser,
+					Content: &schemas.ChatMessageContent{
+						ContentStr: bifrost.Ptr("TTL test message"),
+					},
+				},
+			},
+		},
+	}
+
+	// First request - cache miss
+	_, shortCircuit, err := setup.Plugin.PreLLMHook(ctx, request)
+	if err != nil {
+		t.Fatalf("PreLLMHook failed: %v", err)
+	}
+
+	if shortCircuit != nil {
+		t.Fatal("Expected cache miss, but got cache hit")
+	}
+
+	// Simulate response and cache it
+	response := &schemas.BifrostResponse{
+		ChatResponse: &schemas.BifrostChatResponse{
+			ID: "ttl-test-response",
+			Choices: []schemas.BifrostResponseChoice{
+				{
+					ChatNonStreamResponseChoice: &schemas.ChatNonStreamResponseChoice{
+						Message: &schemas.ChatMessage{
+							Role: "assistant",
+							Content: &schemas.ChatMessageContent{
+								ContentStr: bifrost.Ptr("TTL test response"),
+							},
+						},
+					},
+				},
+			},
+			ExtraFields: schemas.BifrostResponseExtraFields{
+				Provider:               schemas.OpenAI,
+				OriginalModelRequested: "gpt-4o-mini",
+				RequestType:            schemas.ChatCompletionRequest,
+			},
+		},
+	}
+
+	_, _, err = setup.Plugin.PostLLMHook(ctx, response, nil)
+	if err != nil {
+		t.Fatalf("PostLLMHook failed: %v", err)
+	}
+
+	WaitForCache(setup.Plugin)
+
+	t.Log("✅ Custom TTL configuration test passed!")
+}
+
+// TestSemanticCache_CustomThresholdHandling verifies cache respects custom similarity threshold from context
+func TestSemanticCache_CustomThresholdHandling(t *testing.T) {
+	setup := NewTestSetup(t)
+	defer setup.Cleanup()
+
+	// Configure plugin with custom threshold key
+	ctx := schemas.NewBifrostContext(context.Background(), schemas.NoDeadline)
+	ctx.SetValue(CacheKey, "test-cache-enabled")
+	ctx.SetValue(CacheThresholdKey, 0.95) // Very high threshold
+
+	request := &schemas.BifrostRequest{
+		RequestType: schemas.ChatCompletionRequest,
+		ChatRequest: &schemas.BifrostChatRequest{
+			Provider: schemas.OpenAI,
+			Model:    "gpt-4o-mini",
+			Input: []schemas.ChatMessage{
+				{
+					Role: schemas.ChatMessageRoleUser,
+					Content: &schemas.ChatMessageContent{
+						ContentStr: bifrost.Ptr("Threshold test message"),
+					},
+				},
+			},
+		},
+	}
+
+	// Test that custom threshold is used (this would need semantic search to be fully testable)
+	_, shortCircuit, err := setup.Plugin.PreLLMHook(ctx, request)
+	if err != nil {
+		t.Fatalf("PreLLMHook failed: %v", err)
+	}
+
+	if shortCircuit != nil {
+		t.Fatal("Expected cache miss with high threshold, but got cache hit")
+	}
+
+	t.Log("✅ Custom threshold configuration test passed!")
+}
+
+// TestSemanticCache_ProviderModelCachingFlags verifies cache behavior with provider/model caching flags
+func TestSemanticCache_ProviderModelCachingFlags(t *testing.T) {
+	setup := NewTestSetup(t)
+	defer setup.Cleanup()
+
+	// Test with provider/model caching disabled
+	setup.Config.CacheByProvider = bifrost.Ptr(false)
+	setup.Config.CacheByModel = bifrost.Ptr(false)
+
+	ctx := schemas.NewBifrostContext(context.Background(), schemas.NoDeadline)
+	ctx.SetValue(CacheKey, "test-cache-enabled")
+
+	request1 := &schemas.BifrostRequest{
+		RequestType: schemas.ChatCompletionRequest,
+		ChatRequest: &schemas.BifrostChatRequest{
+			Provider: schemas.OpenAI,
+			Model:    "gpt-4o-mini",
+			Input: []schemas.ChatMessage{
+				{
+					Role: schemas.ChatMessageRoleUser,
+					Content: &schemas.ChatMessageContent{
+						ContentStr: bifrost.Ptr("Provider model flags test"),
+					},
+				},
+			},
+		},
+	}
+
+	// First request with OpenAI
+	_, shortCircuit1, err := setup.Plugin.PreLLMHook(ctx, request1)
+	if err != nil {
+		t.Fatalf("PreLLMHook failed: %v", err)
+	}
+
+	if shortCircuit1 != nil {
+		t.Fatal("Expected cache miss, but got cache hit")
+	}
+
+	// Cache the response
+	response := &schemas.BifrostResponse{
+		ChatResponse: &schemas.BifrostChatResponse{
+			ID: "provider-model-test",
+			Choices: []schemas.BifrostResponseChoice{
+				{
+					ChatNonStreamResponseChoice: &schemas.ChatNonStreamResponseChoice{
+						Message: &schemas.ChatMessage{
+							Role: "assistant",
+							Content: &schemas.ChatMessageContent{
+								ContentStr: bifrost.Ptr("Provider model test response"),
+							},
+						},
+					},
+				},
+			},
+			ExtraFields: schemas.BifrostResponseExtraFields{
+				Provider:               schemas.OpenAI,
+				OriginalModelRequested: "gpt-4o-mini",
+				RequestType:            schemas.ChatCompletionRequest,
+			},
+		},
+	}
+
+	_, _, err = setup.Plugin.PostLLMHook(ctx, response, nil)
+	if err != nil {
+		t.Fatalf("PostLLMHook failed: %v", err)
+	}
+
+	WaitForCache(setup.Plugin)
+
+	// Second request with different provider - should potentially hit cache since provider is not considered
+	request2 := &schemas.BifrostRequest{
+		RequestType: schemas.ChatCompletionRequest,
+		ChatRequest: &schemas.BifrostChatRequest{
+			Provider: schemas.Anthropic, // Different provider
+			Model:    "claude-3-haiku",  // Different model
+			Input: []schemas.ChatMessage{
+				{
+					Role: schemas.ChatMessageRoleUser,
+					Content: &schemas.ChatMessageContent{
+						ContentStr: bifrost.Ptr("Provider model flags test"), // Same content
+					},
+				},
+			},
+		},
+	}
+
+	ctx2 := schemas.NewBifrostContext(context.Background(), schemas.NoDeadline)
+	ctx2.SetValue(CacheKey, "test-cache-enabled")
+
+	_, shortCircuit2, err := setup.Plugin.PreLLMHook(ctx2, request2)
+	if err != nil {
+		t.Fatalf("Second PreLLMHook failed: %v", err)
+	}
+
+	// With provider/model caching disabled, we might get cache hits across different providers/models
+	// This behavior depends on the exact implementation of hash generation
+	t.Logf("Cache behavior with disabled provider/model flags: hit=%v", shortCircuit2 != nil)
+
+	t.Log("✅ Provider/model caching flags test passed!")
+}
+
+// TestSemanticCache_ConfigurationEdgeCases verifies edge cases in configuration handling
+func TestSemanticCache_ConfigurationEdgeCases(t *testing.T) {
+	setup := NewTestSetup(t)
+	defer setup.Cleanup()
+
+	// Test with invalid TTL type in context
+	ctx := schemas.NewBifrostContext(context.Background(), schemas.NoDeadline)
+	ctx.SetValue(CacheKey, "test-cache-enabled")
+	ctx.SetValue(CacheTTLKey, "not-a-duration") // Invalid TTL type
+
+	request := &schemas.BifrostRequest{
+		RequestType: schemas.ChatCompletionRequest,
+		ChatRequest: &schemas.BifrostChatRequest{
+			Provider: schemas.OpenAI,
+			Model:    "gpt-4o-mini",
+			Input: []schemas.ChatMessage{
+				{
+					Role: schemas.ChatMessageRoleUser,
+					Content: &schemas.ChatMessageContent{
+						ContentStr: bifrost.Ptr("Edge case test"),
+					},
+				},
+			},
+		},
+	}
+
+	// Should handle invalid TTL gracefully
+	_, shortCircuit, err := setup.Plugin.PreLLMHook(ctx, request)
+	if err != nil {
+		t.Fatalf("PreLLMHook failed with invalid TTL: %v", err)
+	}
+
+	if shortCircuit != nil {
+		t.Fatal("Unexpected cache hit with invalid TTL")
+	}
+
+	// Test with invalid threshold type
+	ctx2 := schemas.NewBifrostContext(context.Background(), schemas.NoDeadline)
+	ctx2.SetValue(CacheKey, "test-cache-enabled")
+	ctx2.SetValue(CacheThresholdKey, "not-a-float") // Invalid threshold type
+
+	// Should handle invalid threshold gracefully
+	_, shortCircuit2, err := setup.Plugin.PreLLMHook(ctx2, request)
+	if err != nil {
+		t.Fatalf("PreLLMHook failed with invalid threshold: %v", err)
+	}
+
+	if shortCircuit2 != nil {
+		t.Fatal("Unexpected cache hit with invalid threshold")
+	}
+
+	t.Log("✅ Configuration edge cases test passed!")
+}
--- a/plugins/semanticcache/plugin_nil_content_test.go
+++ b/plugins/semanticcache/plugin_nil_content_test.go
@@ -0,0 +1,306 @@
+package semanticcache
+
+import (
+	"testing"
+
+	bifrost "github.com/maximhq/bifrost/core"
+	"github.com/maximhq/bifrost/core/schemas"
+)
+
+// TestExtractTextForEmbedding_NilContent verifies that extractTextForEmbedding
+// does not panic when chat messages have nil Content (e.g., assistant tool-call messages).
+func TestExtractTextForEmbedding_NilContent(t *testing.T) {
+	plugin := &Plugin{
+		config: &Config{},
+	}
+
+	tests := []struct {
+		name    string
+		request *schemas.BifrostRequest
+	}{
+		{
+			name: "ChatRequest with nil Content in assistant tool-call message",
+			request: &schemas.BifrostRequest{
+				RequestType: schemas.ChatCompletionRequest,
+				ChatRequest: &schemas.BifrostChatRequest{
+					Provider: schemas.OpenAI,
+					Model:    "gpt-4o-mini",
+					Input: []schemas.ChatMessage{
+						{
+							Role: schemas.ChatMessageRoleUser,
+							Content: &schemas.ChatMessageContent{
+								ContentStr: bifrost.Ptr("Call the get_weather function"),
+							},
+						},
+						{
+							Role:    schemas.ChatMessageRoleAssistant,
+							Content: nil, // tool-call message with no content
+							ChatAssistantMessage: &schemas.ChatAssistantMessage{
+								ToolCalls: []schemas.ChatAssistantMessageToolCall{
+									{
+										ID:   bifrost.Ptr("call_123"),
+										Type: bifrost.Ptr("function"),
+										Function: schemas.ChatAssistantMessageToolCallFunction{
+											Name:      bifrost.Ptr("get_weather"),
+											Arguments: `{"location": "San Francisco"}`,
+										},
+									},
+								},
+							},
+						},
+					},
+					Params: &schemas.ChatParameters{
+						Temperature:         bifrost.Ptr(0.7),
+						MaxCompletionTokens: bifrost.Ptr(100),
+					},
+				},
+			},
+		},
+		{
+			name: "ChatRequest where all messages have nil Content",
+			request: &schemas.BifrostRequest{
+				RequestType: schemas.ChatCompletionRequest,
+				ChatRequest: &schemas.BifrostChatRequest{
+					Provider: schemas.OpenAI,
+					Model:    "gpt-4o-mini",
+					Input: []schemas.ChatMessage{
+						{
+							Role:    schemas.ChatMessageRoleAssistant,
+							Content: nil,
+						},
+					},
+					Params: &schemas.ChatParameters{
+						Temperature:         bifrost.Ptr(0.7),
+						MaxCompletionTokens: bifrost.Ptr(100),
+					},
+				},
+			},
+		},
+		{
+			name: "ResponsesRequest with nil Content",
+			request: &schemas.BifrostRequest{
+				RequestType:      schemas.ResponsesRequest,
+				ResponsesRequest: createResponsesRequestWithNilContent(),
+			},
+		},
+	}
+
+	for _, tt := range tests {
+		t.Run(tt.name, func(t *testing.T) {
+			// This should not panic
+			text, hash, err := plugin.extractTextForEmbedding(tt.request)
+			// We don't care about the error — the important thing is no panic
+			t.Logf("text=%q, hash=%q, err=%v", text, hash, err)
+		})
+	}
+}
+
+func TestPrepareDirectCacheLookup_ResponsesStreamRequest(t *testing.T) {
+	plugin := &Plugin{
+		config: getDefaultTestConfig(),
+		logger: bifrost.NewDefaultLogger(schemas.LogLevelDebug),
+	}
+
+	req := &schemas.BifrostRequest{
+		RequestType:      schemas.ResponsesStreamRequest,
+		ResponsesRequest: CreateStreamingResponsesRequest("Explain cache invalidation", 0.2, 200),
+	}
+
+	ctx := CreateContextWithCacheKey("responses-stream-direct")
+	directID, err := plugin.prepareDirectCacheLookup(ctx, req, "responses-stream-direct")
+	if err != nil {
+		t.Fatalf("prepareDirectCacheLookup failed: %v", err)
+	}
+	if directID == "" {
+		t.Fatal("expected deterministic direct cache id")
+	}
+	if got, _ := ctx.Value(requestHashKey).(string); got == "" {
+		t.Fatal("expected request hash to be stored in context")
+	}
+	if got, _ := ctx.Value(requestParamsHashKey).(string); got == "" {
+		t.Fatal("expected params hash to be stored in context")
+	}
+}
+
+func TestPrepareDirectCacheLookup_UnsupportedRequestTypeFailsClosed(t *testing.T) {
+	plugin := &Plugin{
+		config: getDefaultTestConfig(),
+		logger: bifrost.NewDefaultLogger(schemas.LogLevelDebug),
+	}
+
+	req := &schemas.BifrostRequest{
+		RequestType: schemas.PassthroughRequest,
+		PassthroughRequest: &schemas.BifrostPassthroughRequest{
+			Provider: schemas.OpenAI,
+			Model:    "gpt-4o-mini",
+			Method:   "GET",
+			Path:     "/v1/models",
+		},
+	}
+
+	ctx := CreateContextWithCacheKey("unsupported-direct")
+	directID, err := plugin.prepareDirectCacheLookup(ctx, req, "unsupported-direct")
+	if err == nil {
+		t.Fatal("expected prepareDirectCacheLookup to reject unsupported request type")
+	}
+	if directID != "" {
+		t.Fatalf("expected no direct cache id, got %q", directID)
+	}
+	if got, _ := ctx.Value(requestHashKey).(string); got != "" {
+		t.Fatalf("expected request hash to remain unset, got %q", got)
+	}
+	if got, _ := ctx.Value(requestParamsHashKey).(string); got != "" {
+		t.Fatalf("expected params hash to remain unset, got %q", got)
+	}
+	if got, _ := ctx.Value(requestStorageIDKey).(string); got != "" {
+		t.Fatalf("expected storage id to remain unset, got %q", got)
+	}
+}
+
+func TestPreLLMHookSkipsUnsupportedCountTokensRequest(t *testing.T) {
+	plugin := &Plugin{
+		config: getDefaultTestConfig(),
+		logger: bifrost.NewDefaultLogger(schemas.LogLevelDebug),
+	}
+
+	req := &schemas.BifrostRequest{
+		RequestType: schemas.CountTokensRequest,
+		CountTokensRequest: &schemas.BifrostResponsesRequest{
+			Provider: schemas.Anthropic,
+			Model:    "claude-sonnet-4-5",
+			Input: []schemas.ResponsesMessage{
+				{
+					Role: bifrost.Ptr(schemas.ResponsesInputMessageRoleUser),
+					Content: &schemas.ResponsesMessageContent{
+						ContentStr: bifrost.Ptr("How many tokens is this message?"),
+					},
+				},
+			},
+		},
+	}
+
+	ctx := CreateContextWithCacheKey("count-tokens-test")
+	ctx.SetValue(requestIDKey, "stale-request-id")
+	ctx.SetValue(requestStorageIDKey, "stale-storage-id")
+	ctx.SetValue(requestHashKey, "stale-request-hash")
+	ctx.SetValue(requestParamsHashKey, "stale-params-hash")
+	ctx.SetValue(requestModelKey, "stale-model")
+	ctx.SetValue(requestProviderKey, schemas.OpenAI)
+	ctx.SetValue(requestEmbeddingKey, []float32{1, 2, 3})
+	ctx.SetValue(requestEmbeddingTokensKey, 99)
+	ctx.SetValue(isCacheHitKey, true)
+	ctx.SetValue(cacheHitTypeKey, CacheTypeDirect)
+
+	modifiedReq, shortCircuit, err := plugin.PreLLMHook(ctx, req)
+	if err != nil {
+		t.Fatalf("PreLLMHook failed: %v", err)
+	}
+	if modifiedReq != req {
+		t.Fatal("expected original request to be returned unchanged")
+	}
+	if shortCircuit != nil {
+		t.Fatal("expected no short-circuit for unsupported count tokens request")
+	}
+	if got, _ := ctx.Value(requestIDKey).(string); got != "" {
+		t.Fatalf("expected requestIDKey to remain unset, got %q", got)
+	}
+	if got, _ := ctx.Value(requestHashKey).(string); got != "" {
+		t.Fatalf("expected requestHashKey to remain unset, got %q", got)
+	}
+	if got, _ := ctx.Value(requestParamsHashKey).(string); got != "" {
+		t.Fatalf("expected requestParamsHashKey to remain unset, got %q", got)
+	}
+	if got, _ := ctx.Value(requestStorageIDKey).(string); got != "" {
+		t.Fatalf("expected requestStorageIDKey to remain unset, got %q", got)
+	}
+	if got, _ := ctx.Value(requestModelKey).(string); got != "" {
+		t.Fatalf("expected requestModelKey to remain unset, got %q", got)
+	}
+	if got, ok := ctx.Value(requestProviderKey).(schemas.ModelProvider); ok && got != "" {
+		t.Fatalf("expected requestProviderKey to remain unset, got %q", got)
+	}
+	if got := ctx.Value(requestEmbeddingKey); got != nil {
+		t.Fatalf("expected requestEmbeddingKey to remain unset, got %#v", got)
+	}
+	if got, ok := ctx.Value(requestEmbeddingTokensKey).(int); ok && got != 0 {
+		t.Fatalf("expected requestEmbeddingTokensKey to remain unset, got %d", got)
+	}
+	if got, ok := ctx.Value(isCacheHitKey).(bool); ok && got {
+		t.Fatal("expected isCacheHitKey to remain unset")
+	}
+	if got, ok := ctx.Value(cacheHitTypeKey).(CacheType); ok && got != "" {
+		t.Fatalf("expected cacheHitTypeKey to remain unset, got %q", got)
+	}
+}
+
+// TestGetNormalizedInputForCaching_NilContent verifies that getNormalizedInputForCaching
+// does not panic when chat messages have nil Content.
+func TestGetNormalizedInputForCaching_NilContent(t *testing.T) {
+	plugin := &Plugin{
+		config: &Config{},
+	}
+
+	request := &schemas.BifrostRequest{
+		RequestType: schemas.ChatCompletionRequest,
+		ChatRequest: &schemas.BifrostChatRequest{
+			Provider: schemas.OpenAI,
+			Model:    "gpt-4o-mini",
+			Input: []schemas.ChatMessage{
+				{
+					Role: schemas.ChatMessageRoleUser,
+					Content: &schemas.ChatMessageContent{
+						ContentStr: bifrost.Ptr("Call the get_weather function"),
+					},
+				},
+				{
+					Role:    schemas.ChatMessageRoleAssistant,
+					Content: nil,
+					ChatAssistantMessage: &schemas.ChatAssistantMessage{
+						ToolCalls: []schemas.ChatAssistantMessageToolCall{
+							{
+								ID:   bifrost.Ptr("call_123"),
+								Type: bifrost.Ptr("function"),
+								Function: schemas.ChatAssistantMessageToolCallFunction{
+									Name:      bifrost.Ptr("get_weather"),
+									Arguments: `{"location": "San Francisco"}`,
+								},
+							},
+						},
+					},
+				},
+			},
+			Params: &schemas.ChatParameters{
+				Temperature:         bifrost.Ptr(0.7),
+				MaxCompletionTokens: bifrost.Ptr(100),
+			},
+		},
+	}
+
+	// This should not panic
+	result := plugin.getNormalizedInputForCaching(request)
+	t.Logf("result type: %T", result)
+}
+
+// createResponsesRequestWithNilContent builds a BifrostResponsesRequest with a nil Content message for testing.
+func createResponsesRequestWithNilContent() *schemas.BifrostResponsesRequest {
+	return &schemas.BifrostResponsesRequest{
+		Provider: schemas.OpenAI,
+		Model:    "gpt-4o-mini",
+		Input: []schemas.ResponsesMessage{
+			{
+				Role: bifrost.Ptr(schemas.ResponsesInputMessageRoleUser),
+				Content: &schemas.ResponsesMessageContent{
+					ContentStr: bifrost.Ptr("Hello"),
+				},
+			},
+			{
+				Role:    bifrost.Ptr(schemas.ResponsesInputMessageRoleAssistant),
+				Content: nil,
+			},
+		},
+		Params: &schemas.ResponsesParameters{
+			Temperature:     bifrost.Ptr(0.7),
+			MaxOutputTokens: bifrost.Ptr(100),
+		},
+	}
+}
--- a/plugins/semanticcache/plugin_no_store_test.go
+++ b/plugins/semanticcache/plugin_no_store_test.go
@@ -0,0 +1,326 @@
+package semanticcache
+
+import (
+	"testing"
+
+	"github.com/maximhq/bifrost/core/schemas"
+)
+
+// TestCacheNoStoreBasicFunctionality tests that CacheNoStoreKey prevents caching
+func TestCacheNoStoreBasicFunctionality(t *testing.T) {
+	setup := NewTestSetup(t)
+	defer setup.Cleanup()
+
+	testRequest := CreateBasicChatRequest("What is artificial intelligence?", 0.7, 100)
+
+	// Test 1: Normal caching (control test)
+	ctx1 := CreateContextWithCacheKey("test-no-store-control")
+	t.Log("Making normal request (should be cached)...")
+	response1, err1 := setup.Client.ChatCompletionRequest(ctx1, testRequest)
+	if err1 != nil {
+		return // Test will be skipped by retry function
+	}
+	AssertNoCacheHit(t, &schemas.BifrostResponse{ChatResponse: response1}) // Fresh request
+
+	WaitForCache(setup.Plugin)
+
+	// Verify it got cached
+	t.Log("Verifying normal caching worked...")
+	response2, err2 := setup.Client.ChatCompletionRequest(ctx1, testRequest)
+	if err2 != nil {
+		if err2.Error != nil {
+			t.Fatalf("Second request failed: %v", err2.Error.Message)
+		} else {
+			t.Fatalf("Second request failed: %v", err2)
+		}
+	}
+	AssertCacheHit(t, &schemas.BifrostResponse{ChatResponse: response2}, "direct") // Should be cached
+
+	// Test 2: NoStore = true (should not cache)
+	ctx2 := CreateContextWithCacheKeyAndNoStore("test-no-store-disabled", true)
+	t.Log("Making request with CacheNoStoreKey=true (should not be cached)...")
+	response3, err3 := setup.Client.ChatCompletionRequest(ctx2, testRequest)
+	if err3 != nil {
+		return // Test will be skipped by retry function
+	}
+	AssertNoCacheHit(t, &schemas.BifrostResponse{ChatResponse: response3}) // Fresh request
+
+	WaitForCache(setup.Plugin)
+
+	// Verify it was NOT cached
+	t.Log("Verifying no-store request was not cached...")
+	response4, err4 := setup.Client.ChatCompletionRequest(ctx2, testRequest)
+	if err4 != nil {
+		return // Test will be skipped by retry function
+	}
+	AssertNoCacheHit(t, &schemas.BifrostResponse{ChatResponse: response4}) // Should still be fresh (not cached)
+
+	// Test 3: NoStore = false (should cache normally)
+	ctx3 := CreateContextWithCacheKeyAndNoStore("test-no-store-enabled", false)
+	t.Log("Making request with CacheNoStoreKey=false (should be cached)...")
+	response5, err5 := setup.Client.ChatCompletionRequest(ctx3, testRequest)
+	if err5 != nil {
+		return // Test will be skipped by retry function
+	}
+	AssertNoCacheHit(t, &schemas.BifrostResponse{ChatResponse: response5}) // Fresh request
+
+	WaitForCache(setup.Plugin)
+
+	// Verify it got cached
+	t.Log("Verifying no-store=false request was cached...")
+	response6, err6 := setup.Client.ChatCompletionRequest(ctx3, testRequest)
+	if err6 != nil {
+		t.Fatalf("Sixth request failed: %v", err6)
+	}
+	AssertCacheHit(t, &schemas.BifrostResponse{ChatResponse: response6}, "direct") // Should be cached
+
+	t.Log("✅ CacheNoStoreKey basic functionality works correctly")
+}
+
+// TestCacheNoStoreWithDifferentRequestTypes tests NoStore with various request types
+func TestCacheNoStoreWithDifferentRequestTypes(t *testing.T) {
+	t.Skip("Skipping Embedding Tests")
+
+	setup := NewTestSetup(t)
+	defer setup.Cleanup()
+
+	// Test with chat completion
+	chatRequest := CreateBasicChatRequest("Test no-store with chat", 0.7, 50)
+	ctx1 := CreateContextWithCacheKeyAndNoStore("test-no-store-chat", true)
+
+	t.Log("Testing no-store with chat completion...")
+	response1, err1 := setup.Client.ChatCompletionRequest(ctx1, chatRequest)
+	if err1 != nil {
+		return // Test will be skipped by retry function
+	}
+	AssertNoCacheHit(t, &schemas.BifrostResponse{ChatResponse: response1})
+
+	WaitForCache(setup.Plugin)
+
+	// Verify not cached
+	response2, err2 := setup.Client.ChatCompletionRequest(ctx1, chatRequest)
+	if err2 != nil {
+		return // Test will be skipped by retry function
+	}
+	AssertNoCacheHit(t, &schemas.BifrostResponse{ChatResponse: response2}) // Should not be cached
+
+	// Test with embedding request
+	embeddingRequest := CreateEmbeddingRequest([]string{"Test no-store with embeddings"})
+	ctx2 := CreateContextWithCacheKeyAndNoStore("test-no-store-embedding", true)
+
+	t.Log("Testing no-store with embedding request...")
+	response3, err3 := setup.Client.EmbeddingRequest(ctx2, embeddingRequest)
+	if err3 != nil {
+		return // Test will be skipped by retry function
+	}
+	AssertNoCacheHit(t, &schemas.BifrostResponse{EmbeddingResponse: response3})
+
+	WaitForCache(setup.Plugin)
+
+	// Verify not cached
+	response4, err4 := setup.Client.EmbeddingRequest(ctx2, embeddingRequest)
+	if err4 != nil {
+		return // Test will be skipped by retry function
+	}
+	AssertNoCacheHit(t, &schemas.BifrostResponse{EmbeddingResponse: response4}) // Should not be cached
+
+	t.Log("✅ CacheNoStoreKey works with different request types")
+}
+
+// TestCacheNoStoreWithConversationHistory tests NoStore with conversation context
+func TestCacheNoStoreWithConversationHistory(t *testing.T) {
+	setup := NewTestSetup(t)
+	defer setup.Cleanup()
+
+	// Create conversation context
+	conversation := BuildConversationHistory(
+		"You are a helpful assistant",
+		[]string{"Hello", "Hi! How can I help?"},
+	)
+	messages := AddUserMessage(conversation, "What is machine learning?")
+	request := CreateConversationRequest(messages, 0.7, 100)
+
+	// Test with no-store enabled
+	ctx := CreateContextWithCacheKeyAndNoStore("test-no-store-conversation", true)
+
+	t.Log("Testing no-store with conversation history...")
+	response1, err1 := setup.Client.ChatCompletionRequest(ctx, request)
+	if err1 != nil {
+		return // Test will be skipped by retry function
+	}
+	AssertNoCacheHit(t, &schemas.BifrostResponse{ChatResponse: response1})
+
+	WaitForCache(setup.Plugin)
+
+	// Verify not cached (same conversation should not hit cache)
+	response2, err2 := setup.Client.ChatCompletionRequest(ctx, request)
+	if err2 != nil {
+		return // Test will be skipped by retry function
+	}
+	AssertNoCacheHit(t, &schemas.BifrostResponse{ChatResponse: response2}) // Should not be cached due to no-store
+
+	t.Log("✅ CacheNoStoreKey works with conversation history")
+}
+
+// TestCacheNoStoreWithCacheTypes tests NoStore interaction with CacheTypeKey
+func TestCacheNoStoreWithCacheTypes(t *testing.T) {
+	setup := NewTestSetup(t)
+	defer setup.Cleanup()
+
+	testRequest := CreateBasicChatRequest("Test no-store with cache types", 0.7, 50)
+
+	// Test no-store with direct cache type
+	ctx1 := CreateContextWithCacheKey("test-no-store-cache-types")
+	ctx1 = ctx1.WithValue(CacheNoStoreKey, true)
+	ctx1 = ctx1.WithValue(CacheTypeKey, CacheTypeDirect)
+
+	t.Log("Testing no-store with CacheTypeKey=direct...")
+	response1, err1 := setup.Client.ChatCompletionRequest(ctx1, testRequest)
+	if err1 != nil {
+		return // Test will be skipped by retry function
+	}
+	AssertNoCacheHit(t, &schemas.BifrostResponse{ChatResponse: response1})
+
+	WaitForCache(setup.Plugin)
+
+	// Should not be cached
+	response2, err2 := setup.Client.ChatCompletionRequest(ctx1, testRequest)
+	if err2 != nil {
+		return // Test will be skipped by retry function
+	}
+	AssertNoCacheHit(t, &schemas.BifrostResponse{ChatResponse: response2}) // No-store should override cache type
+
+	// Test no-store with semantic cache type
+	ctx2 := CreateContextWithCacheKey("test-no-store-cache-types")
+	ctx2 = ctx2.WithValue(CacheNoStoreKey, true)
+	ctx2 = ctx2.WithValue(CacheTypeKey, CacheTypeSemantic)
+
+	t.Log("Testing no-store with CacheTypeKey=semantic...")
+	response3, err3 := setup.Client.ChatCompletionRequest(ctx2, testRequest)
+	if err3 != nil {
+		return // Test will be skipped by retry function
+	}
+	AssertNoCacheHit(t, &schemas.BifrostResponse{ChatResponse: response3})
+
+	WaitForCache(setup.Plugin)
+
+	// Should not be cached
+	response4, err4 := setup.Client.ChatCompletionRequest(ctx2, testRequest)
+	if err4 != nil {
+		return // Test will be skipped by retry function
+	}
+	AssertNoCacheHit(t, &schemas.BifrostResponse{ChatResponse: response4}) // No-store should override cache type
+
+	t.Log("✅ CacheNoStoreKey correctly overrides cache type settings")
+}
+
+// TestCacheNoStoreErrorHandling tests error scenarios with NoStore
+func TestCacheNoStoreErrorHandling(t *testing.T) {
+	setup := NewTestSetup(t)
+	defer setup.Cleanup()
+
+	testRequest := CreateBasicChatRequest("Test no-store error handling", 0.7, 50)
+
+	// Test with invalid no-store value (non-boolean)
+	ctx1 := CreateContextWithCacheKey("test-no-store-errors")
+	ctx1 = ctx1.WithValue(CacheNoStoreKey, "invalid")
+
+	t.Log("Testing no-store with invalid value (should cache normally)...")
+	response1, err1 := setup.Client.ChatCompletionRequest(ctx1, testRequest)
+	if err1 != nil {
+		return // Test will be skipped by retry function
+	}
+	AssertNoCacheHit(t, &schemas.BifrostResponse{ChatResponse: response1})
+
+	WaitForCache(setup.Plugin)
+
+	// Should be cached (invalid value should be ignored)
+	response2, err2 := setup.Client.ChatCompletionRequest(ctx1, testRequest)
+	if err2 != nil {
+		if err2.Error != nil {
+			t.Fatalf("Second request failed: %v", err2.Error.Message)
+		} else {
+			t.Fatalf("Second request failed: %v", err2)
+		}
+	}
+	AssertCacheHit(t, &schemas.BifrostResponse{ChatResponse: response2}, "direct") // Should be cached (invalid value ignored)
+
+	// Test with nil value (should cache normally)
+	ctx2 := CreateContextWithCacheKey("test-no-store-nil")
+	ctx2 = ctx2.WithValue(CacheNoStoreKey, nil)
+
+	t.Log("Testing no-store with nil value (should cache normally)...")
+	response3, err3 := setup.Client.ChatCompletionRequest(ctx2, testRequest)
+	if err3 != nil {
+		return // Test will be skipped by retry function
+	}
+	AssertNoCacheHit(t, &schemas.BifrostResponse{ChatResponse: response3})
+
+	WaitForCache(setup.Plugin)
+
+	// Should be cached (nil should be treated as normal caching)
+	response4, err4 := setup.Client.ChatCompletionRequest(ctx2, testRequest)
+	if err4 != nil {
+		t.Fatalf("Fourth request failed: %v", err4)
+	}
+	AssertCacheHit(t, &schemas.BifrostResponse{ChatResponse: response4}, "direct") // Should be cached (nil ignored)
+
+	t.Log("✅ CacheNoStoreKey error handling works correctly")
+}
+
+// TestCacheNoStoreReadButNoWrite tests that NoStore allows reading cache but prevents writing
+func TestCacheNoStoreReadButNoWrite(t *testing.T) {
+	setup := NewTestSetup(t)
+	defer setup.Cleanup()
+
+	testRequest := CreateBasicChatRequest("Describe Isaac Newton's three laws of motion", 0.7, 50)
+
+	// Step 1: Cache a response normally
+	ctx1 := CreateContextWithCacheKey("test-no-store-read")
+	t.Log("Caching response normally...")
+	response1, err1 := setup.Client.ChatCompletionRequest(ctx1, testRequest)
+	if err1 != nil {
+		return // Test will be skipped by retry function
+	}
+	AssertNoCacheHit(t, &schemas.BifrostResponse{ChatResponse: response1})
+
+	WaitForCache(setup.Plugin)
+
+	// Step 2: Try to read with no-store enabled (should still read from cache)
+	ctx2 := CreateContextWithCacheKeyAndNoStore("test-no-store-read", true)
+	t.Log("Reading with no-store enabled (should still hit cache for reads)...")
+	response2, err2 := setup.Client.ChatCompletionRequest(ctx2, testRequest)
+	if err2 != nil {
+		if err2.Error != nil {
+			t.Fatalf("Second request failed: %v", err2.Error.Message)
+		} else {
+			t.Fatalf("Second request failed: %v", err2)
+		}
+	}
+	// The current implementation should still read from cache even with no-store
+	// (no-store only affects writing, not reading)
+	AssertCacheHit(t, &schemas.BifrostResponse{ChatResponse: response2}, "direct")
+
+	// Step 3: Make a semantically similar request with no-store (strong paraphrase for deterministic semantic hit)
+	newRequest := CreateBasicChatRequest("Describe the three laws of motion by Isaac Newton", 0.7, 50)
+	t.Log("Making semantically similar request with no-store (should get semantic hit, but not cache response)...")
+	response3, err3 := setup.Client.ChatCompletionRequest(ctx2, newRequest)
+	if err3 != nil {
+		t.Fatalf("Third request failed: %v", err3)
+	}
+	// Should get semantic cache hit (no-store allows reads, just prevents writes)
+	AssertCacheHit(t, &schemas.BifrostResponse{ChatResponse: response3}, "semantic")
+
+	WaitForCache(setup.Plugin)
+
+	// Step 4: Repeat similar request with no-store (should still get semantic hit)
+	t.Log("Repeating similar request with no-store (should still get semantic hit)...")
+	response4, err4 := setup.Client.ChatCompletionRequest(ctx2, newRequest)
+	if err4 != nil {
+		t.Fatalf("Fourth request failed: %v", err4)
+	}
+	// Should get semantic cache hit again (consistent behavior)
+	AssertCacheHit(t, &schemas.BifrostResponse{ChatResponse: response4}, "semantic")
+
+	t.Log("✅ CacheNoStoreKey allows reading but prevents writing")
+}
--- a/plugins/semanticcache/plugin_normalization_test.go
+++ b/plugins/semanticcache/plugin_normalization_test.go
@@ -0,0 +1,332 @@
+package semanticcache
+
+import (
+	"testing"
+
+	"github.com/maximhq/bifrost/core/schemas"
+)
+
+// TestTextNormalizationDirectCache tests that text normalization works correctly
+// for direct cache (hash-based) matching across all input types
+func TestTextNormalizationDirectCache(t *testing.T) {
+	setup := NewTestSetup(t)
+	defer setup.Cleanup()
+
+	t.Run("ChatCompletion", func(t *testing.T) {
+		testChatCompletionNormalization(t, setup)
+	})
+
+	t.Run("Speech", func(t *testing.T) {
+		testSpeechNormalization(t, setup)
+	})
+}
+
+func testChatCompletionNormalization(t *testing.T, setup *TestSetup) {
+	ctx := CreateContextWithCacheKey("test-chat-normalization")
+
+	// Test cases with different case and whitespace variations
+	testCases := []struct {
+		name      string
+		userMsg   string
+		systemMsg string
+	}{
+		{
+			name:      "Original",
+			userMsg:   "Explain quantum physics",
+			systemMsg: "You are a helpful science teacher",
+		},
+		{
+			name:      "Lowercase",
+			userMsg:   "explain quantum physics",
+			systemMsg: "you are a helpful science teacher",
+		},
+		{
+			name:      "Uppercase",
+			userMsg:   "EXPLAIN QUANTUM PHYSICS",
+			systemMsg: "YOU ARE A HELPFUL SCIENCE TEACHER",
+		},
+		{
+			name:      "Mixed Case",
+			userMsg:   "ExPlAiN QuAnTuM PhYsIcS",
+			systemMsg: "YoU aRe A hElPfUl ScIeNcE tEaChEr",
+		},
+		{
+			name:      "With Whitespace",
+			userMsg:   "  Explain quantum physics  ",
+			systemMsg: "  You are a helpful science teacher  ",
+		},
+		{
+			name:      "Extra Whitespace",
+			userMsg:   "    Explain quantum physics    ",
+			systemMsg: "    You are a helpful science teacher    ",
+		},
+	}
+
+	// Create chat completion requests for all test cases
+	requests := make([]*schemas.BifrostChatRequest, len(testCases))
+	for i, tc := range testCases {
+		requests[i] = &schemas.BifrostChatRequest{
+			Provider: schemas.OpenAI,
+			Model:    "gpt-4o-mini",
+			Input: []schemas.ChatMessage{
+				{
+					Role: schemas.ChatMessageRoleSystem,
+					Content: &schemas.ChatMessageContent{
+						ContentStr: &tc.systemMsg,
+					},
+				},
+				{
+					Role: schemas.ChatMessageRoleUser,
+					Content: &schemas.ChatMessageContent{
+						ContentStr: &tc.userMsg,
+					},
+				},
+			},
+			Params: &schemas.ChatParameters{
+				Temperature:         PtrFloat64(0.5),
+				MaxCompletionTokens: PtrInt(50),
+			},
+		}
+	}
+
+	// Make first request (should miss cache and be stored)
+	t.Logf("Making first request with user: '%s', system: '%s'", testCases[0].userMsg, testCases[0].systemMsg)
+	response1, err1 := setup.Client.ChatCompletionRequest(ctx, requests[0])
+	if err1 != nil {
+		return // Test will be skipped by retry function
+	}
+
+	if response1 == nil || len(response1.Choices) == 0 {
+		t.Fatal("First response is invalid")
+	}
+
+	AssertNoCacheHit(t, &schemas.BifrostResponse{ChatResponse: response1})
+	WaitForCache(setup.Plugin)
+
+	// Test all other variations should hit cache due to normalization
+	for i := 1; i < len(testCases); i++ {
+		tc := testCases[i]
+		t.Logf("Testing variation '%s' with user: '%s', system: '%s'", tc.name, tc.userMsg, tc.systemMsg)
+
+		response, err := setup.Client.ChatCompletionRequest(ctx, requests[i])
+		if err != nil {
+			t.Fatalf("Request for case '%s' failed: %v", tc.name, err)
+		}
+
+		if response == nil || len(response.Choices) == 0 {
+			t.Fatalf("Response for case '%s' is invalid", tc.name)
+		}
+
+		// Should be cache hit due to normalization
+		AssertCacheHit(t, &schemas.BifrostResponse{ChatResponse: response}, "direct")
+		t.Logf("✓ Cache hit for '%s' variation", tc.name)
+	}
+}
+
+func testSpeechNormalization(t *testing.T, setup *TestSetup) {
+	ctx := CreateContextWithCacheKey("test-speech-normalization")
+
+	// Test cases with different case and whitespace variations for speech input
+	testCases := []struct {
+		name  string
+		input string
+	}{
+		{"Original", "Hello, this is a test speech synthesis"},
+		{"Lowercase", "hello, this is a test speech synthesis"},
+		{"Uppercase", "HELLO, THIS IS A TEST SPEECH SYNTHESIS"},
+		{"Mixed Case", "HeLLo, ThIs Is A tEsT sPeEcH sYnThEsIs"},
+		{"Leading Whitespace", "  Hello, this is a test speech synthesis"},
+		{"Trailing Whitespace", "Hello, this is a test speech synthesis  "},
+		{"Both Whitespace", "  Hello, this is a test speech synthesis  "},
+		{"Extra Spaces", "   Hello, this is a test speech synthesis   "},
+	}
+
+	// Create speech requests for all test cases
+	requests := make([]*schemas.BifrostSpeechRequest, len(testCases))
+	for i, tc := range testCases {
+		requests[i] = CreateSpeechRequest(tc.input, "alloy")
+	}
+
+	// Make first request (should miss cache and be stored)
+	t.Logf("Making first speech request with: '%s'", testCases[0].input)
+	response1, err1 := setup.Client.SpeechRequest(ctx, requests[0])
+	if err1 != nil {
+		return // Test will be skipped by retry function
+	}
+
+	if response1 == nil {
+		t.Fatal("First response is invalid")
+	}
+
+	AssertNoCacheHit(t, &schemas.BifrostResponse{SpeechResponse: response1})
+	WaitForCache(setup.Plugin)
+
+	// Test all other variations should hit cache due to normalization
+	for i := 1; i < len(testCases); i++ {
+		tc := testCases[i]
+		t.Logf("Testing variation '%s' with input: '%s'", tc.name, tc.input)
+
+		response, err := setup.Client.SpeechRequest(ctx, requests[i])
+		if err != nil {
+			t.Fatalf("Request for case '%s' failed: %v", tc.name, err)
+		}
+
+		if response == nil {
+			t.Fatalf("Response for case '%s' is invalid", tc.name)
+		}
+
+		// Should be cache hit due to normalization
+		AssertCacheHit(t, &schemas.BifrostResponse{SpeechResponse: response}, "direct")
+		t.Logf("✓ Cache hit for '%s' variation", tc.name)
+	}
+}
+
+// TestChatCompletionContentBlocksNormalization tests normalization for content blocks
+func TestChatCompletionContentBlocksNormalization(t *testing.T) {
+	setup := NewTestSetup(t)
+	defer setup.Cleanup()
+
+	ctx := CreateContextWithCacheKey("test-content-blocks-normalization")
+
+	// Test cases with content blocks having different text normalization
+	testCases := []struct {
+		name       string
+		textBlocks []string
+	}{
+		{
+			name:       "Original",
+			textBlocks: []string{"Hello World", "How are you today?"},
+		},
+		{
+			name:       "Lowercase",
+			textBlocks: []string{"hello world", "how are you today?"},
+		},
+		{
+			name:       "With Whitespace",
+			textBlocks: []string{"  Hello World  ", "  How are you today?  "},
+		},
+		{
+			name:       "Mixed Case",
+			textBlocks: []string{"HeLLo WoRLd", "HoW aRe YoU tOdAy?"},
+		},
+	}
+
+	// Create chat completion requests with content blocks
+	requests := make([]*schemas.BifrostChatRequest, len(testCases))
+	for i, tc := range testCases {
+		// Create content blocks
+		contentBlocks := make([]schemas.ChatContentBlock, len(tc.textBlocks))
+		for j, text := range tc.textBlocks {
+			contentBlocks[j] = schemas.ChatContentBlock{
+				Type: schemas.ChatContentBlockTypeText,
+				Text: &text,
+			}
+		}
+
+		requests[i] = &schemas.BifrostChatRequest{
+			Provider: schemas.OpenAI,
+			Model:    "gpt-4o-mini",
+			Input: []schemas.ChatMessage{
+				{
+					Role: schemas.ChatMessageRoleUser,
+					Content: &schemas.ChatMessageContent{
+						ContentBlocks: contentBlocks,
+					},
+				},
+			},
+			Params: &schemas.ChatParameters{
+				Temperature:         PtrFloat64(0.5),
+				MaxCompletionTokens: PtrInt(50),
+			},
+		}
+	}
+
+	// Make first request (should miss cache and be stored)
+	t.Logf("Making first request with content blocks: %v", testCases[0].textBlocks)
+	response1, err1 := setup.Client.ChatCompletionRequest(ctx, requests[0])
+	if err1 != nil {
+		return // Test will be skipped by retry function
+	}
+
+	if response1 == nil || len(response1.Choices) == 0 {
+		t.Fatal("First response is invalid")
+	}
+
+	AssertNoCacheHit(t, &schemas.BifrostResponse{ChatResponse: response1})
+	WaitForCache(setup.Plugin)
+
+	// Test all other variations should hit cache due to normalization
+	for i := 1; i < len(testCases); i++ {
+		tc := testCases[i]
+		t.Logf("Testing variation '%s' with content blocks: %v", tc.name, tc.textBlocks)
+
+		response, err := setup.Client.ChatCompletionRequest(ctx, requests[i])
+		if err != nil {
+			t.Fatalf("Request for case '%s' failed: %v", tc.name, err)
+		}
+
+		if response == nil || len(response.Choices) == 0 {
+			t.Fatalf("Response for case '%s' is invalid", tc.name)
+		}
+
+		// Should be cache hit due to normalization
+		AssertCacheHit(t, &schemas.BifrostResponse{ChatResponse: response}, "direct")
+		t.Logf("✓ Cache hit for '%s' variation", tc.name)
+	}
+}
+
+// TestNormalizationWithSemanticCache tests that normalization works with semantic cache as well
+func TestNormalizationWithSemanticCache(t *testing.T) {
+	setup := NewTestSetup(t)
+	defer setup.Cleanup()
+
+	ctx := CreateContextWithCacheKey("test-normalization-semantic")
+
+	// Make first request with original text
+	originalRequest := CreateBasicChatRequest("What is Machine Learning?", 0.5, 50)
+	t.Log("Making first request with original text...")
+	response1, err1 := setup.Client.ChatCompletionRequest(ctx, originalRequest)
+	if err1 != nil {
+		return // Test will be skipped by retry function
+	}
+
+	AssertNoCacheHit(t, &schemas.BifrostResponse{ChatResponse: response1})
+	WaitForCache(setup.Plugin)
+
+	// Test semantic match with different case (should hit semantic cache after normalization)
+	normalizedRequest := CreateBasicChatRequest("what is machine learning?", 0.5, 50)
+	t.Log("Making semantic request with normalized case...")
+	response2, err2 := setup.Client.ChatCompletionRequest(ctx, normalizedRequest)
+	if err2 != nil {
+		if err2.Error != nil {
+			t.Fatalf("Second request failed: %v", err2.Error.Message)
+		} else {
+			t.Fatalf("Second request failed: %v", err2)
+		}
+	}
+
+	// This should be a direct cache hit since the normalized text is identical
+	AssertCacheHit(t, &schemas.BifrostResponse{ChatResponse: response2}, "direct")
+	t.Log("✓ Direct cache hit with normalized text")
+
+	// Test with semantically similar but different text
+	semanticRequest := CreateBasicChatRequest("can you explain machine learning concepts?", 0.5, 50)
+	t.Log("Making semantically similar request...")
+	response3, err3 := setup.Client.ChatCompletionRequest(ctx, semanticRequest)
+	if err3 != nil {
+		t.Fatalf("Third request failed: %v", err3)
+	}
+
+	// This should be a semantic cache hit
+	AssertCacheHit(t, &schemas.BifrostResponse{ChatResponse: response3}, "semantic")
+	t.Log("✓ Semantic cache hit with similar content")
+}
+
+// Helper functions for pointer creation
+func PtrFloat64(f float64) *float64 {
+	return &f
+}
+
+func PtrInt(i int) *int {
+	return &i
+}
--- a/plugins/semanticcache/plugin_responses_test.go
+++ b/plugins/semanticcache/plugin_responses_test.go
@@ -0,0 +1,451 @@
+package semanticcache
+
+import (
+	"testing"
+	"time"
+
+	"github.com/maximhq/bifrost/core/schemas"
+)
+
+// TestResponsesAPIBasicFunctionality tests the core caching functionality with Responses API
+func TestResponsesAPIBasicFunctionality(t *testing.T) {
+	setup := NewTestSetup(t)
+	defer setup.Cleanup()
+
+	ctx := CreateContextWithCacheKey("test-responses-basic")
+
+	// Create test request
+	testRequest := CreateBasicResponsesRequest(
+		"What is Bifrost? Answer in one short sentence.",
+		0.7,
+		500,
+	)
+
+	t.Log("Making first Responses API request (should go to OpenAI and be cached)...")
+
+	// Make first request (will go to OpenAI and be cached) - with retries
+	start1 := time.Now()
+	response1, err1 := setup.Client.ResponsesRequest(ctx, testRequest)
+	duration1 := time.Since(start1)
+
+	if err1 != nil {
+		return // Test will be skipped by retry function
+	}
+
+	if response1 == nil || len(response1.Output) == 0 {
+		t.Fatal("First Responses response is invalid")
+	}
+
+	t.Logf("First request completed in %v", duration1)
+	t.Logf("Response contains %d output messages", len(response1.Output))
+	if c := response1.Output[0].Content; c != nil && c.ContentStr != nil {
+		t.Logf("Response: %s", *c.ContentStr)
+	} else if c != nil && len(c.ContentBlocks) > 0 && c.ContentBlocks[0].Text != nil {
+		t.Logf("Response: %s", *c.ContentBlocks[0].Text)
+	} else {
+		t.Log("Response: <no text>")
+	}
+
+	// Wait for cache to be written
+	WaitForCache(setup.Plugin)
+
+	t.Log("Making second identical Responses API request (should be served from cache)...")
+
+	// Make second identical request (should be cached)
+	start2 := time.Now()
+	response2, err2 := setup.Client.ResponsesRequest(ctx, testRequest)
+	duration2 := time.Since(start2)
+
+	if err2 != nil {
+		t.Fatalf("Second Responses request failed: %v", err2)
+	}
+
+	if response2 == nil || len(response2.Output) == 0 {
+		t.Fatal("Second Responses response is invalid")
+	}
+	if response2.Output[0].Content.ContentStr != nil {
+		t.Logf("Response: %s", *response2.Output[0].Content.ContentStr)
+	} else {
+		t.Logf("Response: %v", *response2.Output[0].Content.ContentBlocks[0].Text)
+	}
+
+	t.Logf("Second request completed in %v", duration2)
+
+	// Verify cache hit
+	AssertCacheHit(t, &schemas.BifrostResponse{ResponsesResponse: response2}, string(CacheTypeDirect))
+
+	// Performance comparison
+	t.Logf("Performance Summary:")
+	t.Logf("First request (OpenAI):  %v", duration1)
+	t.Logf("Second request (Cache):  %v", duration2)
+
+	if duration2 >= duration1 {
+		t.Log("⚠️  Cache doesn't seem faster, but this could be due to test environment")
+	}
+
+	// Verify provider information is maintained in cached response
+	if response2.ExtraFields.Provider != testRequest.Provider {
+		t.Errorf("Provider mismatch in cached response: expected %s, got %s",
+			testRequest.Provider, response2.ExtraFields.Provider)
+	}
+
+	t.Log("✅ Basic Responses API semantic caching test completed successfully!")
+}
+
+// TestResponsesAPIDifferentParameters tests that different parameters produce different cache entries
+func TestResponsesAPIDifferentParameters(t *testing.T) {
+	setup := NewTestSetup(t)
+	defer setup.Cleanup()
+
+	ctx := CreateContextWithCacheKey("test-responses-params")
+	basePrompt := "Explain quantum computing"
+
+	tests := []struct {
+		name        string
+		request1    *schemas.BifrostResponsesRequest
+		request2    *schemas.BifrostResponsesRequest
+		shouldCache bool
+	}{
+		{
+			name:        "Identical Requests",
+			request1:    CreateBasicResponsesRequest(basePrompt, 0.5, 500),
+			request2:    CreateBasicResponsesRequest(basePrompt, 0.5, 500),
+			shouldCache: true,
+		},
+		{
+			name:        "Different Temperature",
+			request1:    CreateBasicResponsesRequest(basePrompt, 0.1, 500),
+			request2:    CreateBasicResponsesRequest(basePrompt, 0.9, 500),
+			shouldCache: false,
+		},
+		{
+			name:        "Different MaxOutputTokens",
+			request1:    CreateBasicResponsesRequest(basePrompt, 0.5, 500),
+			request2:    CreateBasicResponsesRequest(basePrompt, 0.5, 200),
+			shouldCache: false,
+		},
+		{
+			name:        "Different Instructions",
+			request1:    CreateResponsesRequestWithInstructions(basePrompt, "Be concise", 0.5, 500),
+			request2:    CreateResponsesRequestWithInstructions(basePrompt, "Be detailed", 0.5, 500),
+			shouldCache: false,
+		},
+	}
+
+	for _, tt := range tests {
+		t.Run(tt.name, func(t *testing.T) {
+			// Clear cache for this subtest
+			clearTestKeysWithStore(t, setup.Store)
+
+			// Make first request
+			_, err1 := setup.Client.ResponsesRequest(ctx, tt.request1)
+			if err1 != nil {
+				return // Test will be skipped by retry function
+			}
+
+			WaitForCache(setup.Plugin)
+
+			// Make second request
+			response2, err2 := setup.Client.ResponsesRequest(ctx, tt.request2)
+			if err2 != nil {
+				if err2.Error != nil {
+					t.Fatalf("Second request failed: %v", err2.Error.Message)
+				} else {
+					t.Fatalf("Second request failed: %v", err2)
+				}
+			}
+
+			if tt.shouldCache {
+				AssertCacheHit(t, &schemas.BifrostResponse{ResponsesResponse: response2}, "direct")
+				t.Log("✓ Parameters match: cache hit as expected")
+			} else {
+				AssertNoCacheHit(t, &schemas.BifrostResponse{ResponsesResponse: response2})
+				t.Log("✓ Parameters differ: no cache hit as expected")
+			}
+		})
+	}
+}
+
+// TestResponsesAPISemanticMatching tests semantic similarity matching with Responses API
+func TestResponsesAPISemanticMatching(t *testing.T) {
+	setup := NewTestSetup(t)
+	defer setup.Cleanup()
+
+	ctx := CreateContextWithCacheKeyAndType("test-responses-semantic", CacheTypeSemantic)
+
+	// First request
+	originalRequest := CreateBasicResponsesRequest("What is machine learning?", 0.5, 500)
+	t.Log("Making first Responses request with original text...")
+	response1, err1 := setup.Client.ResponsesRequest(ctx, originalRequest)
+	if err1 != nil {
+		return // Test will be skipped by retry function
+	}
+
+	AssertNoCacheHit(t, &schemas.BifrostResponse{ResponsesResponse: response1})
+	WaitForCache(setup.Plugin)
+
+	// Test semantic match with similar but different text
+	semanticRequest := CreateBasicResponsesRequest("Can you explain machine learning concepts?", 0.5, 500)
+	t.Log("Making semantically similar Responses request...")
+	response2, err2 := setup.Client.ResponsesRequest(ctx, semanticRequest)
+	if err2 != nil {
+		if err2.Error != nil {
+			t.Fatalf("Second request failed: %v", err2.Error.Message)
+		} else {
+			t.Fatalf("Second request failed: %v", err2)
+		}
+	}
+
+	// This should be a semantic cache hit
+	AssertCacheHit(t, &schemas.BifrostResponse{ResponsesResponse: response2}, "semantic")
+	t.Log("✓ Semantic cache hit with similar content")
+}
+
+// TestResponsesAPIWithInstructions tests caching with system instructions
+func TestResponsesAPIWithInstructions(t *testing.T) {
+	setup := NewTestSetup(t)
+	defer setup.Cleanup()
+
+	ctx := CreateContextWithCacheKey("test-responses-instructions")
+
+	// Create request with instructions
+	request1 := CreateResponsesRequestWithInstructions(
+		"Explain artificial intelligence",
+		"You are a helpful assistant. Be concise and accurate.",
+		0.7,
+		500,
+	)
+
+	t.Log("Making first Responses request with instructions...")
+	response1, err1 := setup.Client.ResponsesRequest(ctx, request1)
+	if err1 != nil {
+		return // Test will be skipped by retry function
+	}
+
+	AssertNoCacheHit(t, &schemas.BifrostResponse{ResponsesResponse: response1})
+	WaitForCache(setup.Plugin)
+
+	// Make identical request
+	request2 := CreateResponsesRequestWithInstructions(
+		"Explain artificial intelligence",
+		"You are a helpful assistant. Be concise and accurate.",
+		0.7,
+		500,
+	)
+
+	t.Log("Making second identical Responses request with instructions...")
+	response2, err2 := setup.Client.ResponsesRequest(ctx, request2)
+	if err2 != nil {
+		if err2.Error != nil {
+			t.Fatalf("Second request failed: %v", err2.Error.Message)
+		} else {
+			t.Fatalf("Second request failed: %v", err2)
+		}
+	}
+
+	// Should be a cache hit
+	AssertCacheHit(t, &schemas.BifrostResponse{ResponsesResponse: response2}, "direct")
+	t.Log("✓ Responses API with instructions cached correctly")
+}
+
+// TestResponsesAPICacheExpiration tests TTL functionality for Responses API requests
+func TestResponsesAPICacheExpiration(t *testing.T) {
+	setup := NewTestSetup(t)
+	defer setup.Cleanup()
+
+	// Set very short TTL for testing
+	shortTTL := 5 * time.Second
+	ctx := CreateContextWithCacheKeyAndTTL("test-responses-ttl", shortTTL)
+
+	responsesRequest := CreateBasicResponsesRequest("TTL test for Responses API", 0.5, 500)
+
+	t.Log("Making first Responses request with short TTL...")
+	response1, err1 := setup.Client.ResponsesRequest(ctx, responsesRequest)
+	if err1 != nil {
+		return // Test will be skipped by retry function
+	}
+	AssertNoCacheHit(t, &schemas.BifrostResponse{ResponsesResponse: response1})
+
+	WaitForCache(setup.Plugin)
+
+	t.Log("Making second Responses request before TTL expiration...")
+	response2, err2 := setup.Client.ResponsesRequest(ctx, responsesRequest)
+	if err2 != nil {
+		if err2.Error != nil {
+			t.Fatalf("Second request failed: %v", err2.Error.Message)
+		} else {
+			t.Fatalf("Second request failed: %v", err2)
+		}
+	}
+	AssertCacheHit(t, &schemas.BifrostResponse{ResponsesResponse: response2}, "direct")
+
+	t.Logf("Waiting for TTL expiration (%v)...", shortTTL)
+	time.Sleep(shortTTL + 2*time.Second) // Wait for TTL to expire
+
+	t.Log("Making third Responses request after TTL expiration...")
+	response3, err3 := setup.Client.ResponsesRequest(ctx, responsesRequest)
+	if err3 != nil {
+		return // Test will be skipped by retry function
+	}
+	// Should not be a cache hit since TTL expired
+	AssertNoCacheHit(t, &schemas.BifrostResponse{ResponsesResponse: response3})
+
+	t.Log("✅ Responses API requests properly handle TTL expiration")
+}
+
+// TestResponsesAPIWithoutCacheKey tests that Responses requests without cache key are not cached
+func TestResponsesAPIWithoutCacheKey(t *testing.T) {
+	setup := NewTestSetup(t)
+	defer setup.Cleanup()
+
+	// Don't set cache key in context
+	ctx := CreateContextWithCacheKey("")
+
+	responsesRequest := CreateBasicResponsesRequest("Test Responses without cache key", 0.5, 500)
+
+	t.Log("Making Responses request without cache key...")
+
+	response, err := setup.Client.ResponsesRequest(ctx, responsesRequest)
+	if err != nil {
+		return // Test will be skipped by retry function
+	}
+
+	// Should not be cached
+	AssertNoCacheHit(t, &schemas.BifrostResponse{ResponsesResponse: response})
+
+	t.Log("✅ Responses requests without cache key are properly not cached")
+}
+
+// TestResponsesAPINoStoreFlag tests that Responses requests with no-store flag are not cached
+func TestResponsesAPINoStoreFlag(t *testing.T) {
+	setup := NewTestSetup(t)
+	defer setup.Cleanup()
+
+	responsesRequest := CreateBasicResponsesRequest("Test no-store with Responses API", 0.7, 500)
+	ctx := CreateContextWithCacheKeyAndNoStore("test-no-store-responses", true)
+
+	t.Log("Testing no-store with Responses API...")
+	response1, err1 := setup.Client.ResponsesRequest(ctx, responsesRequest)
+	if err1 != nil {
+		return // Test will be skipped by retry function
+	}
+	AssertNoCacheHit(t, &schemas.BifrostResponse{ResponsesResponse: response1})
+
+	WaitForCache(setup.Plugin)
+
+	// Verify not cached
+	response2, err2 := setup.Client.ResponsesRequest(ctx, responsesRequest)
+	if err2 != nil {
+		return // Test will be skipped by retry function
+	}
+	AssertNoCacheHit(t, &schemas.BifrostResponse{ResponsesResponse: response2}) // Should not be cached
+
+	t.Log("✅ Responses API no-store flag working correctly")
+}
+
+// TestResponsesAPIStreaming tests streaming Responses API requests
+func TestResponsesAPIStreaming(t *testing.T) {
+	t.Log("Responses streaming not supported yet")
+
+	setup := NewTestSetup(t)
+	defer setup.Cleanup()
+
+	ctx := CreateContextWithCacheKey("test-responses-streaming")
+	prompt := "Explain the basics of quantum computing in simple terms"
+
+	// Make non-streaming request first
+	t.Log("Making non-streaming Responses request...")
+	nonStreamRequest := CreateBasicResponsesRequest(prompt, 0.5, 500)
+	_, err1 := setup.Client.ResponsesRequest(ctx, nonStreamRequest)
+	if err1 != nil {
+		return // Test will be skipped by retry function
+	}
+
+	WaitForCache(setup.Plugin)
+
+	// Make streaming request with same prompt and parameters
+	t.Log("Making streaming Responses request with same prompt...")
+	streamRequest := CreateStreamingResponsesRequest(prompt, 0.5, 500)
+	stream, err2 := setup.Client.ResponsesStreamRequest(ctx, streamRequest)
+	if err2 != nil {
+		t.Fatalf("Streaming Responses request failed: %v", err2)
+	}
+
+	var streamResponses []schemas.BifrostResponsesStreamResponse
+	for streamMsg := range stream {
+		if streamMsg.BifrostError != nil {
+			t.Fatalf("Error in Responses stream: %v", streamMsg.BifrostError)
+		}
+		if streamMsg.BifrostResponsesStreamResponse != nil {
+			streamResponses = append(streamResponses, *streamMsg.BifrostResponsesStreamResponse)
+		}
+	}
+
+	if len(streamResponses) == 0 {
+		t.Fatal("No streaming responses received")
+	}
+
+	// Check if any of the streaming responses was served from cache
+	cacheHitFound := false
+	for _, resp := range streamResponses {
+		if resp.ExtraFields.CacheDebug != nil && resp.ExtraFields.CacheDebug.CacheHit {
+			cacheHitFound = true
+			break
+		}
+	}
+
+	if !cacheHitFound {
+		t.Log("⚠️  No cache hit detected in streaming responses - this could be expected behavior")
+	} else {
+		t.Log("✓ Cache hit detected in streaming Responses API")
+	}
+
+	t.Log("✅ Streaming Responses API test completed")
+}
+
+// TestResponsesAPIComplexParameters tests complex parameter handling
+func TestResponsesAPIComplexParameters(t *testing.T) {
+	setup := NewTestSetup(t)
+	defer setup.Cleanup()
+
+	ctx := CreateContextWithCacheKey("test-responses-complex-params")
+
+	// Create request with various complex parameters
+	request := CreateBasicResponsesRequest("Test complex parameters", 0.8, 500)
+	request.Params.TopP = PtrFloat64(0.9)
+	request.Params.Background = &[]bool{true}[0]
+	request.Params.ParallelToolCalls = &[]bool{false}[0]
+	request.Params.ServiceTier = &[]string{"default"}[0]
+	request.Params.Store = &[]bool{true}[0]
+
+	t.Log("Making first Responses request with complex parameters...")
+	response1, err1 := setup.Client.ResponsesRequest(ctx, request)
+	if err1 != nil {
+		return // Test will be skipped by retry function
+	}
+
+	AssertNoCacheHit(t, &schemas.BifrostResponse{ResponsesResponse: response1})
+	WaitForCache(setup.Plugin)
+
+	// Create identical request
+	request2 := CreateBasicResponsesRequest("Test complex parameters", 0.8, 500)
+	request2.Params.TopP = PtrFloat64(0.9)
+	request2.Params.Background = &[]bool{true}[0]
+	request2.Params.ParallelToolCalls = &[]bool{false}[0]
+	request2.Params.ServiceTier = &[]string{"default"}[0]
+	request2.Params.Store = &[]bool{true}[0]
+
+	t.Log("Making second identical Responses request with complex parameters...")
+	response2, err2 := setup.Client.ResponsesRequest(ctx, request2)
+	if err2 != nil {
+		if err2.Error != nil {
+			t.Fatalf("Second request failed: %v", err2.Error.Message)
+		} else {
+			t.Fatalf("Second request failed: %v", err2)
+		}
+	}
+
+	// Should be a cache hit
+	AssertCacheHit(t, &schemas.BifrostResponse{ResponsesResponse: response2}, "direct")
+	t.Log("✓ Responses API with complex parameters cached correctly")
+}
--- a/plugins/semanticcache/plugin_streaming_test.go
+++ b/plugins/semanticcache/plugin_streaming_test.go
@@ -0,0 +1,333 @@
+package semanticcache
+
+import (
+	"testing"
+	"time"
+
+	"github.com/maximhq/bifrost/core/schemas"
+)
+
+// TestStreamingCacheBasicFunctionality tests streaming response caching
+func TestStreamingCacheBasicFunctionality(t *testing.T) {
+	setup := NewTestSetup(t)
+	defer setup.Cleanup()
+
+	ctx := CreateContextWithCacheKey("test-stream-value")
+
+	// Create a test streaming request
+	testRequest := CreateStreamingChatRequest(
+		"Count from 1 to 3, each number on a new line.",
+		0.0, // Use 0 temperature for more predictable responses
+		20,
+	)
+
+	t.Log("Making first streaming request (should go to OpenAI and be cached)...")
+
+	// Make first streaming request
+	start1 := time.Now()
+	stream1, err1 := setup.Client.ChatCompletionStreamRequest(ctx, testRequest)
+	if err1 != nil {
+		return // Test will be skipped by retry function
+	}
+
+	var responses1 []schemas.BifrostChatResponse
+	for streamMsg := range stream1 {
+		if streamMsg.BifrostError != nil {
+			t.Fatalf("Error in first stream: %v", streamMsg.BifrostError)
+		}
+		if streamMsg.BifrostChatResponse != nil {
+			responses1 = append(responses1, *streamMsg.BifrostChatResponse)
+		}
+	}
+	duration1 := time.Since(start1)
+
+	if len(responses1) == 0 {
+		t.Fatal("First streaming request returned no responses")
+	}
+
+	t.Logf("First streaming request completed in %v with %d chunks", duration1, len(responses1))
+
+	// Wait for cache to be written
+	WaitForCache(setup.Plugin)
+
+	t.Log("Making second identical streaming request (should be served from cache)...")
+
+	// Make second identical streaming request
+	start2 := time.Now()
+	stream2, err2 := setup.Client.ChatCompletionStreamRequest(ctx, testRequest)
+	if err2 != nil {
+		t.Fatalf("Second streaming request failed: %v", err2)
+	}
+
+	var responses2 []schemas.BifrostChatResponse
+	for streamMsg := range stream2 {
+		if streamMsg.BifrostError != nil {
+			t.Fatalf("Error in second stream: %v", streamMsg.BifrostError)
+		}
+		if streamMsg.BifrostChatResponse != nil {
+			responses2 = append(responses2, *streamMsg.BifrostChatResponse)
+		}
+	}
+	duration2 := time.Since(start2)
+
+	if len(responses2) == 0 {
+		t.Fatal("Second streaming request returned no responses")
+	}
+
+	t.Logf("Second streaming request completed in %v with %d chunks", duration2, len(responses2))
+
+	// Validate that both streams have the same number of chunks
+	if len(responses1) != len(responses2) {
+		t.Errorf("Stream chunk count mismatch: original=%d, cached=%d", len(responses1), len(responses2))
+	}
+
+	// Validate that the second stream was cached
+	cached := false
+	for _, response := range responses2 {
+		if response.ExtraFields.CacheDebug != nil && response.ExtraFields.CacheDebug.CacheHit {
+			cached = true
+			break
+		}
+	}
+
+	if !cached {
+		t.Fatal("Second streaming request was not served from cache")
+	}
+
+	// Validate performance improvement
+	if duration2 >= duration1 {
+		t.Errorf("Cached stream took longer than original: cache=%v, original=%v", duration2, duration1)
+	} else {
+		speedup := float64(duration1) / float64(duration2)
+		t.Logf("Streaming cache speedup: %.2fx faster", speedup)
+	}
+
+	// Validate chunk ordering is maintained
+	for i := range responses2 {
+		if responses2[i].ExtraFields.ChunkIndex != responses1[i].ExtraFields.ChunkIndex {
+			t.Errorf("Chunk index mismatch at position %d: original=%d, cached=%d",
+				i, responses1[i].ExtraFields.ChunkIndex, responses2[i].ExtraFields.ChunkIndex)
+		}
+	}
+
+	t.Log("✅ Streaming cache test completed successfully!")
+}
+
+// TestStreamingVsNonStreaming tests that streaming and non-streaming requests are cached separately
+func TestStreamingVsNonStreaming(t *testing.T) {
+	setup := NewTestSetup(t)
+	defer setup.Cleanup()
+
+	ctx := CreateContextWithCacheKey("stream-vs-non-test")
+
+	prompt := "What is the meaning of life?"
+
+	// Make non-streaming request first
+	t.Log("Making non-streaming request...")
+	nonStreamRequest := CreateBasicChatRequest(prompt, 0.5, 50)
+	nonStreamResponse, err1 := setup.Client.ChatCompletionRequest(ctx, nonStreamRequest)
+	if err1 != nil {
+		return // Test will be skipped by retry function
+	}
+
+	WaitForCache(setup.Plugin)
+
+	// Make streaming request with same prompt and parameters
+	t.Log("Making streaming request with same prompt...")
+	streamRequest := CreateStreamingChatRequest(prompt, 0.5, 50)
+	stream, err2 := setup.Client.ChatCompletionStreamRequest(ctx, streamRequest)
+	if err2 != nil {
+		t.Fatalf("Streaming request failed: %v", err2)
+	}
+
+	var streamResponses []schemas.BifrostChatResponse
+	for streamMsg := range stream {
+		if streamMsg.BifrostError != nil {
+			t.Fatalf("Error in stream: %v", streamMsg.BifrostError)
+		}
+		if streamMsg.BifrostChatResponse != nil {
+			streamResponses = append(streamResponses, *streamMsg.BifrostChatResponse)
+		}
+	}
+
+	if len(streamResponses) == 0 {
+		t.Fatal("Streaming request returned no responses")
+	}
+
+	// Verify that the streaming request was NOT served from the non-streaming cache
+	// (They should be cached separately)
+	streamCached := false
+	for _, response := range streamResponses {
+		if response.ExtraFields.RawResponse != nil {
+			if rawMap, ok := response.ExtraFields.RawResponse.(map[string]interface{}); ok {
+				if cachedFlag, exists := rawMap["bifrost_cached"]; exists {
+					if cachedBool, ok := cachedFlag.(bool); ok && cachedBool {
+						streamCached = true
+						break
+					}
+				}
+			}
+		}
+	}
+
+	if streamCached {
+		t.Error("Streaming request should not be cached from non-streaming cache")
+	} else {
+		t.Log("✅ Streaming request correctly not cached from non-streaming cache")
+	}
+
+	// Verify non-streaming response was not affected
+	AssertNoCacheHit(t, &schemas.BifrostResponse{ChatResponse: nonStreamResponse})
+
+	t.Log("✅ Streaming vs non-streaming test completed!")
+}
+
+// TestStreamingChunkOrdering tests that cached streaming responses maintain proper chunk ordering
+func TestStreamingChunkOrdering(t *testing.T) {
+	setup := NewTestSetup(t)
+	defer setup.Cleanup()
+
+	ctx := CreateContextWithCacheKey("chunk-order-test")
+
+	// Request that should generate multiple chunks
+	testRequest := CreateStreamingChatRequest(
+		"List the first 5 prime numbers, one per line with explanation.",
+		0.0,
+		100,
+	)
+
+	t.Log("Making first streaming request to establish cache...")
+	stream1, err1 := setup.Client.ChatCompletionStreamRequest(ctx, testRequest)
+	if err1 != nil {
+		return // Test will be skipped by retry function
+	}
+
+	var originalChunks []schemas.BifrostChatResponse
+	for streamMsg := range stream1 {
+		if streamMsg.BifrostError != nil {
+			t.Fatalf("Error in first stream: %v", streamMsg.BifrostError)
+		}
+		if streamMsg.BifrostChatResponse != nil {
+			originalChunks = append(originalChunks, *streamMsg.BifrostChatResponse)
+		}
+	}
+
+	if len(originalChunks) < 2 {
+		t.Skipf("Need at least 2 chunks to test ordering, got %d", len(originalChunks))
+	}
+
+	t.Logf("Original stream had %d chunks", len(originalChunks))
+
+	WaitForCache(setup.Plugin)
+
+	t.Log("Making second streaming request to test cached chunk ordering...")
+	stream2, err2 := setup.Client.ChatCompletionStreamRequest(ctx, testRequest)
+	if err2 != nil {
+		t.Fatalf("Second streaming request failed: %v", err2)
+	}
+
+	var cachedChunks []schemas.BifrostChatResponse
+	for streamMsg := range stream2 {
+		if streamMsg.BifrostError != nil {
+			t.Fatalf("Error in second stream: %v", streamMsg.BifrostError)
+		}
+		if streamMsg.BifrostChatResponse != nil {
+			cachedChunks = append(cachedChunks, *streamMsg.BifrostChatResponse)
+		}
+	}
+
+	if len(cachedChunks) != len(originalChunks) {
+		t.Errorf("Cached stream chunk count mismatch: original=%d, cached=%d",
+			len(originalChunks), len(cachedChunks))
+	}
+
+	// Verify chunk ordering
+	for i := 0; i < len(cachedChunks) && i < len(originalChunks); i++ {
+		originalIndex := originalChunks[i].ExtraFields.ChunkIndex
+		cachedIndex := cachedChunks[i].ExtraFields.ChunkIndex
+
+		if originalIndex != cachedIndex {
+			t.Errorf("Chunk index mismatch at position %d: original=%d, cached=%d",
+				i, originalIndex, cachedIndex)
+		}
+
+		// Only verify cache hit on the last chunk (where CacheDebug is set)
+		if i == len(cachedChunks)-1 {
+			AssertCacheHit(t, &schemas.BifrostResponse{ChatResponse: &cachedChunks[i]}, string(CacheTypeDirect))
+		}
+	}
+
+	// Verify chunks are in sequential order
+	for i := 1; i < len(cachedChunks); i++ {
+		prevIndex := cachedChunks[i-1].ExtraFields.ChunkIndex
+		currIndex := cachedChunks[i].ExtraFields.ChunkIndex
+
+		if currIndex <= prevIndex {
+			t.Errorf("Chunks not in sequential order: chunk %d has index %d, chunk %d has index %d",
+				i-1, prevIndex, i, currIndex)
+		}
+	}
+
+	t.Log("✅ Streaming chunk ordering test completed successfully!")
+}
+
+// TestSpeechSynthesisStreaming tests speech synthesis streaming caching
+func TestSpeechSynthesisStreaming(t *testing.T) {
+	setup := NewTestSetup(t)
+	defer setup.Cleanup()
+
+	ctx := CreateContextWithCacheKey("speech-stream-test")
+
+	// Create speech synthesis request
+	speechRequest := CreateSpeechRequest(
+		"This is a test of speech synthesis streaming cache.",
+		"alloy",
+	)
+
+	t.Log("Making first speech synthesis request...")
+	start1 := time.Now()
+	response1, err1 := setup.Client.SpeechRequest(ctx, speechRequest)
+	duration1 := time.Since(start1)
+
+	if err1 != nil {
+		return // Test will be skipped by retry function
+	}
+
+	if response1 == nil {
+		t.Fatal("First speech response is nil")
+	}
+
+	t.Logf("First speech request completed in %v", duration1)
+
+	WaitForCache(setup.Plugin)
+
+	t.Log("Making second identical speech synthesis request...")
+	start2 := time.Now()
+	response2, err2 := setup.Client.SpeechRequest(ctx, speechRequest)
+	duration2 := time.Since(start2)
+
+	if err2 != nil {
+		t.Fatalf("Second speech request failed: %v", err2)
+	}
+
+	if response2 == nil {
+		t.Fatal("Second speech response is nil")
+	}
+
+	t.Logf("Second speech request completed in %v", duration2)
+
+	// Check if second request was cached
+	AssertCacheHit(t, &schemas.BifrostResponse{SpeechResponse: response2}, string(CacheTypeDirect))
+
+	// Performance comparison
+	t.Logf("Speech Synthesis Performance:")
+	t.Logf("First request:   %v", duration1)
+	t.Logf("Second request:  %v", duration2)
+
+	if duration2 < duration1 {
+		speedup := float64(duration1) / float64(duration2)
+		t.Logf("Speech cache speedup: %.2fx faster", speedup)
+	}
+
+	t.Log("✅ Speech synthesis streaming test completed successfully!")
+}
--- a/plugins/semanticcache/plugin_vectorstore_test.go
+++ b/plugins/semanticcache/plugin_vectorstore_test.go
@@ -0,0 +1,428 @@
+package semanticcache
+
+import (
+	"context"
+	"os"
+	"strings"
+	"testing"
+
+	"github.com/google/uuid"
+	bifrost "github.com/maximhq/bifrost/core"
+	"github.com/maximhq/bifrost/core/schemas"
+	"github.com/maximhq/bifrost/framework/vectorstore"
+)
+
+// requiresVectors returns true if the vector store requires vectors for storage.
+// Some stores (like Qdrant, Pinecone, and Weaviate) require vectors for all entries,
+// while others (like Redis) can store metadata without vectors.
+func requiresVectors(storeType vectorstore.VectorStoreType) bool {
+	switch storeType {
+	case vectorstore.VectorStoreTypeQdrant, vectorstore.VectorStoreTypePinecone, vectorstore.VectorStoreTypeWeaviate:
+		return true
+	default:
+		return false
+	}
+}
+
+// skipIfNoAPIKey skips the test if OPENAI_API_KEY is not set and the store requires vectors.
+func skipIfNoAPIKey(t *testing.T, storeType vectorstore.VectorStoreType) {
+	if requiresVectors(storeType) && os.Getenv("OPENAI_API_KEY") == "" {
+		t.Skipf("Skipping %s test: OPENAI_API_KEY not set (required for embedding generation)", storeType)
+	}
+}
+
+// VectorStoreTestCase defines a test case for a specific vector store
+type VectorStoreTestCase struct {
+	Name      string
+	StoreType vectorstore.VectorStoreType
+}
+
+// getVectorStoreTestCases returns all vector store test cases
+func getVectorStoreTestCases() []VectorStoreTestCase {
+	return []VectorStoreTestCase{
+		{"Weaviate", vectorstore.VectorStoreTypeWeaviate},
+		{"Redis", vectorstore.VectorStoreTypeRedis},
+		{"Qdrant", vectorstore.VectorStoreTypeQdrant},
+		{"Pinecone", vectorstore.VectorStoreTypePinecone},
+	}
+}
+
+// getDefaultTestConfig returns the default test configuration
+func getDefaultTestConfig() *Config {
+	return &Config{
+		Provider:          schemas.OpenAI,
+		EmbeddingModel:    "text-embedding-3-small",
+		Dimension:         1536,
+		Threshold:         0.8,
+		CleanUpOnShutdown: true,
+		Keys: []schemas.Key{
+			{
+				Value:  *schemas.NewEnvVar("env.OPENAI_API_KEY"),
+				Models: schemas.WhiteList{"*"},
+				Weight: 1.0,
+			},
+		},
+	}
+}
+
+// TestSemanticCache_AllVectorStores_BasicFlow tests the basic cache flow across all vector stores
+func TestSemanticCache_AllVectorStores_BasicFlow(t *testing.T) {
+	for _, tc := range getVectorStoreTestCases() {
+		t.Run(tc.Name, func(t *testing.T) {
+			skipIfNoAPIKey(t, tc.StoreType)
+			setup := NewTestSetupWithVectorStore(t, getDefaultTestConfig(), tc.StoreType)
+			defer setup.Cleanup()
+
+			ctx := schemas.NewBifrostContext(context.Background(), schemas.NoDeadline)
+			ctx.SetValue(CacheKey, "test-"+strings.ToLower(tc.Name)+"-basic")
+
+			// Test request
+			request := &schemas.BifrostRequest{
+				RequestType: schemas.ChatCompletionRequest,
+				ChatRequest: &schemas.BifrostChatRequest{
+					Provider: schemas.OpenAI,
+					Model:    "gpt-4o-mini",
+					Input: []schemas.ChatMessage{
+						{
+							Role: schemas.ChatMessageRoleUser,
+							Content: &schemas.ChatMessageContent{
+								ContentStr: bifrost.Ptr("Hello from " + tc.Name + " test!"),
+							},
+						},
+					},
+					Params: &schemas.ChatParameters{
+						Temperature:         bifrost.Ptr(0.7),
+						MaxCompletionTokens: bifrost.Ptr(100),
+					},
+				},
+			}
+
+			t.Logf("[%s] Testing first request (cache miss)...", tc.Name)
+
+			// First request - should be a cache miss
+			modifiedReq, shortCircuit, err := setup.Plugin.PreLLMHook(ctx, request)
+			if err != nil {
+				t.Fatalf("[%s] PreHook failed: %v", tc.Name, err)
+			}
+
+			if shortCircuit != nil {
+				t.Fatalf("[%s] Expected cache miss, but got cache hit", tc.Name)
+			}
+
+			if modifiedReq == nil {
+				t.Fatalf("[%s] Modified request is nil", tc.Name)
+			}
+
+			t.Logf("[%s] Cache miss handled correctly", tc.Name)
+
+			// Simulate a response
+			response := &schemas.BifrostResponse{
+				ChatResponse: &schemas.BifrostChatResponse{
+					ID: uuid.New().String(),
+					Choices: []schemas.BifrostResponseChoice{
+						{
+							Index: 0,
+							ChatNonStreamResponseChoice: &schemas.ChatNonStreamResponseChoice{
+								Message: &schemas.ChatMessage{
+									Role: schemas.ChatMessageRoleAssistant,
+									Content: &schemas.ChatMessageContent{
+										ContentStr: bifrost.Ptr("Hello! Response from " + tc.Name + " test."),
+									}},
+							},
+						},
+					},
+					ExtraFields: schemas.BifrostResponseExtraFields{
+						Provider:               schemas.OpenAI,
+						OriginalModelRequested: "gpt-4o-mini",
+						RequestType:            schemas.ChatCompletionRequest,
+					},
+				},
+			}
+
+			// Cache the response
+			t.Logf("[%s] Caching response...", tc.Name)
+			_, _, err = setup.Plugin.PostLLMHook(ctx, response, nil)
+			if err != nil {
+				t.Fatalf("[%s] PostHook failed: %v", tc.Name, err)
+			}
+
+			// Wait for async caching to complete
+			WaitForCache(setup.Plugin)
+			t.Logf("[%s] Response cached successfully", tc.Name)
+
+			// Second request - should be a cache hit
+			t.Logf("[%s] Testing second identical request (expecting cache hit)...", tc.Name)
+
+			ctx2 := schemas.NewBifrostContext(context.Background(), schemas.NoDeadline)
+			ctx2.SetValue(CacheKey, "test-"+strings.ToLower(tc.Name)+"-basic")
+
+			_, shortCircuit2, err := setup.Plugin.PreLLMHook(ctx2, request)
+			if err != nil {
+				t.Fatalf("[%s] Second PreHook failed: %v", tc.Name, err)
+			}
+
+			if shortCircuit2 == nil {
+				t.Fatalf("[%s] Expected cache hit on identical request, but got cache miss", tc.Name)
+			}
+
+			if shortCircuit2.Response == nil {
+				t.Fatalf("[%s] Cache hit but response is nil", tc.Name)
+			}
+
+			t.Logf("[%s] Cache hit detected and response returned", tc.Name)
+			t.Logf("[%s] Basic flow test passed!", tc.Name)
+		})
+	}
+}
+
+// TestSemanticCache_AllVectorStores_DirectHashMatch tests direct hash matching across all vector stores
+func TestSemanticCache_AllVectorStores_DirectHashMatch(t *testing.T) {
+	for _, tc := range getVectorStoreTestCases() {
+		t.Run(tc.Name, func(t *testing.T) {
+			skipIfNoAPIKey(t, tc.StoreType)
+			setup := NewTestSetupWithVectorStore(t, getDefaultTestConfig(), tc.StoreType)
+			defer setup.Cleanup()
+
+			// Use unique cache key per test run to avoid stale data from previous runs
+			// (Pinecone Local doesn't support deletion by metadata filter)
+			testRunID := uuid.New().String()[:8]
+			cacheKey := "test-" + strings.ToLower(tc.Name) + "-direct-" + testRunID
+
+			ctx := CreateContextWithCacheKeyAndType(cacheKey, CacheTypeDirect)
+
+			testRequest := CreateBasicChatRequest("Direct hash test for "+tc.Name+" "+testRunID, 0.7, 50)
+
+			t.Logf("[%s] Making first request to populate cache...", tc.Name)
+			response1, err1 := setup.Client.ChatCompletionRequest(ctx, testRequest)
+			if err1 != nil {
+				t.Skipf("[%s] First request failed (likely no API key): %v", tc.Name, err1)
+				return
+			}
+			AssertNoCacheHit(t, &schemas.BifrostResponse{ChatResponse: response1})
+
+			WaitForCache(setup.Plugin)
+
+			// Second request with direct-only cache type
+			ctx2 := CreateContextWithCacheKeyAndType(cacheKey, CacheTypeDirect)
+
+			t.Logf("[%s] Making second request with CacheTypeDirect...", tc.Name)
+			response2, err2 := setup.Client.ChatCompletionRequest(ctx2, testRequest)
+			if err2 != nil {
+				t.Fatalf("[%s] Second request failed: %v", tc.Name, err2.Error.Message)
+			}
+
+			AssertCacheHit(t, &schemas.BifrostResponse{ChatResponse: response2}, "direct")
+			t.Logf("[%s] Direct hash match test passed!", tc.Name)
+		})
+	}
+}
+
+// TestSemanticCache_AllVectorStores_NamespaceIsolation tests that different cache keys are isolated
+func TestSemanticCache_AllVectorStores_NamespaceIsolation(t *testing.T) {
+	for _, tc := range getVectorStoreTestCases() {
+		t.Run(tc.Name, func(t *testing.T) {
+			skipIfNoAPIKey(t, tc.StoreType)
+			setup := NewTestSetupWithVectorStore(t, getDefaultTestConfig(), tc.StoreType)
+			defer setup.Cleanup()
+
+			// Use unique cache keys per test run to avoid stale data from previous runs
+			// (Pinecone Local doesn't support deletion by metadata filter)
+			testRunID := uuid.New().String()[:8]
+			cacheKey1 := "test-" + strings.ToLower(tc.Name) + "-namespace-1-" + testRunID
+			cacheKey2 := "test-" + strings.ToLower(tc.Name) + "-namespace-2-" + testRunID
+
+			// Cache with first key
+			ctx1 := CreateContextWithCacheKey(cacheKey1)
+			testRequest := CreateBasicChatRequest("Namespace isolation test for "+tc.Name+" "+testRunID, 0.7, 50)
+
+			t.Logf("[%s] Making request with cache key 1...", tc.Name)
+			response1, err1 := setup.Client.ChatCompletionRequest(ctx1, testRequest)
+			if err1 != nil {
+				t.Skipf("[%s] First request failed (likely no API key): %v", tc.Name, err1)
+				return
+			}
+			AssertNoCacheHit(t, &schemas.BifrostResponse{ChatResponse: response1})
+
+			WaitForCache(setup.Plugin)
+
+			// Try with different cache key - should miss
+			ctx2 := CreateContextWithCacheKey(cacheKey2)
+
+			t.Logf("[%s] Making same request with different cache key (expecting miss)...", tc.Name)
+			response2, err2 := setup.Client.ChatCompletionRequest(ctx2, testRequest)
+			if err2 != nil {
+				t.Fatalf("[%s] Second request failed: %v", tc.Name, err2.Error.Message)
+			}
+
+			// Should be a cache miss because different namespace
+			AssertNoCacheHit(t, &schemas.BifrostResponse{ChatResponse: response2})
+
+			// Try with original key - should hit
+			ctx3 := CreateContextWithCacheKey(cacheKey1)
+
+			t.Logf("[%s] Making same request with original cache key (expecting hit)...", tc.Name)
+			response3, err3 := setup.Client.ChatCompletionRequest(ctx3, testRequest)
+			if err3 != nil {
+				t.Fatalf("[%s] Third request failed: %v", tc.Name, err3.Error.Message)
+			}
+
+			AssertCacheHit(t, &schemas.BifrostResponse{ChatResponse: response3}, "direct")
+			t.Logf("[%s] Namespace isolation test passed!", tc.Name)
+		})
+	}
+}
+
+// TestSemanticCache_AllVectorStores_ParameterFiltering tests that different parameters don't share cache
+func TestSemanticCache_AllVectorStores_ParameterFiltering(t *testing.T) {
+	for _, tc := range getVectorStoreTestCases() {
+		t.Run(tc.Name, func(t *testing.T) {
+			skipIfNoAPIKey(t, tc.StoreType)
+			setup := NewTestSetupWithVectorStore(t, getDefaultTestConfig(), tc.StoreType)
+			defer setup.Cleanup()
+
+			ctx := schemas.NewBifrostContext(context.Background(), schemas.NoDeadline)
+			ctx.SetValue(CacheKey, "test-"+strings.ToLower(tc.Name)+"-params")
+
+			// First request with temperature=0.7
+			request1 := &schemas.BifrostRequest{
+				RequestType: schemas.ChatCompletionRequest,
+				ChatRequest: &schemas.BifrostChatRequest{
+					Provider: schemas.OpenAI,
+					Model:    "gpt-4o-mini",
+					Input: []schemas.ChatMessage{
+						{
+							Role: schemas.ChatMessageRoleUser,
+							Content: &schemas.ChatMessageContent{
+								ContentStr: bifrost.Ptr("Parameter test for " + tc.Name),
+							},
+						},
+					},
+					Params: &schemas.ChatParameters{
+						Temperature:         bifrost.Ptr(0.7),
+						MaxCompletionTokens: bifrost.Ptr(100),
+					},
+				},
+			}
+
+			t.Logf("[%s] Testing first request with temperature=0.7...", tc.Name)
+
+			_, shortCircuit1, err := setup.Plugin.PreLLMHook(ctx, request1)
+			if err != nil {
+				t.Fatalf("[%s] First PreHook failed: %v", tc.Name, err)
+			}
+
+			if shortCircuit1 != nil {
+				t.Fatalf("[%s] Expected cache miss for first request", tc.Name)
+			}
+
+			// Cache a response
+			response := &schemas.BifrostResponse{
+				ChatResponse: &schemas.BifrostChatResponse{
+					ID: uuid.New().String(),
+					Choices: []schemas.BifrostResponseChoice{
+						{
+							ChatNonStreamResponseChoice: &schemas.ChatNonStreamResponseChoice{
+								Message: &schemas.ChatMessage{
+									Role: schemas.ChatMessageRoleAssistant,
+									Content: &schemas.ChatMessageContent{
+										ContentStr: bifrost.Ptr("Response for " + tc.Name),
+									}},
+							},
+						},
+					},
+					ExtraFields: schemas.BifrostResponseExtraFields{
+						Provider:               schemas.OpenAI,
+						OriginalModelRequested: "gpt-4o-mini",
+						RequestType:            schemas.ChatCompletionRequest,
+					},
+				},
+			}
+
+			_, _, err = setup.Plugin.PostLLMHook(ctx, response, nil)
+			if err != nil {
+				t.Fatalf("[%s] PostHook failed: %v", tc.Name, err)
+			}
+
+			WaitForCache(setup.Plugin)
+			t.Logf("[%s] First response cached", tc.Name)
+
+			// Second request with different temperature - should be cache miss
+			t.Logf("[%s] Testing second request with temperature=0.5 (expecting cache miss)...", tc.Name)
+
+			ctx2 := schemas.NewBifrostContext(context.Background(), schemas.NoDeadline)
+			ctx2.SetValue(CacheKey, "test-"+strings.ToLower(tc.Name)+"-params")
+
+			request2 := &schemas.BifrostRequest{
+				RequestType: schemas.ChatCompletionRequest,
+				ChatRequest: &schemas.BifrostChatRequest{
+					Provider: schemas.OpenAI,
+					Model:    "gpt-4o-mini",
+					Input: []schemas.ChatMessage{
+						{
+							Role: schemas.ChatMessageRoleUser,
+							Content: &schemas.ChatMessageContent{
+								ContentStr: bifrost.Ptr("Parameter test for " + tc.Name),
+							},
+						},
+					},
+					Params: &schemas.ChatParameters{
+						Temperature:         bifrost.Ptr(0.5), // Different temperature
+						MaxCompletionTokens: bifrost.Ptr(100),
+					},
+				},
+			}
+
+			_, shortCircuit2, err := setup.Plugin.PreLLMHook(ctx2, request2)
+			if err != nil {
+				t.Fatalf("[%s] Second PreHook failed: %v", tc.Name, err)
+			}
+
+			if shortCircuit2 != nil {
+				t.Fatalf("[%s] Expected cache miss due to different temperature, but got cache hit", tc.Name)
+			}
+
+			t.Logf("[%s] Parameter filtering test passed!", tc.Name)
+		})
+	}
+}
+
+// TestSemanticCache_AllVectorStores_EmbeddingRequest tests embedding request caching across all vector stores
+func TestSemanticCache_AllVectorStores_EmbeddingRequest(t *testing.T) {
+	for _, tc := range getVectorStoreTestCases() {
+		t.Run(tc.Name, func(t *testing.T) {
+			skipIfNoAPIKey(t, tc.StoreType)
+			setup := NewTestSetupWithVectorStore(t, getDefaultTestConfig(), tc.StoreType)
+			defer setup.Cleanup()
+
+			// Use unique cache key per test run to avoid stale data from previous runs
+			// (Pinecone Local doesn't support deletion by metadata filter)
+			testRunID := uuid.New().String()[:8]
+			cacheKey := "test-" + strings.ToLower(tc.Name) + "-embedding-" + testRunID
+
+			embeddingRequest := CreateEmbeddingRequest([]string{"Test embedding with " + tc.Name + " " + testRunID})
+
+			// Cache first request
+			ctx1 := CreateContextWithCacheKey(cacheKey)
+			t.Logf("[%s] Making first embedding request...", tc.Name)
+			response1, err1 := setup.Client.EmbeddingRequest(ctx1, embeddingRequest)
+			if err1 != nil {
+				t.Skipf("[%s] First embedding request failed (likely no API key): %v", tc.Name, err1)
+				return
+			}
+			AssertNoCacheHit(t, &schemas.BifrostResponse{EmbeddingResponse: response1})
+
+			WaitForCache(setup.Plugin)
+
+			// Second request - should be cache hit
+			ctx2 := CreateContextWithCacheKey(cacheKey)
+			t.Logf("[%s] Making second embedding request (expecting cache hit)...", tc.Name)
+			response2, err2 := setup.Client.EmbeddingRequest(ctx2, embeddingRequest)
+			if err2 != nil {
+				t.Fatalf("[%s] Second embedding request failed: %v", tc.Name, err2.Error.Message)
+			}
+			AssertCacheHit(t, &schemas.BifrostResponse{EmbeddingResponse: response2}, "direct")
+
+			t.Logf("[%s] Embedding request caching test passed!", tc.Name)
+		})
+	}
+}
--- a/plugins/semanticcache/search.go
+++ b/plugins/semanticcache/search.go
@@ -0,0 +1,466 @@
+package semanticcache
+
+import (
+	"context"
+	"encoding/json"
+	"errors"
+	"fmt"
+	"strconv"
+	"strings"
+	"time"
+
+	bifrost "github.com/maximhq/bifrost/core"
+	"github.com/maximhq/bifrost/core/schemas"
+	"github.com/maximhq/bifrost/framework/vectorstore"
+)
+
+func (plugin *Plugin) prepareDirectCacheLookup(ctx *schemas.BifrostContext, req *schemas.BifrostRequest, cacheKey string) (string, error) {
+	hash, err := plugin.generateRequestHash(req)
+	if err != nil {
+		return "", fmt.Errorf("failed to generate request hash: %w", err)
+	}
+
+	plugin.logger.Debug(PluginLoggerPrefix + " Generated Hash for Request: " + hash)
+
+	paramsHash, err := plugin.computeRequestParamsHash(req)
+	if err != nil {
+		return "", fmt.Errorf("failed to compute direct lookup params hash: %w", err)
+	}
+
+	ctx.SetValue(requestHashKey, hash)
+	ctx.SetValue(requestParamsHashKey, paramsHash)
+
+	provider, model, _ := req.GetRequestFields()
+	directCacheID := plugin.generateDirectCacheID(provider, model, cacheKey, hash, paramsHash)
+
+	return directCacheID, nil
+}
+
+func (plugin *Plugin) performLegacyDirectSearch(ctx *schemas.BifrostContext, req *schemas.BifrostRequest, cacheKey string) (*schemas.LLMPluginShortCircuit, error) {
+	hash, _ := ctx.Value(requestHashKey).(string)
+	paramsHash, _ := ctx.Value(requestParamsHashKey).(string)
+
+	provider, model, _ := req.GetRequestFields()
+
+	filters := []vectorstore.Query{
+		{Field: "request_hash", Operator: vectorstore.QueryOperatorEqual, Value: hash},
+		{Field: "cache_key", Operator: vectorstore.QueryOperatorEqual, Value: cacheKey},
+		{Field: "params_hash", Operator: vectorstore.QueryOperatorEqual, Value: paramsHash},
+		{Field: "from_bifrost_semantic_cache_plugin", Operator: vectorstore.QueryOperatorEqual, Value: true},
+	}
+
+	if plugin.config.CacheByProvider != nil && *plugin.config.CacheByProvider {
+		filters = append(filters, vectorstore.Query{Field: "provider", Operator: vectorstore.QueryOperatorEqual, Value: string(provider)})
+	}
+	if plugin.config.CacheByModel != nil && *plugin.config.CacheByModel {
+		filters = append(filters, vectorstore.Query{Field: "model", Operator: vectorstore.QueryOperatorEqual, Value: model})
+	}
+
+	plugin.logger.Debug(fmt.Sprintf("%s Searching for legacy direct hash match with %d filters", PluginLoggerPrefix, len(filters)))
+
+	selectFields := append([]string(nil), SelectFields...)
+	if bifrost.IsStreamRequestType(req.RequestType) {
+		selectFields = removeField(selectFields, "response")
+	} else {
+		selectFields = removeField(selectFields, "stream_chunks")
+	}
+
+	searchCtx := vectorstore.WithDisableScanFallback(ctx)
+	var cursor *string
+	results, _, err := plugin.store.GetAll(searchCtx, plugin.config.VectorStoreNamespace, filters, selectFields, cursor, 1)
+	if err != nil {
+		if errors.Is(err, vectorstore.ErrNotFound) || errors.Is(err, vectorstore.ErrQuerySyntax) {
+			return nil, nil
+		}
+		return nil, fmt.Errorf("failed to search for legacy direct hash match: %w", err)
+	}
+
+	if len(results) == 0 {
+		plugin.logger.Debug(PluginLoggerPrefix + " No legacy direct hash match found")
+		return nil, nil
+	}
+
+	result := results[0]
+	plugin.logger.Debug(fmt.Sprintf("%s Found legacy direct hash match with ID: %s", PluginLoggerPrefix, result.ID))
+	return plugin.buildResponseFromResult(ctx, req, result, CacheTypeDirect, 1.0, 0)
+}
+
+func (plugin *Plugin) performDirectChunkLookup(ctx *schemas.BifrostContext, req *schemas.BifrostRequest, cacheKey string) (*schemas.LLMPluginShortCircuit, error) {
+	directCacheID, err := plugin.prepareDirectCacheLookup(ctx, req, cacheKey)
+	if err != nil {
+		return nil, err
+	}
+	ctx.SetValue(requestStorageIDKey, directCacheID)
+
+	result, err := plugin.store.GetChunk(ctx, plugin.config.VectorStoreNamespace, directCacheID)
+	if err != nil {
+		errMsg := strings.ToLower(err.Error())
+		isMiss := errors.Is(err, vectorstore.ErrNotFound) ||
+			strings.Contains(errMsg, "not found") ||
+			strings.Contains(errMsg, "status code: 404")
+		if isMiss {
+			plugin.logger.Debug(PluginLoggerPrefix + " No direct chunk match found")
+			return nil, nil
+		}
+		return nil, fmt.Errorf("failed to fetch direct cache chunk: %w", err)
+	}
+
+	plugin.logger.Debug(fmt.Sprintf("%s Found direct chunk match with ID: %s", PluginLoggerPrefix, result.ID))
+	return plugin.buildResponseFromResult(ctx, req, result, CacheTypeDirect, 1.0, 0)
+}
+
+func (plugin *Plugin) performDirectSearch(ctx *schemas.BifrostContext, req *schemas.BifrostRequest, cacheKey string) (*schemas.LLMPluginShortCircuit, error) {
+	shortCircuit, err := plugin.performDirectChunkLookup(ctx, req, cacheKey)
+	if err != nil {
+		return nil, err
+	}
+	if shortCircuit != nil {
+		return shortCircuit, nil
+	}
+
+	return plugin.performLegacyDirectSearch(ctx, req, cacheKey)
+}
+
+// generateEmbeddingsForStorage generates embeddings and stores them in context for PostHook storage.
+// This is used when the vector store requires vectors but we're in direct-only cache mode.
+// Unlike performSemanticSearch, this function does not perform any search - it only generates
+// and stores embeddings so they can be persisted with the cache entry.
+func (plugin *Plugin) generateEmbeddingsForStorage(ctx *schemas.BifrostContext, req *schemas.BifrostRequest) error {
+	// Extract text and metadata for embedding
+	text, paramsHash, err := plugin.extractTextForEmbedding(req)
+	if err != nil {
+		return fmt.Errorf("failed to extract text for embedding: %w", err)
+	}
+
+	// Generate embedding
+	embedding, inputTokens, err := plugin.generateEmbedding(ctx, text)
+	if err != nil {
+		return fmt.Errorf("failed to generate embedding: %w", err)
+	}
+
+	// Store embedding and metadata in context for PostHook
+	ctx.SetValue(requestEmbeddingKey, embedding)
+	ctx.SetValue(requestEmbeddingTokensKey, inputTokens)
+	ctx.SetValue(requestParamsHashKey, paramsHash)
+
+	return nil
+}
+
+// performSemanticSearch performs semantic similarity search and returns matching response if found.
+func (plugin *Plugin) performSemanticSearch(ctx *schemas.BifrostContext, req *schemas.BifrostRequest, cacheKey string) (*schemas.LLMPluginShortCircuit, error) {
+	// Extract text and metadata for embedding
+	text, paramsHash, err := plugin.extractTextForEmbedding(req)
+	if err != nil {
+		return nil, fmt.Errorf("failed to extract text for embedding: %w", err)
+	}
+
+	// Generate embedding
+	embedding, inputTokens, err := plugin.generateEmbedding(ctx, text)
+	if err != nil {
+		return nil, fmt.Errorf("failed to generate embedding: %w", err)
+	}
+
+	// Store embedding and metadata in context for PostLLMHook
+	ctx.SetValue(requestEmbeddingKey, embedding)
+	ctx.SetValue(requestEmbeddingTokensKey, inputTokens)
+	ctx.SetValue(requestParamsHashKey, paramsHash)
+
+	cacheThreshold := plugin.config.Threshold
+
+	thresholdValue := ctx.Value(CacheThresholdKey)
+	if thresholdValue != nil {
+		threshold, ok := thresholdValue.(float64)
+		if !ok {
+			plugin.logger.Warn(PluginLoggerPrefix + " Threshold is not a float64, using default threshold")
+		} else {
+			cacheThreshold = threshold
+		}
+	}
+
+	provider, model, _ := req.GetRequestFields()
+
+	// Build strict metadata filters as Query slices (provider, model, and all params)
+	strictFilters := []vectorstore.Query{
+		{Field: "cache_key", Operator: vectorstore.QueryOperatorEqual, Value: cacheKey},
+		{Field: "params_hash", Operator: vectorstore.QueryOperatorEqual, Value: paramsHash},
+		{Field: "from_bifrost_semantic_cache_plugin", Operator: vectorstore.QueryOperatorEqual, Value: true},
+	}
+
+	if plugin.config.CacheByProvider != nil && *plugin.config.CacheByProvider {
+		strictFilters = append(strictFilters, vectorstore.Query{Field: "provider", Operator: vectorstore.QueryOperatorEqual, Value: string(provider)})
+	}
+	if plugin.config.CacheByModel != nil && *plugin.config.CacheByModel {
+		strictFilters = append(strictFilters, vectorstore.Query{Field: "model", Operator: vectorstore.QueryOperatorEqual, Value: model})
+	}
+
+	plugin.logger.Debug(fmt.Sprintf("%s Performing semantic search with %d metadata filters", PluginLoggerPrefix, len(strictFilters)))
+
+	// Make a full copy so we don't mutate the original backing array
+	selectFields := append([]string(nil), SelectFields...)
+	if bifrost.IsStreamRequestType(req.RequestType) {
+		selectFields = removeField(selectFields, "response")
+	} else {
+		selectFields = removeField(selectFields, "stream_chunks")
+	}
+
+	// For semantic search, we want semantic similarity in content but exact parameter matching
+	results, err := plugin.store.GetNearest(ctx, plugin.config.VectorStoreNamespace, embedding, strictFilters, selectFields, cacheThreshold, 1)
+	if err != nil {
+		return nil, fmt.Errorf("failed to search semantic cache: %w", err)
+	}
+
+	if len(results) == 0 {
+		plugin.logger.Debug(PluginLoggerPrefix + " No semantic match found")
+		return nil, nil
+	}
+
+	// Found a semantically similar entry
+	result := results[0]
+	plugin.logger.Debug(fmt.Sprintf("%s Found semantic match with ID: %s, Score: %f", PluginLoggerPrefix, result.ID, *result.Score))
+
+	// Build response from cached result
+	return plugin.buildResponseFromResult(ctx, req, result, CacheTypeSemantic, cacheThreshold, inputTokens)
+}
+
+// buildResponseFromResult constructs a LLMPluginShortCircuit response from a cached VectorEntry result
+func (plugin *Plugin) buildResponseFromResult(ctx *schemas.BifrostContext, req *schemas.BifrostRequest, result vectorstore.SearchResult, cacheType CacheType, threshold float64, inputTokens int) (*schemas.LLMPluginShortCircuit, error) {
+	// Extract response data from the result properties
+	properties := result.Properties
+	if properties == nil {
+		return nil, fmt.Errorf("no properties found in cached result")
+	}
+
+	// Check TTL - if entry has expired, delete it and return cache miss
+	if expiresAtRaw, exists := properties["expires_at"]; exists && expiresAtRaw != nil {
+		var expiresAt int64
+		var validType bool
+		switch v := expiresAtRaw.(type) {
+		case string:
+			var err error
+			expiresAt, err = strconv.ParseInt(v, 10, 64)
+			if err != nil {
+				validType = false
+			} else {
+				validType = true
+			}
+		case float64:
+			expiresAt = int64(v)
+			validType = true
+		case int64:
+			expiresAt = v
+			validType = true
+		case int:
+			expiresAt = int64(v)
+			validType = true
+		}
+		if validType {
+			currentTime := time.Now().Unix()
+			if expiresAt < currentTime {
+				// Entry has expired, delete it asynchronously
+				go func() {
+					deleteCtx, cancel := context.WithTimeout(context.Background(), 5*time.Second)
+					defer cancel()
+					err := plugin.store.Delete(deleteCtx, plugin.config.VectorStoreNamespace, result.ID)
+					if err != nil {
+						plugin.logger.Warn("%s Failed to delete expired entry %s: %v", PluginLoggerPrefix, result.ID, err)
+					}
+				}()
+				// Return nil to indicate cache miss
+				return nil, nil
+			}
+		}
+	}
+
+	// Check if this is a streaming response - need to check for non-null values
+	streamResponses, hasStreamingResponse := properties["stream_chunks"]
+	singleResponse, hasSingleResponse := properties["response"]
+
+	// Consider fields present only if they're not null
+	hasValidSingleResponse := hasSingleResponse && singleResponse != nil
+	hasValidStreamingResponse := hasStreamingResponse && streamResponses != nil
+
+	// Parse stream_chunks
+	streamChunks, err := plugin.parseStreamChunks(streamResponses)
+	if err != nil || len(streamChunks) == 0 {
+		hasValidStreamingResponse = false
+	}
+
+	similarity := 0.0
+	if result.Score != nil {
+		similarity = *result.Score
+	}
+
+	if hasValidStreamingResponse && !hasValidSingleResponse {
+		// Handle streaming response
+		return plugin.buildStreamingResponseFromResult(ctx, req, result, streamResponses, cacheType, threshold, similarity, inputTokens)
+	} else if hasValidSingleResponse && !hasValidStreamingResponse {
+		// Handle single response
+		return plugin.buildSingleResponseFromResult(ctx, req, result, singleResponse, cacheType, threshold, similarity, inputTokens)
+	} else {
+		return nil, fmt.Errorf("cached result has invalid response data: both or neither response/stream_chunks are present (response: %v, stream_chunks: %v)", singleResponse, streamResponses)
+	}
+}
+
+// buildSingleResponseFromResult constructs a single response from cached data
+func (plugin *Plugin) buildSingleResponseFromResult(ctx *schemas.BifrostContext, req *schemas.BifrostRequest, result vectorstore.SearchResult, responseData interface{}, cacheType CacheType, threshold float64, similarity float64, inputTokens int) (*schemas.LLMPluginShortCircuit, error) {
+	requestedProvider, requestedModel, _ := req.GetRequestFields()
+
+	responseStr, ok := responseData.(string)
+	if !ok {
+		return nil, fmt.Errorf("cached response is not a string")
+	}
+
+	// Unmarshal the cached response
+	var cachedResponse schemas.BifrostResponse
+	if err := json.Unmarshal([]byte(responseStr), &cachedResponse); err != nil {
+		return nil, fmt.Errorf("failed to unmarshal cached response: %w", err)
+	}
+
+	extraFields := cachedResponse.GetExtraFields()
+
+	if extraFields.CacheDebug == nil {
+		extraFields.CacheDebug = &schemas.BifrostCacheDebug{}
+	}
+	extraFields.CacheDebug.CacheHit = true
+	extraFields.CacheDebug.HitType = bifrost.Ptr(string(cacheType))
+	extraFields.CacheDebug.CacheID = bifrost.Ptr(result.ID)
+	extraFields.CacheDebug.RequestedProvider = bifrost.Ptr(string(requestedProvider))
+	extraFields.CacheDebug.RequestedModel = bifrost.Ptr(requestedModel)
+	if cacheType == CacheTypeSemantic {
+		extraFields.CacheDebug.ProviderUsed = bifrost.Ptr(string(plugin.config.Provider))
+		extraFields.CacheDebug.ModelUsed = bifrost.Ptr(plugin.config.EmbeddingModel)
+		extraFields.CacheDebug.Threshold = &threshold
+		extraFields.CacheDebug.Similarity = &similarity
+		extraFields.CacheDebug.InputTokens = &inputTokens
+	} else {
+		extraFields.CacheDebug.ProviderUsed = nil
+		extraFields.CacheDebug.ModelUsed = nil
+		extraFields.CacheDebug.Threshold = nil
+		extraFields.CacheDebug.Similarity = nil
+		extraFields.CacheDebug.InputTokens = nil
+	}
+
+	ctx.SetValue(isCacheHitKey, true)
+	ctx.SetValue(cacheHitTypeKey, cacheType)
+
+	return &schemas.LLMPluginShortCircuit{
+		Response: &cachedResponse,
+	}, nil
+}
+
+// buildStreamingResponseFromResult constructs a streaming response from cached data
+func (plugin *Plugin) buildStreamingResponseFromResult(ctx *schemas.BifrostContext, req *schemas.BifrostRequest, result vectorstore.SearchResult, streamData interface{}, cacheType CacheType, threshold float64, similarity float64, inputTokens int) (*schemas.LLMPluginShortCircuit, error) {
+	requestedProvider, requestedModel, _ := req.GetRequestFields()
+
+	// Parse stream_chunks
+	streamArray, err := plugin.parseStreamChunks(streamData)
+	if err != nil {
+		return nil, fmt.Errorf("failed to parse stream_chunks: %w", err)
+	}
+
+	// Mark cache-hit once to avoid concurrent ctx writes
+	ctx.SetValue(isCacheHitKey, true)
+	ctx.SetValue(cacheHitTypeKey, cacheType)
+
+	// Create stream channel
+	streamChan := make(chan *schemas.BifrostStreamChunk)
+
+	go func() {
+		defer close(streamChan)
+
+		// Set cache-hit markers inside the streaming goroutine to avoid races
+		ctx.SetValue(isCacheHitKey, true)
+		ctx.SetValue(cacheHitTypeKey, cacheType)
+
+		// Process each stream chunk
+		for i, chunkData := range streamArray {
+			chunkStr, ok := chunkData.(string)
+			if !ok {
+				plugin.logger.Warn("%s Stream chunk %d is not a string, skipping", PluginLoggerPrefix, i)
+				continue
+			}
+
+			// Unmarshal the chunk as BifrostResponse
+			var cachedResponse schemas.BifrostResponse
+			if err := json.Unmarshal([]byte(chunkStr), &cachedResponse); err != nil {
+				plugin.logger.Warn("%s Failed to unmarshal stream chunk %d, skipping: %v", PluginLoggerPrefix, i, err)
+				continue
+			}
+
+			// Add cache debug to only the last chunk and set stream end indicator
+			if i == len(streamArray)-1 {
+				ctx.SetValue(schemas.BifrostContextKeyStreamEndIndicator, true)
+				extraFields := cachedResponse.GetExtraFields()
+				cacheDebug := schemas.BifrostCacheDebug{
+					CacheHit:          true,
+					HitType:           bifrost.Ptr(string(cacheType)),
+					CacheID:           bifrost.Ptr(result.ID),
+					RequestedProvider: bifrost.Ptr(string(requestedProvider)),
+					RequestedModel:    bifrost.Ptr(requestedModel),
+				}
+				if cacheType == CacheTypeSemantic {
+					cacheDebug.ProviderUsed = bifrost.Ptr(string(plugin.config.Provider))
+					cacheDebug.ModelUsed = bifrost.Ptr(plugin.config.EmbeddingModel)
+					cacheDebug.Threshold = &threshold
+					cacheDebug.Similarity = &similarity
+					cacheDebug.InputTokens = &inputTokens
+				} else {
+					cacheDebug.ProviderUsed = nil
+					cacheDebug.ModelUsed = nil
+					cacheDebug.Threshold = nil
+					cacheDebug.Similarity = nil
+					cacheDebug.InputTokens = nil
+				}
+				extraFields.CacheDebug = &cacheDebug
+			}
+
+			// Send chunk to stream
+			streamChan <- &schemas.BifrostStreamChunk{
+				BifrostTextCompletionResponse:        cachedResponse.TextCompletionResponse,
+				BifrostChatResponse:                  cachedResponse.ChatResponse,
+				BifrostResponsesStreamResponse:       cachedResponse.ResponsesStreamResponse,
+				BifrostSpeechStreamResponse:          cachedResponse.SpeechStreamResponse,
+				BifrostTranscriptionStreamResponse:   cachedResponse.TranscriptionStreamResponse,
+				BifrostImageGenerationStreamResponse: cachedResponse.ImageGenerationStreamResponse,
+			}
+		}
+	}()
+
+	return &schemas.LLMPluginShortCircuit{
+		Stream: streamChan,
+	}, nil
+}
+
+// parseStreamChunks parses stream_chunks data from various formats into []interface{}
+// Handles []interface{}, []string, and JSON string formats
+func (plugin *Plugin) parseStreamChunks(streamData interface{}) ([]interface{}, error) {
+	if streamData == nil {
+		return nil, fmt.Errorf("stream data is nil")
+	}
+
+	switch v := streamData.(type) {
+	case []interface{}:
+		return v, nil
+	case []string:
+		// Convert []string to []interface{}
+		result := make([]interface{}, len(v))
+		for i, s := range v {
+			result[i] = s
+		}
+		return result, nil
+	case string:
+		// Parse JSON string from Redis
+		var stringArray []string
+		if err := json.Unmarshal([]byte(v), &stringArray); err != nil {
+			return nil, fmt.Errorf("failed to parse JSON string: %w", err)
+		}
+		// Convert to []interface{}
+		result := make([]interface{}, len(stringArray))
+		for i, s := range stringArray {
+			result[i] = s
+		}
+		return result, nil
+	default:
+		return nil, fmt.Errorf("unsupported stream data type: %T", streamData)
+	}
+}
--- a/plugins/semanticcache/stream.go
+++ b/plugins/semanticcache/stream.go
@@ -0,0 +1,201 @@
+package semanticcache
+
+import (
+	"context"
+	"encoding/json"
+	"fmt"
+	"sort"
+	"sync"
+	"time"
+)
+
+// Streaming State Management Methods
+
+// createStreamAccumulator creates a new stream accumulator for a request
+func (plugin *Plugin) createStreamAccumulator(requestID string, storageID string, embedding []float32, metadata map[string]interface{}, ttl time.Duration) *StreamAccumulator {
+	return &StreamAccumulator{
+		RequestID:  requestID,
+		StorageID:  storageID,
+		Chunks:     make([]*StreamChunk, 0),
+		IsComplete: false,
+		Embedding:  embedding,
+		Metadata:   metadata,
+		TTL:        ttl,
+		mu:         sync.Mutex{},
+	}
+}
+
+// getOrCreateStreamAccumulator gets or creates a stream accumulator for a request
+func (plugin *Plugin) getOrCreateStreamAccumulator(requestID string, storageID string, embedding []float32, metadata map[string]interface{}, ttl time.Duration) *StreamAccumulator {
+	if existing, ok := plugin.streamAccumulators.Load(requestID); ok {
+		return existing.(*StreamAccumulator)
+	}
+
+	newAccumulator := plugin.createStreamAccumulator(requestID, storageID, embedding, metadata, ttl)
+	actual, _ := plugin.streamAccumulators.LoadOrStore(requestID, newAccumulator)
+	return actual.(*StreamAccumulator)
+}
+
+// addStreamChunk adds a chunk to the stream accumulator
+func (plugin *Plugin) addStreamChunk(requestID string, chunk *StreamChunk, isFinalChunk bool) error {
+	// Get accumulator (should exist if properly initialized)
+	accumulatorInterface, exists := plugin.streamAccumulators.Load(requestID)
+	if !exists {
+		return fmt.Errorf("stream accumulator not found for request %s", requestID)
+	}
+
+	accumulator := accumulatorInterface.(*StreamAccumulator)
+	accumulator.mu.Lock()
+	defer accumulator.mu.Unlock()
+
+	// Add chunk to the list (chunks arrive in order)
+	accumulator.Chunks = append(accumulator.Chunks, chunk)
+
+	// Set FinalTimestamp when FinishReason is present
+	// This handles both normal completion chunks and usage-only last chunks
+	if isFinalChunk {
+		accumulator.FinalTimestamp = chunk.Timestamp
+	}
+
+	plugin.logger.Debug(fmt.Sprintf("%s Added chunk to stream accumulator for request %s", PluginLoggerPrefix, requestID))
+
+	return nil
+}
+
+// processAccumulatedStream processes all accumulated chunks and caches the complete stream
+// Flow: Collect everything → Check for ANY errors → If no errors, order and send to .Add() → If any errors, drop operation
+func (plugin *Plugin) processAccumulatedStream(ctx context.Context, requestID string) error {
+	accumulatorInterface, exists := plugin.streamAccumulators.Load(requestID)
+	if !exists {
+		return fmt.Errorf("stream accumulator not found for request %s", requestID)
+	}
+
+	accumulator := accumulatorInterface.(*StreamAccumulator)
+	accumulator.mu.Lock()
+
+	// Ensure unlock happens after cleanup
+	defer accumulator.mu.Unlock()
+	// Ensure cleanup happens
+	defer plugin.cleanupStreamAccumulator(requestID)
+
+	// STEP 1: Check if any chunk in the entire stream had an error
+	if accumulator.HasError {
+		plugin.logger.Debug(fmt.Sprintf("%s Stream for request %s had errors, dropping entire operation (not caching)", PluginLoggerPrefix, requestID))
+		return nil
+	}
+
+	// STEP 2: All chunks are clean, now sort and build ordered stream for caching
+	plugin.logger.Debug(fmt.Sprintf("%s Stream for request %s completed successfully, processing %d chunks for caching", PluginLoggerPrefix, requestID, len(accumulator.Chunks)))
+
+	// Sort chunks by their ChunkIndex to ensure proper order (stable + nil-safe)
+	sort.SliceStable(accumulator.Chunks, func(i, j int) bool {
+		if accumulator.Chunks[i].Response == nil || accumulator.Chunks[j].Response == nil {
+			// Push nils to the end deterministically
+			return accumulator.Chunks[j].Response != nil
+		}
+		if accumulator.Chunks[i].Response.TextCompletionResponse != nil {
+			return accumulator.Chunks[i].Response.TextCompletionResponse.ExtraFields.ChunkIndex < accumulator.Chunks[j].Response.TextCompletionResponse.ExtraFields.ChunkIndex
+		}
+		if accumulator.Chunks[i].Response.ChatResponse != nil {
+			return accumulator.Chunks[i].Response.ChatResponse.ExtraFields.ChunkIndex < accumulator.Chunks[j].Response.ChatResponse.ExtraFields.ChunkIndex
+		}
+		if accumulator.Chunks[i].Response.ResponsesResponse != nil {
+			return accumulator.Chunks[i].Response.ResponsesResponse.ExtraFields.ChunkIndex < accumulator.Chunks[j].Response.ResponsesResponse.ExtraFields.ChunkIndex
+		}
+		if accumulator.Chunks[i].Response.ResponsesStreamResponse != nil {
+			return accumulator.Chunks[i].Response.ResponsesStreamResponse.ExtraFields.ChunkIndex < accumulator.Chunks[j].Response.ResponsesStreamResponse.ExtraFields.ChunkIndex
+		}
+		if accumulator.Chunks[i].Response.SpeechResponse != nil {
+			return accumulator.Chunks[i].Response.SpeechResponse.ExtraFields.ChunkIndex < accumulator.Chunks[j].Response.SpeechResponse.ExtraFields.ChunkIndex
+		}
+		if accumulator.Chunks[i].Response.SpeechStreamResponse != nil {
+			return accumulator.Chunks[i].Response.SpeechStreamResponse.ExtraFields.ChunkIndex < accumulator.Chunks[j].Response.SpeechStreamResponse.ExtraFields.ChunkIndex
+		}
+		if accumulator.Chunks[i].Response.TranscriptionResponse != nil {
+			return accumulator.Chunks[i].Response.TranscriptionResponse.ExtraFields.ChunkIndex < accumulator.Chunks[j].Response.TranscriptionResponse.ExtraFields.ChunkIndex
+		}
+		if accumulator.Chunks[i].Response.TranscriptionStreamResponse != nil {
+			return accumulator.Chunks[i].Response.TranscriptionStreamResponse.ExtraFields.ChunkIndex < accumulator.Chunks[j].Response.TranscriptionStreamResponse.ExtraFields.ChunkIndex
+		}
+		if accumulator.Chunks[i].Response.ImageGenerationStreamResponse != nil {
+			// For image generation, sort by Index first, then ChunkIndex
+			if accumulator.Chunks[i].Response.ImageGenerationStreamResponse.Index != accumulator.Chunks[j].Response.ImageGenerationStreamResponse.Index {
+				return accumulator.Chunks[i].Response.ImageGenerationStreamResponse.Index < accumulator.Chunks[j].Response.ImageGenerationStreamResponse.Index
+			}
+			return accumulator.Chunks[i].Response.ImageGenerationStreamResponse.ChunkIndex < accumulator.Chunks[j].Response.ImageGenerationStreamResponse.ChunkIndex
+		}
+		return false
+	})
+
+	var streamResponses []string
+	for i, chunk := range accumulator.Chunks {
+		if chunk.Response != nil {
+			chunkData, err := json.Marshal(chunk.Response)
+			if err != nil {
+				plugin.logger.Warn("%s Failed to marshal stream chunk %d: %v", PluginLoggerPrefix, i, err)
+				continue
+			}
+			streamResponses = append(streamResponses, string(chunkData))
+		}
+	}
+
+	// STEP 3: Validate we have valid chunks to cache
+	if len(streamResponses) == 0 {
+		plugin.logger.Warn("%s Stream for request %s has no valid response chunks, skipping cache storage", PluginLoggerPrefix, requestID)
+		return nil
+	}
+
+	// STEP 4: Build final metadata and submit to .Add() method
+	finalMetadata := make(map[string]interface{})
+	for k, v := range accumulator.Metadata {
+		finalMetadata[k] = v
+	}
+	finalMetadata["stream_chunks"] = streamResponses
+
+	// Store complete unified entry using the final cache storage ID.
+	if err := plugin.store.Add(ctx, plugin.config.VectorStoreNamespace, accumulator.StorageID, accumulator.Embedding, finalMetadata); err != nil {
+		return fmt.Errorf("failed to store complete streaming cache entry: %w", err)
+	}
+
+	plugin.logger.Debug(fmt.Sprintf("%s Successfully cached complete stream with %d ordered chunks, ID: %s", PluginLoggerPrefix, len(streamResponses), accumulator.StorageID))
+	return nil
+}
+
+// cleanupStreamAccumulator removes the stream accumulator for a request
+func (plugin *Plugin) cleanupStreamAccumulator(requestID string) {
+	plugin.streamAccumulators.Delete(requestID)
+}
+
+// cleanupOldStreamAccumulators removes stream accumulators older than 5 minutes
+func (plugin *Plugin) cleanupOldStreamAccumulators() {
+	fiveMinutesAgo := time.Now().Add(-5 * time.Minute)
+	cleanedCount := 0
+	toDelete := make([]string, 0)
+
+	plugin.streamAccumulators.Range(func(key, value interface{}) bool {
+		requestID := key.(string)
+		accumulator := value.(*StreamAccumulator)
+
+		// Check if this accumulator is old (no activity for 5 minutes)
+		accumulator.mu.Lock()
+		if len(accumulator.Chunks) > 0 {
+			firstChunkTime := accumulator.Chunks[0].Timestamp
+			if firstChunkTime.Before(fiveMinutesAgo) {
+				toDelete = append(toDelete, requestID)
+				plugin.logger.Debug(fmt.Sprintf("%s Cleaned up old stream accumulator for request %s", PluginLoggerPrefix, requestID))
+			}
+		}
+		accumulator.mu.Unlock()
+		return true
+	})
+
+	// Delete outside the Range loop to avoid concurrent modification
+	for _, requestID := range toDelete {
+		plugin.streamAccumulators.Delete(requestID)
+		cleanedCount++
+	}
+
+	if cleanedCount > 0 {
+		plugin.logger.Debug(fmt.Sprintf("%s Cleaned up %d old stream accumulators", PluginLoggerPrefix, cleanedCount))
+	}
+}
--- a/plugins/semanticcache/test_utils.go
+++ b/plugins/semanticcache/test_utils.go
@@ -0,0 +1,781 @@
+package semanticcache
+
+import (
+	"context"
+	"os"
+	"strconv"
+	"testing"
+	"time"
+
+	bifrost "github.com/maximhq/bifrost/core"
+	"github.com/maximhq/bifrost/core/schemas"
+	"github.com/maximhq/bifrost/framework/vectorstore"
+	mocker "github.com/maximhq/bifrost/plugins/mocker"
+)
+
+// getWeaviateConfigFromEnv retrieves Weaviate configuration from environment variables
+func getWeaviateConfigFromEnv() vectorstore.WeaviateConfig {
+	scheme := os.Getenv("WEAVIATE_SCHEME")
+	if scheme == "" {
+		scheme = "http"
+	}
+	host := schemas.NewEnvVar("env.WEAVIATE_HOST")
+	if host.GetValue() == "" {
+		host = schemas.NewEnvVar("localhost:9000")
+	}
+
+	apiKey := schemas.NewEnvVar("env.WEAVIATE_API_KEY")
+
+	timeoutStr := os.Getenv("WEAVIATE_TIMEOUT")
+	timeout := 30 // default
+	if timeoutStr != "" {
+		if t, err := strconv.Atoi(timeoutStr); err == nil {
+			timeout = t
+		}
+	}
+
+	return vectorstore.WeaviateConfig{
+		Scheme:  scheme,
+		Host:    host,
+		APIKey:  apiKey,
+		Timeout: time.Duration(timeout) * time.Second,
+	}
+}
+
+// getRedisConfigFromEnv retrieves Redis configuration from environment variables
+func getRedisConfigFromEnv() vectorstore.RedisConfig {
+	addr := schemas.NewEnvVar("env.REDIS_ADDR")
+	if addr.GetValue() == "" {
+		addr = schemas.NewEnvVar("localhost:6379")
+	}
+	username := schemas.NewEnvVar("env.REDIS_USERNAME")
+	password := schemas.NewEnvVar("env.REDIS_PASSWORD")
+	db := schemas.NewEnvVar("env.REDIS_DB")
+
+	timeoutStr := os.Getenv("REDIS_TIMEOUT")
+	if timeoutStr == "" {
+		timeoutStr = "10s"
+	}
+	timeout, err := time.ParseDuration(timeoutStr)
+	if err != nil {
+		timeout = 10 * time.Second
+	}
+
+	return vectorstore.RedisConfig{
+		Addr:           addr,
+		Username:       username,
+		Password:       password,
+		DB:             db,
+		ContextTimeout: timeout,
+	}
+}
+
+// getQdrantConfigFromEnv retrieves Qdrant configuration from environment variables
+func getQdrantConfigFromEnv() vectorstore.QdrantConfig {
+	host := schemas.NewEnvVar("env.QDRANT_HOST")
+	if host.GetValue() == "" {
+		host = schemas.NewEnvVar("localhost")
+	}
+	port := schemas.NewEnvVar("env.QDRANT_PORT")
+	if port.GetValue() == "" {
+		port = schemas.NewEnvVar("6334")
+	}
+	apiKey := schemas.NewEnvVar("env.QDRANT_API_KEY")
+	useTLS := schemas.NewEnvVar("env.QDRANT_USE_TLS")
+	if useTLS.GetValue() == "" {
+		useTLS = schemas.NewEnvVar("false")
+	}
+
+	return vectorstore.QdrantConfig{
+		Host:   *host,
+		Port:   *port,
+		APIKey: *apiKey,
+		UseTLS: *useTLS,
+	}
+}
+
+// getPineconeConfigFromEnv retrieves Pinecone configuration from environment variables
+func getPineconeConfigFromEnv() vectorstore.PineconeConfig {
+	apiKey := schemas.NewEnvVar("env.PINECONE_API_KEY")
+	if apiKey.GetValue() == "" {
+		apiKey = schemas.NewEnvVar("pclocal") // Pinecone Local doesn't validate API keys
+	}
+	indexHost := schemas.NewEnvVar("env.PINECONE_INDEX_HOST")
+	if indexHost.GetValue() == "" {
+		indexHost = schemas.NewEnvVar("localhost:5081") // Pinecone Local default port
+	}
+
+	return vectorstore.PineconeConfig{
+		APIKey:    *apiKey,
+		IndexHost: *indexHost,
+	}
+}
+
+// BaseAccount implements the schemas.Account interface for testing purposes.
+type BaseAccount struct{}
+
+func (baseAccount *BaseAccount) GetConfiguredProviders() ([]schemas.ModelProvider, error) {
+	return []schemas.ModelProvider{schemas.OpenAI}, nil
+}
+
+func (baseAccount *BaseAccount) GetKeysForProvider(ctx context.Context, providerKey schemas.ModelProvider) ([]schemas.Key, error) {
+	return []schemas.Key{
+		{
+			Value:  *schemas.NewEnvVar("env.OPENAI_API_KEY"),
+			Models: schemas.WhiteList{"*"}, // "*" means allow all models
+			Weight: 1.0,
+		},
+	}, nil
+}
+
+func (baseAccount *BaseAccount) GetConfigForProvider(providerKey schemas.ModelProvider) (*schemas.ProviderConfig, error) {
+	return &schemas.ProviderConfig{
+		NetworkConfig: schemas.NetworkConfig{
+			DefaultRequestTimeoutInSeconds: 60,
+			MaxRetries:                     5,
+			RetryBackoffInitial:            100 * time.Millisecond,
+			RetryBackoffMax:                30 * time.Second,
+		},
+		ConcurrencyAndBufferSize: schemas.ConcurrencyAndBufferSize{
+			Concurrency: 10,
+			BufferSize:  10,
+		},
+	}, nil
+}
+
+// getMockRules returns a list of mock rules for the semantic cache tests
+func getMockRules() []mocker.MockRule {
+	return []mocker.MockRule{
+		// Core test prompts
+		{
+			Name:        "bifrost-definition",
+			Enabled:     true,
+			Conditions:  mocker.Conditions{MessageRegex: bifrost.Ptr("(?i)What is Bifrost.*")},
+			Probability: 1.0,
+			Responses: []mocker.Response{
+				{Type: mocker.ResponseTypeSuccess, Content: &mocker.SuccessResponse{Message: "Bifrost is a unified API for interacting with multiple AI providers."}},
+			},
+		},
+		{
+			Name:        "machine-learning-explanation",
+			Enabled:     true,
+			Conditions:  mocker.Conditions{MessageRegex: bifrost.Ptr("(?i)what is machine learning\\?|explain machine learning|machine learning concepts|can you explain machine learning|explain the basics of machine learning")},
+			Probability: 1.0,
+			Responses: []mocker.Response{
+				{Type: mocker.ResponseTypeSuccess, Content: &mocker.SuccessResponse{Message: "Machine learning is a field of AI that uses statistical techniques to give computer systems the ability to learn from data."}},
+			},
+		},
+		{
+			Name:        "ai-explanation",
+			Enabled:     true,
+			Conditions:  mocker.Conditions{MessageRegex: bifrost.Ptr("(?i)what is artificial intelligence\\?|can you explain what ai is\\?|define artificial intelligence")},
+			Probability: 1.0,
+			Responses: []mocker.Response{
+				{Type: mocker.ResponseTypeSuccess, Content: &mocker.SuccessResponse{Message: "Artificial intelligence is the simulation of human intelligence in machines."}},
+			},
+		},
+		{
+			Name:        "capital-of-france",
+			Enabled:     true,
+			Conditions:  mocker.Conditions{MessageRegex: bifrost.Ptr("What is the capital of France\\?")},
+			Probability: 1.0,
+			Responses: []mocker.Response{
+				{Type: mocker.ResponseTypeSuccess, Content: &mocker.SuccessResponse{Message: "The capital of France is Paris."}},
+			},
+		},
+		{
+			Name:        "newton-laws",
+			Enabled:     true,
+			Conditions:  mocker.Conditions{MessageRegex: bifrost.Ptr("(?i)describe.*newton.*three laws|describe.*three laws.*newton")},
+			Probability: 1.0,
+			Responses: []mocker.Response{
+				{Type: mocker.ResponseTypeSuccess, Content: &mocker.SuccessResponse{Message: "Newton's three laws of motion are: 1. An object at rest stays at rest and an object in motion stays in motion with the same speed and in the same direction unless acted upon by an unbalanced force. 2. The acceleration of an object as produced by a net force is directly proportional to the magnitude of the net force, in the same direction as the net force, and inversely proportional to the mass of the object. 3. For every action, there is an equal and opposite reaction."}},
+			},
+		},
+		// Weather-related prompts
+		{
+			Name:        "weather-question",
+			Enabled:     true,
+			Conditions:  mocker.Conditions{MessageRegex: bifrost.Ptr("(?i)what.*weather|weather.*like")},
+			Probability: 1.0,
+			Responses: []mocker.Response{
+				{Type: mocker.ResponseTypeSuccess, Content: &mocker.SuccessResponse{Message: "It's sunny today with a temperature of 72°F."}},
+			},
+		},
+		// Blockchain and deep learning
+		{
+			Name:        "blockchain-definition",
+			Enabled:     true,
+			Conditions:  mocker.Conditions{MessageRegex: bifrost.Ptr("(?i)define blockchain|blockchain technology")},
+			Probability: 1.0,
+			Responses: []mocker.Response{
+				{Type: mocker.ResponseTypeSuccess, Content: &mocker.SuccessResponse{Message: "Blockchain is a distributed ledger technology that maintains a continuously growing list of records."}},
+			},
+		},
+		{
+			Name:        "deep-learning",
+			Enabled:     true,
+			Conditions:  mocker.Conditions{MessageRegex: bifrost.Ptr("(?i)what is deep learning")},
+			Probability: 1.0,
+			Responses: []mocker.Response{
+				{Type: mocker.ResponseTypeSuccess, Content: &mocker.SuccessResponse{Message: "Deep learning is a subset of machine learning that uses neural networks with multiple layers."}},
+			},
+		},
+		// Quantum computing
+		{
+			Name:        "quantum-computing",
+			Enabled:     true,
+			Conditions:  mocker.Conditions{MessageRegex: bifrost.Ptr("(?i)quantum computing|explain quantum")},
+			Probability: 1.0,
+			Responses: []mocker.Response{
+				{Type: mocker.ResponseTypeSuccess, Content: &mocker.SuccessResponse{Message: "Quantum computing uses quantum mechanical phenomena to process information in ways that classical computers cannot."}},
+			},
+		},
+		// Conversation prompts
+		{
+			Name:        "hello-greeting",
+			Enabled:     true,
+			Conditions:  mocker.Conditions{MessageRegex: bifrost.Ptr("(?i)^hello$|^hi$|hello.*world")},
+			Probability: 1.0,
+			Responses: []mocker.Response{
+				{Type: mocker.ResponseTypeSuccess, Content: &mocker.SuccessResponse{Message: "Hello! How can I help you today?"}},
+			},
+		},
+		{
+			Name:        "how-are-you",
+			Enabled:     true,
+			Conditions:  mocker.Conditions{MessageRegex: bifrost.Ptr("(?i)how are you")},
+			Probability: 1.0,
+			Responses: []mocker.Response{
+				{Type: mocker.ResponseTypeSuccess, Content: &mocker.SuccessResponse{Message: "I'm doing well, thank you for asking!"}},
+			},
+		},
+		{
+			Name:        "meaning-of-life",
+			Enabled:     true,
+			Conditions:  mocker.Conditions{MessageRegex: bifrost.Ptr("(?i)meaning of life")},
+			Probability: 1.0,
+			Responses: []mocker.Response{
+				{Type: mocker.ResponseTypeSuccess, Content: &mocker.SuccessResponse{Message: "The meaning of life is a philosophical question that has been pondered for centuries. Some say it's 42!"}},
+			},
+		},
+		{
+			Name:        "short-story",
+			Enabled:     true,
+			Conditions:  mocker.Conditions{MessageRegex: bifrost.Ptr("(?i)tell me.*short story")},
+			Probability: 1.0,
+			Responses: []mocker.Response{
+				{Type: mocker.ResponseTypeSuccess, Content: &mocker.SuccessResponse{Message: "Once upon a time, there was a brave knight who saved the day."}},
+			},
+		},
+		// Test-specific prompts
+		{
+			Name:        "test-configuration",
+			Enabled:     true,
+			Conditions:  mocker.Conditions{MessageRegex: bifrost.Ptr("(?i)test configuration")},
+			Probability: 1.0,
+			Responses: []mocker.Response{
+				{Type: mocker.ResponseTypeSuccess, Content: &mocker.SuccessResponse{Message: "This is a test configuration response."}},
+			},
+		},
+		{
+			Name:        "test-messages",
+			Enabled:     true,
+			Conditions:  mocker.Conditions{MessageRegex: bifrost.Ptr("(?i)test.*message|test.*no-store|test.*cache|test.*error|ttl test|threshold test|provider.*test|edge case test")},
+			Probability: 1.0,
+			Responses: []mocker.Response{
+				{Type: mocker.ResponseTypeSuccess, Content: &mocker.SuccessResponse{Message: "This is a test response for various test scenarios."}},
+			},
+		},
+		{
+			Name:        "long-prompt",
+			Enabled:     true,
+			Conditions:  mocker.Conditions{MessageRegex: bifrost.Ptr("(?i)very long prompt")},
+			Probability: 1.0,
+			Responses: []mocker.Response{
+				{Type: mocker.ResponseTypeSuccess, Content: &mocker.SuccessResponse{Message: "This is a response to a very long prompt."}},
+			},
+		},
+		{
+			Name:        "parameter-tests",
+			Enabled:     true,
+			Conditions:  mocker.Conditions{MessageRegex: bifrost.Ptr("(?i)test.*parameters|performance test")},
+			Probability: 1.0,
+			Responses: []mocker.Response{
+				{Type: mocker.ResponseTypeSuccess, Content: &mocker.SuccessResponse{Message: "Parameter test response with various settings."}},
+			},
+		},
+		// Dynamic message patterns (for conversation tests)
+		{
+			Name:        "message-pattern",
+			Enabled:     true,
+			Conditions:  mocker.Conditions{MessageRegex: bifrost.Ptr("(?i)message \\d+")},
+			Probability: 1.0,
+			Responses: []mocker.Response{
+				{Type: mocker.ResponseTypeSuccess, Content: &mocker.SuccessResponse{Message: "Response to numbered message."}},
+			},
+		},
+		// Default catch-all rule (lowest priority)
+		{
+			Name:        "default-mock",
+			Enabled:     true,
+			Priority:    -1, // Lower priority
+			Conditions:  mocker.Conditions{},
+			Probability: 1.0,
+			Responses: []mocker.Response{
+				{Type: mocker.ResponseTypeSuccess, Content: &mocker.SuccessResponse{Message: "This is a generic mocked response."}},
+			},
+		},
+	}
+}
+
+// getMockedBifrostClient creates a Bifrost client with a mocker plugin for testing
+func getMockedBifrostClient(t *testing.T, ctx *schemas.BifrostContext, logger schemas.Logger, semanticCachePlugin schemas.LLMPlugin) *bifrost.Bifrost {
+	mockerCfg := mocker.MockerConfig{
+		Enabled: true,
+		Rules:   getMockRules(),
+	}
+
+	mockerPlugin, err := mocker.Init(mockerCfg)
+	if err != nil {
+		t.Fatalf("Failed to initialize mocker plugin: %v", err)
+	}
+
+	account := &BaseAccount{}
+	client, err := bifrost.Init(ctx, schemas.BifrostConfig{
+		Account:    account,
+		LLMPlugins: []schemas.LLMPlugin{semanticCachePlugin, mockerPlugin},
+		Logger:     logger,
+	})
+	if err != nil {
+		t.Fatalf("Error initializing Bifrost with mocker: %v", err)
+	}
+
+	return client
+}
+
+// TestSetup contains common test setup components
+type TestSetup struct {
+	Logger schemas.Logger
+	Store  vectorstore.VectorStore
+	Plugin schemas.LLMPlugin
+	Client *bifrost.Bifrost
+	Config *Config
+}
+
+// NewTestSetup creates a new test setup with default configuration
+func NewTestSetup(t *testing.T) *TestSetup {
+	return NewTestSetupWithConfig(t, &Config{
+		Provider:          schemas.OpenAI,
+		EmbeddingModel:    "text-embedding-3-small",
+		Dimension:         1536,
+		Threshold:         0.8,
+		CleanUpOnShutdown: true,
+		Keys: []schemas.Key{
+			{
+				Value:  *schemas.NewEnvVar("env.OPENAI_API_KEY"),
+				Models: schemas.WhiteList{"*"},
+				Weight: 1.0,
+			},
+		},
+	})
+}
+
+// NewTestSetupWithConfig creates a new test setup with custom configuration
+func NewTestSetupWithConfig(t *testing.T, config *Config) *TestSetup {
+	return NewTestSetupWithVectorStore(t, config, vectorstore.VectorStoreTypeWeaviate)
+}
+
+// NewTestSetupWithVectorStore creates a new test setup with custom configuration and vector store type
+func NewTestSetupWithVectorStore(t *testing.T, config *Config, storeType vectorstore.VectorStoreType) *TestSetup {
+	ctx := schemas.NewBifrostContext(context.Background(), schemas.NoDeadline)
+	logger := bifrost.NewDefaultLogger(schemas.LogLevelDebug)
+
+	// Get the appropriate config for the vector store type
+	var storeConfig interface{}
+	switch storeType {
+	case vectorstore.VectorStoreTypeWeaviate:
+		storeConfig = getWeaviateConfigFromEnv()
+	case vectorstore.VectorStoreTypeRedis:
+		storeConfig = getRedisConfigFromEnv()
+	case vectorstore.VectorStoreTypeQdrant:
+		storeConfig = getQdrantConfigFromEnv()
+	case vectorstore.VectorStoreTypePinecone:
+		storeConfig = getPineconeConfigFromEnv()
+	default:
+		t.Fatalf("Unsupported vector store type: %s", storeType)
+	}
+
+	store, err := vectorstore.NewVectorStore(context.Background(), &vectorstore.Config{
+		Type:    storeType,
+		Config:  storeConfig,
+		Enabled: true,
+	}, logger)
+	if err != nil {
+		t.Skipf("Vector store %s not available or failed to connect: %v", storeType, err)
+	}
+
+	plugin, err := Init(schemas.NewBifrostContext(context.Background(), schemas.NoDeadline), config, logger, store)
+	if err != nil {
+		t.Fatalf("Failed to initialize plugin: %v", err)
+	}
+
+	// Clear test keys
+	pluginImpl := plugin.(*Plugin)
+	clearTestKeysWithStore(t, pluginImpl.store)
+
+	// Get a mocked Bifrost client
+	client := getMockedBifrostClient(t, ctx, logger, plugin)
+
+	return &TestSetup{
+		Logger: logger,
+		Store:  store,
+		Plugin: plugin,
+		Client: client,
+		Config: config,
+	}
+}
+
+// Cleanup cleans up test resources
+func (ts *TestSetup) Cleanup() {
+	if ts.Client != nil {
+		ts.Client.Shutdown()
+	}
+}
+
+// clearTestKeysWithStore removes all keys matching the test prefix using the store interface
+func clearTestKeysWithStore(t *testing.T, store vectorstore.VectorStore) {
+	// With the new unified VectorStore interface, cleanup is typically handled
+	// by the vector store implementation (e.g., dropping entire classes)
+	t.Logf("Test cleanup delegated to vector store implementation")
+}
+
+// CreateBasicChatRequest creates a basic chat completion request for testing
+func CreateBasicChatRequest(content string, temperature float64, maxTokens int) *schemas.BifrostChatRequest {
+	return &schemas.BifrostChatRequest{
+		Provider: schemas.OpenAI,
+		Model:    "gpt-4o-mini",
+		Input: []schemas.ChatMessage{
+			{
+				Role: "user",
+				Content: &schemas.ChatMessageContent{
+					ContentStr: &content,
+				},
+			},
+		},
+		Params: &schemas.ChatParameters{
+			Temperature:         &temperature,
+			MaxCompletionTokens: &maxTokens,
+		},
+	}
+}
+
+// CreateStreamingChatRequest creates a streaming chat completion request for testing
+func CreateStreamingChatRequest(content string, temperature float64, maxTokens int) *schemas.BifrostChatRequest {
+	return CreateBasicChatRequest(content, temperature, maxTokens)
+}
+
+// CreateSpeechRequest creates a speech synthesis request for testing
+func CreateSpeechRequest(input string, voice string) *schemas.BifrostSpeechRequest {
+	return &schemas.BifrostSpeechRequest{
+		Provider: schemas.OpenAI,
+		Model:    "tts-1",
+		Input: &schemas.SpeechInput{
+			Input: input,
+		},
+		Params: &schemas.SpeechParameters{
+			VoiceConfig: &schemas.SpeechVoiceInput{
+				Voice: &voice,
+			},
+			ResponseFormat: "mp3",
+		},
+	}
+}
+
+// AssertCacheHit verifies that a response was served from cache
+func AssertCacheHit(t *testing.T, response *schemas.BifrostResponse, expectedCacheType string) {
+	extraFields := response.GetExtraFields()
+
+	if extraFields.CacheDebug == nil {
+		t.Error("Cache metadata missing 'cache_debug'")
+		return
+	}
+
+	// Check that it's actually a cache hit
+	if !extraFields.CacheDebug.CacheHit {
+		t.Error("❌ Expected cache hit but response was not cached")
+		return
+	}
+
+	if expectedCacheType != "" {
+		cacheType := extraFields.CacheDebug.HitType
+		if cacheType != nil && *cacheType != expectedCacheType {
+			t.Errorf("Expected cache type '%s', got '%s'", expectedCacheType, *cacheType)
+			return
+		}
+
+		t.Log("✅ Response correctly served from cache")
+	}
+
+	t.Log("✅ Response correctly served from cache")
+}
+
+// AssertNoCacheHit verifies that a response was NOT served from cache
+func AssertNoCacheHit(t *testing.T, response *schemas.BifrostResponse) {
+	extraFields := response.GetExtraFields()
+
+	if extraFields.CacheDebug == nil {
+		t.Log("✅ Response correctly not served from cache (no 'cache_debug' flag)")
+		return
+	}
+
+	// Check the actual CacheHit field instead of just checking if CacheDebug exists
+	if extraFields.CacheDebug.CacheHit {
+		t.Error("❌ Response was cached when it shouldn't be")
+		return
+	}
+
+	t.Log("✅ Response correctly not served from cache (cache_debug present but CacheHit=false)")
+}
+
+// WaitForCache waits for async cache operations to complete
+func WaitForCache(plugin schemas.LLMPlugin) {
+	if p, ok := plugin.(*Plugin); ok {
+		p.WaitForPendingOperations()
+	}
+	// Small buffer for Weaviate index consistency
+	time.Sleep(500 * time.Millisecond)
+}
+
+// CreateEmbeddingRequest creates an embedding request for testing
+func CreateEmbeddingRequest(texts []string) *schemas.BifrostEmbeddingRequest {
+	return &schemas.BifrostEmbeddingRequest{
+		Provider: schemas.OpenAI,
+		Model:    "text-embedding-3-small",
+		Input: &schemas.EmbeddingInput{
+			Texts: texts,
+		},
+	}
+}
+
+// CreateBasicResponsesRequest creates a basic Responses API request for testing
+func CreateBasicResponsesRequest(content string, temperature float64, maxTokens int) *schemas.BifrostResponsesRequest {
+	userRole := schemas.ResponsesInputMessageRoleUser
+	return &schemas.BifrostResponsesRequest{
+		Provider: schemas.OpenAI,
+		Model:    "gpt-4o",
+		Input: []schemas.ResponsesMessage{
+			{
+				Role: &userRole,
+				Content: &schemas.ResponsesMessageContent{
+					ContentStr: &content,
+				},
+			},
+		},
+		Params: &schemas.ResponsesParameters{
+			Temperature:     &temperature,
+			MaxOutputTokens: &maxTokens,
+		},
+	}
+}
+
+// CreateResponsesRequestWithTools creates a Responses API request with tools for testing
+func CreateResponsesRequestWithTools(content string, temperature float64, maxTokens int, tools []schemas.ResponsesTool) *schemas.BifrostResponsesRequest {
+	req := CreateBasicResponsesRequest(content, temperature, maxTokens)
+	req.Params.Tools = tools
+	return req
+}
+
+// CreateResponsesRequestWithInstructions creates a Responses API request with system instructions
+func CreateResponsesRequestWithInstructions(content string, instructions string, temperature float64, maxTokens int) *schemas.BifrostResponsesRequest {
+	req := CreateBasicResponsesRequest(content, temperature, maxTokens)
+	req.Params.Instructions = &instructions
+	return req
+}
+
+// CreateStreamingResponsesRequest creates a streaming Responses API request for testing
+func CreateStreamingResponsesRequest(content string, temperature float64, maxTokens int) *schemas.BifrostResponsesRequest {
+	return CreateBasicResponsesRequest(content, temperature, maxTokens)
+}
+
+// CreateImageGenerationRequest creates an image generation request for testing
+func CreateImageGenerationRequest(prompt string, size string, quality string) *schemas.BifrostImageGenerationRequest {
+	return &schemas.BifrostImageGenerationRequest{
+		Provider: schemas.OpenAI,
+		Model:    "gpt-image-1",
+		Input: &schemas.ImageGenerationInput{
+			Prompt: prompt,
+		},
+		Params: &schemas.ImageGenerationParameters{
+			Size:    bifrost.Ptr(size),
+			Quality: bifrost.Ptr(quality),
+			N:       bifrost.Ptr(1),
+		},
+	}
+}
+
+// CreateContextWithCacheKey creates a context with the test cache key
+func CreateContextWithCacheKey(value string) *schemas.BifrostContext {
+	return schemas.NewBifrostContextWithValue(context.Background(), schemas.NoDeadline, CacheKey, value)
+}
+
+// CreateContextWithCacheKeyAndType creates a context with cache key and cache type
+func CreateContextWithCacheKeyAndType(value string, cacheType CacheType) *schemas.BifrostContext {
+	return schemas.NewBifrostContextWithValue(context.Background(), schemas.NoDeadline, CacheKey, value).WithValue(CacheTypeKey, cacheType)
+}
+
+// CreateContextWithCacheKeyAndTTL creates a context with cache key and custom TTL
+func CreateContextWithCacheKeyAndTTL(value string, ttl time.Duration) *schemas.BifrostContext {
+	return schemas.NewBifrostContextWithValue(context.Background(), schemas.NoDeadline, CacheKey, value).WithValue(CacheTTLKey, ttl)
+}
+
+// CreateContextWithCacheKeyAndThreshold creates a context with cache key and custom threshold
+func CreateContextWithCacheKeyAndThreshold(value string, threshold float64) *schemas.BifrostContext {
+	return schemas.NewBifrostContext(context.Background(), schemas.NoDeadline).WithValue(CacheKey, value).WithValue(CacheThresholdKey, threshold)
+}
+
+// CreateContextWithCacheKeyAndNoStore creates a context with cache key and no-store flag
+func CreateContextWithCacheKeyAndNoStore(value string, noStore bool) *schemas.BifrostContext {
+	return schemas.NewBifrostContext(context.Background(), schemas.NoDeadline).WithValue(CacheKey, value).WithValue(CacheNoStoreKey, noStore)
+}
+
+// CreateTestSetupWithConversationThreshold creates a test setup with custom conversation history threshold
+func CreateTestSetupWithConversationThreshold(t *testing.T, threshold int) *TestSetup {
+	config := &Config{
+		Provider:                     schemas.OpenAI,
+		EmbeddingModel:               "text-embedding-3-small",
+		Dimension:                    1536,
+		CleanUpOnShutdown:            true,
+		Threshold:                    0.8,
+		ConversationHistoryThreshold: threshold,
+		Keys: []schemas.Key{
+			{
+				Value:  *schemas.NewEnvVar("env.OPENAI_API_KEY"),
+				Models: []string{"*"},
+				Weight: 1.0,
+			},
+		},
+	}
+
+	return NewTestSetupWithConfig(t, config)
+}
+
+// CreateTestSetupWithExcludeSystemPrompt creates a test setup with ExcludeSystemPrompt setting
+func CreateTestSetupWithExcludeSystemPrompt(t *testing.T, excludeSystem bool) *TestSetup {
+	config := &Config{
+		Provider:            schemas.OpenAI,
+		EmbeddingModel:      "text-embedding-3-small",
+		Dimension:           1536,
+		CleanUpOnShutdown:   true,
+		Threshold:           0.8,
+		ExcludeSystemPrompt: &excludeSystem,
+		Keys: []schemas.Key{
+			{
+				Value:  *schemas.NewEnvVar("env.OPENAI_API_KEY"),
+				Models: []string{"*"},
+				Weight: 1.0,
+			},
+		},
+	}
+
+	return NewTestSetupWithConfig(t, config)
+}
+
+// CreateTestSetupWithThresholdAndExcludeSystem creates a test setup with both conversation threshold and exclude system prompt settings
+func CreateTestSetupWithThresholdAndExcludeSystem(t *testing.T, threshold int, excludeSystem bool) *TestSetup {
+	config := &Config{
+		Provider:                     schemas.OpenAI,
+		EmbeddingModel:               "text-embedding-3-small",
+		Dimension:                    1536,
+		CleanUpOnShutdown:            true,
+		Threshold:                    0.8,
+		ConversationHistoryThreshold: threshold,
+		ExcludeSystemPrompt:          &excludeSystem,
+		Keys: []schemas.Key{
+			{
+				Value:  *schemas.NewEnvVar("env.OPENAI_API_KEY"),
+				Models: []string{"*"},
+				Weight: 1.0,
+			},
+		},
+	}
+
+	return NewTestSetupWithConfig(t, config)
+}
+
+// CreateConversationRequest creates a chat request with conversation history
+func CreateConversationRequest(messages []schemas.ChatMessage, temperature float64, maxTokens int) *schemas.BifrostChatRequest {
+	return &schemas.BifrostChatRequest{
+		Provider: schemas.OpenAI,
+		Model:    "gpt-4o-mini",
+		Input:    messages,
+		Params: &schemas.ChatParameters{
+			Temperature:         &temperature,
+			MaxCompletionTokens: &maxTokens,
+		},
+	}
+}
+
+// BuildConversationHistory creates a conversation history from pairs of user/assistant messages
+func BuildConversationHistory(systemPrompt string, userAssistantPairs ...[]string) []schemas.ChatMessage {
+	messages := []schemas.ChatMessage{}
+
+	// Add system prompt if provided
+	if systemPrompt != "" {
+		messages = append(messages, schemas.ChatMessage{
+			Role: schemas.ChatMessageRoleSystem,
+			Content: &schemas.ChatMessageContent{
+				ContentStr: &systemPrompt,
+			},
+		})
+	}
+
+	// Add user/assistant pairs
+	for _, pair := range userAssistantPairs {
+		if len(pair) >= 1 && pair[0] != "" {
+			userMsg := pair[0]
+			messages = append(messages, schemas.ChatMessage{
+				Role: schemas.ChatMessageRoleUser,
+				Content: &schemas.ChatMessageContent{
+					ContentStr: &userMsg,
+				},
+			})
+		}
+		if len(pair) >= 2 && pair[1] != "" {
+			assistantMsg := pair[1]
+			messages = append(messages, schemas.ChatMessage{
+				Role: schemas.ChatMessageRoleAssistant,
+				Content: &schemas.ChatMessageContent{
+					ContentStr: &assistantMsg,
+				},
+			})
+		}
+	}
+
+	return messages
+}
+
+// AddUserMessage adds a user message to existing conversation
+func AddUserMessage(messages []schemas.ChatMessage, userMessage string) []schemas.ChatMessage {
+	newMessage := schemas.ChatMessage{
+		Role: schemas.ChatMessageRoleUser,
+		Content: &schemas.ChatMessageContent{
+			ContentStr: &userMessage,
+		},
+	}
+	return append(messages, newMessage)
+}
+
+// RetryConfig defines retry configuration for API requests
+type RetryConfig struct {
+	MaxRetries int
+	BaseDelay  time.Duration
+}
+
+// DefaultRetryConfig returns the default retry configuration
+func DefaultRetryConfig() RetryConfig {
+	return RetryConfig{
+		MaxRetries: 2,
+		BaseDelay:  5 * time.Millisecond,
+	}
+}
--- a/plugins/semanticcache/utils.go
+++ b/plugins/semanticcache/utils.go
--- a/plugins/semanticcache/version
+++ b/plugins/semanticcache/version
@@ -0,0 +1 @@
+1.5.4