FasterDecoding
diff --git a/‎.DS_Store‎
0 Bytes b/‎.DS_Store‎
0 Bytes
diff --git a/‎README.md‎
Lines changed: 1 addition & 1 deletion b/‎README.md‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎figures/LWM-Text-Chat-1M_SnapKV.jpg‎
137 KB b/‎figures/LWM-Text-Chat-1M_SnapKV.jpg‎
137 KB
diff --git a/‎figures/LWM-Text-Chat-1M_SnapKV.pdf‎
-22.9 KB b/‎figures/LWM-Text-Chat-1M_SnapKV.pdf‎
-22.9 KB
diff --git a/‎observations/models/modeling_mistral_benchmark_layerwise.py‎
Lines changed: 5 additions & 20 deletions b/‎observations/models/modeling_mistral_benchmark_layerwise.py‎
Lines changed: 5 additions & 20 deletions
@@ -2,7 +2,7 @@
 We introduce an innovative and out-of-box KV cache compression method, SnapKV.
 
 ![Comprehensive Experiment Results on LongBench](./figures/longbench.jpg)
-![Pressure Test Result on Needle-in-a-Haystack](./figures/LWM-Text-Chat-1M_SnapKV.pdf)
+![Pressure Test Result on Needle-in-a-Haystack](./figures/LWM-Text-Chat-1M_SnapKV.jpg)
 
 ## Quick Start
 ### Use SnapKV-optimized Models
 
@@ -22,8 +22,8 @@
 PyTorch Mistral baseline model.
 https://github.com/huggingface/transformers/blob/v4.36-release/src/transformers/models/mistral/modeling_mistral.py
 Please write change log here:
-[YL] save attention weights
-[YL] for benchmarking
+[SnapKV] save attention weights
+[SnapKV] for benchmarking
 """
 
 import inspect
@@ -307,7 +307,7 @@ def forward(
         # upcast attention to fp32
         attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query_states.dtype)
 
-        # [YL] get stats=====================
+        # [SnapKV] get stats=====================
         self.features_per_data = []
         threshold = self.threshold
         prev_len = self.prev_len
@@ -319,23 +319,8 @@ def forward(
         prev_attn_typical = attn_weights[0, :, start:end, :start] > threshold
         prev_attn_typical = prev_attn_typical.sum(1) > 0
         self.prev_attn_typical = prev_attn_typical
-        
-        # for step in range(steps):
-        #     start = prev_len - window_size
-        #     end = prev_len
-        #     shift = window_size * step
-        #     prev_attn_sum = attn_weights[0, :, start:end, :start].sum(1)
-        #     cur_attn_sum = attn_weights[0, :, start + shift + window_size:end + shift + window_size, :start]
-        #     values, indices = torch.topk(prev_attn_sum, k=int(top_k * prev_len), dim=1)
-        #     mask = torch.zeros_like(prev_attn_sum, dtype=torch.bool, device=prev_attn_sum.device)
-        #     batch_indices = torch.arange(prev_attn_sum.size(0)).unsqueeze(1).expand_as(indices)
-        #     mask[batch_indices, indices] = 1
-        #     mask.unsqueeze_(1)
-        #     cur_attn_sum_threshold = cur_attn_sum > threshold
-        #     activation_overlap = cur_attn_sum_threshold & mask
-        #     hit_rate = activation_overlap.sum().float() / cur_attn_sum_threshold.sum().float()
-        #     self.features_per_data.append(hit_rate.item())
-        # [YL] end ==========================
+    
+        # [SnapKV] end ==========================
 
         attn_weights = nn.functional.dropout(attn_weights, p=self.attention_dropout, training=self.training)
         attn_output = torch.matmul(attn_weights, value_states)