.dockerignore
.env.example
.gitignore
.pre-commit-config.yaml
.python-version
Dockerfile.sglang
LICENSE.md
README.md
make_submission.sh
pyproject.toml
uv.lock
.github/workflows/lint.yml
configs/deepseek_v2_lite_chat.json
configs/deepseek_v2_lite_chat_high.json
configs/deepseek_v2_lite_chat_high_sw.json
configs/deepseek_v2_lite_chat_no_extend.json
configs/delta_landmark_0716_fast.json
configs/gptoss.jinja
configs/llama33_70b_1m.json
configs/mixed_landmark_0707_fast.json
configs/mixed_landmark_0707_fast_debug.json
configs/mixed_landmark_0707_slow.json
configs/mixed_landmark_0708_slow_decode.json
configs/mixed_landmark_0717_fast.json
configs/mixed_landmark_0717_slow.json
configs/mixed_landmark_0722_extend_fast_sparse_decode.json
configs/mixed_landmark_0722_no_extend_fast.json
configs/mixed_landmark_0722_no_extend_fast_16k.json
configs/mixed_landmark_0722_no_extend_fast_3k.json
configs/mixed_landmark_0722_no_extend_fast_8k.json
configs/mixed_landmark_0722_no_extend_fast_8k_copy.json
configs/mixed_landmark_0722_no_extend_sparse_decode.json
configs/mixed_landmark_0801_extend_fast.json
configs/mixed_landmark_0804_glm45.json
configs/mixed_landmark_0804_glm45_dense_decode.json
configs/mixed_landmark_0806_gptoss.json
configs/mixed_landmark_0814_no_extend_qsa.json
configs/qwen3_1b_norm_const.json
configs/qwen3_30b_a3b.json
configs/qwen3_30b_a3b_0701.json
configs/qwen3_30b_a3b_config_1m.json
configs/qwen3_30b_a3b_fast.json
configs/qwen3_30b_a3b_qsa.json
configs/qwen3_64k.json
configs/qwen3_nothinking.jinja
configs/qwq_32b_1m.json
configs/streaming_llm.json
configs/streaming_llm_delta.json
configs/rebuttal/llama31_extend.json
configs/rebuttal/llama31_noextend.json
configs/rebuttal/llama31_noextend_dense_decode.json
configs/rebuttal/qwen3_extend.json
configs/rebuttal/qwen3_noextend.json
docs/REPRODUCE.md
docs/USAGE.sglang.md
docs/demo_infinite.gif
docs/demo_infinite.mp4
docs/demo_vllm.gif
docs/demo_vllm.mp4
hip-research/README.md
hip-research/pyproject.toml
hip-research/src/hip_research/__init__.py
hip-research/src/hip_research/dataset/__init__.py
hip-research/src/hip_research/dataset/alpaca.py
hip-research/src/hip_research/dataset/booksum.py
hip-research/src/hip_research/dataset/calib_loft_rag.py
hip-research/src/hip_research/dataset/calib_loft_retrieval.py
hip-research/src/hip_research/dataset/glue.py
hip-research/src/hip_research/dataset/labdataset.py
hip-research/src/hip_research/dataset/lmsys.py
hip-research/src/hip_research/dataset/openwebtext.py
hip-research/src/hip_research/dataset/passkey.py
hip-research/src/hip_research/dataset/wikitext.py
hip-research/src/hip_research/dataset/wikitext2.py
hip-research/src/hip_research/main/__init__.py
hip-research/src/hip_research/main/chat.py
hip-research/src/hip_research/main/eval_args.py
hip-research/src/hip_research/main/flux_eval.py
hip-research/src/hip_research/main/llava_eval.py
hip-research/src/hip_research/main/model_eval.py
hip-research/src/hip_research/main/web_demo.py
hip-research/src/hip_research/main/jobs/__init__.py
hip-research/src/hip_research/main/jobs/bench_single_layer.py
hip-research/src/hip_research/main/jobs/booksum.py
hip-research/src/hip_research/main/jobs/ga.py
hip-research/src/hip_research/main/jobs/greedy_replace.py
hip-research/src/hip_research/main/jobs/merge_lora.py
hip-research/src/hip_research/main/jobs/mmlu.py
hip-research/src/hip_research/main/jobs/mmmu.py
hip-research/src/hip_research/main/jobs/passkey.py
hip-research/src/hip_research/main/jobs/ppl.py
hip-research/src/hip_research/main/jobs/sample_diag.py
hip-research/src/hip_research/main/jobs/stream.py
hip-research/src/hip_research/main/jobs/stream_demo.py
hip-research/src/hip_research/main/scripts/batch_size_report.py
hip-research/src/hip_research/main/scripts/bench_chunked_prefill_offload.py
hip-research/src/hip_research/main/scripts/bench_sglang_latency.py
hip-research/src/hip_research/main/scripts/latency_sample_wikitext2.py
hip-research/src/hip_research/main/scripts/mmlu_report.py
hip-research/src/hip_research/main/scripts/openai_mrcr.py
hip-research/src/hip_research/main/scripts/plot_ablation_ld.py
hip-research/src/hip_research/main/scripts/plot_correlations.py
hip-research/src/hip_research/main/scripts/plot_infbench_gen3.py
hip-research/src/hip_research/main/scripts/plot_infllm_latency.py
hip-research/src/hip_research/main/scripts/plot_latency_breakdown.py
hip-research/src/hip_research/main/scripts/plot_masking_iteration.py
hip-research/src/hip_research/main/scripts/plot_occupancy.py
hip-research/src/hip_research/main/scripts/plot_offloading.py
hip-research/src/hip_research/main/scripts/plot_ppl.py
hip-research/src/hip_research/main/scripts/plot_sglang_decoding_gen3.py
hip-research/src/hip_research/main/scripts/plot_topk_recall.py
hip-research/src/hip_research/main/scripts/ppl_report.py
hip-research/src/hip_research/main/scripts/rader_report.py
hip-research/src/hip_research/main/scripts/rouge.py
hip-research/src/hip_research/main/scripts/seqlen_speed_report.py
hip-research/src/hip_research/main/scripts/speedup_report.py
hip-research/src/hip_research/main/scripts/throughput_benchmark.py
hip-research/src/hip_research/models/__init__.py
hip-research/src/hip_research/models/dynamic_sparse_flash_attention.py
hip-research/src/hip_research/models/landmark_attention.py
hip-research/src/hip_research/models/modeling_llama_legacy.py
hip-research/src/hip_research/models/modeling_llama_permute.py
hip-research/src/hip_research/models/sglang_model.py
hip-research/src/hip_research/models/sink_attention.py
hip-research/src/hip_research/models/skewed_attention.py
hip-research/src/hip_research/models/h2o/bench_latency.py
hip-research/src/hip_research/models/h2o/h2o_llama.py
hip-research/src/hip_research/models/h2o/scripts/exp_mmlu.sh
hip-research/src/hip_research/models/h2o/scripts/exp_ppl_pg19.sh
hip-research/src/hip_research/models/h2o/scripts/exp_ppl_wikitext.sh
hip-research/src/hip_research/models/hyper_attention/__init__.py
hip-research/src/hip_research/models/hyper_attention/angular_lsh.py
hip-research/src/hip_research/models/hyper_attention/flash_attn_triton_for_hyper.py
hip-research/src/hip_research/models/hyper_attention/hyper_attn.py
hip-research/src/hip_research/models/hyper_attention/utils.py
hip-research/src/hip_research/models/sea_attention/__init__.py
hip-research/src/hip_research/models/sea_attention/attention.py
hip-research/src/hip_research/models/tova/__init__.py
hip-research/src/hip_research/models/tova/convert_tova.py
hip-research/src/hip_research/models/tova/llama_custom.py
hip-research/src/hip_research/models/tova/mistral_custom.py
hip-research/src/hip_research/models/tova/tova_cache.py
hip-research/src/hip_research/trainer/common.py
hip-research/src/hip_research/trainer/permute_trainer.py
hip-research/src/hip_research/trainer/timber_trainer.py
hip-research/src/hip_research/trainer/timber_trainer_hf.py
hip-research/src/hip_research/utils/__init__.py
hip-research/src/hip_research/utils/bench.py
hip-research/src/hip_research/utils/checkpoint.py
hip-research/src/hip_research/utils/ddp.py
hip-research/src/hip_research/utils/example.secrets.py
hip-research/src/hip_research/utils/get_optimizer.py
hip-research/src/hip_research/utils/load_checkouts.py
hip-research/src/hip_research/utils/seed.py
hip-research/src/hip_research/utils/triton_argsort.py
hip-research/src/hip_research/utils/triton_sort.py
notebook/hist_runner_indices_unique.ipynb
notebook/inspect_sglang_decode.ipynb
notebook/mmmu_image.png
notebook/plot_latency_after_prefetch.ipynb
notebook/qwen_vl.ipynb
samples/128k.md
samples/15k.md
samples/16k.md
samples/1m.md
samples/256k.md
samples/2k.md
samples/2m.md
samples/32k.md
samples/4k.md
samples/512k.md
samples/64k.md
samples/8k.md
samples/booksum32k.md
samples/booksum32k_gemma.md
samples/booksum40k.md
samples/booksum9k.md
samples/code_qa.md
samples/code_qa_128K.md
samples/mmlu.md
samples/passkey10k.md
samples/passkey5k.md
scripts/bench_latency_sglang.py
scripts/bench_nsys_micro.sh
scripts/bench_ruler_models.sh
scripts/bench_stream.sh
scripts/bench_stream_1.sh
scripts/bench_stream_inner.sh
scripts/exp_mmlu.sh
scripts/exp_ppl_llama.sh
scripts/exp_ppl_pg19.sh
scripts/run_blocksize_albation.sh
scripts/run_dl_ablation.sh
scripts/simulate_offload_hit_ratio.sh
scripts/test_openai.py
scripts/test_openai_long.py
src/hip_attn/__init__.py
src/hip_attn.egg-info/PKG-INFO
src/hip_attn.egg-info/SOURCES.txt
src/hip_attn.egg-info/dependency_links.txt
src/hip_attn.egg-info/requires.txt
src/hip_attn.egg-info/top_level.txt
src/hip_attn/models/__init__.py
src/hip_attn/models/modeling_llama.py
src/hip_attn/models/gemma/__init__.py
src/hip_attn/models/gemma/modeling_gemma2.py
src/hip_attn/models/llava/__init__.py
src/hip_attn/models/llava/builder.py
src/hip_attn/models/llava/configuration_llava.py
src/hip_attn/models/llava/llava_llama.py
src/hip_attn/models/llava/modeling_llava.py
src/hip_attn/models/llava/processing_llava.py
src/hip_attn/models/qwen/__init__.py
src/hip_attn/models/qwen/modeling_qwen2.py
src/hip_attn/utils/__init__.py
src/hip_attn/utils/attention.py
src/hip_attn/utils/attention_norm.py
src/hip_attn/utils/attn_l1_loss.py
src/hip_attn/utils/benchmarking.py
src/hip_attn/utils/memory_efficient_llm_ce.py
src/hip_attn/utils/new_block_sparse.py
src/hip_attn/utils/rope.py
src/hip_attn/utils/rotate.py
src/hip_attn/utils/triton_argsort.py
src/hip_attn/v1_0/__init__.py
src/hip_attn/v1_0/attention1.py
src/hip_attn/v1_0/attention1_block_gpu.py
src/hip_attn/v1_0/attention1_gpu.py
src/hip_attn/v1_0/attention1_block_gpu_kernel/__init__.py
src/hip_attn/v1_0/attention1_block_gpu_kernel/calc_prob_return_context.py
src/hip_attn/v1_0/attention1_block_gpu_kernel/calc_score_return_prob.py
src/hip_attn/v1_0/attention1_block_gpu_kernel/masking_iteration.py
src/hip_attn/v1_0/attention1_block_gpu_kernel/paged_cache_vllm_compat.py
src/hip_attn/v1_0/attention1_block_gpu_kernel/safe_indices.py
src/hip_attn/v1_1/__init__.py
src/hip_attn/v1_1/attention2_draft.py
src/hip_attn/v1_1/attention2_draft_causal_batch.py
src/hip_attn/v1_1/attention2_draft_causal_batch_gpu.py
src/hip_attn/v1_1/attention2_draft_causal_batch_gpu_fused.py
src/hip_attn/v1_1/attention2_draft_causal_batch_gpu_fused_vec.py
src/hip_attn/v1_1/attention2_draft_prefetch.py
src/hip_attn/v1_1/attention2_draft_sampling.py
src/hip_attn/v1_1/attention2_draft_sampling_extend.py
src/hip_attn/v1_1/offload_runner/__init__.py
src/hip_attn/v1_1/offload_runner/cache_policy.py
src/hip_attn/v1_1/offload_runner/llama_model.py
src/hip_attn/v1_1/offload_runner/offload_runner.py
src/hip_attn/v1_1/offload_runner/tensor_from_pointer.cpp
src/hip_attn/v1_1/offload_runner/tensor_from_pointer.py
src/hip_attn/v1_2/__init__.py
src/hip_attn/v1_2/attention_decode_bsa.py
src/hip_attn/v1_2/attention_extend.py
src/hip_attn/v1_2/attention_extend_bsa.py
src/hip_attn/v1_2/attention_extend_bsa_tilelang.py
src/hip_attn/v1_2/attention_metadata.py
src/hip_attn/v1_2/compute_scores_landmark.py
src/hip_attn/v1_2/compute_v_cos.py
src/hip_attn/v1_2/eval_stage.py
src/hip_attn/v1_2/hip_config.py
src/hip_attn/v1_2/hip_memory_pool.py
src/hip_attn/v1_2/landmark_sample.py
src/hip_attn/v1_2/mask_refresh_interval.py
src/hip_attn/v1_2/model_offload_cache.py
src/hip_attn/v1_2/paged_hip.py
src/hip_attn/v1_2/query_sparse_attention.py
src/hip_attn/v1_2/scan_stage.py
src/hip_attn/v1_2/stage_prologue.py
src/hip_attn/v1_2/triton_argsort.py
src/hip_attn/v1_2/triton_jit.py
src/hip_attn/v1_2/utils.py
src/hip_attn/v1_2/uvm_gpu_cache.py
src/hip_attn/v1_2/delta/__init__.py
src/hip_attn/v1_2/delta/apply_delta.py
tests/test_openvino_bsa.py
tests/utils/bench_argsort_cupy_median.py
tests/utils/test_attention_norm.py
tests/utils/test_attn_l1_loss.py
tests/utils/test_memory_efficient_llm_ce.py
tests/v1_0/test_attention1.py
tests/v1_0/test_attention1_block_gpu.py
tests/v1_0/test_attention1_block_gpu_bwd.py
tests/v1_0/test_attention1_block_gpu_fwd.py
tests/v1_0/test_attention1_gpu.py
tests/v1_0/test_attention1_gpu_bwd.py
tests/v1_0/test_gen3_cpu.py
tests/v1_0/test_paged_attention.py
tests/v1_0/test_score_prob.py
tests/v1_1/bench_attention2_fused_vec_shuffle_head.py
tests/v1_1/bench_dual_stream_masking.py
tests/v1_1/test_attention2_draft.py
tests/v1_1/test_attention2_draft_causal_batch.py
tests/v1_1/test_attention2_draft_causal_batch_gpu.py
tests/v1_1/test_attention2_draft_causal_batch_gpu_fused.py
tests/v1_1/test_attention2_draft_causal_batch_gpu_fused_vec.py
tests/v1_1/test_attention2_draft_prefetch.py
tests/v1_1/test_attention2_draft_sampling.py
tests/v1_1/test_attention2_draft_sampling_extend.py
tests/v1_1/test_attention2_extend_exps.py
tests/v1_1/test_attention2_paged_decode.py
tests/v1_1/test_attention2_prefetch_bsa.py
tests/v1_1/offload_runner/test_cache_policy.py
tests/v1_1/offload_runner/test_offload_runner.py
tests/v1_1/offload_runner/test_tensor_from_pointer.py
tests/v1_2/test_attention_decode_bsa.py
tests/v1_2/test_attention_extend.py
tests/v1_2/test_chunked_sw.py
tests/v1_2/test_gen3_stage_caching.py
tests/v1_2/test_latency_e2e.py
tests/v1_2/test_latency_e2e.sh
tests/v1_2/test_openrouter.py
tests/v1_2/test_qsa_bsa_masking.py
tests/v1_2/test_qsa_estimate.py
tests/v1_2/test_query_sparse_attention.py
tests/v1_2/test_reasoning_effort.py
tests/v1_2/test_structured_gen.py
tests/v1_2/test_tool_calling.py