• 文档 >
  • 表批量嵌入算子
快捷方式

表批量嵌入算子

std::tuple<at::Tensor, at::Tensor, std::optional<at::Tensor>> get_unique_indices_cuda(const at::Tensor &linear_indices, const int64_t max_indices, const bool compute_count)

对索引进行去重。

std::tuple<at::Tensor, at::Tensor, std::optional<at::Tensor>, std::optional<at::Tensor>> get_unique_indices_with_inverse_cuda(const at::Tensor &linear_indices, const int64_t max_indices, const bool compute_count, const bool compute_inverse_indices)

对索引进行去重。

std::tuple<at::Tensor, at::Tensor, std::optional<at::Tensor>> lru_cache_find_uncached_cuda(at::Tensor unique_indices, at::Tensor unique_indices_length, int64_t max_indices, at::Tensor lxu_cache_state, int64_t time_stamp, at::Tensor lru_state, bool gather_cache_stats, at::Tensor uvm_cache_stats, bool lock_cache_line, at::Tensor lxu_cache_locking_counter, const bool compute_inverse_indices)

查找LRU缓存以查找未缓存的索引,然后根据集合对它们进行排序。

int64_t host_lxu_cache_slot(int64_t h_in, int64_t C)

将索引映射到缓存集。h_in:线性索引;C:缓存集数。

at::Tensor linearize_cache_indices_cuda(const at::Tensor &cache_hash_size_cumsum, const at::Tensor &indices, const at::Tensor &offsets, const std::optional<at::Tensor> &B_offsets, const int64_t max_B, const int64_t indices_base_offset)

将所有表的索引线性化以使其唯一

at::Tensor linearize_cache_indices_from_row_idx_cuda(at::Tensor cache_hash_size_cumsum, at::Tensor update_table_indices, at::Tensor update_row_indices)

将所有表的索引线性化以使其唯一。请注意,update_table_indices 和 update_row_indices 来自用于就地更新的行索引格式。

void lru_cache_populate_cuda(at::Tensor weights, at::Tensor hash_size_cumsum, int64_t total_cache_hash_size, at::Tensor cache_index_table_map, at::Tensor weights_offsets, at::Tensor D_offsets, at::Tensor linear_cache_indices, at::Tensor lxu_cache_state, at::Tensor lxu_cache_weights, int64_t time_stamp, at::Tensor lru_state, bool stochastic_rounding, bool gather_cache_stats, std::optional<at::Tensor> uvm_cache_stats, bool lock_cache_line, std::optional<at::Tensor> lxu_cache_locking_counter)

LRU 缓存:从 weights 中获取对应于 linear_cache_indices 的行,并在时间戳 time_stamp 将其插入缓存。

void lru_cache_populate_byte_cuda(at::Tensor weights, at::Tensor hash_size_cumsum, int64_t total_cache_hash_size, at::Tensor cache_index_table_map, at::Tensor weights_offsets, at::Tensor weights_tys, at::Tensor D_offsets, at::Tensor linear_cache_indices, at::Tensor lxu_cache_state, at::Tensor lxu_cache_weights, int64_t time_stamp, at::Tensor lru_state, int64_t row_alignment, bool gather_cache_stats, std::optional<at::Tensor> uvm_cache_stats)

LRU 缓存:从 weights 中获取对应于 linear_cache_indices 的行,并在时间戳 time_stamp 将其插入缓存。weights 和 lxu_cache_weights 的元素类型为“uint8_t”字节。

void direct_mapped_lru_cache_populate_byte_cuda(at::Tensor weights, at::Tensor hash_size_cumsum, int64_t total_cache_hash_size, at::Tensor cache_index_table_map, at::Tensor weights_offsets, at::Tensor weights_tys, at::Tensor D_offsets, at::Tensor linear_cache_indices, at::Tensor lxu_cache_state, at::Tensor lxu_cache_weights, int64_t time_stamp, at::Tensor lru_state, at::Tensor lxu_cache_miss_timestamp, int64_t row_alignment, bool gather_cache_stats, std::optional<at::Tensor> uvm_cache_stats)

直接映射(关联度为1)的 lru_cache_populate_byte_cuda 变体

void lfu_cache_populate_cuda(at::Tensor weights, at::Tensor cache_hash_size_cumsum, int64_t total_cache_hash_size, at::Tensor cache_index_table_map, at::Tensor weights_offsets, at::Tensor D_offsets, at::Tensor linear_cache_indices, at::Tensor lxu_cache_state, at::Tensor lxu_cache_weights, at::Tensor lfu_state, bool stochastic_rounding)

LFU 缓存:从 weights 中获取对应于 linear_cache_indices 的行,并将它们插入到缓存中。

void lfu_cache_populate_byte_cuda(at::Tensor weights, at::Tensor cache_hash_size_cumsum, int64_t total_cache_hash_size, at::Tensor cache_index_table_map, at::Tensor weights_offsets, at::Tensor weights_tys, at::Tensor D_offsets, at::Tensor linear_cache_indices, at::Tensor lxu_cache_state, at::Tensor lxu_cache_weights, at::Tensor lfu_state, int64_t row_alignment)

LFU 缓存:从 weights 中获取对应于 linear_cache_indices 的行,并将它们插入到缓存中。weights 和 lxu_cache_weights 具有“uint8_t”字节元素

at::Tensor lxu_cache_lookup_cuda(at::Tensor linear_cache_indices, at::Tensor lxu_cache_state, int64_t invalid_index, bool gather_cache_stats, std::optional<at::Tensor> uvm_cache_stats, std::optional<at::Tensor> num_uniq_cache_indices, std::optional<at::Tensor> lxu_cache_locations_output)

查找 LRU/LFU 缓存:查找所有索引的缓存权重位置。查找与 linear_cache_indices 对应的缓存中的槽,并使用哨兵值表示缺失。

at::Tensor direct_mapped_lxu_cache_lookup_cuda(at::Tensor linear_cache_indices, at::Tensor lxu_cache_state, int64_t invalid_index, bool gather_cache_stats, std::optional<at::Tensor> uvm_cache_stats)

查找 LRU/LFU 缓存:查找所有索引的缓存权重位置。查找与 linear_cache_indices 对应的缓存中的槽,并使用哨兵值表示缺失。

void lxu_cache_flush_cuda(at::Tensor uvm_weights, at::Tensor cache_hash_size_cumsum, at::Tensor cache_index_table_map, at::Tensor weights_offsets, at::Tensor D_offsets, int64_t total_D, at::Tensor lxu_cache_state, at::Tensor lxu_cache_weights, bool stochastic_rounding)

刷新缓存:将缓存中的权重存储到后备存储中。

void reset_weight_momentum_cuda(at::Tensor dev_weights, at::Tensor uvm_weights, at::Tensor lxu_cache_weights, at::Tensor weights_placements, at::Tensor weights_offsets, at::Tensor momentum1_dev, at::Tensor momentum1_uvm, at::Tensor momentum1_placements, at::Tensor momentum1_offsets, at::Tensor D_offsets, at::Tensor pruned_indices, at::Tensor pruned_indices_offsets, at::Tensor logical_table_ids, at::Tensor buffer_ids, at::Tensor cache_hash_size_cumsum, at::Tensor lxu_cache_state, int64_t total_cache_hash_size)
void lxu_cache_locking_counter_decrement_cuda(at::Tensor lxu_cache_locking_counter, at::Tensor lxu_cache_locations)

根据 lxu_cache_locations 递减 LRU/LFU 缓存计数器。

void lxu_cache_locations_update_cuda(at::Tensor lxu_cache_locations, at::Tensor lxu_cache_locations_new, std::optional<at::Tensor> num_uniq_cache_indices)

就地更新 lxu_cache_locations 为新的值,仅当 lxu_cache_locations[i] == -1 且 lxu_cache_locations_new[i] >= 0 时更新。

文档

访问 PyTorch 的全面开发者文档

查看文档

教程

为初学者和高级开发者提供深入的教程

查看教程

资源

查找开发资源并获得问题的解答

查看资源