8000 Update on "[Cutlass] Remove unused dtype conversion map" · pytorch/pytorch@735ab65 · GitHub
[go: up one dir, main page]

Skip to content

Commit 735ab65

Browse files
committed
Update on "[Cutlass] Remove unused dtype conversion map"
cc voznesenskym penguinwu EikanWang jgong5 Guobing-Chen XiaobingSuper zhuhaozhe blzheng wenzhe-nrv jiayisunx ipiszy chenyang78 kadeng muchulee8 amjames chauhang aakhundov [ghstack-poisoned]
2 parents b652a21 + 41797f6 commit 735ab65

File tree

110 files changed

+1313
-1002
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

110 files changed

+1313
-1002
lines changed

.ci/docker/common/install_inductor_benchmark_deps.sh

Lines changed: 0 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -14,13 +14,6 @@ function install_timm() {
1414
local commit
1515
commit=$(get_pinned_commit timm)
1616

17-
# TODO (huydhn): There is no torchvision release on 3.13 when I write this, so
18-
# I'm using nightly here instead. We just need to package to be able to install
19-
# TIMM. Removing this once vision has a release on 3.13
20-
if [[ "${ANACONDA_PYTHON_VERSION}" == "3.13" ]]; then
21-
pip_install --pre torch torchvision --index-url https://download.pytorch.org/whl/nightly/cu124
22-
fi
23-
2417
pip_install "git+https://github.com/huggingface/pytorch-image-models@${commit}"
2518
# Clean up
2619
conda_run pip uninstall -y cmake torch torchvision triton

.ci/magma/Makefile

Lines changed: 1 addition & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -12,13 +12,12 @@ DOCKER_RUN = set -eou pipefail; ${DOCKER_CMD} run --rm -i \
1212
-e PACKAGE_NAME=${PACKAGE_NAME}${DESIRED_CUDA_SHORT} \
1313
-e DESIRED_CUDA=${DESIRED_CUDA} \
1414
-e CUDA_ARCH_LIST="${CUDA_ARCH_LIST}" \
15-
"pytorch/manylinux2_28-builder:cuda${DESIRED_CUDA}-main" \
15+
"pytorch/almalinux-builder:cuda${DESIRED_CUDA}-main" \
1616
magma/build_magma.sh
1717

1818
.PHONY: all
1919
all: magma-cuda128
2020
all: magma-cuda126
21-
all: magma-cuda124
2221
all: magma-cuda118
2322

2423
.PHONY:
@@ -37,11 +36,6 @@ magma-cuda126: DESIRED_CUDA := 12.6
3736
magma-cuda126:
3837
$(DOCKER_RUN)
3938

40-
.PHONY: magma-cuda124
41-
magma-cuda124: DESIRED_CUDA := 12.4
42-
magma-cuda124:
43-
$(DOCKER_RUN)
44-
4539
.PHONY: magma-cuda118
4640
magma-cuda118: DESIRED_CUDA := 11.8
4741
magma-cuda118: CUDA_ARCH_LIST += -gencode arch=compute_37,code=sm_37

.github/workflows/build-magma-linux.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -34,7 +34,7 @@ jobs:
3434
id-token: write
3535
strategy:
3636
matrix:
37-
cuda_version: ["128", "126", "124", "118"]
37+
cuda_version: ["128", "126", "118"]
3838
steps:
3939
- name: Checkout PyTorch
4040
uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2

.github/workflows/llm_td_retrieval.yml

Lines changed: 5 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -50,18 +50,13 @@ jobs:
5050
ref: v0.0.2
5151
path: llm-target-determinator
5252

53-
- name: Setup miniconda
54-
uses: pytorch/test-infra/.github/actions/setup-miniconda@main
55-
with:
56-
python-version: "3.9"
57-
5853
- name: Install requirements
5954
shell: bash
6055
run: |
6156
set -euxo pipefail
62-
${CONDA_RUN} pip install -r llm-target-determinator/requirements.txt
57+
python3 -m pip install -r llm-target-determinator/requirements.txt
6358
cd "${GITHUB_WORKSPACE}/codellama"
64-
${CONDA_RUN} pip install -e .
59+
python3 -m pip install -e .
6560
6661
- name: Fetch CodeLlama Checkpoint
6762
shell: bash
@@ -80,7 +75,7 @@ jobs:
8075
shell: bash
8176
command: |
8277
set -euxo pipefail
83-
${CONDA_RUN} python -m pip install awscli==1.29.40
78+
python3 -m pip install awscli==1.29.40
8479
cd "${GITHUB_WORKSPACE}"/llm-target-determinator/assets
8580
aws s3 cp "s3://target-determinator-assets/indexes/latest" . --recursive
8681
@@ -94,7 +89,8 @@ jobs:
9489
run: |
9590
set -euxo pipefail
9691
cd "${GITHUB_WORKSPACE}"/llm-target-determinator
97-
${CONDA_RUN} torchrun \
92+
export PATH="$HOME/.local/bin:$PATH"
93+
torchrun \
9894
--standalone \
9995
--nnodes=1 \
10096
--nproc-per-node=1 \

.github/workflows/upload-test-stats-while-running.yml

Lines changed: 2 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -24,17 +24,12 @@ jobs:
2424
- name: Setup Linux
2525
uses: ./.github/actions/setup-linux
2626

27-
- name: Setup miniconda
28-
uses: pytorch/test-infra/.github/actions/setup-miniconda@main
29-
with:
30-
python-version: "3.10"
31-
3227
- name: Install requirements
3328
run: |
34-
${CONDA_RUN} pip install requests==2.32.2 boto3==1.35.42
29+
python3 -m pip install requests==2.32.2 boto3==1.35.42
3530
3631
- name: Upload test stats
3732
env:
3833
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
3934
run: |
40-
${CONDA_RUN} python -m tools.stats.upload_test_stats_running_jobs
35+
python3 -m tools.stats.upload_test_stats_running_jobs

aten/src/ATen/core/List_test.cpp

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22
#include <gtest/gtest.h>
33

44
using namespace c10;
5+
using std::string;
56

67
// NOLINTBEGIN(performance-move-const-arg, bugprone-use-after-move, *analyzer*Move)
78
TEST(ListTestIValueBasedList, givenEmptyList_whenCallingEmpty_thenReturnsTrue) {

aten/src/ATen/core/boxing/impl/kernel_function_legacy_test.cpp

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -519,7 +519,7 @@ TEST(OperatorRegistrationTestLegacyFunctionBasedKernel, givenKernelWithDictInput
519519
EXPECT_EQ(2, captured_dict_size);
520520
}
521521

522-
string kernelWithDictInputWithOutput(Dict<string, string> input1) {
522+
std::string kernelWithDictInputWithOutput(Dict<string, string> input1) {
523523
return input1.at("key2");
524524
}
525525

@@ -581,7 +581,7 @@ TEST(OperatorRegistrationTestLegacyFunctionBasedKernel, givenKernelWithUnordered
581581
EXPECT_EQ(2, captured_dict_size);
582582
}
583583

584-
string kernelWithUnorderedMapInputWithOutput(std::unordered_map<string, string> input1) {
584+
std::string kernelWithUnor 10000 deredMapInputWithOutput(std::unordered_map<string, string> input1) {
585585
return input1.at("key2");
586586
}
587587

aten/src/ATen/core/boxing/impl/kernel_function_test.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -468,7 +468,7 @@ TEST(OperatorRegistrationTestFunctionBasedKernel, givenKernelWithDictInput_witho
468468
EXPECT_EQ(2, captured_dict_size);
469469
}
470470

471-
string kernelWithDictInputWithOutput(Dict<string, string> input1) {
471+
std::string kernelWithDictInputWithOutput(Dict<string, string> input1) {
472472
return input1.at("key2");
473473
}
474474

aten/src/ATen/core/boxing/impl/make_boxed_from_unboxed_functor_test.cpp

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -463,7 +463,7 @@ TEST(OperatorRegistrationTestFunctorBasedKernel, givenKernelWithDictInput_withou
463463
}
464464

465465
struct KernelWithDictInputWithOutput final : OperatorKernel {
466-
string operator()(Dict<string, string> input1) {
466+
std::string operator()(Dict<string, std::string> input1) {
467467
return input1.at("key2");
468468
}
469469
};
@@ -475,7 +475,7 @@ TEST(OperatorRegistrationTestFunctorBasedKernel, givenKernelWithDictInput_withOu
475475
auto op = c10::Dispatcher::singleton().findSchema({"_test::dict_input", ""});
476476
ASSERT_TRUE(op.has_value());
477477

478-
Dict<string, string> dict;
478+
Dict<string, std::string> dict;
479479
dict.insert("key1", "value1");
480480
dict.insert("key2", "value2");
481481
auto outputs = callOp(*op, dict);
@@ -484,7 +484,7 @@ TEST(OperatorRegistrationTestFunctorBasedKernel, givenKernelWithDictInput_withOu
484484
}
485485

486486
struct KernelWithDictOutput final : OperatorKernel {
487-
Dict<string, string> operator()(Dict<string, string> input) {
487+
Dict<string, std::string> operator()(Dict<string, std::string> input) {
488488
return input;
489489
}
490490
};
@@ -496,12 +496,12 @@ TEST(OperatorRegistrationTestFunctorBasedKernel, givenKernelWithDictOutput_whenR
496496
auto op = c10::Dispatcher::singleton().findSchema({"_test::dict_output", ""});
497497
ASSERT_TRUE(op.has_value());
498498

499-
Dict<string, string> dict;
499+
Dict<string, std::string> dict;
500500
dict.insert("key1", "value1");
501501
dict.insert("key2", "value2");
502502
auto outputs = callOp(*op, dict);
503503
EXPECT_EQ(1, outputs.size());
504-
auto output = c10::impl::toTypedDict<string, string>(outputs[0].toGenericDict());
504+
auto output = c10::impl::toTypedDict<string, std::string>(outputs[0].toGenericDict());
505505

506506
EXPECT_EQ(2, output.size());
507507
EXPECT_EQ("value1", output.at("key1"));
@@ -520,7 +520,7 @@ class KernelWithCache final : public OperatorKernel {
520520
};
521521

522522
struct KernelWithTupleInput final : OperatorKernel {
523-
string operator()(std::tuple<string, int64_t, double> input1) {
523+
std::string operator()(std::tuple<string, int64_t, double> input1) {
524524
return std::get<0>(input1);
525525
}
526526
};

aten/src/ATen/native/MathBitsFallback.h

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -22,7 +22,7 @@ namespace at::native {
2222

2323
// NOTE: To use this fallback, `clone` and `copy_` should fully understand and be able to correctly handle the semantic of your math bit.
2424
struct MathOpFallback {
25-
MathOpFallback(DispatchKey key_, string op_name_) : key(key_), op_name(std::move(op_name_)) {}
25+
MathOpFallback(DispatchKey key_, std::string op_name_) : key(key_), op_name(std::move(op_name_)) {}
2626
virtual bool is_bit_set(const Tensor&) = 0;
2727
void fallback_impl(const c10::OperatorHandle& op, DispatchKeySet dispatch_keys, torch::jit::Stack* stack) {
2828
/*
@@ -151,7 +151,7 @@ struct MathOpFallback {
151151
virtual ~MathOpFallback() = default;
152152

153153
DispatchKey key;
154-
string op_name;
154+
std::string op_name;
155155
};
156156

157157
} // namespace at::native

aten/src/ATen/native/cuda/Indexing.cu

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1735,7 +1735,7 @@ Tensor& index_select_out_cuda(
17351735
int64_t dim,
17361736
const Tensor& index,
17371737
Tensor& out) {
1738-
static constexpr string_view DIM_WARNING =
1738+
static constexpr std::string_view DIM_WARNING =
17391739
"Tensor too large or too many (> 25) dimensions";
17401740
TORCH_CHECK(
17411741
at::cuda::check_device({out, self, index}),

aten/src/ATen/native/hip/ck_gemm_bfloat16.hip

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -772,7 +772,7 @@ void dispatch_bfloat16_gemm_wmma(CUDABLAS_GEMM_ARGTYPES(at::BFloat16)) {
772772
template <>
773773
void gemm_internal_ck<at::BFloat16>(CUDABLAS_GEMM_ARGTYPES(at::BFloat16)) {
774774
auto dprops = at::cuda::getCurrentDeviceProperties();
775-
c10::string_view arch(dprops->gcnArchName);
775+
std::string_view arch(dprops->gcnArchName);
776776
if (arch == "gfx1100") {
777777
dispatch_bfloat16_gemm_wmma(CUDABLAS_GEMM_ARGS(at::BFloat16));
778778
} else{

aten/src/ATen/native/mkldnn/xpu/detail/QMatmul.cpp

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -101,11 +101,11 @@ void quantized_matmul(
101101
std::optional<at::Tensor> other, // extra input for binary-post-op
102102
double other_scale,
103103
int64_t other_zero_point,
104-
const c10::string_view& binary_post_op,
104+
const std::string_view& binary_post_op,
105105
double binary_alpha,
106-
const c10::string_view& unary_post_op,
106+
const std::string_view& unary_post_op,
107107
torch::List<std::optional<at::Scalar>>& unary_post_op_args,
108-
c10::string_view unary_post_op_algorithm,
108+
std::string_view unary_post_op_algorithm,
109109
bool m2_trans) {
110110
// [Note] Quantized Matrix Multiplication at XPU
111111
// The following code integrates oneDNN quantized gemm. The quantization

aten/src/ATen/native/mkldnn/xpu/detail/oneDNN.h

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -156,11 +156,11 @@ void quantized_matmul(
156156
std::optional<at::Tensor> other, // extra input for binary-post-op
157157
double other_scale,
158158
int64_t other_zero_point,
159-
const c10::string_view& binary_post_op,
159+
const std::string_view& binary_post_op,
160160
double binary_alpha,
161-
const c10::string_view& unary_post_op,
161+
const std::string_view& unary_post_op,
162162
torch::List<std::optional<at::Scalar>>& unary_post_op_args,
163-
c10::string_view unary_post_op_algorithm,
163+
std::string_view unary_post_op_algorithm,
164164
bool m2_trnas);
165165

166166
void gpu_float_sdpa(

aten/src/ATen/native/mkldnn/xpu/qlinear.cpp

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -151,11 +151,11 @@ static Tensor q_linear_pointwise_binary(
151151
std::optional<c10::ScalarType> output_dtype,
152152
double other_scale,
153153
int64_t other_zero_point,
154-
c10::string_view binary_post_op,
154+
std::string_view binary_post_op,
155155
double binary_alpha,
156-
c10::string_view unary_post_op,
156+
std::string_view unary_post_op,
157157
torch::List<std::optional<at::Scalar>> unary_post_op_args,
158-
c10::string_view unary_post_op_algorithm) {
158+
std::string_view unary_post_op_algorithm) {
159159
TORCH_CHECK(
160160
act.device() == weight.device() &&
161161
act.device() == weight_scales.device() &&
@@ -222,11 +222,11 @@ static Tensor q_linear_pointwise_binary_tensor(
222222
std::optional<c10::ScalarType> output_dtype,
223223
double other_scale,
224224
int64_t other_zero_point,
225-
c10::string_view binary_post_op,
225+
std::string_view binary_post_op,
226226
double binary_alpha,
227-
c10::string_view unary_post_op,
227+
std::string_view unary_post_op,
228228
torch::List<std::optional<at::Scalar>> unary_post_op_args,
229-
c10::string_view unary_post_op_algorithm) {
229+
std::string_view unary_post_op_algorithm) {
230230
return q_linear_pointwise_binary(
231231
act,
232232
act_scale.item().toDouble(),

aten/src/ATen/native/mps/MetalShaderLibrary.h

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,9 @@ struct TensorIteratorBase;
2929

3030
namespace at::native::mps {
3131

32+
// Forward declaration of MPSScalar - for exec_binary_alpha_kernel()
33+
struct MPSScalar;
34+
3235
namespace de 741A tail {
3336
template <typename T>
3437
class has_size_type {
@@ -138,6 +141,10 @@ class MetalShaderLibrary {
138141
const std::string& name,
139142
std::optional<int64_t> extra = std::nullopt);
140143
void exec_binary_kernel(TensorIteratorBase& iter, const std::string& name);
144+
void exec_binary_alpha_kernel(
145+
TensorIteratorBase& iter,
146+
const std::string& name,
147+
const MPSScalar& alpha);
141148

142149
protected:
143150
virtual MTLLibrary_t getLibrary();

aten/src/ATen/native/mps/OperationUtils.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -156,7 +156,7 @@ MPSGraphTensor* mpsGraphRankedPlaceHolder(MPSGraph* mpsGraph, const TensorBase&
156156
MPSGraphTensor* mpsGraphScalarPlaceHolder(MPSGraph* mpsGraph, MPSDataType dataType);
157157
MPSGraphTensor* mpsGraphScalarPlaceHolder(MPSGraph* mpsGraph, const Scalar& scalar);
158158

159-
string get_mem_format_string(c10::MemoryFormat memory_format);
159+
std::string get_mem_format_string(c10::MemoryFormat memory_format);
160160

161161
using MPSCacheKey = uint64_t;
162162

0 commit comments

Comments
 (0)
0