[clang-tidy] NO.8 enable `cppcoreguidelines-narrowing-conversions`. step:1 (#56218)

b702d2ae · gouzil · GitHub · 0236771e · b702d2ae · b702d2ae
113 changed file
--- a/paddle/phi/api/profiler/device_tracer.cc
+++ b/paddle/phi/api/profiler/device_tracer.cc
@@ -797,8 +797,8 @@ void ClearCurAnnotation() {
  if (!main_thread_annotation_stack.empty()) {
    std::string name = annotation_stack.back()->name();
    std::string main_name = main_thread_annotation_stack.back()->name();
-    int main_name_len = main_name.length();
-    int name_len = name.length();
+    int main_name_len = static_cast<int>(main_name.length());
+    int name_len = static_cast<int>(name.length());
    int prefix_len = main_name_len - name_len;

    if ((prefix_len > 0 && main_name.at(prefix_len - 1) == '/' &&
@@ -825,7 +825,7 @@ void SetCurBlock(int block_id) { block_id_stack.push_back(block_id); }

 void ClearCurBlock() { block_id_stack.pop_back(); }

-int BlockDepth() { return block_id_stack.size(); }
+int BlockDepth() { return static_cast<int>(block_id_stack.size()); }

 uint32_t GetCurSystemThreadId() {
  std::stringstream ss;

--- a/paddle/phi/api/profiler/event.h
+++ b/paddle/phi/api/profiler/event.h
@@ -78,7 +78,7 @@ class Event {
  Event *parent_{nullptr};
  uint64_t thread_id_;
  EventRole role_{};
-  int64_t cpu_ns_;
+  uint64_t cpu_ns_;
  bool visited_status_{false};
  std::string attr_;
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)

--- a/paddle/phi/api/profiler/profiler.cc
+++ b/paddle/phi/api/profiler/profiler.cc
@@ -72,7 +72,7 @@ Event::Event(EventType type,
 const EventType &Event::type() const { return type_; }

 double Event::CpuElapsedMs(const Event &e) const {
-  return (e.cpu_ns_ - cpu_ns_) / (1000000.0);
+  return (static_cast<double>(e.cpu_ns_ - cpu_ns_)) / (1000000.0);
 }

 double Event::CudaElapsedMs(const Event &e) const {

--- a/paddle/phi/api/profiler/profiler.proto
+++ b/paddle/phi/api/profiler/profiler.proto
@@ -29,7 +29,7 @@ message Event {
  optional uint64 end_ns = 3;
  // When positive, it represents gpu id. When -1, it represents CPU.
  optional int64 device_id = 5;
-  optional int64 sub_device_id = 6;
+  optional uint64 sub_device_id = 6;

  optional MemCopy memcopy = 7;
  optional string detail_info = 9;

--- a/paddle/phi/backends/cpu/cpu_info.cc
+++ b/paddle/phi/backends/cpu/cpu_info.cc
@@ -78,7 +78,8 @@ size_t CpuTotalPhysicalMemory() {
 size_t CpuMaxAllocSize() {
  // For distributed systems, it requires configuring and limiting
  // the fraction of memory to use.
-  return FLAGS_fraction_of_cpu_memory_to_use * CpuTotalPhysicalMemory();
+  return static_cast<size_t>(FLAGS_fraction_of_cpu_memory_to_use *
+                             static_cast<double>(CpuTotalPhysicalMemory()));
 }

 size_t CpuMaxChunkSize() {
@@ -97,7 +98,8 @@ size_t CpuMinChunkSize() {
 size_t CUDAPinnedMaxAllocSize() {
  // For distributed systems, it requires configuring and limiting
  // the fraction of memory to use.
-  return FLAGS_fraction_of_cuda_pinned_memory_to_use * CpuTotalPhysicalMemory();
+  return static_cast<size_t>(FLAGS_fraction_of_cuda_pinned_memory_to_use *
+                             static_cast<double>(CpuTotalPhysicalMemory()));
 }

 size_t CUDAPinnedMinChunkSize() {

--- a/paddle/phi/backends/device_manager.cc
+++ b/paddle/phi/backends/device_manager.cc
@@ -491,7 +491,7 @@ std::vector<size_t> DeviceManager::GetSelectedDeviceList(
        device_list.push_back(atoi(id.c_str()));
      }
    } else {
-      int count = DeviceManager::GetDeviceCount(device_type);
+      int count = static_cast<int>(DeviceManager::GetDeviceCount(device_type));
      for (int i = 0; i < count; ++i) {
        device_list.push_back(i);
      }

--- a/paddle/phi/core/ddim.cc
+++ b/paddle/phi/core/ddim.cc
@@ -19,15 +19,15 @@
 namespace phi {

 DDim make_ddim(std::initializer_list<int64_t> dims) {
-  return DDim(dims.begin(), dims.size());
+  return DDim(dims.begin(), static_cast<int>(dims.size()));
 }

 DDim make_ddim(const std::vector<int64_t>& dims) {
-  return DDim(dims.data(), dims.size());
+  return DDim(dims.data(), static_cast<int>(dims.size()));
 }

 DDim make_ddim(const std::vector<int>& dims) {
-  return DDim(dims.data(), dims.size());
+  return DDim(dims.data(), static_cast<int>(dims.size()));
 }

 struct DDimEqualityVisitor {
@@ -186,19 +186,19 @@ DDim stride_numel(const DDim& ddim) {
 DDim DDim::reshape(std::vector<int>& shape) const {
  const DDim& in_dims = *this;

-  for (uint64_t i = 0; i < shape.size(); ++i) {
+  for (int i = 0; i < static_cast<int>(shape.size()); ++i) {
    if (shape[i] == 0) {
-      shape[i] = in_dims.at(i);
+      shape[i] = static_cast<int>(in_dims.at(i));
    }
  }

  // Dim marked as "-1" must be inferred
  auto it = std::find(shape.begin(), shape.end(), -1);
  if (it != shape.end()) {
-    int index = std::distance(shape.begin(), it);
+    int index = static_cast<int>(std::distance(shape.begin(), it));
    int reshape_out_product =
        std::accumulate(shape.begin(), shape.end(), -1, std::multiplies<int>());
-    shape[index] = product(in_dims) / reshape_out_product;
+    shape[index] = static_cast<int>(product(in_dims)) / reshape_out_product;
  }

  return phi::make_ddim(shape);
@@ -208,7 +208,7 @@ DDim DDim::transpose(const std::vector<int>& axis) const {
  const DDim& in_dims = *this;

  DDim out_dims(in_dims);
-  for (size_t i = 0; i < axis.size(); i++) {
+  for (int i = 0; i < static_cast<int>(axis.size()); i++) {
    out_dims[i] = in_dims[axis[i]];
  }
  return out_dims;

--- a/paddle/phi/core/dense_tensor_impl.cc
+++ b/paddle/phi/core/dense_tensor_impl.cc
@@ -340,7 +340,7 @@ std::vector<DenseTensor> DenseTensor::Split(int64_t split_size,
          "split expects split_size be non-negative, but got split_size is %d",
          split_size));

-  int64_t numel_size = meta_.dims[axis];
+  int64_t numel_size = meta_.dims[static_cast<int>(axis)];

  int64_t num_splits = 1;
  if (split_size != 0) {
@@ -371,7 +371,7 @@ std::vector<DenseTensor> DenseTensor::Chunk(int64_t chunks,
      phi::errors::OutOfRange(
          "chunks expects to be greater than 0, but got chunks is %d", chunks));

-  int64_t numel_size = meta_.dims[axis];
+  int64_t numel_size = meta_.dims[static_cast<int>(axis)];
  int64_t split_size = (numel_size + chunks - 1) / chunks;
  return Split(split_size, axis);
 }

--- a/paddle/phi/core/distributed/auto_parallel/device_mesh.cc
+++ b/paddle/phi/core/distributed/auto_parallel/device_mesh.cc
@@ -330,25 +330,25 @@ DeviceMesh DeviceMesh::from_proto(const DeviceMeshProto &proto) {
  mesh.name_ = proto.name();

  mesh.shape_.resize(proto.shape_size());
-  for (int64_t i = 0; i < proto.shape_size(); ++i) {
+  for (int i = 0; i < proto.shape_size(); ++i) {
    mesh.shape_[i] = proto.shape(i);
  }

  mesh.device_ids_.resize(proto.device_ids_size());
-  for (int64_t i = 0; i < proto.device_ids_size(); ++i) {
+  for (int i = 0; i < proto.device_ids_size(); ++i) {
    mesh.device_ids_[i] = proto.device_ids(i);
  }

  mesh.dim_names_.resize(proto.dim_names_size());
-  for (int64_t i = 0; i < proto.dim_names_size(); ++i) {
+  for (int i = 0; i < proto.dim_names_size(); ++i) {
    mesh.dim_names_[i] = proto.dim_names(i);
  }

-  for (int64_t i = 0; i < proto.devices_size(); ++i) {
+  for (int i = 0; i < proto.devices_size(); ++i) {
    mesh.add_device(Device::from_proto(proto.devices(i)));
  }

-  for (int64_t i = 0; i < proto.links_size(); ++i) {
+  for (int i = 0; i < proto.links_size(); ++i) {
    mesh.add_link(Link::from_proto(proto.links(i)));
  }


--- a/paddle/phi/core/distributed/auto_parallel/device_mesh.h
+++ b/paddle/phi/core/distributed/auto_parallel/device_mesh.h
@@ -96,8 +96,8 @@ inline bool operator!=(const Device& lhs, const Device& rhs) {
 }

 struct LinkCapability {
-  double bandwidth = 0.0;  // Bytes/s
-  double latency = 0.0;
+  int64_t bandwidth = 0.0;  // Bytes/s
+  int64_t latency = 0.0;

  // LinkCapability from_string(const std::string& str);
  std::string to_string() const;

--- a/paddle/phi/core/distributed/auto_parallel/dist_attr.cc
+++ b/paddle/phi/core/distributed/auto_parallel/dist_attr.cc
@@ -186,7 +186,7 @@ bool TensorDistAttr::verify_dims_mapping(
 bool TensorDistAttr::verify_batch_dim(
    int64_t dim, const std::vector<int64_t>& tensor_shape) const {
  VLOG(4) << "[TensorDistAttr verify_batch_dim] " << dim;
-  int64_t ndim = tensor_shape.size();
+  int64_t ndim = static_cast<int64_t>(tensor_shape.size());
  if (ndim > 0) {
    if (dim < 0) {
      dim = dim + ndim;
@@ -270,12 +270,12 @@ std::string TensorDistAttr::to_string() const {
 void TensorDistAttr::from_proto(const TensorDistAttrProto& proto) {
  process_mesh_ = ProcessMesh::from_proto(proto.process_mesh());
  dims_mapping_.resize(proto.dims_mapping_size());
-  for (int64_t i = 0; i < proto.dims_mapping_size(); ++i) {
+  for (int i = 0; i < proto.dims_mapping_size(); ++i) {
    dims_mapping_[i] = proto.dims_mapping(i);
  }
  batch_dim_ = proto.batch_dim();
  dynamic_dims_.resize(proto.dynamic_dims_size());
-  for (int64_t i = 0; i < proto.dynamic_dims_size(); ++i) {
+  for (int i = 0; i < proto.dynamic_dims_size(); ++i) {
    dynamic_dims_[i] = proto.dynamic_dims(i);
  }
 }

--- a/paddle/phi/core/distributed/auto_parallel/dist_mapper.cc
+++ b/paddle/phi/core/distributed/auto_parallel/dist_mapper.cc
@@ -72,17 +72,17 @@ void DistributedMapper::set_process_id_to_device_ids(
 DistributedMapper DistributedMapper::from_proto(
    const DistributedMapperProto& proto) {
  DistributedMapper dist_mapper;
-  for (int64_t i = 0; i < proto.device_meshes_size(); ++i) {
+  for (int i = 0; i < proto.device_meshes_size(); ++i) {
    dist_mapper.device_meshes_[proto.device_meshes(i).name()] =
        DeviceMesh::from_proto(proto.device_meshes(i));
  }
-  for (int64_t i = 0; i < proto.process_id_to_device_ids_size(); ++i) {
+  for (int i = 0; i < proto.process_id_to_device_ids_size(); ++i) {
    int64_t process_id = proto.process_id_to_device_ids(i).process_id();
    std::string device_mesh_name =
        proto.process_id_to_device_ids(i).device_mesh_name();
    std::vector<int64_t> device_ids;
-    int64_t num_devices = proto.process_id_to_device_ids(i).device_ids_size();
-    for (int64_t j = 0; j < num_devices; ++j) {
+    int num_devices = proto.process_id_to_device_ids(i).device_ids_size();
+    for (int j = 0; j < num_devices; ++j) {
      device_ids.push_back(proto.process_id_to_device_ids(i).device_ids(j));
    }
    dist_mapper.process_id_to_device_ids_[process_id].first = device_mesh_name;

--- a/paddle/phi/core/distributed/auto_parallel/process_mesh.cc
+++ b/paddle/phi/core/distributed/auto_parallel/process_mesh.cc
@@ -88,17 +88,17 @@ ProcessMesh ProcessMesh::from_proto(const ProcessMeshProto &proto) {
  ProcessMesh mesh;

  mesh.shape_.resize(proto.shape_size());
-  for (int64_t i = 0; i < proto.shape_size(); ++i) {
+  for (int i = 0; i < proto.shape_size(); ++i) {
    mesh.shape_[i] = proto.shape(i);
  }

  mesh.process_ids_.resize(proto.process_ids_size());
-  for (int64_t i = 0; i < proto.process_ids_size(); ++i) {
+  for (int i = 0; i < proto.process_ids_size(); ++i) {
    mesh.process_ids_[i] = proto.process_ids(i);
  }

  mesh.dim_names_.resize(proto.dim_names_size());
-  for (int64_t i = 0; i < proto.dim_names_size(); ++i) {
+  for (int i = 0; i < proto.dim_names_size(); ++i) {
    mesh.dim_names_[i] = proto.dim_names(i);
  }


--- a/paddle/phi/core/distributed/auto_parallel/reshard_utils.cc
+++ b/paddle/phi/core/distributed/auto_parallel/reshard_utils.cc
@@ -46,7 +46,7 @@ bool IsDimsMappingReplicated(const std::vector<int64_t>& dims_mapping) {
 std::vector<int64_t> GetCurRankCoordInMesh(const ProcessMesh& process_mesh) {
  const auto& process_shape = process_mesh.shape();
  const auto& process_ids = process_mesh.process_ids();
-  int64_t ndims_mesh = process_shape.size();
+  int64_t ndims_mesh = static_cast<int64_t>(process_shape.size());
  int64_t cur_global_rank = GetCurGlobalRank();

  VLOG(3) << "Searching current global rank " << cur_global_rank
@@ -162,7 +162,7 @@ CommContext* CreateOrGetCommContext(const DeviceContext& dev_ctx,
  std::string unique_comm_key = GenUniqueCommKey(process_ids);

  if (!CommContextManager::GetInstance().Has(unique_comm_key)) {
-    int64_t world_size = process_ids.size();
+    int64_t world_size = static_cast<int64_t>(process_ids.size());
    int64_t rank = GetLocalRankInParticipate(process_ids);
    VLOG(3) << "local world size: " << world_size << " local rank: " << rank;


--- a/paddle/phi/core/distributed/store/tcp_store.cc
+++ b/paddle/phi/core/distributed/store/tcp_store.cc
@@ -172,7 +172,7 @@ void MasterDaemon::ProcessCommands(std::vector<struct pollfd>* p_fds) {
  for (size_t i = 1; i < fds.size(); i++) {
 #else
  // 0: listen socket, 1:controller pipe, so loop from 2.
-  for (size_t i = 2; i < fds.size(); i++) {
+  for (uint i = 2; i < fds.size(); i++) {
 #endif
    try {
      if (fds[i].revents == 0) {
@@ -345,14 +345,16 @@ TCPStore::TCPStore(std::string host,
                   bool is_master,
                   size_t num_workers,
                   int timeout)
-    : Store(timeout), _is_master(is_master), _num_workers(num_workers) {
+    : Store(timeout),
+      _is_master(is_master),
+      _num_workers(static_cast<int>(num_workers)) {
  _timeout = timeout;
  PADDLE_ENFORCE_GT(
      timeout, 0, phi::errors::InvalidArgument("timeout must >= %d", timeout));

  VLOG(3) << "input timeout" << timeout << ", member timeout:" << _timeout;
  if (_is_master) {
-    _server = detail::TCPServer::create(port, num_workers, timeout);
+    _server = detail::TCPServer::create(port, this->_num_workers, timeout);
  }

  _client = detail::TCPClient::connect(host, port);

--- a/paddle/phi/core/generator.cc
+++ b/paddle/phi/core/generator.cc
@@ -205,7 +205,7 @@ Generator::Generator(uint64_t seed, uint64_t device_id) {
  std::seed_seq seq({seed});
  auto engine = std::make_shared<std::mt19937_64>(seq);
  this->state_.cpu_engine = *engine;
-  this->state_.device = device_id;
+  this->state_.device = static_cast<int64_t>(device_id);
  this->state_.current_seed = seed;
  this->state_.thread_offset = 0;
  this->engine_ = engine;

--- a/paddle/phi/core/infermeta_utils.cc
+++ b/paddle/phi/core/infermeta_utils.cc
@@ -21,12 +21,12 @@ void InferMetaContext::SetMetaConfig(MetaConfig config) {
 }

 void InferMetaContext::EmplaceBackInput(MetaTensor input) {
-  int index = inputs_.size();
+  int index = static_cast<int>(inputs_.size());
  inputs_.emplace_back(std::move(input));
  input_range_.emplace_back(std::pair<int, int>(index, index + 1));
 }
 void InferMetaContext::EmplaceBackOutput(MetaTensor output) {
-  int index = outputs_.size();
+  int index = static_cast<int>(outputs_.size());
  outputs_.emplace_back(std::move(output));
  output_range_.emplace_back(std::pair<int, int>(index, index + 1));
 }
@@ -36,7 +36,7 @@ void InferMetaContext::EmplaceBackAttr(Attribute attr) {

 void InferMetaContext::EmplaceBackInputs(
    paddle::small_vector<MetaTensor, phi::kInputSmallVectorSize> inputs) {
-  int index = inputs_.size();
+  int index = static_cast<int>(inputs_.size());
  input_range_.emplace_back(std::pair<int, int>(index, index + inputs.size()));
  inputs_.insert(inputs_.end(),
                 std::make_move_iterator(inputs.begin()),
@@ -44,7 +44,7 @@ void InferMetaContext::EmplaceBackInputs(
 }
 void InferMetaContext::EmplaceBackOutputs(
    paddle::small_vector<MetaTensor, phi::kOutputSmallVectorSize> outputs) {
-  int index = outputs_.size();
+  int index = static_cast<int>(outputs_.size());
  output_range_.emplace_back(
      std::pair<int, int>(index, index + outputs.size()));
  outputs_.insert(outputs_.end(),

--- a/paddle/phi/core/kernel_context.cc
+++ b/paddle/phi/core/kernel_context.cc
@@ -17,7 +17,7 @@
 namespace phi {

 void KernelContext::EmplaceBackInput(const TensorBase* input) {
-  int index = inputs_.size();
+  int index = static_cast<int>(inputs_.size());
  inputs_.emplace_back(input);
  // Record the start and end index of the input
  input_range_.emplace_back(std::pair<int, int>(index, index + 1));
@@ -29,7 +29,7 @@ void KernelContext::EmplaceBackInputWithoutSetRange(const TensorBase* input) {

 void KernelContext::EmplaceBackInputs(
    paddle::small_vector<const TensorBase*> inputs) {
-  int index = inputs_.size();
+  int index = static_cast<int>(inputs_.size());
  // Record the start and end index of the input
  input_range_.emplace_back(std::pair<int, int>(index, index + inputs.size()));
  inputs_.insert(inputs_.end(),
@@ -45,7 +45,7 @@ void KernelContext::EmplaceBackInputsWithoutSetRange(
 }

 void KernelContext::EmplaceBackOutput(TensorBase* output) {
-  int index = outputs_.size();
+  int index = static_cast<int>(outputs_.size());
  outputs_.emplace_back(output);
  // Record the start and end index of the input
  output_range_.emplace_back(std::pair<int, int>(index, index + 1));
@@ -57,7 +57,7 @@ void KernelContext::EmplaceBackOutputWithoutSetRange(TensorBase* output) {

 void KernelContext::EmplaceBackOutputs(
    paddle::small_vector<TensorBase*> outputs) {
-  int index = outputs_.size();
+  int index = static_cast<int>(outputs_.size());
  // Record the start and end index of the input
  output_range_.emplace_back(
      std::pair<int, int>(index, index + outputs.size()));

--- a/paddle/phi/core/selected_rows_impl.cc
+++ b/paddle/phi/core/selected_rows_impl.cc
@@ -136,7 +136,7 @@ int64_t SelectedRowsImpl::AutoGrownIndex(int64_t key,
    }
    auto write_iter = id_to_index_.find(key);
    if (write_iter == id_to_index_.end()) {
-      int row_num = rows_.size();
+      int row_num = static_cast<int>(rows_.size());
      if (row_num == value_->dims()[0]) {
        rwlock_->UNLock();
        PADDLE_THROW(phi::errors::InvalidArgument(
@@ -165,7 +165,7 @@ int64_t SelectedRowsImpl::AutoGrownIndex(int64_t key,
 void SelectedRowsImpl::SyncIndex() {
  rwlock_->WRLock();
  id_to_index_.clear();
-  for (size_t i = 0; i < rows_.size(); ++i) {
+  for (int i = 0; i < static_cast<int>(rows_.size()); ++i) {
    id_to_index_[rows_[i]] = i;
  }
  rwlock_->UNLock();

--- a/paddle/phi/core/sparse_coo_tensor.cc
+++ b/paddle/phi/core/sparse_coo_tensor.cc
@@ -147,7 +147,7 @@ void SparseCooTensor::SetMember(const DenseTensor& non_zero_indices,
 }

 int32_t SparseCooTensor::sparse_dim() const {
-  return non_zero_indices_.dims()[0];
+  return static_cast<int32_t>(non_zero_indices_.dims()[0]);
 }

 int32_t SparseCooTensor::dense_dim() const {

--- a/paddle/phi/core/threadpool.cc
+++ b/paddle/phi/core/threadpool.cc
@@ -38,7 +38,7 @@ ThreadPool* ThreadPool::GetInstance() {
 void ThreadPool::Init() {
  if (threadpool_.get() == nullptr) {
    // TODO(Yancey1989): specify the max threads number
-    int num_threads = std::thread::hardware_concurrency();
+    int num_threads = static_cast<int>(std::thread::hardware_concurrency());
    if (FLAGS_dist_threadpool_size > 0) {
      num_threads = FLAGS_dist_threadpool_size;
      VLOG(1) << "set dist_threadpool_size to " << num_threads;

--- a/paddle/phi/kernels/coalesce_tensor_kernel.cc
+++ b/paddle/phi/kernels/coalesce_tensor_kernel.cc
@@ -143,7 +143,7 @@ void CoalesceTensorKernel(const Context &dev_ctx,
    int64_t accumulated_ranks = 0;
    for (size_t i = 0; i < input.size(); ++i) {
      phi::DDim dims(concated_shapes.data() + accumulated_ranks,
-                     concated_ranks[i]);
+                     static_cast<int>(concated_ranks[i]));
      if (!input[i]->initialized()) {
        PADDLE_ENFORCE_EQ(
            input[i],
@@ -187,7 +187,7 @@ void CoalesceTensorKernel(const Context &dev_ctx,
  size_t numel = 0;

  if (size_of_dtype == -1) {
-    size_of_dtype = phi::SizeOf(dtype);
+    size_of_dtype = static_cast<int>(phi::SizeOf(dtype));
  }
  GetMemSizeAndDtype(
      input, &numel, size_of_dtype, dev_ctx.GetPlace(), use_align, align_size);

--- a/paddle/phi/kernels/cpu/accuracy_kernel.cc
+++ b/paddle/phi/kernels/cpu/accuracy_kernel.cc
@@ -85,7 +85,7 @@ void AccuracyKernel(const Context& dev_ctx,
  }

  *correct_data = num_correct;
-  *total_data = num_samples;
+  *total_data = static_cast<int>(num_samples);
  *accuracy_data =
      static_cast<float>(num_correct) / static_cast<float>(num_samples);
 }

--- a/paddle/phi/kernels/cpu/adagrad_kernel.cc
+++ b/paddle/phi/kernels/cpu/adagrad_kernel.cc
@@ -57,7 +57,7 @@ struct DenseAdagradFunctor<phi::CPUContext, T> {
    auto place = *ctx.eigen_device();

    moment_out.device(place) = moment + grad * grad;
-    Eigen::DSizes<int, 1> m_dsize(moment_out_tensor->numel());
+    Eigen::DSizes<int, 1> m_dsize(static_cast<int>(moment_out_tensor->numel()));
    auto* lr = learning_rate.data<T>();
    param_out.device(place) =
        param - lr[0] * grad / (moment_out.sqrt() + epsilon);

--- a/paddle/phi/kernels/cpu/affine_grid_grad_kernel.cc
+++ b/paddle/phi/kernels/cpu/affine_grid_grad_kernel.cc
@@ -49,12 +49,12 @@ void AffineGridGrad4DKernel(const Context& dev_ctx,
                            bool align_corners,
                            DenseTensor* input_grad) {
  auto& theta_grad = input_grad;
-  int n = output_grad.dims()[0];
+  int n = static_cast<int>(output_grad.dims()[0]);
  auto& size_attr = outputShape.GetData();
  int h = 0;
  int w = 0;
-  h = size_attr[2];
-  w = size_attr[3];
+  h = static_cast<int>(size_attr[2]);
+  w = static_cast<int>(size_attr[3]);
  theta_grad->Resize(phi::make_ddim({n, 2, 3}));
  dev_ctx.template Alloc<T>(theta_grad);
  phi::funcs::SetConstant<Context, T>()(dev_ctx, theta_grad, static_cast<T>(0));
@@ -86,14 +86,14 @@ void AffineGridGrad5DKernel(const Context& dev_ctx,
                            bool align_corners,
                            DenseTensor* input_grad) {
  auto& theta_grad = input_grad;
-  int n = output_grad.dims()[0];
+  int n = static_cast<int>(output_grad.dims()[0]);
  auto& size_attr = outputShape.GetData();
  int d = 0;
  int h = 0;
  int w = 0;
-  d = size_attr[2];
-  h = size_attr[3];
-  w = size_attr[4];
+  d = static_cast<int>(size_attr[2]);
+  h = static_cast<int>(size_attr[3]);
+  w = static_cast<int>(size_attr[4]);
  theta_grad->Resize(phi::make_ddim({n, 3, 4}));
  dev_ctx.template Alloc<T>(theta_grad);
  phi::funcs::SetConstant<Context, T>()(dev_ctx, theta_grad, static_cast<T>(0));

--- a/paddle/phi/kernels/cpu/affine_grid_kernel.cc
+++ b/paddle/phi/kernels/cpu/affine_grid_kernel.cc
@@ -49,12 +49,12 @@ void AffineGrid4DKernel(const Context& dev_ctx,
                        bool align_corners,
                        DenseTensor* output) {
  auto* theta = &input;
-  int n = theta->dims()[0];
+  int n = static_cast<int>(theta->dims()[0]);
  auto& size_attr = outputShape.GetData();
  int h = 0;
  int w = 0;
-  h = size_attr[2];
-  w = size_attr[3];
+  h = static_cast<int>(size_attr[2]);
+  w = static_cast<int>(size_attr[3]);
  output->Resize(phi::make_ddim({n, h, w, 2}));
  dev_ctx.template Alloc<T>(output);
  phi::funcs::SetConstant<Context, T>()(dev_ctx, output, static_cast<T>(0));
@@ -81,14 +81,14 @@ void AffineGrid5DKernel(const Context& dev_ctx,
                        bool align_corners,
                        DenseTensor* output) {
  auto* theta = &input;
-  int n = theta->dims()[0];
+  int n = static_cast<int>(theta->dims()[0]);
  auto& size_attr = outputShape.GetData();
  int d = 0;
  int h = 0;
  int w = 0;
-  d = size_attr[2];
-  h = size_attr[3];
-  w = size_attr[4];
+  d = static_cast<int>(size_attr[2]);
+  h = static_cast<int>(size_attr[3]);
+  w = static_cast<int>(size_attr[4]);
  output->Resize(phi::make_ddim({n, d, h, w, 3}));
  dev_ctx.template Alloc<T>(output);
  phi::funcs::SetConstant<Context, T>()(dev_ctx, output, static_cast<T>(0));

--- a/paddle/phi/kernels/cpu/argsort_grad_kernel.cc
+++ b/paddle/phi/kernels/cpu/argsort_grad_kernel.cc
@@ -96,7 +96,7 @@ void ArgsortGradKernel(const Context& dev_ctx,
    trans.push_back(axis);
    phi::DDim trans_dims(in_dims);
    for (size_t i = 0; i < trans.size(); i++) {
-      trans_dims[i] = in_dims[trans[i]];
+      trans_dims[static_cast<int>(i)] = in_dims[trans[i]];
    }

    DenseTensor trans_dO;

--- a/paddle/phi/kernels/cpu/argsort_kernel.cc
+++ b/paddle/phi/kernels/cpu/argsort_kernel.cc
@@ -114,7 +114,7 @@ void ArgsortKernel(const Context& dev_ctx,
    trans.push_back(axis);
    phi::DDim trans_dims(in_dims);
    for (size_t i = 0; i < trans.size(); i++) {
-      trans_dims[i] = in_dims[trans[i]];
+      trans_dims[static_cast<int>(i)] = in_dims[trans[i]];
    }

    DenseTensor trans_inp;

--- a/paddle/phi/kernels/cpu/auc_kernel.cc
+++ b/paddle/phi/kernels/cpu/auc_kernel.cc
@@ -124,8 +124,8 @@ inline static void calcAuc(const int64_t *stat_pos,
  while (idx >= 0) {
    totPosPrev = totPos;
    totNegPrev = totNeg;
-    totPos += stat_pos[idx];
-    totNeg += stat_neg[idx];
+    totPos += static_cast<double>(stat_pos[idx]);
+    totNeg += static_cast<double>(stat_neg[idx]);
    *auc += trapezoidArea(totNeg, totNegPrev, totPos, totPosPrev);
    --idx;
  }

--- a/paddle/phi/kernels/cpu/batch_norm_grad_kernel.cc
+++ b/paddle/phi/kernels/cpu/batch_norm_grad_kernel.cc
@@ -104,10 +104,10 @@ void BatchNormGradFunctor(const Context& ctx,
          "The size of input X's dimensions should be less than 6."
          "But received: the size of input X's dimensions is [%d]",
          x_dims.size()));
-  const int N = x_dims[0];
-  const int C = (data_layout == DataLayout::kNCHW ? x_dims[1]
-                                                  : x_dims[x_dims.size() - 1]);
-  const int sample_size = x.numel() / N / C;
+  const int N = static_cast<int>(x_dims[0]);
+  const int C = static_cast<int>(
+      data_layout == DataLayout::kNCHW ? x_dims[1] : x_dims[x_dims.size() - 1]);
+  const int sample_size = static_cast<int>(x.numel() / N / C);

  // input dimension is 2 and the format is NCHW. The input can be regarded as
  // NHWC format
@@ -382,9 +382,9 @@ void BatchNormDoubleGradKernel(
  ctx.template Alloc<T>(ddY);

  const auto& x_dims = X->dims();
-  const int C = (data_layout == DataLayout::kNCHW ? x_dims[1]
-                                                  : x_dims[x_dims.size() - 1]);
-  const int sample_size = X->numel() / C;
+  const int C = static_cast<int>(
+      data_layout == DataLayout::kNCHW ? x_dims[1] : x_dims[x_dims.size() - 1]);
+  const int sample_size = static_cast<int>(X->numel() / C);
  phi::funcs::SetConstant<Context, T> set_constant;

  const T* mean_data = Saved_mean->data<T>();

--- a/paddle/phi/kernels/cpu/batch_norm_kernel.cc
+++ b/paddle/phi/kernels/cpu/batch_norm_kernel.cc
@@ -72,10 +72,10 @@ void BatchNormKernel(const Context& ctx,
          "The size of input X's dimensions should be less than 6."
          "But received: the size of input X's dimensionss is [%d]",
          x_dims.size()));
-  const int N = x_dims[0];
-  const int C = (data_layout == DataLayout::kNCHW ? x_dims[1]
-                                                  : x_dims[x_dims.size() - 1]);
-  const int sample_size = x.numel() / N / C;
+  const int N = static_cast<int>(x_dims[0]);
+  const int C = static_cast<int>(
+      data_layout == DataLayout::kNCHW ? x_dims[1] : x_dims[x_dims.size() - 1]);
+  const int sample_size = static_cast<int>(x.numel() / N / C);

  // alloc memory
  ctx.template Alloc<T>(y);

--- a/paddle/phi/kernels/cpu/bce_loss_grad_kernel.cc
+++ b/paddle/phi/kernels/cpu/bce_loss_grad_kernel.cc
@@ -32,7 +32,7 @@ void BCELossGradKernel(const Context& dev_ctx,
  auto x_data = input.data<T>();
  auto label_data = label.data<T>();

-  int x_numel = input.numel();
+  int x_numel = static_cast<int>(input.numel());

  // dx = dout * ((x - label)/(x - x^2))
  for (int i = 0; i < x_numel; ++i) {

--- a/paddle/phi/kernels/cpu/box_coder_kernel.cc
+++ b/paddle/phi/kernels/cpu/box_coder_kernel.cc
@@ -78,7 +78,7 @@ void EncodeCenterSize(const DenseTensor *target_box,
      for (int64_t j = 0; j < col; ++j) {
        for (int k = 0; k < 4; ++k) {
          size_t offset = i * col * len + j * len;
-          int prior_var_offset = j * len;
+          int prior_var_offset = static_cast<int>(j * len);
          output[offset + k] /= prior_box_var_data[prior_var_offset + k];
        }
      }

--- a/paddle/phi/kernels/cpu/broadcast_tensors_grad_kernel.cc
+++ b/paddle/phi/kernels/cpu/broadcast_tensors_grad_kernel.cc
@@ -108,7 +108,7 @@ void BroadcastTensorsGradKernel(const Context& ctx,
      int out_axis = out_rank - j - 1;
      int in_axis = in_rank - j - 1;

-      reshape_dims_vec.push_back(input_dims[j]);
+      reshape_dims_vec.push_back(static_cast<int>(input_dims[j]));
      if (out_axis < 0 || output_dims[out_axis] != input_dims[in_axis]) {
        reduce_dims_vec.push_back(in_axis);
      }

--- a/paddle/phi/kernels/cpu/cholesky_kernel.cc
+++ b/paddle/phi/kernels/cpu/cholesky_kernel.cc
@@ -35,7 +35,7 @@ void CholeskyKernel(const Context& dev_ctx,
  auto& dims = x.dims();
  int batch_count = 1;
  for (int i = 0; i < dims.size() - 2; i++) {
-    batch_count *= dims[i];
+    batch_count *= static_cast<int>(dims[i]);
  }
  auto m = dims[dims.size() - 1];


--- a/paddle/phi/kernels/cpu/class_center_sample_kernel.cc
+++ b/paddle/phi/kernels/cpu/class_center_sample_kernel.cc
@@ -80,7 +80,7 @@ void ClassCenterSampleKernel(const Context& dev_ctx,

  if (!fix_seed) {
    std::random_device rnd;
-    seed = rnd();
+    seed = static_cast<int>(rnd());
  }
  std::uniform_int_distribution<T> dist(0, num_classes - 1);
  std::shared_ptr<std::mt19937_64> engine;

--- a/paddle/phi/kernels/cpu/concat_kernel.cc
+++ b/paddle/phi/kernels/cpu/concat_kernel.cc
@@ -93,8 +93,8 @@ void ConcatKernel(const Context& dev_ctx,
          out_stride,
          in->data<T>(),
          in_stride,
-          in_stride[axis]);
-      output_offset += in_stride[axis];
+          in_stride[static_cast<int>(axis)]);
+      output_offset += in_stride[static_cast<int>(axis)];
    }
  } else {
    // TODO(chenweihang): concat functor support vector<DenseTensor*> input

--- a/paddle/phi/kernels/cpu/cross_entropy_grad_kernel.cc
+++ b/paddle/phi/kernels/cpu/cross_entropy_grad_kernel.cc
@@ -43,7 +43,7 @@ void CrossEntropyWithSoftmaxGradCPUKernel(const CPUContext& dev_ctx,

  const int rank = logit_grad->dims().size();
  const int axis_v = phi::funcs::CanonicalAxis(axis, rank);
-  int axis_dim = logit_grad->dims()[axis_v];
+  int axis_dim = static_cast<int>(logit_grad->dims()[axis_v]);
  PADDLE_ENFORCE_GT(
      axis_dim,
      0,

--- a/paddle/phi/kernels/cpu/cross_entropy_kernel.cc
+++ b/paddle/phi/kernels/cpu/cross_entropy_kernel.cc
@@ -34,7 +34,7 @@ void CrossEntropy(const CPUContext& dev_ctx,
                  DenseTensor* out) {
  const int rank = x.dims().size();
  const int axis_v = phi::funcs::CanonicalAxis(axis, rank);
-  int axis_dim = x.dims()[axis_v];
+  int axis_dim = static_cast<int>(x.dims()[axis_v]);

  PADDLE_ENFORCE_GT(
      axis_dim,

--- a/paddle/phi/kernels/cpu/cross_grad_kernel.cc
+++ b/paddle/phi/kernels/cpu/cross_grad_kernel.cc
@@ -74,11 +74,11 @@ void CrossGradKernel(const Context &dev_ctx,
  }
  auto outer_loops = 1;
  for (auto i = 0; i < dim; i++) {
-    outer_loops *= input_x_dims[i];
+    outer_loops *= static_cast<int>(input_x_dims[i]);
  }
  auto slice_size = 1;
  for (auto i = dim + 1; i < input_x_dims.size(); i++) {
-    slice_size *= input_x_dims[i];
+    slice_size *= static_cast<int>(input_x_dims[i]);
  }

  std::vector<T> input_x_vec, input_y_vec, input_dout_vec;

--- a/paddle/phi/kernels/cpu/cross_kernel.cc
+++ b/paddle/phi/kernels/cpu/cross_kernel.cc
@@ -72,11 +72,11 @@ void CrossKernel(const Context& dev_ctx,
  }
  auto outer_loops = 1;
  for (auto i = 0; i < dim; i++) {
-    outer_loops *= input_x_dims[i];
+    outer_loops *= static_cast<int>(input_x_dims[i]);
  }
  auto slice_size = 1;
  for (auto i = dim + 1; i < input_x_dims.size(); i++) {
-    slice_size *= input_x_dims[i];
+    slice_size *= static_cast<int>(input_x_dims[i]);
  }

  std::vector<T> input_x_vec, input_y_vec;

--- a/paddle/phi/kernels/cpu/cum_kernel.cc
+++ b/paddle/phi/kernels/cpu/cum_kernel.cc
@@ -82,12 +82,12 @@ void ScanKernel(const Context& dev_ctx,

  int pre = 1;
  int post = 1;
-  int mid = out_dims[axis];
+  int mid = static_cast<int>(out_dims[axis]);
  for (int i = 0; i < axis; ++i) {
-    pre *= out_dims[i];
+    pre *= static_cast<int>(out_dims[i]);
  }
  for (int i = axis + 1; i < out_dims.size(); ++i) {
-    post *= out_dims[i];
+    post *= static_cast<int>(out_dims[i]);
  }

  auto x0 = EigenVector<T>::Flatten(x);

--- a/paddle/phi/kernels/cpu/deformable_conv_grad_kernel.cc
+++ b/paddle/phi/kernels/cpu/deformable_conv_grad_kernel.cc
@@ -117,8 +117,10 @@ void ModulatedDeformableCol2im(const Context& dev_ctx,
                               const std::vector<int>& dilation,
                               const int deformable_group,
                               T* grad_im) {
-  int channel_per_deformable_group = im_shape[0] / deformable_group;
-  int num_kernels = col_shape[0] * col_shape[1] * col_shape[2] * col_shape[3];
+  int channel_per_deformable_group =
+      static_cast<int>(im_shape[0] / deformable_group);
+  int num_kernels = static_cast<int>(col_shape[0] * col_shape[1] *
+                                     col_shape[2] * col_shape[3]);

  ModulatedDeformableCol2imCPUKernel(num_kernels,
                                     data_col,
@@ -275,9 +277,11 @@ void ModulatedDeformableCol2imCoord(const Context& dev_ctx,
                                    const int deformable_groups,
                                    T* grad_offset,
                                    T* grad_mask) {
-  int num_kernels = 2 * kernel_shape[2] * kernel_shape[3] * col_shape[1] *
-                    col_shape[2] * col_shape[3] * deformable_groups;
-  int channel_per_deformable_group = col_shape[0] / deformable_groups;
+  int num_kernels =
+      static_cast<int>(2 * kernel_shape[2] * kernel_shape[3] * col_shape[1] *
+                       col_shape[2] * col_shape[3] * deformable_groups);
+  int channel_per_deformable_group =
+      static_cast<int>(col_shape[0] / deformable_groups);

  ModulatedDeformableCol2imCoordCPUKernel(
      num_kernels,

--- a/paddle/phi/kernels/cpu/diagonal_grad_kernel.cc
+++ b/paddle/phi/kernels/cpu/diagonal_grad_kernel.cc
@@ -38,8 +38,10 @@ void DiagonalGradKernel(const Context& dev_ctx,
  auto dx_dim_size = dx_dim.size();

  const int64_t offset_ = offset;
-  int64_t axis1_ = axis1 < 0 ? dx_dim_size + axis1 : axis1;
-  int64_t axis2_ = axis2 < 0 ? dx_dim_size + axis2 : axis2;
+  int64_t axis1_ =
+      static_cast<int64_t>(axis1 < 0 ? dx_dim_size + axis1 : axis1);
+  int64_t axis2_ =
+      static_cast<int64_t>(axis2 < 0 ? dx_dim_size + axis2 : axis2);

  std::vector<int64_t> dout_stride = funcs::ComputeDimStride(dout_dim);
  std::vector<int64_t> dx_stride = funcs::ComputeDimStride(dx_dim);

--- a/paddle/phi/kernels/cpu/diagonal_kernel.cc
+++ b/paddle/phi/kernels/cpu/diagonal_kernel.cc
@@ -38,8 +38,10 @@ void DiagonalKernel(const Context& dev_ctx,
  auto output_dim_size = output_dim.size();

  const int64_t offset_ = offset;
-  int64_t axis1_ = axis1 < 0 ? input_dim_size + axis1 : axis1;
-  int64_t axis2_ = axis2 < 0 ? input_dim_size + axis2 : axis2;
+  int64_t axis1_ =
+      static_cast<int64_t>(axis1 < 0 ? input_dim_size + axis1 : axis1);
+  int64_t axis2_ =
+      static_cast<int64_t>(axis2 < 0 ? input_dim_size + axis2 : axis2);

  std::vector<int64_t> input_stride = funcs::ComputeDimStride(input_dim);
  std::vector<int64_t> output_stride = funcs::ComputeDimStride(output_dim);

--- a/paddle/phi/kernels/cpu/distribute_fpn_proposals_kernel.cc
+++ b/paddle/phi/kernels/cpu/distribute_fpn_proposals_kernel.cc
@@ -52,14 +52,15 @@ void DistributeFpnProposalsKernel(
  } else {
    fpn_rois_lod = fpn_rois.lod().back();
  }
-  fpn_rois_num = fpn_rois_lod[fpn_rois_lod.size() - 1];
+  fpn_rois_num = static_cast<int>(fpn_rois_lod[fpn_rois_lod.size() - 1]);
  std::vector<int> target_level;

  // record the number of rois in each level
  std::vector<int> num_rois_level(num_level, 0);
  std::vector<int> num_rois_level_integral(num_level + 1, 0);
  for (size_t i = 0; i < fpn_rois_lod.size() - 1; ++i) {
-    auto fpn_rois_slice = fpn_rois.Slice(fpn_rois_lod[i], fpn_rois_lod[i + 1]);
+    auto fpn_rois_slice = fpn_rois.Slice(static_cast<int>(fpn_rois_lod[i]),
+                                         static_cast<int>(fpn_rois_lod[i + 1]));
    const T* rois_data = fpn_rois_slice.data<T>();
    for (int j = 0; j < fpn_rois_slice.dims()[0]; ++j) {
      // get the target level of current rois
@@ -92,7 +93,8 @@ void DistributeFpnProposalsKernel(
  std::vector<int> restore_index_inter(fpn_rois_num, -1);
  // distribute the rois into different fpn level by target level
  for (size_t i = 0; i < fpn_rois_lod.size() - 1; ++i) {
-    auto fpn_rois_slice = fpn_rois.Slice(fpn_rois_lod[i], fpn_rois_lod[i + 1]);
+    auto fpn_rois_slice = fpn_rois.Slice(static_cast<int>(fpn_rois_lod[i]),
+                                         static_cast<int>(fpn_rois_lod[i + 1]));
    const T* rois_data = fpn_rois_slice.data<T>();
    size_t cur_offset = fpn_rois_lod[i];

@@ -105,9 +107,10 @@ void DistributeFpnProposalsKernel(
             rois_data,
             funcs::kBoxDim * sizeof(T));
      multi_fpn_rois_data[lvl - min_level] += funcs::kBoxDim;
-      int index_in_shuffle = num_rois_level_integral[lvl - min_level] +
-                             multi_fpn_rois_lod0[lvl - min_level][i + 1];
-      restore_index_inter[index_in_shuffle] = cur_offset + j;
+      int index_in_shuffle =
+          static_cast<int>(num_rois_level_integral[lvl - min_level] +
+                           multi_fpn_rois_lod0[lvl - min_level][i + 1]);
+      restore_index_inter[index_in_shuffle] = static_cast<int>(cur_offset + j);
      multi_fpn_rois_lod0[lvl - min_level][i + 1]++;
      rois_data += funcs::kBoxDim;
    }
@@ -117,7 +120,7 @@ void DistributeFpnProposalsKernel(
  }

  if (!multi_level_rois_num.empty()) {
-    int batch_size = fpn_rois_lod.size() - 1;
+    int batch_size = static_cast<int>(fpn_rois_lod.size() - 1);
    for (int i = 0; i < num_level; ++i) {
      multi_level_rois_num[i]->Resize({batch_size});
      int* rois_num_data = dev_ctx.template Alloc<int>(multi_level_rois_num[i]);

--- a/paddle/phi/kernels/cpu/eig_grad_kernel.cc
+++ b/paddle/phi/kernels/cpu/eig_grad_kernel.cc
@@ -32,7 +32,7 @@ void EigGradKernel(const Context& dev_ctx,
  phi::DDim dim_origin = dims;
  int num_dims = dim_origin.size();
  int batch_count = BatchCount(out_v);
-  const int order = dim_origin[num_dims - 1];
+  const int order = static_cast<int>(dim_origin[num_dims - 1]);

  ComputeBackwardForComplexInput<phi::dtype::Complex<T>, Context>(
      out_w, out_v, dout_w, dout_v, dx_data, batch_count, order, dev_ctx);

--- a/paddle/phi/kernels/cpu/eig_kernel.cc
+++ b/paddle/phi/kernels/cpu/eig_kernel.cc
@@ -33,7 +33,7 @@ void EigKernel(const Context& dev_ctx,
    dev_ctx.template Alloc<phi::dtype::Complex<T>>(out_v);

    int batch_count = BatchCount(x);
-    int order = x.dims()[x.dims().size() - 1];
+    int order = static_cast<int>(x.dims()[x.dims().size() - 1]);

    PADDLE_ENFORCE_LT(0,
                      order,
@@ -69,7 +69,7 @@ void EigKernel(const Context& dev_ctx,
    // 2. construct complex values
    auto* real_part_data = real_part.data<phi::dtype::Real<T>>();
    auto* imag_part_data = imag_part.data<phi::dtype::Real<T>>();
-    int out_w_numel = out_w->numel();
+    int out_w_numel = static_cast<int>(out_w->numel());

    phi::funcs::ForRange<Context> for_range(dev_ctx, out_w_numel);
    phi::funcs::RealImagToComplexFunctor<phi::dtype::Complex<T>> functor(

--- a/paddle/phi/kernels/cpu/eigvals_kernel.cc
+++ b/paddle/phi/kernels/cpu/eigvals_kernel.cc
@@ -81,7 +81,7 @@ typename std::enable_if<std::is_floating_point<T>::value>::type LapackEigvals(
  w.Resize(make_ddim({n_dim << 1}));
  T* w_data = ctx.template Alloc<T>(&w);

-  int64_t work_mem = work->memory_size();
+  int64_t work_mem = static_cast<int64_t>(work->memory_size());
  int64_t required_work_mem = 3 * n_dim * sizeof(T);
  PADDLE_ENFORCE_GE(
      work_mem,
@@ -132,7 +132,7 @@ LapackEigvals(const Context& ctx,
  DenseTensor a;  // will be overwritten when lapackEig exit
  Copy(ctx, input, input.place(), /*blocking=*/true, &a);

-  int64_t work_mem = work->memory_size();
+  int64_t work_mem = static_cast<int64_t>(work->memory_size());
  int64_t n_dim = input.dims()[1];
  int64_t required_work_mem = 3 * n_dim * sizeof(T);
  PADDLE_ENFORCE_GE(
@@ -145,7 +145,7 @@ LapackEigvals(const Context& ctx,
          required_work_mem,
          work_mem));

-  int64_t rwork_mem = rwork->memory_size();
+  int64_t rwork_mem = static_cast<int64_t>(rwork->memory_size());
  int64_t required_rwork_mem = (n_dim << 1) * sizeof(dtype::Real<T>);
  PADDLE_ENFORCE_GE(
      rwork_mem,
@@ -185,7 +185,7 @@ void SpiltBatchSquareMatrix(const DenseTensor& input,
                            std::vector<DenseTensor>* output) {
  DDim input_dims = input.dims();
  int last_dim = input_dims.size() - 1;
-  int n_dim = input_dims[last_dim];
+  int n_dim = static_cast<int>(input_dims[last_dim]);

  DDim flattened_input_dims, flattened_output_dims;
  if (input_dims.size() > 2) {
@@ -209,7 +209,7 @@ void EigvalsKernel(const Context& ctx, const DenseTensor& x, DenseTensor* out) {
  SpiltBatchSquareMatrix(x, /*->*/ &x_matrices);

  int64_t n_dim = x_matrices[0].dims()[1];
-  int64_t n_batch = x_matrices.size();
+  int64_t n_batch = static_cast<int64_t>(x_matrices.size());
  DDim out_dims = out->dims();
  out->Resize(make_ddim({n_batch, n_dim}));
  std::vector<DenseTensor> out_vectors = out->Split(1, 0);

--- a/paddle/phi/kernels/cpu/fill_diagonal_tensor_grad_kernel.cc
+++ b/paddle/phi/kernels/cpu/fill_diagonal_tensor_grad_kernel.cc
@@ -27,7 +27,7 @@ void FillDiagonalTensorGradKernel(const Context& ctx,
                                  int dim1,
                                  int dim2,
                                  DenseTensor* x_grad) {
-  auto matrows = 1;
+  int matrows = 1;

  if (x_grad) {
    auto* data = ctx.template Alloc<T>(x_grad);
@@ -35,7 +35,7 @@ void FillDiagonalTensorGradKernel(const Context& ctx,
    auto dx_dims = x_grad->dims();
    for (int i = 0; i < dx_dims.size(); i++) {
      if (i != dim1 && i != dim2) {
-        matrows *= dx_dims[i];
+        matrows *= static_cast<int>(dx_dims[i]);
      }
    }


--- a/paddle/phi/kernels/cpu/fill_diagonal_tensor_kernel.cc
+++ b/paddle/phi/kernels/cpu/fill_diagonal_tensor_kernel.cc
@@ -59,7 +59,7 @@ void CalMatDims(phi::DDim out_dims,
    dimprod *= out_dims[i];
  }

-  auto diagdim = dim1;
+  int64_t diagdim = dim1;
  if (*offset >= 0) {
    diagdim = std::min(out_dims[dim1], out_dims[dim2] - *offset);
    *offset *= strides[0];

--- a/paddle/phi/kernels/cpu/gather_tree_kernel.cc
+++ b/paddle/phi/kernels/cpu/gather_tree_kernel.cc
@@ -30,7 +30,7 @@ void GatherTreeKernel(const Context &dev_ctx,
  T *out_data = dev_ctx.template Alloc<T>(out);

  auto &ids_dims = ids.dims();
-  auto max_length = ids_dims[0];
+  int64_t max_length = ids_dims[0];
  auto batch_size = ids_dims[1];
  auto beam_size = ids_dims[2];

@@ -49,7 +49,7 @@ void GatherTreeKernel(const Context &dev_ctx,
          (max_length - 1) * batch_size * beam_size + batch * beam_size + beam;
      out_data[idx] = ids_data[idx];
      auto parent = parents_data[idx];
-      for (int step = max_length - 2; step >= 0; step--) {
+      for (int64_t step = max_length - 2; step >= 0; step--) {
        PADDLE_ENFORCE_LT(
            parent,
            beam_size,

--- a/paddle/phi/kernels/cpu/generate_proposals_kernel.cc
+++ b/paddle/phi/kernels/cpu/generate_proposals_kernel.cc
@@ -28,7 +28,7 @@ static void AppendProposals(DenseTensor* dst,
  auto* out_data = dst->data();
  auto* to_add_data = src.data();
  size_t size_of_t = SizeOf(src.dtype());
-  offset *= size_of_t;
+  offset *= static_cast<int64_t>(size_of_t);
  std::memcpy(
      reinterpret_cast<void*>(reinterpret_cast<uintptr_t>(out_data) + offset),
      to_add_data,
@@ -367,7 +367,7 @@ void GenerateProposalsKernel(const Context& ctx,
    AppendProposals(rpn_roi_probs, num_proposals, nscores);
    num_proposals += proposals.dims()[0];
    lod0.push_back(num_proposals);
-    tmp_num.push_back(proposals.dims()[0]);
+    tmp_num.push_back(static_cast<int>(proposals.dims()[0]));
  }
  if (rpn_rois_num != nullptr) {
    rpn_rois_num->Resize(phi::make_ddim({num}));

--- a/paddle/phi/kernels/cpu/graph_reindex_kernel.cc
+++ b/paddle/phi/kernels/cpu/graph_reindex_kernel.cc
@@ -35,8 +35,8 @@ void GraphReindexKernel(const Context& dev_ctx,
  const T* x_data = x.data<T>();
  const T* neighbors_data = neighbors.data<T>();
  const int* count_data = count.data<int>();
-  const int bs = x.dims()[0];
-  const int num_edges = neighbors.dims()[0];
+  const int bs = static_cast<int>(x.dims()[0]);
+  const int num_edges = static_cast<int>(neighbors.dims()[0]);

  std::unordered_map<T, T> node_map;
  std::vector<T> unique_nodes;
@@ -63,7 +63,7 @@ void GraphReindexKernel(const Context& dev_ctx,
  }
  // Reindex Dst
  // Add support for multi-type edges reindex
-  int num_edge_types = count.dims()[0] / bs;
+  int num_edge_types = static_cast<int>(count.dims()[0] / bs);
  int cnt = 0;
  for (int i = 0; i < num_edge_types; i++) {
    for (int j = 0; j < bs; j++) {

--- a/paddle/phi/kernels/cpu/graph_sample_neighbors_kernel.cc
+++ b/paddle/phi/kernels/cpu/graph_sample_neighbors_kernel.cc
@@ -178,7 +178,7 @@ void GraphSampleNeighborsKernel(
  const T* row_data = row.data<T>();
  const T* col_ptr_data = col_ptr.data<T>();
  const T* x_data = x.data<T>();
-  int bs = x.dims()[0];
+  int bs = static_cast<int>(x.dims()[0]);

  std::vector<T> output;
  std::vector<int> output_count;

--- a/paddle/phi/kernels/cpu/grid_sample_grad_kernel.cc
+++ b/paddle/phi/kernels/cpu/grid_sample_grad_kernel.cc
@@ -154,9 +154,9 @@ static void CalcGridLocationsWithGrad(const CPUContext& ctx,
                                      DenseTensor* grid_y,
                                      DenseTensor* grid_x_scale,
                                      DenseTensor* grid_y_scale) {
-  const int n = grid.dims()[0];
-  const int out_h = grid.dims()[1];
-  const int out_w = grid.dims()[2];
+  const int n = static_cast<int>(grid.dims()[0]);
+  const int out_h = static_cast<int>(grid.dims()[1]);
+  const int out_w = static_cast<int>(grid.dims()[2]);

  // split grid with shape (n, h, w, 2) into (x, y) by the 3rd Dim
  grid_x->Resize({n, out_h, out_w});
@@ -193,10 +193,10 @@ static void Calc3DGridLocationsWithGrad(const CPUContext& ctx,
                                        DenseTensor* grid_x_scale,
                                        DenseTensor* grid_y_scale,
                                        DenseTensor* grid_z_scale) {
-  const int n = grid.dims()[0];
-  const int out_d = grid.dims()[1];
-  const int out_h = grid.dims()[2];
-  const int out_w = grid.dims()[3];
+  const int n = static_cast<int>(grid.dims()[0]);
+  const int out_d = static_cast<int>(grid.dims()[1]);
+  const int out_h = static_cast<int>(grid.dims()[2]);
+  const int out_w = static_cast<int>(grid.dims()[3]);

  // split grid with shape (n, d, h, w, 3) into (x, y, z) by the 3rd Dim
  grid_x->Resize({n, out_d, out_h, out_w});
@@ -232,12 +232,12 @@ static void GatherOutputGradToInputGrad(const DenseTensor& output_grad,
                                        const DenseTensor& y,
                                        const DenseTensor& d1,
                                        const DenseTensor& d2) {
-  const int n = output_grad.dims()[0];
-  const int c = output_grad.dims()[1];
-  const int out_h = output_grad.dims()[2];
-  const int out_w = output_grad.dims()[3];
-  const int in_h = input_grad->dims()[2];
-  const int in_w = input_grad->dims()[3];
+  const int n = static_cast<int>(output_grad.dims()[0]);
+  const int c = static_cast<int>(output_grad.dims()[1]);
+  const int out_h = static_cast<int>(output_grad.dims()[2]);
+  const int out_w = static_cast<int>(output_grad.dims()[3]);
+  const int in_h = static_cast<int>(input_grad->dims()[2]);
+  const int in_w = static_cast<int>(input_grad->dims()[3]);
  auto x_t = EigenTensor<T, 3>::From(x);
  auto y_t = EigenTensor<T, 3>::From(y);
  auto d1_t = EigenTensor<T, 3>::From(d1);
@@ -272,14 +272,14 @@ static void Gather3DOutputGradToInputGrad(const DenseTensor& output_grad,
                                          const DenseTensor& d1,
                                          const DenseTensor& d2,
                                          const DenseTensor& d3) {
-  const int n = output_grad.dims()[0];
-  const int c = output_grad.dims()[1];
-  const int out_d = output_grad.dims()[2];
-  const int out_h = output_grad.dims()[3];
-  const int out_w = output_grad.dims()[4];
-  const int in_d = input_grad->dims()[2];
-  const int in_h = input_grad->dims()[3];
-  const int in_w = input_grad->dims()[4];
+  const int n = static_cast<int>(output_grad.dims()[0]);
+  const int c = static_cast<int>(output_grad.dims()[1]);
+  const int out_d = static_cast<int>(output_grad.dims()[2]);
+  const int out_h = static_cast<int>(output_grad.dims()[3]);
+  const int out_w = static_cast<int>(output_grad.dims()[4]);
+  const int in_d = static_cast<int>(input_grad->dims()[2]);
+  const int in_h = static_cast<int>(input_grad->dims()[3]);
+  const int in_w = static_cast<int>(input_grad->dims()[4]);
  auto x_t = EigenTensor<T, 4>::From(x);
  auto y_t = EigenTensor<T, 4>::From(y);
  auto z_t = EigenTensor<T, 4>::From(z);
@@ -325,10 +325,10 @@ static void GatherBilinearGrad(const CPUContext& ctx,
                               DenseTensor* grid_y_scale,
                               DenseTensor* input_grad,
                               DenseTensor* grid_grad) {
-  const int n = grid_x->dims()[0];
-  const int out_h = grid_x->dims()[1];
-  const int out_w = grid_x->dims()[2];
-  const int c = input.dims()[1];
+  const int n = static_cast<int>(grid_x->dims()[0]);
+  const int out_h = static_cast<int>(grid_x->dims()[1]);
+  const int out_w = static_cast<int>(grid_x->dims()[2]);
+  const int c = static_cast<int>(input.dims()[1]);

  DenseTensor x_w, x_e, y_n, y_s;
  DenseTensor d_w, d_e, d_n, d_s;
@@ -427,11 +427,11 @@ static void Gather3DBilinearGrad(const CPUContext& ctx,
                                 DenseTensor* grid_z_scale,
                                 DenseTensor* input_grad,
                                 DenseTensor* grid_grad) {
-  const int n = grid_x->dims()[0];
-  const int out_d = grid_x->dims()[1];
-  const int out_h = grid_x->dims()[2];
-  const int out_w = grid_x->dims()[3];
-  const int c = input.dims()[1];
+  const int n = static_cast<int>(grid_x->dims()[0]);
+  const int out_d = static_cast<int>(grid_x->dims()[1]);
+  const int out_h = static_cast<int>(grid_x->dims()[2]);
+  const int out_w = static_cast<int>(grid_x->dims()[3]);
+  const int c = static_cast<int>(input.dims()[1]);

  DenseTensor x_w, x_e, y_n, y_s, z_t, z_b;
  DenseTensor d_w, d_e, d_n, d_s, d_t, d_b;
@@ -577,12 +577,12 @@ static void GatherOutputGradToInputGrad(const DenseTensor& output_grad,
                                        DenseTensor* input_grad,
                                        const DenseTensor& x,
                                        const DenseTensor& y) {
-  const int n = output_grad.dims()[0];
-  const int c = output_grad.dims()[1];
-  const int out_h = output_grad.dims()[2];
-  const int out_w = output_grad.dims()[3];
-  const int in_h = input_grad->dims()[2];
-  const int in_w = input_grad->dims()[3];
+  const int n = static_cast<int>(output_grad.dims()[0]);
+  const int c = static_cast<int>(output_grad.dims()[1]);
+  const int out_h = static_cast<int>(output_grad.dims()[2]);
+  const int out_w = static_cast<int>(output_grad.dims()[3]);
+  const int in_h = static_cast<int>(input_grad->dims()[2]);
+  const int in_w = static_cast<int>(input_grad->dims()[3]);
  auto x_t = EigenTensor<T, 3>::From(x);
  auto y_t = EigenTensor<T, 3>::From(y);
  auto input_grad_t = EigenTensor<T, 4>::From(*input_grad);
@@ -611,14 +611,14 @@ static void Gather3DOutputGradToInputGrad(const DenseTensor& output_grad,
                                          const DenseTensor& x,
                                          const DenseTensor& y,
                                          const DenseTensor& z) {
-  const int n = output_grad.dims()[0];
-  const int c = output_grad.dims()[1];
-  const int out_d = output_grad.dims()[2];
-  const int out_h = output_grad.dims()[3];
-  const int out_w = output_grad.dims()[4];
-  const int in_d = input_grad->dims()[2];
-  const int in_h = input_grad->dims()[3];
-  const int in_w = input_grad->dims()[4];
+  const int n = static_cast<int>(output_grad.dims()[0]);
+  const int c = static_cast<int>(output_grad.dims()[1]);
+  const int out_d = static_cast<int>(output_grad.dims()[2]);
+  const int out_h = static_cast<int>(output_grad.dims()[3]);
+  const int out_w = static_cast<int>(output_grad.dims()[4]);
+  const int in_d = static_cast<int>(input_grad->dims()[2]);
+  const int in_h = static_cast<int>(input_grad->dims()[3]);
+  const int in_w = static_cast<int>(input_grad->dims()[4]);
  auto x_t = EigenTensor<T, 4>::From(x);
  auto y_t = EigenTensor<T, 4>::From(y);
  auto z_t = EigenTensor<T, 4>::From(z);
@@ -660,12 +660,12 @@ void GridSampleGradKernel(const Context& dev_ctx,
                          DenseTensor* x_grad,
                          DenseTensor* grid_grad) {
  if (x.dims().size() == 4) {
-    const int n = grid.dims()[0];
-    const int out_h = grid.dims()[1];
-    const int out_w = grid.dims()[2];
-    const int c = x.dims()[1];
-    const int in_h = x.dims()[2];
-    const int in_w = x.dims()[3];
+    const int n = static_cast<int>(grid.dims()[0]);
+    const int out_h = static_cast<int>(grid.dims()[1]);
+    const int out_w = static_cast<int>(grid.dims()[2]);
+    const int c = static_cast<int>(x.dims()[1]);
+    const int in_h = static_cast<int>(x.dims()[2]);
+    const int in_w = static_cast<int>(x.dims()[3]);

    x_grad->Resize({n, c, in_h, in_w});
    dev_ctx.template Alloc<T>(x_grad);
@@ -708,14 +708,14 @@ void GridSampleGradKernel(const Context& dev_ctx,
      GatherOutputGradToInputGrad<T>(out_grid, x_grad, grid_x, grid_y);
    }
  } else {
-    const int n = grid.dims()[0];
-    const int out_d = grid.dims()[1];
-    const int out_h = grid.dims()[2];
-    const int out_w = grid.dims()[3];
-    const int c = x.dims()[1];
-    const int in_d = x.dims()[2];
-    const int in_h = x.dims()[3];
-    const int in_w = x.dims()[4];
+    const int n = static_cast<int>(grid.dims()[0]);
+    const int out_d = static_cast<int>(grid.dims()[1]);
+    const int out_h = static_cast<int>(grid.dims()[2]);
+    const int out_w = static_cast<int>(grid.dims()[3]);
+    const int c = static_cast<int>(x.dims()[1]);
+    const int in_d = static_cast<int>(x.dims()[2]);
+    const int in_h = static_cast<int>(x.dims()[3]);
+    const int in_w = static_cast<int>(x.dims()[4]);

    x_grad->Resize({n, c, in_d, in_h, in_w});
    dev_ctx.template Alloc<T>(x_grad);

--- a/paddle/phi/kernels/cpu/grid_sample_kernel.cc
+++ b/paddle/phi/kernels/cpu/grid_sample_kernel.cc
@@ -97,9 +97,9 @@ static void CalcGridLocations(const CPUContext& ctx,
                              std::string padding_mode,
                              DenseTensor* grid_x,
                              DenseTensor* grid_y) {
-  const int n = grid.dims()[0];
-  const int out_h = grid.dims()[1];
-  const int out_w = grid.dims()[2];
+  const int n = static_cast<int>(grid.dims()[0]);
+  const int out_h = static_cast<int>(grid.dims()[1]);
+  const int out_w = static_cast<int>(grid.dims()[2]);

  // split grid with shape (n, h, w, 2) into (x, y) by the 3rd Dim
  grid_x->Resize({n, out_h, out_w});
@@ -130,10 +130,10 @@ static void Calc3DGridLocations(const CPUContext& ctx,
                                DenseTensor* grid_x,
                                DenseTensor* grid_y,
                                DenseTensor* grid_z) {
-  const int n = grid.dims()[0];
-  const int out_d = grid.dims()[1];
-  const int out_h = grid.dims()[2];
-  const int out_w = grid.dims()[3];
+  const int n = static_cast<int>(grid.dims()[0]);
+  const int out_d = static_cast<int>(grid.dims()[1]);
+  const int out_h = static_cast<int>(grid.dims()[2]);
+  const int out_w = static_cast<int>(grid.dims()[3]);

  // split grid with shape (n, d, h, w, 3) into (x, y, z) by the 3rd Dim
  grid_x->Resize({n, out_d, out_h, out_w});
@@ -165,10 +165,10 @@ static void BilinearInter(const CPUContext& ctx,
                          DenseTensor* grid_y,
                          DenseTensor* out) {
  auto& place = *ctx.eigen_device();
-  const int n = grid_x->dims()[0];
-  const int out_h = grid_x->dims()[1];
-  const int out_w = grid_x->dims()[2];
-  const int c = input.dims()[1];
+  const int n = static_cast<int>(grid_x->dims()[0]);
+  const int out_h = static_cast<int>(grid_x->dims()[1]);
+  const int out_w = static_cast<int>(grid_x->dims()[2]);
+  const int c = static_cast<int>(input.dims()[1]);

  DenseTensor x_w, x_e, y_n, y_s;
  DenseTensor d_w, d_e, d_n, d_s;
@@ -224,11 +224,11 @@ static void Bilinear3DInter(const CPUContext& ctx,
                            DenseTensor* grid_z,
                            DenseTensor* out) {
  auto& place = *ctx.eigen_device();
-  const int n = grid_x->dims()[0];
-  const int out_d = grid_x->dims()[1];
-  const int out_h = grid_x->dims()[2];
-  const int out_w = grid_x->dims()[3];
-  const int c = input.dims()[1];
+  const int n = static_cast<int>(grid_x->dims()[0]);
+  const int out_d = static_cast<int>(grid_x->dims()[1]);
+  const int out_h = static_cast<int>(grid_x->dims()[2]);
+  const int out_w = static_cast<int>(grid_x->dims()[3]);
+  const int c = static_cast<int>(input.dims()[1]);

  // get corner pixel values from (x, y, z)
  // for 4d, we used north-east-south-west
@@ -313,12 +313,12 @@ void GridSampleKernel(const Context& dev_ctx,
                      bool align_corners,
                      DenseTensor* out) {
  if (x.dims().size() == 4) {
-    const int n = grid.dims()[0];
-    const int out_h = grid.dims()[1];
-    const int out_w = grid.dims()[2];
-    const int c = x.dims()[1];
-    const int in_h = x.dims()[2];
-    const int in_w = x.dims()[3];
+    const int n = static_cast<int>(grid.dims()[0]);
+    const int out_h = static_cast<int>(grid.dims()[1]);
+    const int out_w = static_cast<int>(grid.dims()[2]);
+    const int c = static_cast<int>(x.dims()[1]);
+    const int in_h = static_cast<int>(x.dims()[2]);
+    const int in_w = static_cast<int>(x.dims()[3]);

    out->Resize(phi::make_ddim({n, c, out_h, out_w}));
    dev_ctx.template Alloc<T>(out);
@@ -344,14 +344,14 @@ void GridSampleKernel(const Context& dev_ctx,
      GetGridPointValue<T>(x, out, grid_x, grid_y);
    }
  } else {
-    const int n = grid.dims()[0];
-    const int out_d = grid.dims()[1];
-    const int out_h = grid.dims()[2];
-    const int out_w = grid.dims()[3];
-    const int c = x.dims()[1];
-    const int in_d = x.dims()[2];
-    const int in_h = x.dims()[3];
-    const int in_w = x.dims()[4];
+    const int n = static_cast<int>(grid.dims()[0]);
+    const int out_d = static_cast<int>(grid.dims()[1]);
+    const int out_h = static_cast<int>(grid.dims()[2]);
+    const int out_w = static_cast<int>(grid.dims()[3]);
+    const int c = static_cast<int>(x.dims()[1]);
+    const int in_d = static_cast<int>(x.dims()[2]);
+    const int in_h = static_cast<int>(x.dims()[3]);
+    const int in_w = static_cast<int>(x.dims()[4]);

    out->Resize(phi::make_ddim({n, c, out_d, out_h, out_w}));
    dev_ctx.template Alloc<T>(out);

--- a/paddle/phi/kernels/cpu/group_norm_grad_kernel.cc
+++ b/paddle/phi/kernels/cpu/group_norm_grad_kernel.cc
@@ -48,8 +48,8 @@ void GroupNormGradKernel(const Context& dev_ctx,
  const auto scale_ptr = scale.get_ptr();
  const auto bias_ptr = bias.get_ptr();
  const auto& x_dims = y.dims();
-  const int C = (data_layout == DataLayout::kNCHW ? x_dims[1]
-                                                  : x_dims[x_dims.size() - 1]);
+  const int C = static_cast<int>(
+      data_layout == DataLayout::kNCHW ? x_dims[1] : x_dims[x_dims.size() - 1]);
  const int group_size = C / groups;

  dev_ctx.template Alloc<T>(d_x);
@@ -80,11 +80,11 @@ void GroupNormGradKernel(const Context& dev_ctx,
  int imsize = 1;
  if (data_layout == DataLayout::kNCHW) {
    for (int i = 2; i < x_dims.size(); ++i) {
-      imsize *= x_dims[i];
+      imsize *= static_cast<int>(x_dims[i]);
    }
  } else {
    for (int i = 1; i < x_dims.size() - 1; ++i) {
-      imsize *= x_dims[i];
+      imsize *= static_cast<int>(x_dims[i]);
    }
  }
  auto* iter_x_data = x_data;

--- a/paddle/phi/kernels/cpu/group_norm_kernel.cc
+++ b/paddle/phi/kernels/cpu/group_norm_kernel.cc
@@ -45,8 +45,8 @@ void GroupNormKernel(const Context& dev_ctx,
  const auto bias_ptr = bias.get_ptr();

  const auto x_dims = x.dims();
-  const int C = (data_layout == DataLayout::kNCHW ? x_dims[1]
-                                                  : x_dims[x_dims.size() - 1]);
+  const int C = static_cast<int>(
+      data_layout == DataLayout::kNCHW ? x_dims[1] : x_dims[x_dims.size() - 1]);
  const int group_size = C / groups;

  dev_ctx.template Alloc<T>(y);
@@ -66,11 +66,11 @@ void GroupNormKernel(const Context& dev_ctx,
  int imsize = 1;
  if (data_layout == DataLayout::kNCHW) {
    for (int i = 2; i < x_dims.size(); ++i) {
-      imsize *= x_dims[i];
+      imsize *= static_cast<int>(x_dims[i]);
    }
  } else {
    for (int i = 1; i < x_dims.size() - 1; ++i) {
-      imsize *= x_dims[i];
+      imsize *= static_cast<int>(x_dims[i]);
    }
  }
  auto* iter_x_data = x_data;

--- a/paddle/phi/kernels/cpu/gumbel_softmax_kernel.cc
+++ b/paddle/phi/kernels/cpu/gumbel_softmax_kernel.cc
@@ -66,7 +66,7 @@ struct OneHotGenerator<CPUContext, T> {
    const int size_out_axis = funcs::SizeOutAxis(axis, x.dims());

    for (int i = 0; i < x.dims().size(); i++) {
-      if (i != axis) index_dim.push_back(x.dims().Get()[i]);
+      if (i != axis) index_dim.push_back(static_cast<int>(x.dims().Get()[i]));
    }
    DDim index_ddim(index_dim.data(), rank - 1);
    index.Resize(index_ddim);

--- a/paddle/phi/kernels/cpu/hsigmoid_loss_kernel.cc
+++ b/paddle/phi/kernels/cpu/hsigmoid_loss_kernel.cc
@@ -45,9 +45,10 @@ void HSigmoidLossKernel(const Context& ctx,
  if (path.get_ptr()) {
    is_custom = true;
  }
-  int64_t code_length = path.get_ptr()
-                            ? path.get_ptr()->dims()[1]
-                            : phi::funcs::FindLastSet(num_classes_st - 1);
+  int64_t code_length =
+      path.get_ptr()
+          ? static_cast<int64_t>(path.get_ptr()->dims()[1])
+          : static_cast<int64_t>(phi::funcs::FindLastSet(num_classes_st - 1));
  int64_t batch_size = x.dims()[0];
  DenseTensor sum;
  pre_out->Resize(phi::make_ddim({batch_size, code_length}));

--- a/paddle/phi/kernels/cpu/index_put_grad_kernel.cc
+++ b/paddle/phi/kernels/cpu/index_put_grad_kernel.cc
@@ -210,7 +210,8 @@ void IndexPutGradKernel(const Context& dev_ctx,
  std::vector<DenseTensor> tmp_res_indices_v;
  std::vector<DenseTensor> range_tensor_v;

-  for (int i = int_indices_v.size(); i < x.dims().size(); ++i) {
+  for (int i = static_cast<int>(int_indices_v.size()); i < x.dims().size();
+       ++i) {
    range_tensor_v.emplace_back(funcs::GetRangeTensor<int64_t, Context>(
        dev_ctx, x.dims()[i], phi::DataType::INT64));
  }

--- a/paddle/phi/kernels/cpu/index_put_kernel.cc
+++ b/paddle/phi/kernels/cpu/index_put_kernel.cc
@@ -134,7 +134,8 @@ void IndexPutKernel(const Context& dev_ctx,
  std::vector<DenseTensor> range_tensor_v;
  const DenseTensor* ptr_value = nullptr;

-  for (int i = int_indices_v.size(); i < x.dims().size(); ++i) {
+  for (int i = static_cast<int>(int_indices_v.size()); i < x.dims().size();
+       ++i) {
    range_tensor_v.emplace_back(funcs::GetRangeTensor<int64_t, Context>(
        dev_ctx, x.dims()[i], phi::DataType::INT64));
  }

--- a/paddle/phi/kernels/cpu/index_sample_grad_kernel.cc
+++ b/paddle/phi/kernels/cpu/index_sample_grad_kernel.cc
@@ -35,7 +35,7 @@ void IndexSampleGradInner(const Context& context,

  auto value_length = x_grad_dims[1];
  auto index_length = index_dims[1];
-  int index_ids_num = index.numel();
+  int index_ids_num = static_cast<int>(index.numel());

  std::vector<T> x_grad_vec(x_grad->numel(), 0);


--- a/paddle/phi/kernels/cpu/index_sample_kernel.cc
+++ b/paddle/phi/kernels/cpu/index_sample_kernel.cc
@@ -37,10 +37,10 @@ void IndexSampleInner(const Context &context,
  auto input_dims = input.dims();
  auto index_dims = index.dims();

-  int batch_size = input_dims[0];
+  int batch_size = static_cast<int>(input_dims[0]);
  auto value_length = input_dims[1];
  auto index_length = index_dims[1];
-  int index_ids_num = index.numel();
+  int index_ids_num = static_cast<int>(index.numel());

  std::vector<T> input_vec;
  std::vector<IndexT> index_vec;

--- a/paddle/phi/kernels/cpu/instance_norm_grad_kernel.cc
+++ b/paddle/phi/kernels/cpu/instance_norm_grad_kernel.cc
@@ -55,10 +55,10 @@ void InstanceNormGradKernel(const Context& dev_ctx,

  const auto& x_dims = x.dims();

-  const int N = x_dims[0];
-  const int C = x_dims[1];
+  const int N = static_cast<int>(x_dims[0]);
+  const int C = static_cast<int>(x_dims[1]);
  const int NxC = N * C;
-  const int sample_size = x.numel() / N / C;
+  const int sample_size = static_cast<int>(x.numel() / N / C);

  dev_ctx.template Alloc<T>(d_x);
  auto* place = dev_ctx.eigen_device();
@@ -172,7 +172,7 @@ void InstanceNormDoubleGradKernel(const Context& dev_ctx,
  const auto& x_dims = x.dims();
  int N, C, H, W, D;
  funcs::ExtractNCWHD(x_dims, DataLayout::kNCHW, &N, &C, &H, &W, &D);
-  const int sample_size = x.numel() / N / C;
+  const int sample_size = static_cast<int>(x.numel() / N / C);
  const int NxC = N * C;

  const T* mean_data = saved_mean.data<T>();

--- a/paddle/phi/kernels/cpu/instance_norm_kernel.cc
+++ b/paddle/phi/kernels/cpu/instance_norm_kernel.cc
@@ -40,10 +40,10 @@ void InstanceNormKernel(const Context& dev_ctx,
                        DenseTensor* saved_variance) {
  const auto& x_dims = x.dims();
  T epsilon = static_cast<T>(epsilon_f);
-  const int N = x_dims[0];
-  const int C = x_dims[1];
+  const int N = static_cast<int>(x_dims[0]);
+  const int C = static_cast<int>(x_dims[1]);
  const int NxC = N * C;
-  const int sample_size = x.numel() / N / C;
+  const int sample_size = static_cast<int>(x.numel() / N / C);
  auto* place = dev_ctx.eigen_device();

  Eigen::DSizes<int, 2> shape(NxC, sample_size);

--- a/paddle/phi/kernels/cpu/interpolate_grad_kernel.cc
+++ b/paddle/phi/kernels/cpu/interpolate_grad_kernel.cc
@@ -40,15 +40,18 @@ static void LinearInterpolationGrad(const DenseTensor& output_grad,
  bool align_flag = (align_mode == 0 && !align_corners);
  using MT = typename phi::dtype::MPTypeTrait<T>::Type;
  for (int l = 0; l < out_w; l++) {
-    int x_w = align_flag ? static_cast<int>(ratio_w * (l + 0.5) - 0.5)
-                         : static_cast<int>(ratio_w * l);
+    int x_w = static_cast<int>(align_flag ? (ratio_w * (l + 0.5) - 0.5)
+                                          : (ratio_w * static_cast<float>(l)));
    x_w = (x_w > 0) ? x_w : 0;                       // w
    int x_e = (x_w < (in_w - 1)) ? (x_w + 1) : x_w;  // w_id

-    float idx_src_x = ratio_w * (l + 0.5) - 0.5;
+    float idx_src_x = ratio_w * (static_cast<float>(l) + 0.5f) - 0.5f;
    idx_src_x = (idx_src_x > 0) ? idx_src_x : 0;
-    float d_w = align_flag ? idx_src_x - x_w : ratio_w * l - x_w;  // w1lambda
-    float d_e = 1.f - d_w;                                         // w2lambda
+    float d_w = static_cast<float>(
+        align_flag ? idx_src_x - static_cast<float>(x_w)
+                   : ratio_w * static_cast<float>(l) -
+                         static_cast<float>(x_w));  // w1lambda
+    float d_e = 1.f - d_w;                          // w2lambda

    for (int i = 0; i < n; i++) {    // loop for batches
      for (int j = 0; j < c; j++) {  // loop for channels
@@ -88,23 +91,28 @@ static void BilinearInterpolationGrad(const DenseTensor& output_grad,
  using MT = typename phi::dtype::MPTypeTrait<T>::Type;

  for (int k = 0; k < out_h; k++) {  // loop for images
-    int y_n = align_flag ? static_cast<int>(ratio_h * (k + 0.5) - 0.5)
-                         : static_cast<int>(ratio_h * k);
+    int y_n = static_cast<int>(align_flag ? (ratio_h * (k + 0.5) - 0.5)
+                                          : (ratio_h * static_cast<float>(k)));
    y_n = (y_n > 0) ? y_n : 0;
    int y_s = (y_n + 1) < (in_h - 1) ? (y_n + 1) : (in_h - 1);
-    float idx_src_y = ratio_h * (k + 0.5) - 0.5;
+    float idx_src_y = ratio_h * (static_cast<float>(k) + 0.5f) - 0.5f;
    idx_src_y = (idx_src_y > 0) ? idx_src_y : 0;
-    float d_n = align_flag ? idx_src_y - y_n : ratio_h * k - y_n;
+    float d_n = align_flag
+                    ? idx_src_y - static_cast<float>(y_n)
+                    : ratio_h * static_cast<float>(k) - static_cast<float>(y_n);
    float d_s = 1.f - d_n;

    for (int l = 0; l < out_w; l++) {
-      int x_w = align_flag ? static_cast<int>(ratio_w * (l + 0.5) - 0.5)
-                           : static_cast<int>(ratio_w * l);
+      int x_w = static_cast<int>(
+          align_flag ? (ratio_w * (static_cast<float>(l) + 0.5f) - 0.5f)
+                     : (ratio_w * static_cast<float>(l)));
      x_w = (x_w > 0) ? x_w : 0;
      int x_e = (x_w + 1) < (in_w - 1) ? (x_w + 1) : (in_w - 1);
-      float idx_src_x = ratio_w * (l + 0.5) - 0.5;
+      float idx_src_x = ratio_w * (static_cast<float>(l) + 0.5f) - 0.5f;
      idx_src_x = (idx_src_x > 0) ? idx_src_x : 0;
-      float d_w = align_flag ? idx_src_x - x_w : ratio_w * l - x_w;
+      float d_w = align_flag ? idx_src_x - static_cast<float>(x_w)
+                             : ratio_w * static_cast<float>(l) -
+                                   static_cast<float>(x_w);
      float d_e = 1.f - d_w;

      for (int i = 0; i < n; i++) {    // loop for batches
@@ -144,12 +152,14 @@ static void NearestNeighborInterpolateGrad(const DenseTensor& output_grad,
  auto output_grad_t = EigenTensor<T, 4>::From(output_grad);

  for (int k = 0; k < out_h; k++) {  // loop for images
-    int in_k = (align_corners) ? static_cast<int>(ratio_h * k + 0.5)
-                               : static_cast<int>(ratio_h * k);
+    int in_k = static_cast<int>(align_corners
+                                    ? (ratio_h * static_cast<float>(k) + 0.5f)
+                                    : (ratio_h * static_cast<float>(k)));

    for (int l = 0; l < out_w; l++) {
-      int in_l = (align_corners) ? static_cast<int>(ratio_w * l + 0.5)
-                                 : static_cast<int>(ratio_w * l);
+      int in_l = static_cast<int>(align_corners
+                                      ? (ratio_w * static_cast<float>(l) + 0.5f)
+                                      : (ratio_w * static_cast<float>(l)));

      for (int i = 0; i < n; i++) {    // loop for batches
        for (int j = 0; j < c; j++) {  // loop for channels
@@ -182,12 +192,14 @@ static void BicubicInterpolationGrad(const DenseTensor& output_grad,
  using MT = typename phi::dtype::MPTypeTrait<T>::Type;

  for (int k = 0; k < out_h; k++) {  // loop for images
-    MT y_n = align_corners ? ratio_h * k : ratio_h * (k + 0.5) - 0.5;
+    MT y_n = align_corners ? ratio_h * static_cast<float>(k)
+                           : ratio_h * (static_cast<float>(k) + 0.5f) - 0.5f;
    int input_y = floorf(y_n);
    MT y_t = y_n - input_y;

    for (int l = 0; l < out_w; l++) {
-      MT x_n = align_corners ? ratio_w * l : ratio_w * (l + 0.5) - 0.5;
+      MT x_n = align_corners ? ratio_w * static_cast<float>(l)
+                             : ratio_w * (static_cast<float>(l) + 0.5f) - 0.5f;
      int input_x = floorf(x_n);
      MT x_t = x_n - input_x;

@@ -245,33 +257,42 @@ static void TrilinearInterpolationGrad(const DenseTensor& output_grad,
  bool align_flag = (align_mode == 0 && !align_corners);
  using MT = typename phi::dtype::MPTypeTrait<T>::Type;
  for (int j = 0; j < out_d; j++) {  // loop for D
-    int t_f = align_flag ? static_cast<int>(ratio_d * (j + 0.5) - 0.5)
-                         : static_cast<int>(ratio_d * j);
+    int t_f = static_cast<int>(
+        align_flag ? (ratio_d * (static_cast<float>(j) + 0.5f) - 0.5f)
+                   : (ratio_d * static_cast<float>(j)));
    t_f = (t_f > 0) ? t_f : 0;
    int t_b = (t_f + 1) < (in_d - 1) ? (t_f + 1) : (in_d - 1);
-    float idx_src_t = ratio_d * (j + 0.5) - 0.5;
+    float idx_src_t = ratio_d * (static_cast<float>(j) + 0.5f) - 0.5f;
    idx_src_t = (idx_src_t > 0) ? idx_src_t : 0;
-    float d_f = align_flag ? idx_src_t - t_f : ratio_d * j - t_f;
+    float d_f = align_flag
+                    ? idx_src_t - static_cast<float>(t_f)
+                    : ratio_d * static_cast<float>(j) - static_cast<float>(t_f);
    float d_b = 1.f - d_f;

    for (int k = 0; k < out_h; k++) {  // loop for H
-      int y_n = align_flag ? static_cast<int>(ratio_h * (k + 0.5) - 0.5)
-                           : static_cast<int>(ratio_h * k);
+      int y_n = static_cast<int>(
+          align_flag ? (ratio_h * (static_cast<float>(k) + 0.5f) - 0.5f)
+                     : (ratio_h * static_cast<float>(k)));
      y_n = (y_n > 0) ? y_n : 0;
      int y_s = (y_n + 1) < (in_h - 1) ? (y_n + 1) : (in_h - 1);
-      float idx_src_y = ratio_h * (k + 0.5) - 0.5;
+      float idx_src_y = ratio_h * (static_cast<float>(k) + 0.5f) - 0.5f;
      idx_src_y = (idx_src_y > 0) ? idx_src_y : 0;
-      float d_n = align_flag ? idx_src_y - y_n : ratio_h * k - y_n;
+      float d_n = align_flag ? idx_src_y - static_cast<float>(y_n)
+                             : ratio_h * static_cast<float>(k) -
+                                   static_cast<float>(y_n);
      float d_s = 1.f - d_n;

      for (int l = 0; l < out_w; l++) {  // loop for W
-        int x_w = align_flag ? static_cast<int>(ratio_w * (l + 0.5) - 0.5)
-                             : static_cast<int>(ratio_w * l);
+        int x_w = static_cast<int>(
+            align_flag ? (ratio_w * (static_cast<float>(l) + 0.5f) - 0.5f)
+                       : (ratio_w * static_cast<float>(l)));
        x_w = (x_w > 0) ? x_w : 0;
        int x_e = (x_w + 1) < (in_w - 1) ? (x_w + 1) : (in_w - 1);
-        float idx_src_x = ratio_w * (l + 0.5) - 0.5;
+        float idx_src_x = ratio_w * (static_cast<float>(l) + 0.5f) - 0.5f;
        idx_src_x = (idx_src_x > 0) ? idx_src_x : 0;
-        float d_w = align_flag ? idx_src_x - x_w : ratio_w * l - x_w;
+        float d_w = align_flag ? idx_src_x - static_cast<float>(x_w)
+                               : ratio_w * static_cast<float>(l) -
+                                     static_cast<float>(x_w);
        float d_e = 1.f - d_w;

        for (int b = 0; b < n; b++) {    // loop for batches
@@ -338,15 +359,18 @@ static void NearestNeighbor3DInterpolateGrad(const DenseTensor& output_grad,
  auto output_grad_t = EigenTensor<T, 5>::From(output_grad);

  for (int d = 0; d < out_d; d++) {
-    int in_d = (align_corners) ? static_cast<int>(ratio_d * d + 0.5)
-                               : static_cast<int>(ratio_d * d);
+    int in_d = static_cast<int>(align_corners
+                                    ? (ratio_d * static_cast<float>(d) + 0.5f)
+                                    : (ratio_d * static_cast<float>(d)));
    for (int k = 0; k < out_h; k++) {  // loop for images
-      int in_k = (align_corners) ? static_cast<int>(ratio_h * k + 0.5)
-                                 : static_cast<int>(ratio_h * k);
+      int in_k = static_cast<int>(align_corners
+                                      ? (ratio_h * static_cast<float>(k) + 0.5f)
+                                      : (ratio_h * static_cast<float>(k)));

      for (int l = 0; l < out_w; l++) {
-        int in_l = (align_corners) ? static_cast<int>(ratio_w * l + 0.5)
-                                   : static_cast<int>(ratio_w * l);
+        int in_l = static_cast<int>(
+            align_corners ? (ratio_w * static_cast<float>(l) + 0.5f)
+                          : (ratio_w * static_cast<float>(l)));

        for (int i = 0; i < n; i++) {    // loop for batches
          for (int j = 0; j < c; j++) {  // loop for channels
@@ -408,7 +432,7 @@ static void Interpolate1DCPUBwd(
    }
  }
  if (scale_w > 0.) {
-    out_w = static_cast<int>(in_w * scale_w);
+    out_w = static_cast<int>(static_cast<float>(in_w) * scale_w);
  }
  if (out_size) {
    auto out_size_data =
@@ -442,10 +466,13 @@ static void Interpolate1DCPUBwd(
  float ratio_w = 0.f;
  if (out_w > 1) {
    float new_scale_w = 0.f;
-    new_scale_w = (scale_w > 0) ? static_cast<float>(1. / scale_w)
-                                : static_cast<float>(in_w) / out_w;
-    ratio_w = (align_corners) ? static_cast<float>(in_w - 1) / (out_w - 1)
-                              : static_cast<float>(new_scale_w);
+    new_scale_w = static_cast<float>(
+        scale_w > 0 ? (1.f / scale_w)
+                    : static_cast<float>(in_w) / static_cast<float>(out_w));
+    ratio_w =
+        static_cast<float>(align_corners ? (static_cast<float>(in_w) - 1.f) /
+                                               (static_cast<float>(out_w) - 1.f)
+                                         : new_scale_w);
  }
  if ("linear" == interp_method) {
    LinearInterpolationGrad<T>(output_grad,
@@ -528,8 +555,8 @@ static void Interpolate2DCPUBwd(
    }
  }
  if (scale_h > 0. && scale_w > 0.) {
-    out_h = static_cast<int>(in_h * scale_h);
-    out_w = static_cast<int>(in_w * scale_w);
+    out_h = static_cast<int>(in_h * scale_h);  // NOLINT
+    out_w = static_cast<int>(in_w * scale_w);  // NOLINT
  }
  if (out_size) {
    auto out_size_data =
@@ -566,17 +593,23 @@ static void Interpolate2DCPUBwd(
  float ratio_w = 0.f;
  if (out_h > 1) {
    float new_scale_h = 0.f;
-    new_scale_h = (scale_h > 0) ? static_cast<float>(1. / scale_h)
-                                : static_cast<float>(in_h) / out_h;
-    ratio_h = (align_corners) ? static_cast<float>(in_h - 1) / (out_h - 1)
-                              : static_cast<float>(new_scale_h);
+    new_scale_h = static_cast<float>(
+        (scale_h > 0) ? (1.f / scale_h)
+                      : static_cast<float>(in_h) / static_cast<float>(out_h));
+    ratio_h =
+        static_cast<float>(align_corners ? (static_cast<float>(in_h) - 1.f) /
+                                               (static_cast<float>(out_h) - 1.f)
+                                         : new_scale_h);
  }
  if (out_w > 1) {
    float new_scale_w = 0.f;
-    new_scale_w = (scale_w > 0) ? static_cast<float>(1. / scale_w)
-                                : static_cast<float>(in_w) / out_w;
-    ratio_w = (align_corners) ? static_cast<float>(in_w - 1) / (out_w - 1)
-                              : static_cast<float>(new_scale_w);
+    new_scale_w = static_cast<float>(
+        (scale_w > 0) ? (1.f / scale_w)
+                      : static_cast<float>(in_w) / static_cast<float>(out_w));
+    ratio_w =
+        static_cast<float>(align_corners ? (static_cast<float>(in_w) - 1.f) /
+                                               (static_cast<float>(out_w) - 1.f)
+                                         : new_scale_w);
  }

  if ("bilinear" == interp_method) {
@@ -706,9 +739,9 @@ static void Interpolate3DCPUBwd(
    }
  }
  if (scale_d > 0. && scale_h > 0. && scale_w > 0.) {
-    out_d = static_cast<int>(in_d * scale_d);
-    out_h = static_cast<int>(in_h * scale_h);
-    out_w = static_cast<int>(in_w * scale_w);
+    out_d = static_cast<int>(in_d * scale_d);  // NOLINT
+    out_h = static_cast<int>(in_h * scale_h);  // NOLINT
+    out_w = static_cast<int>(in_w * scale_w);  // NOLINT
  }
  if (out_size) {
    auto out_size_data =
@@ -747,24 +780,32 @@ static void Interpolate3DCPUBwd(
  float ratio_w = 0.f;
  if (out_d > 1) {
    float new_scale_d = 0.f;
-    new_scale_d = (scale_d > 0) ? static_cast<float>(1. / scale_d)
-                                : static_cast<float>(in_d) / out_d;
-    ratio_d = (align_corners) ? static_cast<float>(in_d - 1) / (out_d - 1)
-                              : static_cast<float>(new_scale_d);
+    new_scale_d = static_cast<float>(
+        (scale_d > 0) ? (1.f / scale_d)
+                      : static_cast<float>(in_d) / static_cast<float>(out_d));
+    ratio_d =
+        static_cast<float>(align_corners ? (static_cast<float>(in_d) - 1.f) /
+                                               (static_cast<float>(out_d) - 1.f)
+                                         : new_scale_d);
  }
  if (out_h > 1) {
    float new_scale_h = 0.f;
-    new_scale_h = (scale_h > 0) ? static_cast<float>(1. / scale_h)
-                                : static_cast<float>(in_h) / out_h;
-    ratio_h = (align_corners) ? static_cast<float>(in_h - 1) / (out_h - 1)
+    new_scale_h = static_cast<float>(
+        (scale_h > 0) ? (1.f / scale_h)
+                      : static_cast<float>(in_h) / static_cast<float>(out_h));
+    ratio_h = (align_corners) ? static_cast<float>(in_h - 1) /
+                                    (static_cast<float>(out_h) - 1)
                              : static_cast<float>(new_scale_h);
  }
  if (out_w > 1) {
    float new_scale_w = 0.f;
-    new_scale_w = (scale_w > 0) ? static_cast<float>(1. / scale_w)
-                                : static_cast<float>(in_w) / out_w;
-    ratio_w = (align_corners) ? static_cast<float>(in_w - 1) / (out_w - 1)
-                              : static_cast<float>(new_scale_w);
+    new_scale_w = static_cast<float>(
+        (scale_w > 0) ? (1.f / scale_w)
+                      : static_cast<float>(in_w) / static_cast<float>(out_w));
+    ratio_w =
+        static_cast<float>(align_corners ? (static_cast<float>(in_w) - 1.f) /
+                                               (static_cast<float>(out_w) - 1.f)
+                                         : new_scale_w);
  }

  if ("trilinear" == interp_method) {

--- a/paddle/phi/kernels/cpu/interpolate_kernel.cc
+++ b/paddle/phi/kernels/cpu/interpolate_kernel.cc
@@ -57,15 +57,18 @@ static void LinearInterpolation(const DenseTensor& input,
 #pragma omp parallel for
 #endif
  for (int l = 0; l < out_w; l++) {
-    int x_w = align_flag ? static_cast<int>(ratio_w * (l + 0.5) - 0.5)
-                         : static_cast<int>(ratio_w * l);
+    int x_w = static_cast<int>(
+        align_flag ? (ratio_w * (static_cast<float>(l) + 0.5f) - 0.5f)
+                   : ratio_w * static_cast<float>(l));
    x_w = (x_w > 0) ? x_w : 0;                       // w
    int x_e = (x_w < (in_w - 1)) ? (x_w + 1) : x_w;  // w_id

-    float idx_src_x = ratio_w * (l + 0.5) - 0.5;
+    float idx_src_x = ratio_w * (static_cast<float>(l) + 0.5f) - 0.5f;
    idx_src_x = (idx_src_x > 0) ? idx_src_x : 0;
-    float d_w = align_flag ? idx_src_x - x_w : ratio_w * l - x_w;  // w1lambda
-    float d_e = 1.f - d_w;                                         // w2lambda
+    float d_w = align_flag ? idx_src_x - static_cast<float>(x_w)
+                           : ratio_w * static_cast<float>(l) -
+                                 static_cast<float>(x_w);  // w1lambda
+    float d_e = 1.f - d_w;                                 // w2lambda
    {
      vx_w[l] = x_w;
      vx_e[l] = x_e;
@@ -127,13 +130,15 @@ static void BilinearInterpolation(const DenseTensor& input,
 #pragma omp parallel for
 #endif
  for (int k = 0; k < out_h; k++) {
-    int y_n = align_flag ? static_cast<int>(ratio_h * (k + 0.5) - 0.5)
-                         : static_cast<int>(ratio_h * k);
+    int y_n = static_cast<int>(align_flag ? (ratio_h * (k + 0.5) - 0.5)
+                                          : (ratio_h * static_cast<float>(k)));
    y_n = (y_n > 0) ? y_n : 0;
    int y_s = (y_n + 1) < (in_h - 1) ? (y_n + 1) : (in_h - 1);
-    float idx_src_y = ratio_h * (k + 0.5) - 0.5;
+    float idx_src_y = ratio_h * (static_cast<float>(k) + 0.5f) - 0.5f;
    idx_src_y = (idx_src_y > 0) ? idx_src_y : 0;
-    float d_n = align_flag ? idx_src_y - y_n : ratio_h * k - y_n;
+    float d_n = align_flag
+                    ? idx_src_y - static_cast<float>(y_n)
+                    : ratio_h * static_cast<float>(k) - static_cast<float>(y_n);
    float d_s = 1.f - d_n;
    {
      vy_n[k] = y_n;
@@ -155,12 +160,14 @@ static void BilinearInterpolation(const DenseTensor& input,
  for (int l = 0; l < out_w; l++) {
    int x_w = (align_mode == 0 && !align_corners)
                  ? static_cast<int>(ratio_w * (l + 0.5) - 0.5)
-                  : static_cast<int>(ratio_w * l);
+                  : static_cast<int>(ratio_w * static_cast<float>(l));
    x_w = (x_w > 0) ? x_w : 0;
    int x_e = (x_w + 1) < (in_w - 1) ? (x_w + 1) : (in_w - 1);
-    float idx_src_x = ratio_w * (l + 0.5) - 0.5;
+    float idx_src_x = ratio_w * (static_cast<float>(l) + 0.5f) - 0.5f;
    idx_src_x = (idx_src_x > 0) ? idx_src_x : 0;
-    float d_w = align_flag ? idx_src_x - x_w : ratio_w * l - x_w;
+    float d_w = align_flag
+                    ? idx_src_x - static_cast<float>(x_w)
+                    : ratio_w * static_cast<float>(l) - static_cast<float>(x_w);
    float d_e = 1.f - d_w;
    {
      vx_w[l] = x_w;
@@ -224,12 +231,14 @@ static void NearestNeighborInterpolate(const DenseTensor& input,
  auto output_t = EigenTensor<T, 4>::From(*output);

  for (int k = 0; k < out_h; k++) {  // loop for images
-    int in_k = (align_corners) ? static_cast<int>(ratio_h * k + 0.5)
-                               : static_cast<int>(ratio_h * k);
+    int in_k = (align_corners)
+                   ? static_cast<int>(ratio_h * static_cast<float>(k) + 0.5)
+                   : static_cast<int>(ratio_h * static_cast<float>(k));

    for (int l = 0; l < out_w; l++) {
-      int in_l = (align_corners) ? static_cast<int>(ratio_w * l + 0.5)
-                                 : static_cast<int>(ratio_w * l);
+      int in_l = (align_corners)
+                     ? static_cast<int>(ratio_w * static_cast<float>(l) + 0.5)
+                     : static_cast<int>(ratio_w * static_cast<float>(l));

      for (int i = 0; i < n; i++) {    // loop for batches
        for (int j = 0; j < c; j++) {  // loop for channels
@@ -262,13 +271,13 @@ static void BicubicInterpolation(const DenseTensor& input,
  using MT = typename phi::dtype::MPTypeTrait<T>::Type;

  for (int k = 0; k < out_h; k++) {  // loop for images
-    MT y_n = align_corners ? static_cast<MT>(ratio_h * k)
+    MT y_n = align_corners ? static_cast<MT>(ratio_h * static_cast<float>(k))
                           : static_cast<MT>(ratio_h * (k + 0.5) - 0.5);
    int input_y = floorf(y_n);
    const MT y_t = y_n - input_y;

    for (int l = 0; l < out_w; l++) {
-      MT x_n = align_corners ? static_cast<MT>(ratio_w * l)
+      MT x_n = align_corners ? static_cast<MT>(ratio_w * static_cast<float>(l))
                             : static_cast<MT>(ratio_w * (l + 0.5) - 0.5);
      int input_x = floorf(x_n);
      const MT x_t = x_n - input_x;
@@ -360,12 +369,14 @@ static void TrilinearInterpolation(const DenseTensor& input,
 #endif
  for (int j = 0; j < out_d; j++) {
    int t_f = align_flag ? static_cast<int>(ratio_d * (j + 0.5) - 0.5)
-                         : static_cast<int>(ratio_d * j);
+                         : static_cast<int>(ratio_d * static_cast<float>(j));
    t_f = (t_f > 0) ? t_f : 0;
    int t_b = (t_f + 1) < (in_d - 1) ? (t_f + 1) : (in_d - 1);
-    float idx_src_t = ratio_d * (j + 0.5) - 0.5;
+    float idx_src_t = ratio_d * (static_cast<float>(j) + 0.5f) - 0.5f;
    idx_src_t = (idx_src_t > 0) ? idx_src_t : 0;
-    float d_f = align_flag ? idx_src_t - t_f : ratio_d * j - t_f;
+    float d_f = align_flag
+                    ? idx_src_t - static_cast<float>(t_f)
+                    : ratio_d * static_cast<float>(j) - static_cast<float>(t_f);
    float d_b = 1.f - d_f;
    {
      vt_f[j] = t_f;
@@ -386,12 +397,14 @@ static void TrilinearInterpolation(const DenseTensor& input,
 #endif
  for (int k = 0; k < out_h; k++) {
    int y_n = align_flag ? static_cast<int>(ratio_h * (k + 0.5) - 0.5)
-                         : static_cast<int>(ratio_h * k);
+                         : static_cast<int>(ratio_h * static_cast<float>(k));
    y_n = (y_n > 0) ? y_n : 0;
    int y_s = (y_n + 1) < (in_h - 1) ? (y_n + 1) : (in_h - 1);
-    float idx_src_y = ratio_h * (k + 0.5) - 0.5;
+    float idx_src_y = ratio_h * (static_cast<float>(k) + 0.5f) - 0.5f;
    idx_src_y = (idx_src_y > 0) ? idx_src_y : 0;
-    float d_n = align_flag ? idx_src_y - y_n : ratio_h * k - y_n;
+    float d_n = align_flag
+                    ? idx_src_y - static_cast<float>(y_n)
+                    : ratio_h * static_cast<float>(k) - static_cast<float>(y_n);
    float d_s = 1.f - d_n;
    {
      vy_n[k] = y_n;
@@ -413,12 +426,14 @@ static void TrilinearInterpolation(const DenseTensor& input,
  for (int l = 0; l < out_w; l++) {
    int x_w = (align_mode == 0 && !align_corners)
                  ? static_cast<int>(ratio_w * (l + 0.5) - 0.5)
-                  : static_cast<int>(ratio_w * l);
+                  : static_cast<int>(ratio_w * static_cast<float>(l));
    x_w = (x_w > 0) ? x_w : 0;
    int x_e = (x_w + 1) < (in_w - 1) ? (x_w + 1) : (in_w - 1);
-    float idx_src_x = ratio_w * (l + 0.5) - 0.5;
+    float idx_src_x = ratio_w * (static_cast<float>(l) + 0.5f) - 0.5f;
    idx_src_x = (idx_src_x > 0) ? idx_src_x : 0;
-    float d_w = align_flag ? idx_src_x - x_w : ratio_w * l - x_w;
+    float d_w = align_flag
+                    ? idx_src_x - static_cast<float>(x_w)
+                    : ratio_w * static_cast<float>(l) - static_cast<float>(x_w);
    float d_e = 1.f - d_w;
    {
      vx_w[l] = x_w;
@@ -499,15 +514,18 @@ static void NearestNeighbor3DInterpolate(const DenseTensor& input,
  auto input_t = EigenTensor<T, 5>::From(input);
  auto output_t = EigenTensor<T, 5>::From(*output);
  for (int d = 0; d < out_d; d++) {  // loop for images
-    int in_d = (align_corners) ? static_cast<int>(ratio_d * d + 0.5)
-                               : static_cast<int>(ratio_d * d);
+    int in_d = (align_corners)
+                   ? static_cast<int>(ratio_d * static_cast<float>(d) + 0.5)
+                   : static_cast<int>(ratio_d * static_cast<float>(d));
    for (int k = 0; k < out_h; k++) {
-      int in_k = (align_corners) ? static_cast<int>(ratio_h * k + 0.5)
-                                 : static_cast<int>(ratio_h * k);
+      int in_k = (align_corners)
+                     ? static_cast<int>(ratio_h * static_cast<float>(k) + 0.5)
+                     : static_cast<int>(ratio_h * static_cast<float>(k));

      for (int l = 0; l < out_w; l++) {
-        int in_l = (align_corners) ? static_cast<int>(ratio_w * l + 0.5)
-                                   : static_cast<int>(ratio_w * l);
+        int in_l = (align_corners)
+                       ? static_cast<int>(ratio_w * static_cast<float>(l) + 0.5)
+                       : static_cast<int>(ratio_w * static_cast<float>(l));

        for (int i = 0; i < n; i++) {    // loop for batches
          for (int j = 0; j < c; j++) {  // loop for channels
@@ -572,7 +590,7 @@ static void Interpolate1DCPUFwd(
      }
    }
    if (scale_w > 0.) {
-      out_w = static_cast<int>(in_w * scale_w);
+      out_w = static_cast<int>(in_w * scale_w);  // NOLINT
    }
    if (out_size) {
      auto out_size_data =
@@ -602,10 +620,12 @@ static void Interpolate1DCPUFwd(
  float ratio_w = 0.f;
  if (out_w > 1) {
    float new_scale_w = 0.f;
-    new_scale_w = (scale_w > 0) ? static_cast<float>(1. / scale_w)
-                                : static_cast<float>(in_w) / out_w;
-    ratio_w = (align_corners) ? static_cast<float>(in_w - 1) / (out_w - 1)
-                              : static_cast<float>(new_scale_w);
+    new_scale_w = (scale_w > 0)
+                      ? static_cast<float>(1. / scale_w)
+                      : static_cast<float>(in_w) / static_cast<float>(out_w);
+    ratio_w = (align_corners)
+                  ? static_cast<float>(in_w - 1) / static_cast<float>(out_w - 1)
+                  : static_cast<float>(new_scale_w);
  }
  if ("linear" == interp_method) {
    LinearInterpolation<T>(x,
@@ -695,8 +715,8 @@ static void Interpolate2DCPUFwd(
      }
    }
    if (scale_h > 0. && scale_w > 0.) {
-      out_h = static_cast<int>(in_h * scale_h);
-      out_w = static_cast<int>(in_w * scale_w);
+      out_h = static_cast<int>(in_h * scale_h);  // NOLINT
+      out_w = static_cast<int>(in_w * scale_w);  // NOLINT
    }
    if (out_size) {
      auto out_size_data =
@@ -733,17 +753,21 @@ static void Interpolate2DCPUFwd(
  float ratio_w = 0.f;
  if (out_h > 1) {
    float new_scale_h = 0.f;
-    new_scale_h = (scale_h > 0) ? static_cast<float>(1. / scale_h)
-                                : static_cast<float>(in_h) / out_h;
-    ratio_h = (align_corners) ? static_cast<float>(in_h - 1) / (out_h - 1)
-                              : static_cast<float>(new_scale_h);
+    new_scale_h = (scale_h > 0)
+                      ? static_cast<float>(1. / scale_h)
+                      : static_cast<float>(in_h) / static_cast<float>(out_h);
+    ratio_h = (align_corners)
+                  ? static_cast<float>(in_h - 1) / static_cast<float>(out_h - 1)
+                  : static_cast<float>(new_scale_h);
  }
  if (out_w > 1) {
    float new_scale_w = 0.f;
-    new_scale_w = (scale_w > 0) ? static_cast<float>(1. / scale_w)
-                                : static_cast<float>(in_w) / out_w;
-    ratio_w = (align_corners) ? static_cast<float>(in_w - 1) / (out_w - 1)
-                              : static_cast<float>(new_scale_w);
+    new_scale_w = (scale_w > 0)
+                      ? static_cast<float>(1. / scale_w)
+                      : static_cast<float>(in_w) / static_cast<float>(out_w);
+    ratio_w = (align_corners)
+                  ? static_cast<float>(in_w - 1) / static_cast<float>(out_w - 1)
+                  : static_cast<float>(new_scale_w);
  }

  if ("bilinear" == interp_method) {
@@ -881,9 +905,9 @@ static void Interpolate3DCPUFwd(
      }
    }
    if (scale_w > 0. && scale_h > 0. && scale_d > 0.) {
-      out_d = static_cast<int>(in_d * scale_d);
-      out_h = static_cast<int>(in_h * scale_h);
-      out_w = static_cast<int>(in_w * scale_w);
+      out_d = static_cast<int>(in_d * scale_d);  // NOLINT
+      out_h = static_cast<int>(in_h * scale_h);  // NOLINT
+      out_w = static_cast<int>(in_w * scale_w);  // NOLINT
    }
    if (out_size) {
      auto out_size_data =
@@ -929,24 +953,30 @@ static void Interpolate3DCPUFwd(
  float ratio_w = 0.f;
  if (out_d > 1) {
    float new_scale_d = 0.f;
-    new_scale_d = (scale_d > 0) ? static_cast<float>(1. / scale_d)
-                                : static_cast<float>(in_d) / out_d;
-    ratio_d = (align_corners) ? static_cast<float>(in_d - 1) / (out_d - 1)
-                              : static_cast<float>(new_scale_d);
+    new_scale_d = (scale_d > 0)
+                      ? static_cast<float>(1. / scale_d)
+                      : static_cast<float>(in_d) / static_cast<float>(out_d);
+    ratio_d = (align_corners)
+                  ? static_cast<float>(in_d - 1) / static_cast<float>(out_d - 1)
+                  : static_cast<float>(new_scale_d);
  }
  if (out_h > 1) {
    float new_scale_h = 0.f;
-    new_scale_h = (scale_h > 0) ? static_cast<float>(1. / scale_h)
-                                : static_cast<float>(in_h) / out_h;
-    ratio_h = (align_corners) ? static_cast<float>(in_h - 1) / (out_h - 1)
-                              : static_cast<float>(new_scale_h);
+    new_scale_h = (scale_h > 0)
+                      ? static_cast<float>(1. / scale_h)
+                      : static_cast<float>(in_h) / static_cast<float>(out_h);
+    ratio_h = (align_corners)
+                  ? static_cast<float>(in_h - 1) / static_cast<float>(out_h - 1)
+                  : static_cast<float>(new_scale_h);
  }
  if (out_w > 1) {
    float new_scale_w = 0.f;
-    new_scale_w = (scale_w > 0) ? static_cast<float>(1. / scale_w)
-                                : static_cast<float>(in_w) / out_w;
-    ratio_w = (align_corners) ? static_cast<float>(in_w - 1) / (out_w - 1)
-                              : static_cast<float>(new_scale_w);
+    new_scale_w = (scale_w > 0)
+                      ? static_cast<float>(1. / scale_w)
+                      : static_cast<float>(in_w) / static_cast<float>(out_w);
+    ratio_w = (align_corners)
+                  ? static_cast<float>(in_w - 1) / static_cast<float>(out_w - 1)
+                  : static_cast<float>(new_scale_w);
  }

  if ("trilinear" == interp_method) {

--- a/paddle/phi/kernels/cpu/kthvalue_grad_kernel.cc
+++ b/paddle/phi/kernels/cpu/kthvalue_grad_kernel.cc
@@ -117,7 +117,7 @@ void KthvalueGradKernel(const Context& dev_ctx,
    trans.emplace_back(axis);
    DDim trans_dims(out_dims);
    DDim trans_in_dims(in_dims);
-    for (size_t i = 0; i < trans.size(); i++) {
+    for (int i = 0; i < static_cast<int>(trans.size()); i++) {
      trans_dims[i] = out_dims[trans[i]];
      trans_in_dims[i] = in_dims[trans[i]];
    }
@@ -126,7 +126,7 @@ void KthvalueGradKernel(const Context& dev_ctx,
    trans_ind.Resize(trans_dims);
    dev_ctx.template Alloc<T>(&trans_dO);
    dev_ctx.template Alloc<int64_t>(&trans_ind);
-    int ndims = trans.size();
+    int ndims = static_cast<int>(trans.size());
    if (keepdim) {
      funcs::TransCompute<phi::CPUContext, T>(
          ndims, dev_ctx, d_out, &trans_dO, trans);

--- a/paddle/phi/kernels/cpu/kthvalue_kernel.cc
+++ b/paddle/phi/kernels/cpu/kthvalue_kernel.cc
@@ -136,7 +136,7 @@ void KthvalueKernel(const Context& dev_ctx,
    DDim trans_dims(in_dims);
    DDim trans_out_dims(in_dims);

-    for (size_t i = 0; i < trans.size(); i++) {
+    for (int i = 0; i < static_cast<int>(trans.size()); i++) {
      trans_dims[i] = in_dims[trans[i]];
      trans_out_dims[i] = in_dims[trans[i]];
    }
@@ -144,7 +144,7 @@ void KthvalueKernel(const Context& dev_ctx,
    DenseTensor trans_inp;
    trans_inp.Resize(trans_dims);
    dev_ctx.template Alloc<T>(&trans_inp);
-    int ndims = trans.size();
+    int ndims = static_cast<int>(trans.size());
    funcs::TransCompute<phi::CPUContext, T>(
        ndims, dev_ctx, x, &trans_inp, trans);


--- a/paddle/phi/kernels/cpu/label_smooth_kernel.cc
+++ b/paddle/phi/kernels/cpu/label_smooth_kernel.cc
@@ -37,10 +37,12 @@ void LabelSmoothKernel(const Context& ctx,
      eigen_out.device(dev) =
          static_cast<T>(1 - epsilon) * eigen_in +
          static_cast<T>(epsilon) *
-              dist.broadcast(Eigen::DSizes<int, 1>(label.numel() / label_dim));
+              dist.broadcast(Eigen::DSizes<int, 1>(
+                  static_cast<int>(label.numel() / label_dim)));
    } else {
-      eigen_out.device(dev) = static_cast<T>(1 - epsilon) * eigen_in +
-                              static_cast<T>(epsilon / label_dim);
+      eigen_out.device(dev) =
+          static_cast<T>(1 - epsilon) * eigen_in +
+          static_cast<T>(epsilon / static_cast<float>(label_dim));
    }
  }
 }

--- a/paddle/phi/kernels/cpu/log_softmax_grad_kernel.cc
+++ b/paddle/phi/kernels/cpu/log_softmax_grad_kernel.cc
@@ -46,7 +46,7 @@ struct LogSoftmaxGradFunctor {
    auto dy = EigenMatrixTemplate<T>::From(*dY, dim_2d);
    auto dx = EigenMatrixTemplate<T>::From(*dX, dim_2d);

-    const int axis_dim = Y->dims()[axis];
+    const int axis_dim = static_cast<int>(Y->dims()[axis]);
    const int batch_size = y.dimension(kBatchDim);
    const int num_classes = y.dimension(kClassDim);
    const int num_remain = num_classes / axis_dim;

--- a/paddle/phi/kernels/cpu/log_softmax_kernel.cc
+++ b/paddle/phi/kernels/cpu/log_softmax_kernel.cc
@@ -46,7 +46,7 @@ struct LogSoftmaxFunctor {
    constexpr int kClassDim = 1;
    constexpr int kAxisDim = 1;

-    int axis_dim = X->dims()[axis];
+    int axis_dim = static_cast<int>(X->dims()[axis]);
    const int n = funcs::SizeToAxis(axis, X->dims());
    const int d = funcs::SizeFromAxis(axis, X->dims());
    phi::DDim dim_2d{n, d};

--- a/paddle/phi/kernels/cpu/lstsq_kernel.cc
+++ b/paddle/phi/kernels/cpu/lstsq_kernel.cc
@@ -63,9 +63,9 @@ void LstsqKernel(const Context& dev_ctx,
  // lapack is a column-major storge, transpose make the input to
  // have a continuous memory layout
  int info = 0;
-  int m = x_dims[dim_size - 2];
-  int n = x_dims[dim_size - 1];
-  int nrhs = y_dims[dim_size - 1];
+  int m = static_cast<int>(x_dims[dim_size - 2]);
+  int n = static_cast<int>(x_dims[dim_size - 1]);
+  int nrhs = static_cast<int>(y_dims[dim_size - 1]);
  int lda = std::max<int>(m, 1);
  int ldb = std::max<int>(1, std::max(m, n));

@@ -115,7 +115,7 @@ void LstsqKernel(const Context& dev_ctx,
    s_data = dev_ctx.template Alloc<T>(singular_values);
    s_working_ptr = s_data;
    auto s_dims = singular_values->dims();
-    s_stride = s_dims[s_dims.size() - 1];
+    s_stride = static_cast<int>(s_dims[s_dims.size() - 1]);
  }

  // "jpvt" is only used for "gelsy" driver

--- a/paddle/phi/kernels/cpu/masked_select_grad_kernel.cc
+++ b/paddle/phi/kernels/cpu/masked_select_grad_kernel.cc
@@ -64,7 +64,7 @@ void MaskedSelectGradKernel(const Context& dev_ctx,

  auto* mask_data = mask_expand.data<bool>();
  auto* input_data = out_grad.data<T>();
-  int mask_size = mask_expand.numel();
+  int mask_size = static_cast<int>(mask_expand.numel());

  int index = 0;
  for (int i = 0; i < mask_size; i++) {

--- a/paddle/phi/kernels/cpu/matrix_nms_kernel.cc
+++ b/paddle/phi/kernels/cpu/matrix_nms_kernel.cc
@@ -219,7 +219,7 @@ size_t MultiClassMatrixNMS(const DenseTensor& scores,
  std::iota(perm.begin(), perm.end(), 0);

  std::partial_sort(perm.begin(),
-                    perm.begin() + num_det,
+                    perm.begin() + num_det,  // NOLINT
                    perm.end(),
                    [&all_scores](int lhs, int rhs) {
                      return all_scores[lhs] > all_scores[rhs];
@@ -295,7 +295,7 @@ void MatrixNMSKernel(const Context& ctx,
    num_per_batch.emplace_back(num_out);
  }

-  int64_t num_kept = offsets.back();
+  int64_t num_kept = static_cast<int64_t>(offsets.back());
  if (num_kept == 0) {
    out->Resize(phi::make_ddim({0, out_dim}));
    ctx.template Alloc<T>(out);

--- a/paddle/phi/kernels/cpu/matrix_rank_tol_kernel.cc
+++ b/paddle/phi/kernels/cpu/matrix_rank_tol_kernel.cc
@@ -87,11 +87,10 @@ void MatrixRankTolKernel(const Context& dev_ctx,
  dev_ctx.template Alloc<int64_t>(out);
  auto dim_x = x.dims();
  auto dim_out = out->dims();
-  int rows = dim_x[dim_x.size() - 2];
-  int cols = dim_x[dim_x.size() - 1];
+  int rows = static_cast<int>(dim_x[dim_x.size() - 2]);
+  int cols = static_cast<int>(dim_x[dim_x.size() - 1]);
  int k = std::min(rows, cols);
-  auto numel = x.numel();
-  int batches = numel / (rows * cols);
+  int batches = static_cast<int>(x.numel() / (rows * cols));

  T rtol_T = 0;


--- a/paddle/phi/kernels/cpu/mode_grad_kernel.cc
+++ b/paddle/phi/kernels/cpu/mode_grad_kernel.cc
@@ -105,7 +105,7 @@ void ModeGradKernel(const Context& dev_ctx,
    trans_axis.emplace_back(axis);
    DDim trans_shape(out_dims);
    DDim trans_in_shape(in_dims);
-    for (size_t i = 0; i < trans_axis.size(); i++) {
+    for (int i = 0; i < static_cast<int>(trans_axis.size()); i++) {
      trans_shape[i] = out_dims[trans_axis[i]];
      trans_in_shape[i] = in_dims[trans_axis[i]];
    }
@@ -118,7 +118,7 @@ void ModeGradKernel(const Context& dev_ctx,
    trans_ind.Resize(trans_shape);
    dev_ctx.template Alloc<int64_t>(&trans_ind);

-    int ndims = trans_axis.size();
+    int ndims = static_cast<int>(trans_axis.size());

    if (keepdim) {
      // Do transpose

--- a/paddle/phi/kernels/cpu/mode_kernel.cc
+++ b/paddle/phi/kernels/cpu/mode_kernel.cc
@@ -89,7 +89,7 @@ void ModeKernel(const Context& dev_ctx,
    DDim trans_shape(in_dims);
    DDim trans_out_shape(in_dims);

-    for (size_t i = 0; i < trans_axis.size(); i++) {
+    for (int i = 0; i < static_cast<int>(trans_axis.size()); i++) {
      trans_shape[i] = in_dims[trans_axis[i]];
      trans_out_shape[i] = in_dims[trans_axis[i]];
    }
@@ -98,7 +98,7 @@ void ModeKernel(const Context& dev_ctx,
    DenseTensor trans_input;
    trans_input.Resize(trans_shape);
    dev_ctx.template Alloc<T>(&trans_input);
-    int ndims = trans_axis.size();
+    int ndims = static_cast<int>(trans_axis.size());

    // transpose the input value
    funcs::TransCompute<CPUContext, T>(

--- a/paddle/phi/kernels/cpu/multiclass_nms3_kernel.cc
+++ b/paddle/phi/kernels/cpu/multiclass_nms3_kernel.cc
@@ -65,7 +65,7 @@ void Array2Poly(const T* box,
  (*poly).hole[0] = 0;
  (*poly).contour =
      (phi::funcs::gpc_vertex_list*)malloc(sizeof(phi::funcs::gpc_vertex_list));
-  (*poly).contour->num_vertices = pts_num;
+  (*poly).contour->num_vertices = static_cast<int>(pts_num);
  (*poly).contour->vertex =
      (phi::funcs::gpc_vertex*)malloc(sizeof(phi::funcs::gpc_vertex) * pts_num);
  for (size_t i = 0; i < pts_num; ++i) {
@@ -255,9 +255,9 @@ void SliceOneClass(const Context& ctx,
  T* item_data = ctx.template Alloc<T>(one_class_item);
  const T* items_data = items.data<T>();
  const int64_t num_item = items.dims()[0];
-  const int class_num = items.dims()[1];
+  const int class_num = static_cast<int>(items.dims()[1]);
  if (items.dims().size() == 3) {
-    int item_size = items.dims()[2];
+    int item_size = static_cast<int>(items.dims()[2]);
    for (int i = 0; i < num_item; ++i) {
      std::memcpy(item_data + i * item_size,
                  items_data + i * class_num * item_size + class_id * item_size,
@@ -350,9 +350,10 @@ void MultiClassNMS(const Context& ctx,

  int num_det = 0;

-  int64_t class_num = scores_size == 3 ? scores.dims()[0] : scores.dims()[1];
+  int class_num =
+      static_cast<int>(scores_size == 3 ? scores.dims()[0] : scores.dims()[1]);
  DenseTensor bbox_slice, score_slice;
-  for (int64_t c = 0; c < class_num; ++c) {
+  for (int c = 0; c < class_num; ++c) {
    if (c == background_label) continue;
    if (scores_size == 3) {
      score_slice = scores.Slice(c, c + 1);
@@ -374,7 +375,7 @@ void MultiClassNMS(const Context& ctx,
    if (scores_size == 2) {
      std::stable_sort((*indices)[c].begin(), (*indices)[c].end());
    }
-    num_det += (*indices)[c].size();
+    num_det += static_cast<int>((*indices)[c].size());
  }

  *num_nmsed_out = num_det;
@@ -466,7 +467,7 @@ void MultiClassOutput(const Context& ctx,
        bdata = bbox.data<T>() + idx * box_size;
        odata[count * out_dim + 1] = *(scores_data + idx * class_num + label);
        if (oindices != nullptr) {
-          oindices[count] = offset + idx * class_num + label;
+          oindices[count] = offset + idx * class_num + label;  // NOLINT
        }
      }
      // xmin, ymin, xmax, ymax or multi-points coordinates
@@ -505,9 +506,11 @@ void MultiClassNMSKernel(const Context& ctx,
  DenseTensor boxes_slice, scores_slice;
  int n = 0;
  if (has_roisnum) {
-    n = score_size == 3 ? batch_size : rois_num.get_ptr()->numel();
+    n = static_cast<int>(score_size == 3 ? batch_size
+                                         : rois_num.get_ptr()->numel());
  } else {
-    n = score_size == 3 ? batch_size : bboxes.lod().back().size() - 1;
+    n = static_cast<int>(score_size == 3 ? batch_size
+                                         : bboxes.lod().back().size() - 1);
  }
  for (int i = 0; i < n; ++i) {
    std::map<int, std::vector<int>> indices;
@@ -528,8 +531,8 @@ void MultiClassNMSKernel(const Context& ctx,
        batch_starts.push_back(batch_starts.back());
        continue;
      }
-      scores_slice = scores.Slice(boxes_lod[i], boxes_lod[i + 1]);
-      boxes_slice = bboxes.Slice(boxes_lod[i], boxes_lod[i + 1]);
+      scores_slice = scores.Slice(boxes_lod[i], boxes_lod[i + 1]);  // NOLINT
+      boxes_slice = bboxes.Slice(boxes_lod[i], boxes_lod[i + 1]);   // NOLINT
    }
    MultiClassNMS<T, Context>(ctx,
                              scores_slice,
@@ -548,7 +551,7 @@ void MultiClassNMSKernel(const Context& ctx,
    batch_starts.push_back(batch_starts.back() + num_nmsed_out);
  }

-  int num_kept = batch_starts.back();
+  int num_kept = static_cast<int>(batch_starts.back());
  if (num_kept == 0) {
    if (return_index) {
      out->Resize({0, out_dim});
@@ -583,15 +586,15 @@ void MultiClassNMSKernel(const Context& ctx,
          boxes_lod = bboxes.lod().back();
        }
        if (boxes_lod[i] == boxes_lod[i + 1]) continue;
-        scores_slice = scores.Slice(boxes_lod[i], boxes_lod[i + 1]);
-        boxes_slice = bboxes.Slice(boxes_lod[i], boxes_lod[i + 1]);
+        scores_slice = scores.Slice(boxes_lod[i], boxes_lod[i + 1]);  // NOLINT
+        boxes_slice = bboxes.Slice(boxes_lod[i], boxes_lod[i + 1]);   // NOLINT
        if (return_index) {
-          offset = boxes_lod[i] * score_dims[1];
+          offset = static_cast<int>(boxes_lod[i] * score_dims[1]);
        }
      }

-      int64_t s = batch_starts[i];
-      int64_t e = batch_starts[i + 1];
+      int64_t s = static_cast<int64_t>(batch_starts[i]);
+      int64_t e = static_cast<int64_t>(batch_starts[i + 1]);
      if (e > s) {
        DenseTensor nout = out->Slice(s, e);
        if (return_index) {
@@ -615,7 +618,7 @@ void MultiClassNMSKernel(const Context& ctx,
    ctx.template Alloc<int>(nms_rois_num);
    int* num_data = nms_rois_num->data<int>();
    for (int i = 1; i <= n; i++) {
-      num_data[i - 1] = batch_starts[i] - batch_starts[i - 1];
+      num_data[i - 1] = batch_starts[i] - batch_starts[i - 1];  // NOLINT
    }
    nms_rois_num->Resize({n});
  }

--- a/paddle/phi/kernels/cpu/multinomial_kernel.cc
+++ b/paddle/phi/kernels/cpu/multinomial_kernel.cc
@@ -29,7 +29,7 @@ void MultinomialKernel(const Context& dev_ctx,
  auto* in_data = x.data<T>();
  int64_t* out_data = dev_ctx.template Alloc<int64_t>(out);
  auto in_dims = x.dims();
-  int64_t in_rank = in_dims.size();
+  int in_rank = in_dims.size();
  const int64_t num_categories = in_dims[in_rank - 1];
  const int64_t num_distributions = in_rank > 1 ? in_dims[in_rank - 2] : 1;


--- a/paddle/phi/kernels/cpu/mv_grad_kernel.cc
+++ b/paddle/phi/kernels/cpu/mv_grad_kernel.cc
@@ -32,8 +32,8 @@ void MvGradKernel(const Context& dev_ctx,
  auto dvec = vec_grad;

  const auto& dim_x = x.dims();
-  int m = dim_x[0];
-  int n = dim_x[1];
+  int m = static_cast<int>(dim_x[0]);
+  int n = static_cast<int>(dim_x[1]);

  // get data ptr
  const T* x_data = x.data<T>();

--- a/paddle/phi/kernels/cpu/nanmedian_grad_kernel.cc
+++ b/paddle/phi/kernels/cpu/nanmedian_grad_kernel.cc
@@ -38,7 +38,7 @@ void CalcMedianGradKernel(const Context& dev_ctx,
  int64_t numel = x.numel();
  auto x_dim = x.dims();
  int64_t rank = x_dim.size();
-  int64_t stride = x_dim[rank - 1];
+  int64_t stride = x_dim[static_cast<int>(rank - 1)];
  int64_t pre_dim = numel / stride;

  int64_t i = 0;

--- a/paddle/phi/kernels/cpu/nanmedian_kernel.cc
+++ b/paddle/phi/kernels/cpu/nanmedian_kernel.cc
@@ -35,7 +35,7 @@ void CalcMedianFunc(const Context& dev_ctx,
  DenseTensor sort_indices;
  auto sort_dim = x.dims();
  int64_t rank = sort_dim.size();
-  sort_dim[rank - 1] = sort_k;
+  sort_dim[static_cast<int>(rank - 1)] = sort_k;
  sort_out.Resize(sort_dim);
  sort_indices.Resize(sort_dim);

@@ -115,7 +115,7 @@ void ProcessMedianKernel(const Context& dev_ctx,
  int64_t numel = x.numel();
  auto x_dim = x.dims();
  int64_t x_rank = x_dim.size();
-  int64_t stride = x_dim[x_rank - 1];
+  int64_t stride = x_dim[static_cast<int>(x_rank - 1)];

  PADDLE_ENFORCE_NE(
      stride,

--- a/paddle/phi/kernels/cpu/overlap_add_grad_kernel.cc
+++ b/paddle/phi/kernels/cpu/overlap_add_grad_kernel.cc
@@ -30,12 +30,15 @@ void OverlapAddGradKernel(const Context& dev_ctx,
  const size_t out_grad_rank = out_grad.dims().size();
  const size_t x_grad_rank = x_grad->dims().size();

-  const int n_frames =
-      (axis == 0) ? x_grad->dims()[0] : x_grad->dims()[x_grad_rank - 1];
-  const int frame_length =
-      (axis == 0) ? x_grad->dims()[1] : x_grad->dims()[x_grad_rank - 2];
-  const int seq_length =
-      (axis == 0) ? out_grad.dims()[0] : out_grad.dims()[out_grad_rank - 1];
+  const int n_frames = static_cast<int>(
+      (axis == 0) ? x_grad->dims()[0]
+                  : x_grad->dims()[static_cast<int>(x_grad_rank) - 1]);
+  const int frame_length = static_cast<int>(
+      (axis == 0) ? x_grad->dims()[1]
+                  : x_grad->dims()[static_cast<int>(x_grad_rank) - 2]);
+  const int seq_length = static_cast<int>(
+      (axis == 0) ? out_grad.dims()[0]
+                  : out_grad.dims()[static_cast<int>(out_grad_rank) - 1]);

  // When the number of input dims is larger than 2, it needs to copy
  // from x to resize input into 2d and output into 3d. Morevoer, output
@@ -50,12 +53,14 @@ void OverlapAddGradKernel(const Context& dev_ctx,
    phi::DDim x_grad_resized_dims;
    phi::DDim out_grad_resized_dims;
    if (axis == 0) {
-      preserved_dims = phi::slice_ddim(out_grad_.dims(), 1, out_grad_rank);
+      preserved_dims =
+          phi::slice_ddim(out_grad_.dims(), 1, static_cast<int>(out_grad_rank));
      x_grad_resized_dims = {
          n_frames, frame_length, phi::product(preserved_dims)};
      out_grad_resized_dims = {seq_length, phi::product(preserved_dims)};
    } else {
-      preserved_dims = phi::slice_ddim(out_grad_.dims(), 0, out_grad_rank - 1);
+      preserved_dims = phi::slice_ddim(
+          out_grad_.dims(), 0, static_cast<int>(out_grad_rank) - 1);
      x_grad_resized_dims = {
          phi::product(preserved_dims), frame_length, n_frames};
      out_grad_resized_dims = {phi::product(preserved_dims), seq_length};

--- a/paddle/phi/kernels/cpu/overlap_add_kernel.cc
+++ b/paddle/phi/kernels/cpu/overlap_add_kernel.cc
@@ -26,13 +26,16 @@ void OverlapAddKernel(const Context& dev_ctx,
                      int axis,
                      DenseTensor* out) {
  dev_ctx.template Alloc<T>(out);
-  const size_t x_rank = x.dims().size();
+  const int x_rank = x.dims().size();
  const size_t out_rank = out->dims().size();

-  const int n_frames = (axis == 0) ? x.dims()[0] : x.dims()[x_rank - 1];
-  const int frame_length = (axis == 0) ? x.dims()[1] : x.dims()[x_rank - 2];
-  const int seq_length =
-      (axis == 0) ? out->dims()[0] : out->dims()[out_rank - 1];
+  const int n_frames =
+      static_cast<int>((axis == 0) ? x.dims()[0] : x.dims()[x_rank - 1]);
+  const int frame_length =
+      static_cast<int>((axis == 0) ? x.dims()[1] : x.dims()[x_rank - 2]);
+  const int seq_length = static_cast<int>(
+      (axis == 0) ? out->dims()[0]
+                  : out->dims()[static_cast<int>(out_rank) - 1]);

  // auto& dev_ctx = ctx.device_context<Context>();

@@ -46,11 +49,13 @@ void OverlapAddKernel(const Context& dev_ctx,
    phi::DDim x_resized_dims;
    phi::DDim out_resized_dims;
    if (axis == 0) {
-      preserved_dims = phi::slice_ddim(out->dims(), 1, out_rank);
+      preserved_dims =
+          phi::slice_ddim(out->dims(), 1, static_cast<int>(out_rank));
      x_resized_dims = {n_frames, frame_length, phi::product(preserved_dims)};
      out_resized_dims = {seq_length, phi::product(preserved_dims)};
    } else {
-      preserved_dims = phi::slice_ddim(out->dims(), 0, out_rank - 1);
+      preserved_dims =
+          phi::slice_ddim(out->dims(), 0, static_cast<int>(out_rank) - 1);
      x_resized_dims = {phi::product(preserved_dims), frame_length, n_frames};
      out_resized_dims = {phi::product(preserved_dims), seq_length};
    }

--- a/paddle/phi/kernels/cpu/p_norm_grad_kernel.cc
+++ b/paddle/phi/kernels/cpu/p_norm_grad_kernel.cc
@@ -30,15 +30,15 @@ inline void GetDims(const phi::DDim& dim,
                    bool asvector) {
  *pre = 1;
  *post = 1;
-  *n = dim[axis];
+  *n = static_cast<int>(dim[axis]);
  if (asvector) {
-    *n = product(dim);
+    *n = static_cast<int>(product(dim));
  } else {
    for (int i = 0; i < axis; ++i) {
-      (*pre) *= dim[i];
+      (*pre) *= static_cast<int>(dim[i]);
    }
    for (int i = axis + 1; i < dim.size(); ++i) {
-      (*post) *= dim[i];
+      (*post) *= static_cast<int>(dim[i]);
    }
  }
 }

--- a/paddle/phi/kernels/cpu/p_norm_kernel.cc
+++ b/paddle/phi/kernels/cpu/p_norm_kernel.cc
@@ -31,15 +31,15 @@ inline void GetDims(const phi::DDim& dim,
                    bool asvector) {
  *pre = 1;
  *post = 1;
-  *n = dim[axis];
+  *n = static_cast<int>(dim[axis]);
  if (asvector) {
-    *n = product(dim);
+    *n = static_cast<int>(product(dim));
  } else {
    for (int i = 0; i < axis; ++i) {
-      (*pre) *= dim[i];
+      (*pre) *= static_cast<int>(dim[i]);
    }
    for (int i = axis + 1; i < dim.size(); ++i) {
-      (*post) *= dim[i];
+      (*post) *= static_cast<int>(dim[i]);
    }
  }
 }

--- a/paddle/phi/kernels/cpu/pad3d_grad_kernel.cc
+++ b/paddle/phi/kernels/cpu/pad3d_grad_kernel.cc
@@ -377,18 +377,18 @@ void Pad3dGradKernel(const Context& dev_ctx,
  T* d_in_data = dev_ctx.template Alloc<T>(d_in);
  phi::funcs::SetConstant<Context, T>()(dev_ctx, d_in, static_cast<T>(0));

-  const int pad_left = pads[0];
-  const int pad_top = pads[2];
-  const int pad_front = pads[4];
-  const int num = d_in_dims[0];
+  const int pad_left = static_cast<int>(pads[0]);
+  const int pad_top = static_cast<int>(pads[2]);
+  const int pad_front = static_cast<int>(pads[4]);
+  const int num = static_cast<int>(d_in_dims[0]);
  if (data_format == "NCDHW") {
-    const int channels = d_in_dims[1];
-    const int in_depth = d_in_dims[2];
-    const int in_height = d_in_dims[3];
-    const int in_width = d_in_dims[4];
-    const int out_depth = d_out_dims[2];
-    const int out_height = d_out_dims[3];
-    const int out_width = d_out_dims[4];
+    const int channels = static_cast<int>(d_in_dims[1]);
+    const int in_depth = static_cast<int>(d_in_dims[2]);
+    const int in_height = static_cast<int>(d_in_dims[3]);
+    const int in_width = static_cast<int>(d_in_dims[4]);
+    const int out_depth = static_cast<int>(d_out_dims[2]);
+    const int out_height = static_cast<int>(d_out_dims[3]);
+    const int out_width = static_cast<int>(d_out_dims[4]);

    std::map<std::string,
             void (*)(T*,
@@ -427,13 +427,13 @@ void Pad3dGradKernel(const Context& dev_ctx,
                   d_out_data,
                   func_map[mode]);
  } else {
-    const int channels = d_in_dims[4];
-    const int in_depth = d_in_dims[1];
-    const int in_height = d_in_dims[2];
-    const int in_width = d_in_dims[3];
-    const int out_depth = d_out_dims[1];
-    const int out_height = d_out_dims[2];
-    const int out_width = d_out_dims[3];
+    const int channels = static_cast<int>(d_in_dims[4]);
+    const int in_depth = static_cast<int>(d_in_dims[1]);
+    const int in_height = static_cast<int>(d_in_dims[2]);
+    const int in_width = static_cast<int>(d_in_dims[3]);
+    const int out_depth = static_cast<int>(d_out_dims[1]);
+    const int out_height = static_cast<int>(d_out_dims[2]);
+    const int out_width = static_cast<int>(d_out_dims[3]);

    std::map<std::string,
             void (*)(T*,

--- a/paddle/phi/kernels/cpu/pad3d_kernel.cc
+++ b/paddle/phi/kernels/cpu/pad3d_kernel.cc
@@ -407,21 +407,21 @@ void Pad3dKernel(const Context& dev_ctx,
  auto out_dims = out->dims();
  T* out_data = dev_ctx.template Alloc<T>(out);

-  int channels = in_dims[1];
-  int in_depth = in_dims[2];
-  int in_height = in_dims[3];
-  int in_width = in_dims[4];
-  int out_depth = out_dims[2];
-  int out_height = out_dims[3];
-  int out_width = out_dims[4];
+  int channels = static_cast<int>(in_dims[1]);
+  int in_depth = static_cast<int>(in_dims[2]);
+  int in_height = static_cast<int>(in_dims[3]);
+  int in_width = static_cast<int>(in_dims[4]);
+  int out_depth = static_cast<int>(out_dims[2]);
+  int out_height = static_cast<int>(out_dims[3]);
+  int out_width = static_cast<int>(out_dims[4]);
  if (data_format == "NDHWC") {
-    channels = in_dims[4];
-    in_depth = in_dims[1];
-    in_height = in_dims[2];
-    in_width = in_dims[3];
-    out_depth = out_dims[1];
-    out_height = out_dims[2];
-    out_width = out_dims[3];
+    channels = static_cast<int>(in_dims[4]);
+    in_depth = static_cast<int>(in_dims[1]);
+    in_height = static_cast<int>(in_dims[2]);
+    in_width = static_cast<int>(in_dims[3]);
+    out_depth = static_cast<int>(out_dims[1]);
+    out_height = static_cast<int>(out_dims[2]);
+    out_width = static_cast<int>(out_dims[3]);
  }

  if (mode == "reflect") {
@@ -489,10 +489,10 @@ void Pad3dKernel(const Context& dev_ctx,
                          "or replicate padding mode."));
  }

-  const int pad_left = pads[0];
-  const int pad_top = pads[2];
-  const int pad_front = pads[4];
-  const int num = in_dims[0];
+  const int pad_left = static_cast<int>(pads[0]);
+  const int pad_top = static_cast<int>(pads[2]);
+  const int pad_front = static_cast<int>(pads[4]);
+  const int num = static_cast<int>(in_dims[0]);
  if (data_format == "NCDHW") {
    std::map<std::string,
             void (*)(const T*,

--- a/paddle/phi/kernels/cpu/prelu_grad_kernel.cc
+++ b/paddle/phi/kernels/cpu/prelu_grad_kernel.cc
@@ -31,7 +31,7 @@ void PReluGradKernel(const Context& dev_ctx,
  const T* alpha_ptr = alpha.data<T>();
  const T* x_ptr = x.data<T>();
  const T* out_grad_ptr = out_grad.data<T>();
-  int numel = x.numel();
+  int numel = static_cast<int>(x.numel());
  auto dim = x.dims();
  int index = 0;
  int i = 0;
@@ -41,16 +41,16 @@ void PReluGradKernel(const Context& dev_ctx,
      if (data_format == "NCHW") {
        int temp = 1;
        for (int j = 2; j < dim.size(); j++) {
-          temp *= dim[j];
+          temp *= static_cast<int>(dim[j]);
        }
        for (i = 0; i < numel; i++) {
-          index = (i / temp) % dim[1];
+          index = static_cast<int>((i / temp) % dim[1]);
          x_grad_ptr[i] = x_ptr[i] > 0 ? out_grad_ptr[i]
                                       : alpha_ptr[index] * out_grad_ptr[i];
        }
      } else {
        for (i = 0; i < numel; i++) {
-          index = i % dim[dim.size() - 1];
+          index = static_cast<int>(i % dim[dim.size() - 1]);
          x_grad_ptr[i] = x_ptr[i] > 0 ? out_grad_ptr[i]
                                       : alpha_ptr[index] * out_grad_ptr[i];
        }
@@ -58,7 +58,7 @@ void PReluGradKernel(const Context& dev_ctx,
    } else if (mode == "element") {
      int temp = 1;
      for (int j = 1; j < dim.size(); j++) {
-        temp *= dim[j];
+        temp *= static_cast<int>(dim[j]);
      }
      for (i = 0; i < numel; i++) {
        index = i % temp;
@@ -82,16 +82,16 @@ void PReluGradKernel(const Context& dev_ctx,
      if (data_format == "NCHW") {
        int temp = 1;
        for (int j = 2; j < dim.size(); j++) {
-          temp *= dim[j];
+          temp *= static_cast<int>(dim[j]);
        }
        for (i = 0; i < numel; i++) {
-          index = (i / temp) % dim[1];
+          index = static_cast<int>((i / temp) % dim[1]);
          alpha_grad_ptr[index] +=
              x_ptr[i] > 0 ? 0 : x_ptr[i] * out_grad_ptr[i];
        }
      } else {
        for (i = 0; i < numel; i++) {
-          index = i % dim[dim.size() - 1];
+          index = static_cast<int>(i % dim[dim.size() - 1]);
          alpha_grad_ptr[index] +=
              x_ptr[i] > 0 ? 0 : x_ptr[i] * out_grad_ptr[i];
        }
@@ -99,7 +99,7 @@ void PReluGradKernel(const Context& dev_ctx,
    } else if (mode == "element") {
      int temp = 1;
      for (int j = 1; j < dim.size(); j++) {
-        temp *= dim[j];
+        temp *= static_cast<int>(dim[j]);
      }
      for (i = 0; i < numel; i++) {
        index = i % temp;

--- a/paddle/phi/kernels/funcs/concat_and_split_functor.cc
+++ b/paddle/phi/kernels/funcs/concat_and_split_functor.cc
@@ -92,14 +92,14 @@ struct SplitFunctor<phi::CPUContext, T> {
    int input_rows = 1;
    auto dim_0 = ref_inputs[0]->dims();
    for (int i = 0; i < axis; ++i) {
-      input_rows *= dim_0[i];
+      input_rows *= static_cast<int>(dim_0[i]);
    }

    int input_cols = 0;

    std::vector<int64_t> output_cols(outputs->size());
    for (size_t i = 0; i < num; ++i) {
-      int t_cols = ref_inputs[i]->numel() / input_rows;
+      int t_cols = static_cast<int>(ref_inputs[i]->numel() / input_rows);
      input_cols += t_cols;
      output_cols[i] = t_cols;
    }
@@ -110,7 +110,7 @@ struct SplitFunctor<phi::CPUContext, T> {
      const T* src_ptr = input.data<T>() + k * input_cols;
      int col_idx = 0;
      for (size_t j = 0; j < num; ++j) {
-        int col_len = output_cols[j];
+        int col_len = static_cast<int>(output_cols[j]);
        auto* out_tensor = outputs->at(j);
        if (out_tensor != nullptr) {
          T* dst_ptr = out_tensor->data<T>() + k * col_len;

--- a/paddle/phi/kernels/funcs/cross_entropy.cc
+++ b/paddle/phi/kernels/funcs/cross_entropy.cc
@@ -101,8 +101,8 @@ void CrossEntropyFunctor<DeviceContext, T>::operator()(
    const int ignore_index,
    const int axis_dim) {
  if (softLabel) {
-    const int batch_size = prob->dims()[0];
-    const int num_classes = prob->dims()[1];
+    const int batch_size = static_cast<const int>(prob->dims()[0]);
+    const int num_classes = static_cast<const int>(prob->dims()[1]);
    const int num_remain = num_classes / axis_dim;

    Eigen::DSizes<int, 3> batch_axis_remain(batch_size, axis_dim, num_remain);

--- a/paddle/phi/kernels/funcs/deformable_conv_functor.cc
+++ b/paddle/phi/kernels/funcs/deformable_conv_functor.cc
@@ -111,8 +111,10 @@ void ModulatedDeformableIm2col(const Context& dev_ctx UNUSED,
                               const std::vector<int>& dilations,
                               const int deformable_groups,
                               T* data_col) {
-  int channel_per_deformable_group = im_shape[0] / deformable_groups;
-  int num_kernels = im_shape[0] * col_shape[1] * col_shape[2] * col_shape[3];
+  int channel_per_deformable_group =
+      static_cast<int>(im_shape[0] / deformable_groups);
+  int num_kernels = static_cast<int>(im_shape[0] * col_shape[1] * col_shape[2] *
+                                     col_shape[3]);

  // get outputs of im2col with offset by bilinear interpolation
  ModulatedDeformableIm2colCPUKernel(num_kernels,

--- a/paddle/phi/kernels/funcs/gather_scatter_functor.cc
+++ b/paddle/phi/kernels/funcs/gather_scatter_functor.cc
@@ -78,13 +78,13 @@ struct cpu_gather_scatter_functor {
          "self_size, src_size, index_size cannot be 0");
      return;
    }
-    int select_dim_size = index_dims[dim];
+    int64_t select_dim_size = index_dims[dim];
    // index matrix has different shape with self matrix or src matrix.
    int replaced_select_dim_size =
        is_scatter_like ? self_dims[dim] : src_dims[dim];
    int64_t inner_dim_size = 1;
    int64_t outer_dim_size = 1;
-    for (int64_t i = 0; i < dim; ++i) {
+    for (int i = 0; i < dim; ++i) {
      inner_dim_size *= index_dims[i];
    }

@@ -193,9 +193,9 @@ void cpu_scatter_input_grad_kernel(phi::DenseTensor self UNUSED,

  int64_t inner_dim_size = 1;
  int64_t outer_dim_size = 1;
-  int select_dim_size = index_dims[dim];
-  int output_select_dim_size = output_dims[dim];
-  for (int64_t i = 0; i < dim; ++i) {
+  int64_t select_dim_size = index_dims[dim];
+  int64_t output_select_dim_size = output_dims[dim];
+  for (int i = 0; i < dim; ++i) {
    inner_dim_size *= index_dims[i];
  }


--- a/paddle/phi/kernels/funcs/gpc.cc
+++ b/paddle/phi/kernels/funcs/gpc.cc
@@ -252,7 +252,7 @@ static edge_node *build_lmt(lmt_node **lmt,

  /* Create the entire input polygon edge table in one go */
  gpc_malloc<edge_node>(edge_table,
-                        total_vertices * sizeof(edge_node),
+                        total_vertices * static_cast<int>(sizeof(edge_node)),
                        const_cast<char *>("edge table creation"));

  for (c = 0; c < p->num_contours; c++) {
@@ -711,7 +711,7 @@ static bbox *create_contour_bboxes(gpc_polygon *p) {
  int v = 0;

  gpc_malloc<bbox>(box,
-                   p->num_contours * sizeof(bbox),
+                   p->num_contours * static_cast<int>(sizeof(bbox)),
                   const_cast<char *>("Bounding box creation"));
  PADDLE_ENFORCE_NOT_NULL(
      box, phi::errors::ResourceExhausted("Failed to malloc box memory."));
@@ -754,9 +754,10 @@ static void minimax_test(gpc_polygon *subj, gpc_polygon *clip, gpc_op op) {
  s_bbox = create_contour_bboxes(subj);
  c_bbox = create_contour_bboxes(clip);

-  gpc_malloc<int>(o_table,
-                  subj->num_contours * clip->num_contours * sizeof(int),
-                  const_cast<char *>("overlap table creation"));
+  gpc_malloc<int>(
+      o_table,
+      subj->num_contours * clip->num_contours * static_cast<int>(sizeof(int)),
+      const_cast<char *>("overlap table creation"));

  /* Check all subject contour bounding boxes against clip boxes */
  for (s = 0; s < subj->num_contours; s++) {
@@ -877,16 +878,17 @@ void gpc_add_contour(gpc_polygon *p, gpc_vertex_list *new_contour, int hole) {

  /* Create an extended hole array */
  gpc_malloc<int>(extended_hole,
-                  (p->num_contours + 1) * sizeof(int),
+                  (p->num_contours + 1) * static_cast<int>(sizeof(int)),
                  const_cast<char *>("contour hole addition"));
  PADDLE_ENFORCE_NOT_NULL(
      extended_hole,
      phi::errors::ResourceExhausted("Failed to malloc extended hole memory."));

  /* Create an extended contour array */
-  gpc_malloc<gpc_vertex_list>(extended_contour,
-                              (p->num_contours + 1) * sizeof(gpc_vertex_list),
-                              const_cast<char *>("contour addition"));
+  gpc_malloc<gpc_vertex_list>(
+      extended_contour,
+      (p->num_contours + 1) * static_cast<int>(sizeof(gpc_vertex_list)),
+      const_cast<char *>("contour addition"));

  /* Copy the old contour and hole data into the extended arrays */
  for (c = 0; c < p->num_contours; c++) {
@@ -898,9 +900,10 @@ void gpc_add_contour(gpc_polygon *p, gpc_vertex_list *new_contour, int hole) {
  c = p->num_contours;
  extended_hole[c] = hole;
  extended_contour[c].num_vertices = new_contour->num_vertices;
-  gpc_malloc<gpc_vertex>(extended_contour[c].vertex,
-                         new_contour->num_vertices * sizeof(gpc_vertex),
-                         const_cast<char *>("contour addition"));
+  gpc_malloc<gpc_vertex>(
+      extended_contour[c].vertex,
+      new_contour->num_vertices * static_cast<int>(sizeof(gpc_vertex)),
+      const_cast<char *>("contour addition"));
  for (v = 0; v < new_contour->num_vertices; v++) {
    extended_contour[c].vertex[v] = new_contour->vertex[v];
  }
@@ -999,8 +1002,9 @@ void gpc_polygon_clip(gpc_op op,
  }

  /* Build scanbeam table from scanbeam tree */
-  gpc_malloc<double>(
-      sbt, sbt_entries * sizeof(double), const_cast<char *>("sbt creation"));
+  gpc_malloc<double>(sbt,
+                     sbt_entries * static_cast<int>(sizeof(double)),
+                     const_cast<char *>("sbt creation"));
  PADDLE_ENFORCE_NOT_NULL(sbt,
                          phi::errors::ResourceExhausted(
                              "Failed to malloc scanbeam table memory."));
@@ -1496,11 +1500,12 @@ void gpc_polygon_clip(gpc_op op,
  result->num_contours = count_contours(out_poly);
  if (result->num_contours > 0) {
    gpc_malloc<int>(result->hole,
-                    result->num_contours * sizeof(int),
+                    result->num_contours * static_cast<int>(sizeof(int)),
                    const_cast<char *>("hole flag table creation"));
-    gpc_malloc<gpc_vertex_list>(result->contour,
-                                result->num_contours * sizeof(gpc_vertex_list),
-                                const_cast<char *>("contour creation"));
+    gpc_malloc<gpc_vertex_list>(
+        result->contour,
+        result->num_contours * static_cast<int>(sizeof(gpc_vertex_list)),
+        const_cast<char *>("contour creation"));

    c = 0;
    for (poly = out_poly; poly; poly = npoly) {
@@ -1508,10 +1513,10 @@ void gpc_polygon_clip(gpc_op op,
      if (poly->active) {
        result->hole[c] = poly->proxy->hole;
        result->contour[c].num_vertices = poly->active;
-        gpc_malloc<gpc_vertex>(
-            result->contour[c].vertex,
-            result->contour[c].num_vertices * sizeof(gpc_vertex),
-            const_cast<char *>("vertex creation"));
+        gpc_malloc<gpc_vertex>(result->contour[c].vertex,
+                               result->contour[c].num_vertices *
+                                   static_cast<int>(sizeof(gpc_vertex)),
+                               const_cast<char *>("vertex creation"));

        v = result->contour[c].num_vertices - 1;
        for (vtx = poly->proxy->v[LEFT]; vtx; vtx = nv) {
@@ -1644,8 +1649,9 @@ void gpc_tristrip_clip(gpc_op op,
  }

  /* Build scanbeam table from scanbeam tree */
-  gpc_malloc<double>(
-      sbt, sbt_entries * sizeof(double), const_cast<char *>("sbt creation"));
+  gpc_malloc<double>(sbt,
+                     sbt_entries * static_cast<int>(sizeof(double)),
+                     const_cast<char *>("sbt creation"));
  PADDLE_ENFORCE_NOT_NULL(sbt,
                          phi::errors::ResourceExhausted(
                              "Failed to malloc scanbeam table memory."));
@@ -2181,9 +2187,10 @@ void gpc_tristrip_clip(gpc_op op,
  result->strip = nullptr;
  result->num_strips = count_tristrips(tlist);
  if (result->num_strips > 0) {
-    gpc_malloc<gpc_vertex_list>(result->strip,
-                                result->num_strips * sizeof(gpc_vertex_list),
-                                const_cast<char *>("tristrip list creation"));
+    gpc_malloc<gpc_vertex_list>(
+        result->strip,
+        result->num_strips * static_cast<int>(sizeof(gpc_vertex_list)),
+        const_cast<char *>("tristrip list creation"));

    s = 0;
    for (tn = tlist; tn; tn = tnn) {
@@ -2191,9 +2198,10 @@ void gpc_tristrip_clip(gpc_op op,
      if (tn->active > 2) {
        /* Valid tristrip: copy the vertices and free the heap */
        result->strip[s].num_vertices = tn->active;
-        gpc_malloc<gpc_vertex>(result->strip[s].vertex,
-                               tn->active * sizeof(gpc_vertex),
-                               const_cast<char *>("tristrip creation"));
+        gpc_malloc<gpc_vertex>(
+            result->strip[s].vertex,
+            tn->active * static_cast<int>(sizeof(gpc_vertex)),
+            const_cast<char *>("tristrip creation"));
        v = 0;
        if (false) {
          lt = tn->v[RIGHT];

--- a/paddle/phi/kernels/funcs/gpc.h
+++ b/paddle/phi/kernels/funcs/gpc.h
@@ -139,6 +139,7 @@ typedef struct edge_shape {
 } edge_node;

 inline bool gpc_eq(float a, float b) { return (fabs(a - b) <= 1e-6); }
+inline bool gpc_eq(double a, double b) { return (fabs(a - b) <= 1e-6); }

 inline bool gpc_prev_index(float a, float b) { return (fabs(a - b) <= 1e-6); }

@@ -189,7 +190,7 @@ inline void gpc_n_edge(edge_node *d, edge_node *e, int p) {
 }

 template <typename T>
-void gpc_malloc(T *&p, int b, char *s) {
+void gpc_malloc(T *&p, int b, char *s) {  // NOLINT
  if (b > 0) {
    p = reinterpret_cast<T *>(malloc(b));

@@ -202,7 +203,7 @@ void gpc_malloc(T *&p, int b, char *s) {
  }
 }
 template <typename T>
-void gpc_free(T *&p) {
+void gpc_free(T *&p) {  // NOLINT
  if (p) {
    free(p);
    p = NULL;

--- a/paddle/phi/kernels/funcs/im2col.cc
+++ b/paddle/phi/kernels/funcs/im2col.cc
@@ -94,16 +94,16 @@ class Col2ImFunctor<phi::funcs::ColFormat::kCFO, DeviceContext, T> {
                          "The dimension of tensor 'col' should be 5. But got "
                          "the dims of tensor 'col' is [%s].",
                          col.dims()));
-    int im_channels =
-        (data_layout != DataLayout::kNHWC ? im->dims()[0] : im->dims()[2]);
-    int im_height =
-        (data_layout != DataLayout::kNHWC ? im->dims()[1] : im->dims()[0]);
-    int im_width =
-        (data_layout != DataLayout::kNHWC ? im->dims()[2] : im->dims()[1]);
-    int filter_height = col.dims()[1];
-    int filter_width = col.dims()[2];
-    int col_height = col.dims()[3];
-    int col_width = col.dims()[4];
+    int im_channels = static_cast<int>(
+        data_layout != DataLayout::kNHWC ? im->dims()[0] : im->dims()[2]);
+    int im_height = static_cast<int>(
+        data_layout != DataLayout::kNHWC ? im->dims()[1] : im->dims()[0]);
+    int im_width = static_cast<int>(
+        data_layout != DataLayout::kNHWC ? im->dims()[2] : im->dims()[1]);
+    int filter_height = static_cast<int>(col.dims()[1]);
+    int filter_width = static_cast<int>(col.dims()[2]);
+    int col_height = static_cast<int>(col.dims()[3]);
+    int col_width = static_cast<int>(col.dims()[4]);

    PADDLE_ENFORCE_EQ(
        (im_height + padding[0] + padding[2] -
@@ -194,13 +194,13 @@ class Im2ColFunctor<phi::funcs::ColFormat::kOCF, DeviceContext, T> {
                          "The dimension of tensor 'col' should be 5. But got "
                          "the dims of tensor 'col' is [%s].",
                          col->dims()));
-    int im_channels = im.dims()[0];
-    int im_height = im.dims()[1];
-    int im_width = im.dims()[2];
-    int filter_height = col->dims()[3];
-    int filter_width = col->dims()[4];
-    int col_height = col->dims()[0];
-    int col_width = col->dims()[1];
+    int im_channels = static_cast<int>(im.dims()[0]);
+    int im_height = static_cast<int>(im.dims()[1]);
+    int im_width = static_cast<int>(im.dims()[2]);
+    int filter_height = static_cast<int>(col->dims()[3]);
+    int filter_width = static_cast<int>(col->dims()[4]);
+    int col_height = static_cast<int>(col->dims()[0]);
+    int col_width = static_cast<int>(col->dims()[1]);

    const T* im_data = im.data<T>();
    T* col_data = col->data<T>();
@@ -267,13 +267,13 @@ class Col2ImFunctor<phi::funcs::ColFormat::kOCF, DeviceContext, T> {
                          "The dimension of tensor 'col' should be 5. But got "
                          "the dims of tensor 'col' is [%s].",
                          col.dims()));
-    int im_channels = im->dims()[0];
-    int im_height = im->dims()[1];
-    int im_width = im->dims()[2];
-    int filter_height = col.dims()[3];
-    int filter_width = col.dims()[4];
-    int col_height = col.dims()[0];
-    int col_width = col.dims()[1];
+    int im_channels = static_cast<int>(im->dims()[0]);
+    int im_height = static_cast<int>(im->dims()[1]);
+    int im_width = static_cast<int>(im->dims()[2]);
+    int filter_height = static_cast<int>(col.dims()[3]);
+    int filter_width = static_cast<int>(col.dims()[4]);
+    int col_height = static_cast<int>(col.dims()[0]);
+    int col_width = static_cast<int>(col.dims()[1]);

    PADDLE_ENFORCE_EQ(
        (im_height + padding[0] + padding[2] - filter_height) / stride[0] + 1,

--- a/paddle/phi/kernels/funcs/jit/gen_base.cc
+++ b/paddle/phi/kernels/funcs/jit/gen_base.cc
@@ -39,7 +39,8 @@ void GenBase::dumpCode(const unsigned char* code) const {
    counter++;
    std::ofstream fout(filename.str(), std::ios::out);
    if (fout.is_open()) {
-      fout.write(reinterpret_cast<const char*>(code), this->getSize());
+      fout.write(reinterpret_cast<const char*>(code),
+                 static_cast<int>(this->getSize()));
      fout.close();
    }
  }

--- a/paddle/phi/kernels/funcs/jit/helper.cc
+++ b/paddle/phi/kernels/funcs/jit/helper.cc
--- a/paddle/phi/kernels/funcs/jit/kernel_key.cc
+++ b/paddle/phi/kernels/funcs/jit/kernel_key.cc
--- a/paddle/phi/kernels/funcs/jit/more/intrinsic/layer_norm.cc
+++ b/paddle/phi/kernels/funcs/jit/more/intrinsic/layer_norm.cc
--- a/paddle/phi/kernels/funcs/math_function.cc
+++ b/paddle/phi/kernels/funcs/math_function.cc
--- a/paddle/phi/kernels/funcs/matrix_reduce.cc
+++ b/paddle/phi/kernels/funcs/matrix_reduce.cc
--- a/paddle/phi/kernels/funcs/maxouting.cc
+++ b/paddle/phi/kernels/funcs/maxouting.cc
--- a/paddle/phi/kernels/funcs/pooling.cc
+++ b/paddle/phi/kernels/funcs/pooling.cc
--- a/paddle/phi/kernels/funcs/segment_pooling.cc
+++ b/paddle/phi/kernels/funcs/segment_pooling.cc
--- a/paddle/phi/kernels/funcs/selected_rows_functor.cc
+++ b/paddle/phi/kernels/funcs/selected_rows_functor.cc
--- a/paddle/phi/kernels/funcs/sequence_padding.cc
+++ b/paddle/phi/kernels/funcs/sequence_padding.cc
--- a/paddle/phi/kernels/funcs/sequence_pooling.cc
+++ b/paddle/phi/kernels/funcs/sequence_pooling.cc
--- a/paddle/phi/kernels/funcs/vol2col.cc
+++ b/paddle/phi/kernels/funcs/vol2col.cc
--- a/paddle/utils/string/string_helper.cc
+++ b/paddle/utils/string/string_helper.cc