diff --git a/doc/faq.md b/doc/faq.md
index d06edcb1332e275632ebd6b0e0752723ecf8cf84..028e90dd0d8618c9ac067cc3f3cbbcc74c047680 100644
--- a/doc/faq.md
+++ b/doc/faq.md
@@ -9,11 +9,11 @@ OpenPose - Frequently Asked Question (FAQ)
     4. [Profiling Speed and Estimating FPS without Display](#profiling-speed-and-estimating-fps-without-display)
     5. [Webcam Slower than Images](#webcam-slower-than-images)
     6. [Video/Webcam Not Working](#videowebcam-not-working)
-    7. [Cannot Find OpenPose.dll Error](#cannot-find-openpose.dll-error-windows)
+    7. [Cannot Find OpenPose.dll Error](#cannot-find-openposedll-error-windows)
     8. [Free Invalid Pointer Error](#free-invalid-pointer-error)
-    9. [Source Directory does not Contain CMakeLists.txt (Windows)](#source-directory-does-not-contain-cmakelists.txt-windows)
+    9. [Source Directory does not Contain CMakeLists.txt (Windows)](#source-directory-does-not-contain-cmakeliststxt-windows)
     10. [How Should I Link my IP Camera?](#how-should-i-link-my-ip-camera)
-    11. [Difference between BODY_25 vs. COCO vs. MPI](#difference-between-body_25-vs.-coco-vs.-mpi)
+    11. [Difference between BODY_25 vs. COCO vs. MPI](#difference-between-body_25-vs-coco-vs-mpi)
     12. [How to Measure the Latency Time?](#how-to-measure-the-latency-time)
     13. [Zero People Detected](#zero-people-detected)
     14. [Check Failed for ReadProtoFromBinaryFile (Failed to Parse NetParameter File)](#check-failed-for-readprotofrombinaryfile-failed-to-parse-netparameter-file)
@@ -21,7 +21,8 @@ OpenPose - Frequently Asked Question (FAQ)
     16. [Protobuf Clip Param Caffe Error](#protobuf-clip-param-caffe-error)
     17. [The Human Skeleton Looks like Dotted Lines Rather than Solid Lines](#the-human-skeleton-looks-like-dotted-lines-rather-than-solid-lines)
     18. [Huge RAM Usage](#huge-ram-usage)
-    19. [CUDA_cublas_device_LIBRARY Not Found](#cuda_cublas_device_library_not_found)
+    19. [CUDA_cublas_device_LIBRARY Not Found](#cuda_cublas_device_library-not-found)
+    20. [CMake-GUI Error While Getting Default Caffe](#cmake-gui-error-while-getting-default-caffe)
 
 
 
@@ -176,3 +177,17 @@ CUDA_cublas_device_LIBRARY (ADVANCED)
 ```
 
 **A**: Make sure to download and install CMake-GUI following the [doc/prerequisites.md](./prerequisites.md) section. This is a known problem with CMake-GUI versions from 3.8 to 3.11 (unfortunately, default Ubuntu 18 CMake-GUI uses 3.10). You will need a CMake version >= 3.12.
+
+
+
+### CMake-GUI Error While Getting Default Caffe
+**Q**: It seems to me CMake-gui does not download Caffe at all. I tried to wipe everything and try to install OpenPose again, but received the same mistake. I also tried to see if cmake follows the ifs in the CMakeLists.txt correctly and reaches the branches where he establishes that Caffe needs to be downloaded and it seems to me it does so.
+
+**A**: There are 2 solutions to try. First, if you were using an old OP version and you just updated it, you should simply completely remove that OpenPose folder, and then re-download and re-compile OpenPose. Second, and only if after re-cloning master and running CMake-GUI the `3rdparty/caffe/` folder stays empty, manually trigger the git submodules to update. So the clone step becomes:
+```
+git clone https://github.com/CMU-Perceptual-Computing-Lab/openpose
+cd openpose
+
+git submodule init
+git submodle update
+```
diff --git a/doc/standalone_face_or_hand_keypoint_detector.md b/doc/standalone_face_or_hand_keypoint_detector.md
index 8ce723121ca56602b3067aa879d9b519ec0db32d..8240786551c8fa98272f42daa9c752917dddc140 100644
--- a/doc/standalone_face_or_hand_keypoint_detector.md
+++ b/doc/standalone_face_or_hand_keypoint_detector.md
@@ -6,16 +6,16 @@ In case of hand camera views at which the hands are visible but not the rest of
 ## OpenCV-based Face Keypoint Detector
 Note that this method will be faster than the current system if there is few people in the image, but it is also much less accurate (OpenCV face detector only works with big and frontal faces, while OpenPose works with more scales and face rotations).
 ```
-./build/examples/openpose/openpose.bin --body_disable --face --face_detector 1
+./build/examples/openpose/openpose.bin --body 0 --face --face_detector 1
 ```
 
 ## Custom Standalone Face or Hand Keypoint Detector
 Check the examples in `examples/tutorial_api_cpp/`, in particular [examples/tutorial_api_cpp/06_face_from_image.cpp](https://github.com/CMU-Perceptual-Computing-Lab/openpose/blob/master/examples/tutorial_api_cpp/06_face_from_image.cpp) and [examples/tutorial_api_cpp/07_hand_from_image.cpp](https://github.com/CMU-Perceptual-Computing-Lab/openpose/blob/master/examples/tutorial_api_cpp/07_hand_from_image.cpp). The provide examples of face and/or hand keypoint detection given a known bounding box or rectangle for the face and/or hand locations. These examples are equivalent to use the following flags:
 ```
 # Face
-examples/tutorial_api_cpp/06_face_from_image.cpp --body_disable --face --face_detector 2
+examples/tutorial_api_cpp/06_face_from_image.cpp --body 0 --face --face_detector 2
 # Hands
-examples/tutorial_api_cpp/07_hand_from_image.cpp --body_disable --hand --hand_detector 2
+examples/tutorial_api_cpp/07_hand_from_image.cpp --body 0 --hand --hand_detector 2
 ```
 
 Note: both `FaceExtractor` and `HandExtractor` classes requires as input **squared rectangles**.
diff --git a/examples/tutorial_api_cpp/06_face_from_image.cpp b/examples/tutorial_api_cpp/06_face_from_image.cpp
index ec8aefb5e99bf8dec49ebb3aff063ada95ee9b35..362b798b96a36909f62ce708eacdc82b8cc06ad0 100644
--- a/examples/tutorial_api_cpp/06_face_from_image.cpp
+++ b/examples/tutorial_api_cpp/06_face_from_image.cpp
@@ -3,7 +3,7 @@
 // it includes all the OpenPose configuration flags.
 // Input: An image and the face rectangle locations.
 // Output: OpenPose face keypoint detection.
-// NOTE: This demo is auto-selecting the following flags: `--body_disable --face --face_detector 2`
+// NOTE: This demo is auto-selecting the following flags: `--body 0 --face --face_detector 2`
 
 // Command-line user intraface
 #define OPENPOSE_FLAGS_DISABLE_PRODUCER
@@ -202,7 +202,7 @@ int tutorialApiCpp()
 
         // Info
         op::log("NOTE: In addition with the user flags, this demo has auto-selected the following flags:\n"
-                "\t`--body_disable --face --face_detector 2`", op::Priority::High);
+                "\t`--body 0 --face --face_detector 2`", op::Priority::High);
 
         // Measuring total time
         op::printTime(opTimer, "OpenPose demo successfully finished. Total time: ", " seconds.", op::Priority::High);
diff --git a/examples/tutorial_api_cpp/07_hand_from_image.cpp b/examples/tutorial_api_cpp/07_hand_from_image.cpp
index 6cc9d794e77039f9efe15bff6f4491cd49a9094b..5d16f1417f7d8a36097cc58b2584569d64221aff 100644
--- a/examples/tutorial_api_cpp/07_hand_from_image.cpp
+++ b/examples/tutorial_api_cpp/07_hand_from_image.cpp
@@ -3,7 +3,7 @@
 // it includes all the OpenPose configuration flags.
 // Input: An image and the hand rectangle locations.
 // Output: OpenPose hand keypoint detection.
-// NOTE: This demo is auto-selecting the following flags: `--body_disable --hand --hand_detector 2`
+// NOTE: This demo is auto-selecting the following flags: `--body 0 --hand --hand_detector 2`
 
 // Command-line user intraface
 #define OPENPOSE_FLAGS_DISABLE_PRODUCER
@@ -211,7 +211,7 @@ int tutorialApiCpp()
 
         // Info
         op::log("NOTE: In addition with the user flags, this demo has auto-selected the following flags:\n"
-                "\t`--body_disable --hand --hand_detector 2`", op::Priority::High);
+                "\t`--body 0 --hand --hand_detector 2`", op::Priority::High);
 
         // Measuring total time
         op::printTime(opTimer, "OpenPose demo successfully finished. Total time: ", " seconds.", op::Priority::High);
diff --git a/include/openpose/gpu/cuda.hu b/include/openpose/gpu/cuda.hu
index 47adb7cff9b5c3c7095a975eb3283b0892c40475..a039791a54448c60bfd4554e7a49583d9a56d65b 100644
--- a/include/openpose/gpu/cuda.hu
+++ b/include/openpose/gpu/cuda.hu
@@ -1,6 +1,11 @@
 #ifndef OPENPOSE_GPU_CUDA_HU
 #define OPENPOSE_GPU_CUDA_HU
 
+// Note: This class should only be included if CUDA is enabled
+
+#include <cuda.h>
+#include <cuda_runtime.h>
+
 namespace op
 {
     // VERY IMPORTANT: These fast functions does NOT work for negative integer numbers.
diff --git a/include/openpose/pose/poseParametersRender.hpp b/include/openpose/pose/poseParametersRender.hpp
index f27653f65a8e2c9df372504dcc2885b2419c4674..8d3b942af6e55fef1aace04ba54c6f479b74e58b 100644
--- a/include/openpose/pose/poseParametersRender.hpp
+++ b/include/openpose/pose/poseParametersRender.hpp
@@ -180,8 +180,8 @@ namespace op
         4,45, 45,46, 46,47, 47,48, 4,49, 49,50, 50,51, 51,52, 4,53, 53,54, 54,55, 55,56, 4,57, 57,58, 58,59, 59,60, 4,61, 61,62, 62,63, 63,64
     #define POSE_BODY_65_SCALES_RENDER_GPU \
         1.f,1.f,1.f,1.f,1.f, 1.f,1.f,1.f,1.f,1.f, 1.f,1.f,1.f,1.f,1.f, 1.f,1.f,1.f,1.f,1.f,1.f,1.f,1.f,1.f,1.f, \
-        0.75f,0.75f,0.75f,0.75f,0.75f, 0.75f,0.75f,0.75f,0.75f,0.75f, 0.75f,0.75f,0.75f,0.75f,0.75f, 0.75f,0.75f,0.75f,0.75f,0.75f, \
-        0.75f,0.75f,0.75f,0.75f,0.75f, 0.75f,0.75f,0.75f,0.75f,0.75f, 0.75f,0.75f,0.75f,0.75f,0.75f, 0.75f,0.75f,0.75f,0.75f,0.75f
+        0.60f,0.60f,0.60f,0.60f,0.60f, 0.60f,0.60f,0.60f,0.60f,0.60f, 0.60f,0.60f,0.60f,0.60f,0.60f, 0.60f,0.60f,0.60f,0.60f,0.60f, \
+        0.60f,0.60f,0.60f,0.60f,0.60f, 0.60f,0.60f,0.60f,0.60f,0.60f, 0.60f,0.60f,0.60f,0.60f,0.60f, 0.60f,0.60f,0.60f,0.60f,0.60f
     #define POSE_BODY_65_COLORS_RENDER_GPU \
         255.f,     0.f,    85.f, \
         255.f,     0.f,     0.f, \
@@ -274,10 +274,10 @@ namespace op
         1.f,1.f,1.f,1.f,1.f, 1.f,1.f,1.f,1.f,1.f, 1.f,1.f,1.f,1.f,1.f, 1.f,1.f, \
         1.f,1.f, \
         1.f,1.f,1.f,1.f,1.f,1.f, \
-        0.55f,0.55f,0.55f,0.55f,0.55f, 0.55f,0.55f,0.55f,0.55f,0.55f, 0.55f,0.55f,0.55f,0.55f,0.55f, 0.55f,0.55f,0.55f,0.55f,0.55f, \
-        0.55f,0.55f,0.55f,0.55f,0.55f, 0.55f,0.55f,0.55f,0.55f,0.55f, 0.55f,0.55f,0.55f,0.55f,0.55f, 0.55f,0.55f,0.55f,0.55f,0.55f, \
-        0.55f,0.55f,0.55f,0.55f,0.55f, 0.55f,0.55f,0.55f,0.55f,0.55f, 0.55f,0.55f,0.55f,0.55f,0.55f, 0.55f,0.55f,0.55f,0.55f,0.55f, \
-        0.55f,0.55f,0.55f,0.55f,0.55f, 0.55f,0.55f,0.55f,0.55f,0.55f
+        0.45f,0.45f,0.45f,0.45f,0.45f, 0.45f,0.45f,0.45f,0.45f,0.45f, 0.45f,0.45f,0.45f,0.45f,0.45f, 0.45f,0.45f,0.45f,0.45f,0.45f, \
+        0.45f,0.45f,0.45f,0.45f,0.45f, 0.45f,0.45f,0.45f,0.45f,0.45f, 0.45f,0.45f,0.45f,0.45f,0.45f, 0.45f,0.45f,0.45f,0.45f,0.45f, \
+        0.45f,0.45f,0.45f,0.45f,0.45f, 0.45f,0.45f,0.45f,0.45f,0.45f, 0.45f,0.45f,0.45f,0.45f,0.45f, 0.45f,0.45f,0.45f,0.45f,0.45f, \
+        0.45f,0.45f,0.45f,0.45f,0.45f, 0.45f,0.45f,0.45f,0.45f,0.45f
     #define POSE_BODY_95_COLORS_RENDER_GPU \
         255.f,     0.f,    85.f, \
         170.f,     0.f,   255.f, \
@@ -396,16 +396,17 @@ namespace op
         F135+20,F135+21,  F135+22,F135+23,  F135+23,F135+24,  F135+24,F135+25,  F135+25,F135+26,  F135+27,F135+28,  F135+28,F135+29,  F135+29,F135+30,  F135+31,F135+32,  F135+32,F135+33,  F135+33,F135+34,  F135+34,F135+35,  F135+36,F135+37,  F135+37,F135+38,  F135+38,F135+39,  F135+39,F135+40,  F135+40,F135+41, \
         F135+41,F135+36,  F135+42,F135+43,  F135+43,F135+44,  F135+44,F135+45,  F135+45,F135+46,  F135+46,F135+47,  F135+47,F135+42,  F135+48,F135+49,  F135+49,F135+50,  F135+50,F135+51,  F135+51,F135+52,  F135+52,F135+53,  F135+53,F135+54,  F135+54,F135+55,  F135+55,F135+56,  F135+56,F135+57,  F135+57,F135+58, \
         F135+58,F135+59,  F135+59,F135+48,  F135+60,F135+61,  F135+61,F135+62,  F135+62,F135+63,  F135+63,F135+64,  F135+64,F135+65,  F135+65,F135+66,  F135+66,F135+67,  F135+67,F135+60
+    // Disabled really noisy values
     #define POSE_BODY_135_SCALES_RENDER_GPU \
         1.f,1.f,1.f,1.f,1.f, 1.f,1.f,1.f,1.f,1.f, 1.f,1.f,1.f,1.f,1.f, 1.f,1.f, \
-        1.f,1.f, \
+        1.f,0.00f, \
         1.f,1.f,1.f,1.f,1.f,1.f, \
-        0.75f,0.75f,0.75f,0.75f,0.75f, 0.75f,0.75f,0.75f,0.75f,0.75f, 0.75f,0.75f,0.75f,0.75f,0.75f, 0.75f,0.75f,0.75f,0.75f,0.75f, \
-        0.75f,0.75f,0.75f,0.75f,0.75f, 0.75f,0.75f,0.75f,0.75f,0.75f, 0.75f,0.75f,0.75f,0.75f,0.75f, 0.75f,0.75f,0.75f,0.75f,0.75f, \
-        0.55f,0.55f,0.55f,0.55f,0.55f, 0.55f,0.55f,0.55f,0.55f,0.55f, 0.55f,0.55f,0.55f,0.55f,0.55f, 0.55f,0.55f,0.55f,0.55f,0.55f, \
-        0.55f,0.55f,0.55f,0.55f,0.55f, 0.55f,0.55f,0.55f,0.55f,0.55f, 0.55f,0.55f,0.55f,0.55f,0.55f, 0.55f,0.55f,0.55f,0.55f,0.55f, \
-        0.55f,0.55f,0.55f,0.55f,0.55f, 0.55f,0.55f,0.55f,0.55f,0.55f, 0.55f,0.55f,0.55f,0.55f,0.55f, 0.55f,0.55f,0.55f,0.55f,0.55f, \
-        0.55f,0.55f,0.55f,0.55f,0.55f, 0.55f,0.55f,0.55f,0.55f,0.55f
+        0.60f,0.60f,0.60f,0.60f,0.60f, 0.60f,0.60f,0.60f,0.60f,0.60f, 0.60f,0.60f,0.60f,0.60f,0.60f, 0.60f,0.60f,0.60f,0.60f,0.60f, \
+        0.60f,0.60f,0.60f,0.60f,0.60f, 0.60f,0.60f,0.60f,0.60f,0.60f, 0.60f,0.60f,0.60f,0.60f,0.60f, 0.60f,0.60f,0.60f,0.60f,0.60f, \
+        0.00f,0.00f,0.00f,0.00f,0.00f, 0.00f,0.00f,0.00f,0.00f,0.00f, 0.00f,0.00f,0.00f,0.00f,0.00f, 0.00f,0.00f,0.45f,0.45f,0.45f, \
+        0.45f,0.45f,0.45f,0.45f,0.45f, 0.45f,0.45f,0.45f,0.45f,0.45f, 0.45f,0.45f,0.45f,0.45f,0.45f, 0.45f,0.45f,0.45f,0.45f,0.45f, \
+        0.45f,0.45f,0.45f,0.45f,0.45f, 0.45f,0.45f,0.45f,0.45f,0.45f, 0.45f,0.45f,0.45f,0.45f,0.45f, 0.45f,0.45f,0.45f,0.45f,0.45f, \
+        0.45f,0.45f,0.45f,0.45f,0.45f, 0.45f,0.45f,0.45f,0.45f,0.45f
     #define POSE_BODY_135_COLORS_RENDER_GPU \
         255.f,     0.f,    85.f, \
         170.f,     0.f,   255.f, \
diff --git a/include/openpose/utilities/avx.hpp b/include/openpose/utilities/avx.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..40638f78f08ebc2f26c669f4f63148f69a34dd43
--- /dev/null
+++ b/include/openpose/utilities/avx.hpp
@@ -0,0 +1,100 @@
+#ifndef OPENPOSE_UTILITIES_AVX_HPP
+#define OPENPOSE_UTILITIES_AVX_HPP
+
+// Warning:
+// This file contains auxiliary functions for AVX.
+// This file should only be included from cpp files.
+// Default #include <openpose/headers.hpp> does not include it.
+
+#ifdef WITH_AVX
+    #include <cstdint> // uintptr_t
+    #include <memory> // shared_ptr
+    #include <immintrin.h>
+    #include <openpose/utilities/errorAndLog.hpp>
+
+    namespace op
+    {
+        #ifdef __GNUC__
+            #define ALIGN32(x) x __attribute__((aligned(32)))
+        #elif defined(_MSC_VER) // defined(_WIN32)
+            #define ALIGN32(x) __declspec(align(32))
+        #else
+            #error Unknown environment!
+        #endif
+
+        // Functions
+        // Sources:
+        // - https://stackoverflow.com/questions/32612190/how-to-solve-the-32-byte-alignment-issue-for-avx-load-store-operations
+        // - https://embeddedartistry.com/blog/2017/2/20/implementing-aligned-malloc
+        // - https://embeddedartistry.com/blog/2017/2/23/c-smart-pointers-with-aligned-mallocfree
+        typedef unsigned long long offset_t;
+        #define PTR_OFFSET_SZ sizeof(offset_t)
+        #ifndef align_up
+        #define align_up(num, align) \
+            (((num) + ((align) - 1)) & ~((align) - 1))
+        #endif
+        inline void * aligned_malloc(const size_t align, const size_t size)
+        {
+            void * ptr = nullptr;
+
+            // 2 conditions:
+            //  - We want both align and size to be greater than 0
+            //  - We want it to be a power of two since align_up operates on powers of two
+            if (align && size && (align & (align - 1)) == 0)
+            {
+                // We know we have to fit an offset value
+                // We also allocate extra bytes to ensure we can meet the alignment
+                const auto hdr_size = PTR_OFFSET_SZ + (align - 1);
+                void * p = malloc(size + hdr_size);
+
+                if (p)
+                {
+                    // Add the offset size to malloc's pointer (we will always store that)
+                    // Then align the resulting value to the arget alignment
+                    ptr = (void *) align_up(((uintptr_t)p + PTR_OFFSET_SZ), align);
+
+                    // Calculate the offset and store it behind our aligned pointer
+                    *((offset_t *)ptr - 1) = (offset_t)((uintptr_t)ptr - (uintptr_t)p);
+
+                } // else nullptr, could not malloc
+            } // else nullptr, invalid arguments
+
+            if (ptr == nullptr)
+            {
+                error("Shared pointer could not be allocated for Array data storage.",
+                      __LINE__, __FUNCTION__, __FILE__);
+            }
+
+            return ptr;
+        }
+        inline void aligned_free(void * ptr)
+        {
+            if (ptr == nullptr)
+                error("Received nullptr.", __LINE__, __FUNCTION__, __FILE__);
+
+            // Walk backwards from the passed-in pointer to get the pointer offset
+            // We convert to an offset_t pointer and rely on pointer math to get the data
+            offset_t offset = *((offset_t *)ptr - 1);
+
+            // Once we have the offset, we can get our original pointer and call free
+            void * p = (void *)((uint8_t *)ptr - offset);
+            free(p);
+        }
+        template<class T>
+        std::shared_ptr<T> aligned_shared_ptr(const size_t size)
+        {
+            try
+            {
+                return std::shared_ptr<T>(static_cast<T*>(
+                    aligned_malloc(8*sizeof(T), sizeof(T)*size)), &aligned_free);
+            }
+            catch (const std::exception& e)
+            {
+                error(e.what(), __LINE__, __FUNCTION__, __FILE__);
+                return std::shared_ptr<T>{};
+            }
+        }
+    }
+#endif
+
+#endif // OPENPOSE_UTILITIES_AVX_HPP
diff --git a/include/openpose/utilities/profiler.hpp b/include/openpose/utilities/profiler.hpp
index c0e1cc2c031dbefff64f12046bf4a7e9d69898e4..c44ae90ac6b86b20556f141f511afa4aa495f24a 100644
--- a/include/openpose/utilities/profiler.hpp
+++ b/include/openpose/utilities/profiler.hpp
@@ -8,6 +8,11 @@
 
 namespace op
 {
+    // The following functions provides basic functions to measure time. Usage example:
+    //     const auto timerInit = getTimerInit();
+    //         // [Some code in here]
+    //     const auto timeSeconds = getTimeSeconds(timerInit);
+    //     const printTime(timeSeconds, "Function X took ", " seconds.");
     OP_API std::chrono::time_point<std::chrono::high_resolution_clock> getTimerInit();
 
     OP_API double getTimeSeconds(const std::chrono::time_point<std::chrono::high_resolution_clock>& timerInit);
@@ -16,6 +21,47 @@ namespace op
         const std::chrono::time_point<std::chrono::high_resolution_clock>& timerInit, const std::string& firstMessage,
         const std::string& secondMessage, const Priority priority);
 
+    // The following functions will run REPS times and average the final time in seconds. Usage example:
+    //     const auto REPS = 1000;
+    //     double time = 0.;
+    //     OP_PROFILE_INIT(REPS);
+    //         // [Some code in here]
+    //     OP_PROFILE_END(time, 1e3, REPS); // Time in msec. 1 = sec, 1e3 = msec, 1e6 = usec, 1e9 = nsec, etc.
+    //     log("Function X took " + std::to_string(time) + " milliseconds.");
+    #define OP_PROFILE_INIT(REPS) \
+    { \
+        const auto timerInit = getTimerInit(); \
+        for (auto rep = 0 ; rep < (REPS) ; ++rep) \
+        {
+    #define OP_PROFILE_END(finalTime, factor, REPS) \
+        } \
+        (finalTime) = (factor)/(float)(REPS)*getTimeSeconds(timerInit); \
+    }
+
+    // The following functions will run REPS times, wait for the kernels to finish, and then average the final time
+    // in seconds. Usage example:
+    //     const auto REPS = 1000;
+    //     double time = 0.;
+    //     OP_CUDA_PROFILE_INIT(REPS);
+    //         // [Some code with CUDA calls in here]
+    //     OP_CUDA_PROFILE_END(time, 1e3, REPS); // Time in msec. 1 = sec, 1e3 = msec, 1e6 = usec, 1e9 = nsec, etc.
+    //     log("Function X took " + std::to_string(time) + " milliseconds.");
+    // Analogous to OP_PROFILE_INIT, but also waits for CUDA kernels to finish their asynchronous operations
+    // It requires: #include <cuda_runtime.h>
+    #define OP_CUDA_PROFILE_INIT(REPS) \
+    { \
+        cudaDeviceSynchronize(); \
+        const auto timerInit = getTimerInit(); \
+        for (auto rep = 0 ; rep < (REPS) ; ++rep) \
+        {
+    // Analogous to OP_PROFILE_END, but also waits for CUDA kernels to finish their asynchronous operations
+    // It requires: #include <cuda_runtime.h>
+    #define OP_CUDA_PROFILE_END(finalTime, factor, REPS) \
+        } \
+        cudaDeviceSynchronize(); \
+        (finalTime) = (factor)/(float)(REPS)*getTimeSeconds(timerInit); \
+    }
+
     // Enable PROFILER_ENABLED on Makefile.config or CMake in order to use this function. Otherwise nothing will be outputted.
     // How to use - example:
     // For GPU - It can only be applied in the main.cpp file:
diff --git a/scripts/tests/pose_accuracy_coco_val.sh b/scripts/tests/pose_accuracy_coco_val.sh
index e45d68485f044e587772111c544342eb5283afde..80db88d48cf9209ca7ffb311fb35c588aa381684 100755
--- a/scripts/tests/pose_accuracy_coco_val.sh
+++ b/scripts/tests/pose_accuracy_coco_val.sh
@@ -11,15 +11,19 @@ clear && clear
 
 # Parameters
 IMAGE_FOLDER=~/devel/images/val2017/
+IMAGE_FOOT_FOLDER=~/devel/images/val2017_foot/
 JSON_FOLDER=../evaluation/coco_val_jsons/
 # JSON_FOLDER=/media/posefs3b/Users/gines/openpose_train/training_results/2_23_51/best_702k/
 OP_BIN=./build/examples/openpose/openpose.bin
 
-    # 1 scale
-$OP_BIN --image_dir $IMAGE_FOLDER --display 0 --render_pose 0 --cli_verbose 0.2 --write_coco_json ${JSON_FOLDER}1.json --write_coco_json_variants 3
+    # 1 scale (body)
+$OP_BIN --image_dir $IMAGE_FOLDER --display 0 --render_pose 0 --cli_verbose 0.2 --write_coco_json ${JSON_FOLDER}1.json --write_coco_json_variants 1
 # $OP_BIN --image_dir $IMAGE_FOLDER --display 0 --render_pose 0 --cli_verbose 0.2 --write_coco_json ${JSON_FOLDER}1_max.json --write_coco_json_variants 3 \
 #     --maximize_positives
 
+    # 1 scale (foot)
+$OP_BIN --image_dir $IMAGE_FOOT_FOLDER --display 0 --render_pose 0 --cli_verbose 0.2 --write_coco_json ${JSON_FOLDER}1.json --write_coco_json_variants 2
+
 #     # 4 scales
 # $OP_BIN --image_dir $IMAGE_FOLDER --display 0 --render_pose 0 --cli_verbose 0.2 --write_coco_json ${JSON_FOLDER}1_4.json --write_coco_json_variants 3 \
 #     --scale_number 4 --scale_gap 0.25 --net_resolution "1312x736"
diff --git a/src/openpose/core/array.cpp b/src/openpose/core/array.cpp
index 776cd85b3338c5c734be7de9770e9f62262add11..dfd6ac33026f2e71ba572072757d16fc996ca943 100644
--- a/src/openpose/core/array.cpp
+++ b/src/openpose/core/array.cpp
@@ -1,5 +1,6 @@
 #include <typeinfo> // typeid
 #include <numeric> // std::accumulate
+#include <openpose/utilities/avx.hpp>
 #include <openpose/utilities/errorAndLog.hpp>
 #include <openpose/core/array.hpp>
 
@@ -637,8 +638,16 @@ namespace op
                 // Prepare shared_ptr
                 if (dataPtr == nullptr)
                 {
-                    spData.reset(new T[mVolume], std::default_delete<T[]>());
+                    #ifdef WITH_AVX
+                        spData = aligned_shared_ptr<T>(mVolume);
+                    #else
+                        spData.reset(new T[mVolume], std::default_delete<T[]>());
+                    #endif
                     pData = spData.get();
+                    // Sanity check
+                    if (pData == nullptr)
+                        error("Shared pointer could not be allocated for Array data storage.",
+                              __LINE__, __FUNCTION__, __FILE__);
                 }
                 else
                 {
diff --git a/src/openpose/filestream/jsonOfstream.cpp b/src/openpose/filestream/jsonOfstream.cpp
index e2282ddadd1b98ec8b57e039d7bbb0db0f7fb910..87bcc19d830aad953896c451b7b5fbcafdd99477 100644
--- a/src/openpose/filestream/jsonOfstream.cpp
+++ b/src/openpose/filestream/jsonOfstream.cpp
@@ -29,7 +29,7 @@ namespace op
         try
         {
             if (!filePath.empty() && !upOfstream->is_open())
-                error("Json file could not be opened.", __LINE__, __FUNCTION__, __FILE__);
+                error("Json file " + filePath + " could not be opened.", __LINE__, __FUNCTION__, __FILE__);
         }
         catch (const std::exception& e)
         {
diff --git a/src/openpose/filestream/videoSaver.cpp b/src/openpose/filestream/videoSaver.cpp
index 05742b1a458a91a3a012ee5ba291152e46471630..1ee26dca6fde0c10c3b0047653f338976d34a844 100644
--- a/src/openpose/filestream/videoSaver.cpp
+++ b/src/openpose/filestream/videoSaver.cpp
@@ -111,8 +111,13 @@ namespace op
             {
                 log("JPG images temporarily generated in " + upImpl->mTempImageFolder + ".", op::Priority::High);
                 // FFmpeg command: Save video from images (override if video with same name exists)
-                const std::string imageToVideoCommand = "ffmpeg -y -i " + upImpl->mTempImageFolder + "/%12d_rendered.jpg"
-                    + " -c:v libx264 -framerate " + std::to_string(upImpl->mFps) + " -pix_fmt yuv420p "
+                // Framerate works with both `-r` and `-framerate` for an image folder. Source:
+                //     https://stackoverflow.com/questions/51143100/framerate-vs-r-vs-filter-fps
+                // Very important: Either FPS flag must go before `-i`!!! Otherwise, it would either not work (`-r`)
+                // or do a weird resample (`-framerate`)
+                const std::string imageToVideoCommand = "ffmpeg -y -framerate " + std::to_string(upImpl->mFps)
+                    + " -i " + upImpl->mTempImageFolder + "/%12d_rendered.jpg"
+                    + " -c:v libx264 -pix_fmt yuv420p "
                     + upImpl->mVideoSaverPath;
                 log("Creating MP4 video out of JPG images by running:\n" + imageToVideoCommand + "\n",
                     op::Priority::High);
diff --git a/src/openpose/pose/poseExtractorCaffe.cpp b/src/openpose/pose/poseExtractorCaffe.cpp
index 42b35ae92d8361d9c155cd065699d4094a6a4209..0e67b033ce9a94ffaeec3479d520a1f6babbf890 100644
--- a/src/openpose/pose/poseExtractorCaffe.cpp
+++ b/src/openpose/pose/poseExtractorCaffe.cpp
@@ -1,5 +1,6 @@
 #include <limits> // std::numeric_limits
 #include <openpose/gpu/cuda.hpp>
+#include <openpose/gpu/cuda.hu>
 #include <openpose/pose/poseParameters.hpp>
 #include <openpose/utilities/check.hpp>
 #include <openpose/utilities/fastMath.hpp>
@@ -203,6 +204,12 @@ namespace op
         try
         {
             #ifdef USE_CAFFE
+                // const auto REPS = 1;
+                // double timeNormalize1 = 0.;
+                // double timeNormalize2 = 0.;
+                // double timeNormalize3 = 0.;
+                // double timeNormalize4 = 0.;
+                // OP_CUDA_PROFILE_INIT(REPS);
                 // Sanity checks
                 if (inputNetData.empty())
                     error("Empty inputNetData.", __LINE__, __FUNCTION__, __FILE__);
@@ -279,6 +286,8 @@ namespace op
                             positiveIntRound(ratio*mNetInput4DSizes[0][3]),
                             positiveIntRound(ratio*mNetInput4DSizes[0][2])};
                 }
+                // OP_CUDA_PROFILE_END(timeNormalize1, 1e3, REPS);
+                // OP_CUDA_PROFILE_INIT(REPS);
                 // 2. Resize heat maps + merge different scales
                 // ~5ms (GPU) / ~20ms (CPU)
                 const auto caffeNetOutputBlobs = arraySharedToPtr(spCaffeNetOutputBlobs);
@@ -295,12 +304,16 @@ namespace op
                 // mScaleNetToOutput = 1.f;
                 // 3. Get peaks by Non-Maximum Suppression
                 // ~2ms (GPU) / ~7ms (CPU)
+                // OP_CUDA_PROFILE_END(timeNormalize2, 1e3, REPS);
                 const auto nmsThreshold = (float)get(PoseProperty::NMSThreshold);
-                spNmsCaffe->setThreshold(nmsThreshold);
                 const auto nmsOffset = float(0.5/double(mScaleNetToOutput));
+                // OP_CUDA_PROFILE_INIT(REPS);
+                spNmsCaffe->setThreshold(nmsThreshold);
                 spNmsCaffe->setOffset(Point<float>{nmsOffset, nmsOffset});
                 spNmsCaffe->Forward({spHeatMapsBlob.get()}, {spPeaksBlob.get()});
                 // 4. Connecting body parts
+                // OP_CUDA_PROFILE_END(timeNormalize3, 1e3, REPS);
+                // OP_CUDA_PROFILE_INIT(REPS);
                 spBodyPartConnectorCaffe->setScaleNetToOutput(mScaleNetToOutput);
                 spBodyPartConnectorCaffe->setInterMinAboveThreshold(
                     (float)get(PoseProperty::ConnectInterMinAboveThreshold));
@@ -310,6 +323,11 @@ namespace op
                 // Note: BODY_25D will crash (only implemented for CPU version)
                 spBodyPartConnectorCaffe->Forward(
                     {spHeatMapsBlob.get(), spPeaksBlob.get()}, mPoseKeypoints, mPoseScores);
+                // OP_CUDA_PROFILE_END(timeNormalize4, 1e3, REPS);
+                // log("1 = " + std::to_string(timeNormalize1) + " msecs.");
+                // log("2 = " + std::to_string(timeNormalize2) + " msecs.");
+                // log("3 = " + std::to_string(timeNormalize3) + " msecs.");
+                // log("4 = " + std::to_string(timeNormalize4) + " msecs.");
                 // Re-run on each person
                 if (TOP_DOWN_REFINEMENT)
                 {
diff --git a/src/openpose/tracking/pyramidalLK.cpp b/src/openpose/tracking/pyramidalLK.cpp
index 4300342f1c4222a81a719a6be2085ec9c1c42c64..d6f3100a468a12a72654df3fd38e89f930a3e9d9 100644
--- a/src/openpose/tracking/pyramidalLK.cpp
+++ b/src/openpose/tracking/pyramidalLK.cpp
@@ -1,20 +1,19 @@
-// #include <iostream>
+#ifdef WITH_SSE4
+    #include <emmintrin.h>
+    #include "smmintrin.h"
+#endif
+
+#ifdef WITH_AVX
+    #include <immintrin.h>
+#endif
+
+#include <iostream>
 #include <opencv2/core/core.hpp> // cv::Point2f, cv::Mat
 #include <opencv2/imgproc/imgproc.hpp> // cv::pyrDown
 #include <opencv2/video/video.hpp> // cv::buildOpticalFlowPyramid
 #include <openpose/utilities/profiler.hpp>
 #include <openpose/tracking/pyramidalLK.hpp>
 
-#if defined (WITH_SSE4)
-#include <emmintrin.h>
-#include "smmintrin.h"
-#endif
-
-#if defined (WITH_AVX)
-#include <immintrin.h>
-#endif
-
-#include <iostream>
 //#define DEBUG
 // #ifdef DEBUG
 // // When debugging is enabled, these form aliases to useful functions
@@ -32,7 +31,7 @@
 
 namespace op
 {
-#if defined (WITH_SSE4)
+#ifdef WITH_SSE4
     float sse_dot_product(std::vector<float> &av, std::vector<float> &bv)
     {
 
@@ -51,7 +50,7 @@ namespace op
       /* Do SIMD dot product */
       for (unsigned int i = 0; i < niters; i++, ptrA++,ptrB++)
         res = _mm_add_ps(_mm_dp_ps(*ptrA, *ptrB, 255), res);
-      
+
 
       /* Get result back from the SIMD vector */
       float fres[4];
@@ -68,7 +67,7 @@ namespace op
     }
 #endif
 
-#if defined (WITH_AVX)
+#ifdef WITH_AVX
 
     float avx_dot_product(std::vector<float> &av, std::vector<float> &bv)
     {
@@ -100,7 +99,7 @@ namespace op
 
       return fres[0] + fres[4];
     }
-#endif 
+#endif
 
     char computeLK(cv::Point2f& delta,  std::vector<float>& ix,
                   std::vector<float>& iy, std::vector<float>& it)
@@ -114,7 +113,7 @@ namespace op
             auto sumYT = 0.f;
             auto sumXY = 0.f;
 
-#if defined (WITH_AVX)
+#ifdef WITH_AVX
             sumXX = avx_dot_product(ix,ix);
             sumYY = avx_dot_product(iy,iy);
             sumXY = avx_dot_product(ix,iy);
@@ -126,7 +125,7 @@ namespace op
             sumXY = sse_dot_product(ix,iy);
             sumXT = sse_dot_product(ix,it);
             sumYT = sse_dot_product(iy,it);
-#else            
+#else
             for (auto i = 0u; i < ix.size(); i++)
             {
               sumXX += ix[i] * ix[i];
@@ -134,8 +133,8 @@ namespace op
               sumXY += ix[i] * iy[i];
               sumXT += ix[i] * it[i];
               sumYT += iy[i] * it[i];
-            }            
-#endif            
+            }
+#endif
 
             // Get numerator and denominator of u and v
             const auto den = (sumXX*sumYY) - (sumXY * sumXY);
@@ -250,6 +249,7 @@ namespace op
             return UNDEFINED_ERROR;
         }
     }
+
     // Given an OpenCV image, build a gaussian pyramid of size 'levels'
     void buildGaussianPyramid(std::vector<cv::Mat>& pyramidImages, const cv::Mat& image, const int levels)
     {
@@ -271,8 +271,6 @@ namespace op
         }
     }
 
-
-
     cv::Point2f pyramidIteration(char& status, const cv::Point2f& pointI, const cv::Point2f& pointJ, const cv::Mat& I,
                                  const cv::Mat& J, const int patchSize = 5)
     {
@@ -335,12 +333,12 @@ namespace op
 
             coordJ.clear();
             coordJ.assign(I.begin(), I.end());
-            
+
             if (pyramidImagesPrevious.empty())
                 buildGaussianPyramid(pyramidImagesPrevious, imagePrevious, levels);
             if (pyramidImagesCurrent.empty())
                 buildGaussianPyramid(pyramidImagesCurrent, imageCurrent, levels);
-  
+
 
             // Process all pixel requests
             for (auto i = 0u; i < coordI.size(); i++)
@@ -364,8 +362,6 @@ namespace op
                     coordJ[i] *= 2.f;
                 }
             }
-     
-
         }
         catch (const std::exception& e)
         {
diff --git a/src/openpose/utilities/openCv.cpp b/src/openpose/utilities/openCv.cpp
index 85ef0b2e4b3acd413dfcd9e0e238dfc15defbd09..710f2a9d618e28d94ac480d7e469ce8fc97cc912 100644
--- a/src/openpose/utilities/openCv.cpp
+++ b/src/openpose/utilities/openCv.cpp
@@ -1,3 +1,4 @@
+#include <openpose/utilities/avx.hpp>
 #include <openpose/utilities/fastMath.hpp>
 #include <openpose/utilities/openCv.hpp>
 
@@ -95,20 +96,44 @@ namespace op
                     const auto floatPtrImageOffsetY = (floatPtrImageOffsetC + y) * width;
                     const auto originFramePtrOffsetY = y * width;
                     for (auto x = 0; x < width; x++)
-                        floatPtrImage[floatPtrImageOffsetY + x] = float(originFramePtr[(originFramePtrOffsetY + x)
-                                                                        * channels + c]);
+                        floatPtrImage[floatPtrImageOffsetY + x] = float(
+                            originFramePtr[(originFramePtrOffsetY + x) * channels + c]);
                 }
             }
             // Normalizing if desired
-            // floatPtrImage wrapped as cv::Mat
-                // Empirically tested - OpenCV is more efficient normalizing a whole matrix/image (it uses AVX and
-                // other optimized instruction sets).
-                // In addition, the following if statement does not copy the pointer to a cv::Mat, just wrapps it.
             // VGG
             if (normalize == 1)
             {
-                cv::Mat floatPtrImageCvWrapper(height, width, CV_32FC3, floatPtrImage);
-                floatPtrImageCvWrapper = floatPtrImageCvWrapper/256.f - 0.5f;
+                #ifdef WITH_AVX
+                    // // C++ code
+                    // const auto ratio = 1.f/256.f;
+                    // for (auto pixel = 0 ; pixel < width*height*channels ; ++pixel)
+                    //     floatPtrImage[pixel] = floatPtrImage[pixel]*ratio - 0.5f;
+                    // AVX code
+                    const auto volume = width*height*channels;
+                    int pixel;
+                    const __m256 mmRatio = _mm256_set1_ps(1.f/256.f);
+                    const __m256 mmBias = _mm256_set1_ps(-0.5f);
+                    for (pixel = 0 ; pixel < volume-7 ; pixel += 8)
+                    {
+                        const __m256 input = _mm256_load_ps(&floatPtrImage[pixel]);
+                        // const __m256 input = _mm256_loadu_ps(&floatPtrImage[pixel]); // If non-aligned pointer
+                        const __m256 output = _mm256_fmadd_ps(input, mmRatio, mmBias);
+                        _mm256_store_ps(&floatPtrImage[pixel], output);
+                        // _mm256_storeu_ps(&floatPtrImage[pixel], output); // If non-aligned pointer
+                    }
+                    const auto ratio = 1.f/256.f;
+                    for (; pixel < volume ; ++pixel)
+                        floatPtrImage[pixel] = floatPtrImage[pixel]*ratio - 0.5f;
+                // Non optimized code
+                #else
+                    // floatPtrImage wrapped as cv::Mat
+                        // Empirically tested - OpenCV is more efficient normalizing a whole matrix/image (it uses AVX and
+                        // other optimized instruction sets).
+                        // In addition, the following if statement does not copy the pointer to a cv::Mat, just wrapps it.
+                    cv::Mat floatPtrImageCvWrapper(height, width, CV_32FC3, floatPtrImage);
+                    floatPtrImageCvWrapper = floatPtrImageCvWrapper*(1/256.f) - 0.5f;
+                #endif
             }
             // // ResNet
             // else if (normalize == 2)
diff --git a/src/openpose/wrapper/wrapperAuxiliary.cpp b/src/openpose/wrapper/wrapperAuxiliary.cpp
index 2fcc1b329b6f6c7dc6336fcdfb4121b5f8d7b79d..445693af5936c7f61912b18929893b7ad8428dd6 100644
--- a/src/openpose/wrapper/wrapperAuxiliary.cpp
+++ b/src/openpose/wrapper/wrapperAuxiliary.cpp
@@ -108,7 +108,7 @@ namespace op
             if (wrapperStructPose.poseMode == PoseMode::Disabled && !wrapperStructFace.enable
                 && !wrapperStructHand.enable)
                 error("Body, face, and hand keypoint detectors are disabled. You must enable at least one (i.e,"
-                      " unselect `--body_disable`, select `--face`, or select `--hand`.",
+                      " unselect `--body 0`, select `--face`, or select `--hand`.",
                       __LINE__, __FUNCTION__, __FILE__);
             const auto ownDetectorProvided = (wrapperStructFace.detector == Detector::Provided
                                               || wrapperStructHand.detector == Detector::Provided);
@@ -126,7 +126,7 @@ namespace op
                 log("Warning: Body keypoint estimation is enabled while you have also selected to provide your own"
                     " face and/or hand rectangle detections (`face_detector 2` and/or `hand_detector 2`). Therefore,"
                     " OpenPose will not detect face and/or hand keypoints based on the body keypoints. Are you sure"
-                    " you want to keep enabled the body keypoint detector? (disable it with `--body_disable`).",
+                    " you want to keep enabled the body keypoint detector? (disable it with `--body 0`).",
                     Priority::High);
             // If 3-D module, 1 person is the maximum
             if (wrapperStructExtra.reconstruct3d && wrapperStructPose.numberPeopleMax != 1)