diff --git a/tools/lib/api/fs/fs.c b/tools/lib/api/fs/fs.c
index 809c7721cd2451a5e1a8e3bbe5a3994965106346..a7ecf8f469f47921ec0734eb4142a4da64f6c5a7 100644
--- a/tools/lib/api/fs/fs.c
+++ b/tools/lib/api/fs/fs.c
@@ -387,6 +387,22 @@ int filename__read_str(const char *filename, char **buf, size_t *sizep)
 	return err;
 }
 
+int filename__write_int(const char *filename, int value)
+{
+	int fd = open(filename, O_WRONLY), err = -1;
+	char buf[64];
+
+	if (fd < 0)
+		return err;
+
+	sprintf(buf, "%d", value);
+	if (write(fd, buf, sizeof(buf)) == sizeof(buf))
+		err = 0;
+
+	close(fd);
+	return err;
+}
+
 int procfs__read_str(const char *entry, char **buf, size_t *sizep)
 {
 	char path[PATH_MAX];
@@ -480,3 +496,17 @@ int sysctl__read_int(const char *sysctl, int *value)
 
 	return filename__read_int(path, value);
 }
+
+int sysfs__write_int(const char *entry, int value)
+{
+	char path[PATH_MAX];
+	const char *sysfs = sysfs__mountpoint();
+
+	if (!sysfs)
+		return -1;
+
+	if (snprintf(path, sizeof(path), "%s/%s", sysfs, entry) >= PATH_MAX)
+		return -1;
+
+	return filename__write_int(path, value);
+}
diff --git a/tools/lib/api/fs/fs.h b/tools/lib/api/fs/fs.h
index 956c21127d1ef7d9817c0bbb8a9423ff4bdf1de8..45605348461e2f40e308c7a446ddfe32808c12ee 100644
--- a/tools/lib/api/fs/fs.h
+++ b/tools/lib/api/fs/fs.h
@@ -31,6 +31,8 @@ int filename__read_int(const char *filename, int *value);
 int filename__read_ull(const char *filename, unsigned long long *value);
 int filename__read_str(const char *filename, char **buf, size_t *sizep);
 
+int filename__write_int(const char *filename, int value);
+
 int procfs__read_str(const char *entry, char **buf, size_t *sizep);
 
 int sysctl__read_int(const char *sysctl, int *value);
@@ -38,4 +40,6 @@ int sysfs__read_int(const char *entry, int *value);
 int sysfs__read_ull(const char *entry, unsigned long long *value);
 int sysfs__read_str(const char *entry, char **buf, size_t *sizep);
 int sysfs__read_bool(const char *entry, bool *value);
+
+int sysfs__write_int(const char *entry, int value);
 #endif /* __API_FS__ */
diff --git a/tools/perf/Documentation/intel-pt.txt b/tools/perf/Documentation/intel-pt.txt
index b0b3007d3c9c0ff9b3288e8005d533efe48c0b57..d157dee7a4ec89151ba795c13f54063b189e39e1 100644
--- a/tools/perf/Documentation/intel-pt.txt
+++ b/tools/perf/Documentation/intel-pt.txt
@@ -364,6 +364,42 @@ cyc_thresh	Specifies how frequently CYC packets are produced - see cyc
 
 		CYC packets are not requested by default.
 
+pt		Specifies pass-through which enables the 'branch' config term.
+
+		The default config selects 'pt' if it is available, so a user will
+		never need to specify this term.
+
+branch		Enable branch tracing.  Branch tracing is enabled by default so to
+		disable branch tracing use 'branch=0'.
+
+		The default config selects 'branch' if it is available.
+
+ptw		Enable PTWRITE packets which are produced when a ptwrite instruction
+		is executed.
+
+		Support for this feature is indicated by:
+
+			/sys/bus/event_source/devices/intel_pt/caps/ptwrite
+
+		which contains "1" if the feature is supported and
+		"0" otherwise.
+
+fup_on_ptw	Enable a FUP packet to follow the PTWRITE packet.  The FUP packet
+		provides the address of the ptwrite instruction.  In the absence of
+		fup_on_ptw, the decoder will use the address of the previous branch
+		if branch tracing is enabled, otherwise the address will be zero.
+		Note that fup_on_ptw will work even when branch tracing is disabled.
+
+pwr_evt		Enable power events.  The power events provide information about
+		changes to the CPU C-state.
+
+		Support for this feature is indicated by:
+
+			/sys/bus/event_source/devices/intel_pt/caps/power_event_trace
+
+		which contains "1" if the feature is supported and
+		"0" otherwise.
+
 
 new snapshot option
 -------------------
diff --git a/tools/perf/Documentation/perf-stat.txt b/tools/perf/Documentation/perf-stat.txt
index bd0e4417f2be63f892a870ec5ad60ae0fd5d9994..698076313606a7e22df2bda7bfa365c42673a32a 100644
--- a/tools/perf/Documentation/perf-stat.txt
+++ b/tools/perf/Documentation/perf-stat.txt
@@ -239,6 +239,20 @@ taskset.
 --no-merge::
 Do not merge results from same PMUs.
 
+--smi-cost::
+Measure SMI cost if msr/aperf/ and msr/smi/ events are supported.
+
+During the measurement, the /sys/device/cpu/freeze_on_smi will be set to
+freeze core counters on SMI.
+The aperf counter will not be effected by the setting.
+The cost of SMI can be measured by (aperf - unhalted core cycles).
+
+In practice, the percentages of SMI cycles is very useful for performance
+oriented analysis. --metric_only will be applied by default.
+The output is SMI cycles%, equals to (aperf - unhalted core cycles) / aperf
+
+Users who wants to get the actual value can apply --no-metric-only.
+
 EXAMPLES
 --------
 
diff --git a/tools/perf/Makefile.config b/tools/perf/Makefile.config
index 1f4fbc9a3292e06b6ae48c47297e795ccbc45809..bdf0e87f9b2938c33dc94b0541c8a63d1965cb35 100644
--- a/tools/perf/Makefile.config
+++ b/tools/perf/Makefile.config
@@ -61,7 +61,7 @@ endif
 # Disable it on all other architectures in case libdw unwind
 # support is detected in system. Add supported architectures
 # to the check.
-ifneq ($(SRCARCH),$(filter $(SRCARCH),x86 arm))
+ifneq ($(SRCARCH),$(filter $(SRCARCH),x86 arm powerpc))
   NO_LIBDW_DWARF_UNWIND := 1
 endif
 
diff --git a/tools/perf/arch/powerpc/util/Build b/tools/perf/arch/powerpc/util/Build
index 90ad64b231cd821abe8756ed92d68d7c4c367804..2e6595310420104a40fe4ee813f41ee0bc213349 100644
--- a/tools/perf/arch/powerpc/util/Build
+++ b/tools/perf/arch/powerpc/util/Build
@@ -5,4 +5,6 @@ libperf-y += perf_regs.o
 
 libperf-$(CONFIG_DWARF) += dwarf-regs.o
 libperf-$(CONFIG_DWARF) += skip-callchain-idx.o
+
 libperf-$(CONFIG_LIBUNWIND) += unwind-libunwind.o
+libperf-$(CONFIG_LIBDW_DWARF_UNWIND) += unwind-libdw.o
diff --git a/tools/perf/arch/powerpc/util/unwind-libdw.c b/tools/perf/arch/powerpc/util/unwind-libdw.c
new file mode 100644
index 0000000000000000000000000000000000000000..3a24b3c4327386dd0d32b6e00863bdf3a02e0aa6
--- /dev/null
+++ b/tools/perf/arch/powerpc/util/unwind-libdw.c
@@ -0,0 +1,73 @@
+#include <elfutils/libdwfl.h>
+#include "../../util/unwind-libdw.h"
+#include "../../util/perf_regs.h"
+#include "../../util/event.h"
+
+/* See backends/ppc_initreg.c and backends/ppc_regs.c in elfutils.  */
+static const int special_regs[3][2] = {
+	{ 65, PERF_REG_POWERPC_LINK },
+	{ 101, PERF_REG_POWERPC_XER },
+	{ 109, PERF_REG_POWERPC_CTR },
+};
+
+bool libdw__arch_set_initial_registers(Dwfl_Thread *thread, void *arg)
+{
+	struct unwind_info *ui = arg;
+	struct regs_dump *user_regs = &ui->sample->user_regs;
+	Dwarf_Word dwarf_regs[32], dwarf_nip;
+	size_t i;
+
+#define REG(r) ({						\
+	Dwarf_Word val = 0;					\
+	perf_reg_value(&val, user_regs, PERF_REG_POWERPC_##r);	\
+	val;							\
+})
+
+	dwarf_regs[0]  = REG(R0);
+	dwarf_regs[1]  = REG(R1);
+	dwarf_regs[2]  = REG(R2);
+	dwarf_regs[3]  = REG(R3);
+	dwarf_regs[4]  = REG(R4);
+	dwarf_regs[5]  = REG(R5);
+	dwarf_regs[6]  = REG(R6);
+	dwarf_regs[7]  = REG(R7);
+	dwarf_regs[8]  = REG(R8);
+	dwarf_regs[9]  = REG(R9);
+	dwarf_regs[10] = REG(R10);
+	dwarf_regs[11] = REG(R11);
+	dwarf_regs[12] = REG(R12);
+	dwarf_regs[13] = REG(R13);
+	dwarf_regs[14] = REG(R14);
+	dwarf_regs[15] = REG(R15);
+	dwarf_regs[16] = REG(R16);
+	dwarf_regs[17] = REG(R17);
+	dwarf_regs[18] = REG(R18);
+	dwarf_regs[19] = REG(R19);
+	dwarf_regs[20] = REG(R20);
+	dwarf_regs[21] = REG(R21);
+	dwarf_regs[22] = REG(R22);
+	dwarf_regs[23] = REG(R23);
+	dwarf_regs[24] = REG(R24);
+	dwarf_regs[25] = REG(R25);
+	dwarf_regs[26] = REG(R26);
+	dwarf_regs[27] = REG(R27);
+	dwarf_regs[28] = REG(R28);
+	dwarf_regs[29] = REG(R29);
+	dwarf_regs[30] = REG(R30);
+	dwarf_regs[31] = REG(R31);
+	if (!dwfl_thread_state_registers(thread, 0, 32, dwarf_regs))
+		return false;
+
+	dwarf_nip = REG(NIP);
+	dwfl_thread_state_register_pc(thread, dwarf_nip);
+	for (i = 0; i < ARRAY_SIZE(special_regs); i++) {
+		Dwarf_Word val = 0;
+		perf_reg_value(&val, user_regs, special_regs[i][1]);
+		if (!dwfl_thread_state_registers(thread,
+						 special_regs[i][0], 1,
+						 &val))
+			return false;
+	}
+
+	return true;
+}
diff --git a/tools/perf/arch/x86/util/intel-pt.c b/tools/perf/arch/x86/util/intel-pt.c
index 6fe667b3269eed8e563502f120757749435380ba..9535be57033f0c02b12d4471b2b61a272d630de0 100644
--- a/tools/perf/arch/x86/util/intel-pt.c
+++ b/tools/perf/arch/x86/util/intel-pt.c
@@ -192,6 +192,7 @@ static u64 intel_pt_default_config(struct perf_pmu *intel_pt_pmu)
 	int psb_cyc, psb_periods, psb_period;
 	int pos = 0;
 	u64 config;
+	char c;
 
 	pos += scnprintf(buf + pos, sizeof(buf) - pos, "tsc");
 
@@ -225,6 +226,10 @@ static u64 intel_pt_default_config(struct perf_pmu *intel_pt_pmu)
 		}
 	}
 
+	if (perf_pmu__scan_file(intel_pt_pmu, "format/pt", "%c", &c) == 1 &&
+	    perf_pmu__scan_file(intel_pt_pmu, "format/branch", "%c", &c) == 1)
+		pos += scnprintf(buf + pos, sizeof(buf) - pos, ",pt,branch");
+
 	pr_debug2("%s default config: %s\n", intel_pt_pmu->name, buf);
 
 	intel_pt_parse_terms(&intel_pt_pmu->format, buf, &config);
diff --git a/tools/perf/builtin-script.c b/tools/perf/builtin-script.c
index db5261c3f719977bd4517c934c8bf0a1e63226e5..4bce7d8679cbad191d97905e418ea24c2ad6407c 100644
--- a/tools/perf/builtin-script.c
+++ b/tools/perf/builtin-script.c
@@ -385,7 +385,7 @@ static int perf_session__check_output_opt(struct perf_session *session)
 		 */
 		if (!evsel && output[j].user_set && !output[j].wildcard_set) {
 			pr_err("%s events do not exist. "
-			       "Remove corresponding -f option to proceed.\n",
+			       "Remove corresponding -F option to proceed.\n",
 			       event_type(j));
 			return -1;
 		}
diff --git a/tools/perf/builtin-stat.c b/tools/perf/builtin-stat.c
index ad9324d1daf9f29a990a0d8f903563873ac91ef9..324363054c3fe1b8380acba5899d75473626a189 100644
--- a/tools/perf/builtin-stat.c
+++ b/tools/perf/builtin-stat.c
@@ -86,6 +86,7 @@
 #define DEFAULT_SEPARATOR	" "
 #define CNTR_NOT_SUPPORTED	"<not supported>"
 #define CNTR_NOT_COUNTED	"<not counted>"
+#define FREEZE_ON_SMI_PATH	"devices/cpu/freeze_on_smi"
 
 static void print_counters(struct timespec *ts, int argc, const char **argv);
 
@@ -122,6 +123,14 @@ static const char * topdown_attrs[] = {
 	NULL,
 };
 
+static const char *smi_cost_attrs = {
+	"{"
+	"msr/aperf/,"
+	"msr/smi/,"
+	"cycles"
+	"}"
+};
+
 static struct perf_evlist	*evsel_list;
 
 static struct target target = {
@@ -137,6 +146,8 @@ static bool			null_run			=  false;
 static int			detailed_run			=  0;
 static bool			transaction_run;
 static bool			topdown_run			= false;
+static bool			smi_cost			= false;
+static bool			smi_reset			= false;
 static bool			big_num				=  true;
 static int			big_num_opt			=  -1;
 static const char		*csv_sep			= NULL;
@@ -1782,6 +1793,8 @@ static const struct option stat_options[] = {
 			"Only print computed metrics. No raw values", enable_metric_only),
 	OPT_BOOLEAN(0, "topdown", &topdown_run,
 			"measure topdown level 1 statistics"),
+	OPT_BOOLEAN(0, "smi-cost", &smi_cost,
+			"measure SMI cost"),
 	OPT_END()
 };
 
@@ -2160,6 +2173,39 @@ static int add_default_attributes(void)
 		return 0;
 	}
 
+	if (smi_cost) {
+		int smi;
+
+		if (sysfs__read_int(FREEZE_ON_SMI_PATH, &smi) < 0) {
+			fprintf(stderr, "freeze_on_smi is not supported.\n");
+			return -1;
+		}
+
+		if (!smi) {
+			if (sysfs__write_int(FREEZE_ON_SMI_PATH, 1) < 0) {
+				fprintf(stderr, "Failed to set freeze_on_smi.\n");
+				return -1;
+			}
+			smi_reset = true;
+		}
+
+		if (pmu_have_event("msr", "aperf") &&
+		    pmu_have_event("msr", "smi")) {
+			if (!force_metric_only)
+				metric_only = true;
+			err = parse_events(evsel_list, smi_cost_attrs, NULL);
+		} else {
+			fprintf(stderr, "To measure SMI cost, it needs "
+				"msr/aperf/, msr/smi/ and cpu/cycles/ support\n");
+			return -1;
+		}
+		if (err) {
+			fprintf(stderr, "Cannot set up SMI cost events\n");
+			return -1;
+		}
+		return 0;
+	}
+
 	if (topdown_run) {
 		char *str = NULL;
 		bool warn = false;
@@ -2742,6 +2788,9 @@ int cmd_stat(int argc, const char **argv)
 	perf_stat__exit_aggr_mode();
 	perf_evlist__free_stats(evsel_list);
 out:
+	if (smi_cost && smi_reset)
+		sysfs__write_int(FREEZE_ON_SMI_PATH, 0);
+
 	perf_evlist__delete(evsel_list);
 	return status;
 }
diff --git a/tools/perf/util/evsel.c b/tools/perf/util/evsel.c
index 7f78f27f53824f1f17dda10fb0d37224d557fc99..6f4882f8d61fb2e1f23000c581134f837722712c 100644
--- a/tools/perf/util/evsel.c
+++ b/tools/perf/util/evsel.c
@@ -11,6 +11,7 @@
 #include <errno.h>
 #include <inttypes.h>
 #include <linux/bitops.h>
+#include <api/fs/fs.h>
 #include <api/fs/tracing_path.h>
 #include <traceevent/event-parse.h>
 #include <linux/hw_breakpoint.h>
@@ -19,6 +20,8 @@
 #include <linux/err.h>
 #include <sys/ioctl.h>
 #include <sys/resource.h>
+#include <sys/types.h>
+#include <dirent.h>
 #include "asm/bug.h"
 #include "callchain.h"
 #include "cgroup.h"
@@ -2472,6 +2475,42 @@ bool perf_evsel__fallback(struct perf_evsel *evsel, int err,
 	return false;
 }
 
+static bool find_process(const char *name)
+{
+	size_t len = strlen(name);
+	DIR *dir;
+	struct dirent *d;
+	int ret = -1;
+
+	dir = opendir(procfs__mountpoint());
+	if (!dir)
+		return false;
+
+	/* Walk through the directory. */
+	while (ret && (d = readdir(dir)) != NULL) {
+		char path[PATH_MAX];
+		char *data;
+		size_t size;
+
+		if ((d->d_type != DT_DIR) ||
+		     !strcmp(".", d->d_name) ||
+		     !strcmp("..", d->d_name))
+			continue;
+
+		scnprintf(path, sizeof(path), "%s/%s/comm",
+			  procfs__mountpoint(), d->d_name);
+
+		if (filename__read_str(path, &data, &size))
+			continue;
+
+		ret = strncmp(name, data, len);
+		free(data);
+	}
+
+	closedir(dir);
+	return ret ? false : true;
+}
+
 int perf_evsel__open_strerror(struct perf_evsel *evsel, struct target *target,
 			      int err, char *msg, size_t size)
 {
diff --git a/tools/perf/util/intel-pt-decoder/intel-pt-decoder.c b/tools/perf/util/intel-pt-decoder/intel-pt-decoder.c
index 7cf7f7aca4d2e80cd3f39cf83bc515b26daaf497..5dea06289db591df192f22ced0d21ad1f50fe73a 100644
--- a/tools/perf/util/intel-pt-decoder/intel-pt-decoder.c
+++ b/tools/perf/util/intel-pt-decoder/intel-pt-decoder.c
@@ -64,6 +64,25 @@ enum intel_pt_pkt_state {
 	INTEL_PT_STATE_FUP_NO_TIP,
 };
 
+static inline bool intel_pt_sample_time(enum intel_pt_pkt_state pkt_state)
+{
+	switch (pkt_state) {
+	case INTEL_PT_STATE_NO_PSB:
+	case INTEL_PT_STATE_NO_IP:
+	case INTEL_PT_STATE_ERR_RESYNC:
+	case INTEL_PT_STATE_IN_SYNC:
+	case INTEL_PT_STATE_TNT:
+		return true;
+	case INTEL_PT_STATE_TIP:
+	case INTEL_PT_STATE_TIP_PGD:
+	case INTEL_PT_STATE_FUP:
+	case INTEL_PT_STATE_FUP_NO_TIP:
+		return false;
+	default:
+		return true;
+	};
+}
+
 #ifdef INTEL_PT_STRICT
 #define INTEL_PT_STATE_ERR1	INTEL_PT_STATE_NO_PSB
 #define INTEL_PT_STATE_ERR2	INTEL_PT_STATE_NO_PSB
@@ -87,11 +106,13 @@ struct intel_pt_decoder {
 	const unsigned char *buf;
 	size_t len;
 	bool return_compression;
+	bool branch_enable;
 	bool mtc_insn;
 	bool pge;
 	bool have_tma;
 	bool have_cyc;
 	bool fixup_last_mtc;
+	bool have_last_ip;
 	uint64_t pos;
 	uint64_t last_ip;
 	uint64_t ip;
@@ -99,6 +120,7 @@ struct intel_pt_decoder {
 	uint64_t timestamp;
 	uint64_t tsc_timestamp;
 	uint64_t ref_timestamp;
+	uint64_t sample_timestamp;
 	uint64_t ret_addr;
 	uint64_t ctc_timestamp;
 	uint64_t ctc_delta;
@@ -119,6 +141,7 @@ struct intel_pt_decoder {
 	int pkt_len;
 	int last_packet_type;
 	unsigned int cbr;
+	unsigned int cbr_seen;
 	unsigned int max_non_turbo_ratio;
 	double max_non_turbo_ratio_fp;
 	double cbr_cyc_to_tsc;
@@ -136,9 +159,18 @@ struct intel_pt_decoder {
 	bool continuous_period;
 	bool overflow;
 	bool set_fup_tx_flags;
+	bool set_fup_ptw;
+	bool set_fup_mwait;
+	bool set_fup_pwre;
+	bool set_fup_exstop;
 	unsigned int fup_tx_flags;
 	unsigned int tx_flags;
+	uint64_t fup_ptw_payload;
+	uint64_t fup_mwait_payload;
+	uint64_t fup_pwre_payload;
+	uint64_t cbr_payload;
 	uint64_t timestamp_insn_cnt;
+	uint64_t sample_insn_cnt;
 	uint64_t stuck_ip;
 	int no_progress;
 	int stuck_ip_prd;
@@ -192,6 +224,7 @@ struct intel_pt_decoder *intel_pt_decoder_new(struct intel_pt_params *params)
 	decoder->pgd_ip             = params->pgd_ip;
 	decoder->data               = params->data;
 	decoder->return_compression = params->return_compression;
+	decoder->branch_enable      = params->branch_enable;
 
 	decoder->period             = params->period;
 	decoder->period_type        = params->period_type;
@@ -398,6 +431,7 @@ static uint64_t intel_pt_calc_ip(const struct intel_pt_pkt *packet,
 static inline void intel_pt_set_last_ip(struct intel_pt_decoder *decoder)
 {
 	decoder->last_ip = intel_pt_calc_ip(&decoder->packet, decoder->last_ip);
+	decoder->have_last_ip = true;
 }
 
 static inline void intel_pt_set_ip(struct intel_pt_decoder *decoder)
@@ -635,6 +669,8 @@ static int intel_pt_calc_cyc_cb(struct intel_pt_pkt_info *pkt_info)
 	case INTEL_PT_PAD:
 	case INTEL_PT_VMCS:
 	case INTEL_PT_MNT:
+	case INTEL_PT_PTWRITE:
+	case INTEL_PT_PTWRITE_IP:
 		return 0;
 
 	case INTEL_PT_MTC:
@@ -733,6 +769,11 @@ static int intel_pt_calc_cyc_cb(struct intel_pt_pkt_info *pkt_info)
 
 	case INTEL_PT_TIP_PGD:
 	case INTEL_PT_TRACESTOP:
+	case INTEL_PT_EXSTOP:
+	case INTEL_PT_EXSTOP_IP:
+	case INTEL_PT_MWAIT:
+	case INTEL_PT_PWRE:
+	case INTEL_PT_PWRX:
 	case INTEL_PT_OVF:
 	case INTEL_PT_BAD: /* Does not happen */
 	default:
@@ -898,6 +939,7 @@ static int intel_pt_walk_insn(struct intel_pt_decoder *decoder,
 
 	decoder->tot_insn_cnt += insn_cnt;
 	decoder->timestamp_insn_cnt += insn_cnt;
+	decoder->sample_insn_cnt += insn_cnt;
 	decoder->period_insn_cnt += insn_cnt;
 
 	if (err) {
@@ -990,6 +1032,57 @@ static int intel_pt_walk_insn(struct intel_pt_decoder *decoder,
 	return err;
 }
 
+static bool intel_pt_fup_event(struct intel_pt_decoder *decoder)
+{
+	bool ret = false;
+
+	if (decoder->set_fup_tx_flags) {
+		decoder->set_fup_tx_flags = false;
+		decoder->tx_flags = decoder->fup_tx_flags;
+		decoder->state.type = INTEL_PT_TRANSACTION;
+		decoder->state.from_ip = decoder->ip;
+		decoder->state.to_ip = 0;
+		decoder->state.flags = decoder->fup_tx_flags;
+		return true;
+	}
+	if (decoder->set_fup_ptw) {
+		decoder->set_fup_ptw = false;
+		decoder->state.type = INTEL_PT_PTW;
+		decoder->state.flags |= INTEL_PT_FUP_IP;
+		decoder->state.from_ip = decoder->ip;
+		decoder->state.to_ip = 0;
+		decoder->state.ptw_payload = decoder->fup_ptw_payload;
+		return true;
+	}
+	if (decoder->set_fup_mwait) {
+		decoder->set_fup_mwait = false;
+		decoder->state.type = INTEL_PT_MWAIT_OP;
+		decoder->state.from_ip = decoder->ip;
+		decoder->state.to_ip = 0;
+		decoder->state.mwait_payload = decoder->fup_mwait_payload;
+		ret = true;
+	}
+	if (decoder->set_fup_pwre) {
+		decoder->set_fup_pwre = false;
+		decoder->state.type |= INTEL_PT_PWR_ENTRY;
+		decoder->state.type &= ~INTEL_PT_BRANCH;
+		decoder->state.from_ip = decoder->ip;
+		decoder->state.to_ip = 0;
+		decoder->state.pwre_payload = decoder->fup_pwre_payload;
+		ret = true;
+	}
+	if (decoder->set_fup_exstop) {
+		decoder->set_fup_exstop = false;
+		decoder->state.type |= INTEL_PT_EX_STOP;
+		decoder->state.type &= ~INTEL_PT_BRANCH;
+		decoder->state.flags |= INTEL_PT_FUP_IP;
+		decoder->state.from_ip = decoder->ip;
+		decoder->state.to_ip = 0;
+		ret = true;
+	}
+	return ret;
+}
+
 static int intel_pt_walk_fup(struct intel_pt_decoder *decoder)
 {
 	struct intel_pt_insn intel_pt_insn;
@@ -1003,15 +1096,8 @@ static int intel_pt_walk_fup(struct intel_pt_decoder *decoder)
 		if (err == INTEL_PT_RETURN)
 			return 0;
 		if (err == -EAGAIN) {
-			if (decoder->set_fup_tx_flags) {
-				decoder->set_fup_tx_flags = false;
-				decoder->tx_flags = decoder->fup_tx_flags;
-				decoder->state.type = INTEL_PT_TRANSACTION;
-				decoder->state.from_ip = decoder->ip;
-				decoder->state.to_ip = 0;
-				decoder->state.flags = decoder->fup_tx_flags;
+			if (intel_pt_fup_event(decoder))
 				return 0;
-			}
 			return err;
 		}
 		decoder->set_fup_tx_flags = false;
@@ -1360,7 +1446,9 @@ static void intel_pt_calc_mtc_timestamp(struct intel_pt_decoder *decoder)
 
 static void intel_pt_calc_cbr(struct intel_pt_decoder *decoder)
 {
-	unsigned int cbr = decoder->packet.payload;
+	unsigned int cbr = decoder->packet.payload & 0xff;
+
+	decoder->cbr_payload = decoder->packet.payload;
 
 	if (decoder->cbr == cbr)
 		return;
@@ -1417,6 +1505,13 @@ static int intel_pt_walk_psbend(struct intel_pt_decoder *decoder)
 		case INTEL_PT_TRACESTOP:
 		case INTEL_PT_BAD:
 		case INTEL_PT_PSB:
+		case INTEL_PT_PTWRITE:
+		case INTEL_PT_PTWRITE_IP:
+		case INTEL_PT_EXSTOP:
+		case INTEL_PT_EXSTOP_IP:
+		case INTEL_PT_MWAIT:
+		case INTEL_PT_PWRE:
+		case INTEL_PT_PWRX:
 			decoder->have_tma = false;
 			intel_pt_log("ERROR: Unexpected packet\n");
 			return -EAGAIN;
@@ -1446,7 +1541,8 @@ static int intel_pt_walk_psbend(struct intel_pt_decoder *decoder)
 
 		case INTEL_PT_FUP:
 			decoder->pge = true;
-			intel_pt_set_last_ip(decoder);
+			if (decoder->packet.count)
+				intel_pt_set_last_ip(decoder);
 			break;
 
 		case INTEL_PT_MODE_TSX:
@@ -1497,6 +1593,13 @@ static int intel_pt_walk_fup_tip(struct intel_pt_decoder *decoder)
 		case INTEL_PT_MODE_TSX:
 		case INTEL_PT_BAD:
 		case INTEL_PT_PSBEND:
+		case INTEL_PT_PTWRITE:
+		case INTEL_PT_PTWRITE_IP:
+		case INTEL_PT_EXSTOP:
+		case INTEL_PT_EXSTOP_IP:
+		case INTEL_PT_MWAIT:
+		case INTEL_PT_PWRE:
+		case INTEL_PT_PWRX:
 			intel_pt_log("ERROR: Missing TIP after FUP\n");
 			decoder->pkt_state = INTEL_PT_STATE_ERR3;
 			return -ENOENT;
@@ -1625,6 +1728,15 @@ static int intel_pt_walk_trace(struct intel_pt_decoder *decoder)
 				break;
 			}
 			intel_pt_set_last_ip(decoder);
+			if (!decoder->branch_enable) {
+				decoder->ip = decoder->last_ip;
+				if (intel_pt_fup_event(decoder))
+					return 0;
+				no_tip = false;
+				break;
+			}
+			if (decoder->set_fup_mwait)
+				no_tip = true;
 			err = intel_pt_walk_fup(decoder);
 			if (err != -EAGAIN) {
 				if (err)
@@ -1650,6 +1762,8 @@ static int intel_pt_walk_trace(struct intel_pt_decoder *decoder)
 			break;
 
 		case INTEL_PT_PSB:
+			decoder->last_ip = 0;
+			decoder->have_last_ip = true;
 			intel_pt_clear_stack(&decoder->stack);
 			err = intel_pt_walk_psbend(decoder);
 			if (err == -EAGAIN)
@@ -1696,6 +1810,16 @@ static int intel_pt_walk_trace(struct intel_pt_decoder *decoder)
 
 		case INTEL_PT_CBR:
 			intel_pt_calc_cbr(decoder);
+			if (!decoder->branch_enable &&
+			    decoder->cbr != decoder->cbr_seen) {
+				decoder->cbr_seen = decoder->cbr;
+				decoder->state.type = INTEL_PT_CBR_CHG;
+				decoder->state.from_ip = decoder->ip;
+				decoder->state.to_ip = 0;
+				decoder->state.cbr_payload =
+							decoder->packet.payload;
+				return 0;
+			}
 			break;
 
 		case INTEL_PT_MODE_EXEC:
@@ -1722,6 +1846,71 @@ static int intel_pt_walk_trace(struct intel_pt_decoder *decoder)
 		case INTEL_PT_PAD:
 			break;
 
+		case INTEL_PT_PTWRITE_IP:
+			decoder->fup_ptw_payload = decoder->packet.payload;
+			err = intel_pt_get_next_packet(decoder);
+			if (err)
+				return err;
+			if (decoder->packet.type == INTEL_PT_FUP) {
+				decoder->set_fup_ptw = true;
+				no_tip = true;
+			} else {
+				intel_pt_log_at("ERROR: Missing FUP after PTWRITE",
+						decoder->pos);
+			}
+			goto next;
+
+		case INTEL_PT_PTWRITE:
+			decoder->state.type = INTEL_PT_PTW;
+			decoder->state.from_ip = decoder->ip;
+			decoder->state.to_ip = 0;
+			decoder->state.ptw_payload = decoder->packet.payload;
+			return 0;
+
+		case INTEL_PT_MWAIT:
+			decoder->fup_mwait_payload = decoder->packet.payload;
+			decoder->set_fup_mwait = true;
+			break;
+
+		case INTEL_PT_PWRE:
+			if (decoder->set_fup_mwait) {
+				decoder->fup_pwre_payload =
+							decoder->packet.payload;
+				decoder->set_fup_pwre = true;
+				break;
+			}
+			decoder->state.type = INTEL_PT_PWR_ENTRY;
+			decoder->state.from_ip = decoder->ip;
+			decoder->state.to_ip = 0;
+			decoder->state.pwrx_payload = decoder->packet.payload;
+			return 0;
+
+		case INTEL_PT_EXSTOP_IP:
+			err = intel_pt_get_next_packet(decoder);
+			if (err)
+				return err;
+			if (decoder->packet.type == INTEL_PT_FUP) {
+				decoder->set_fup_exstop = true;
+				no_tip = true;
+			} else {
+				intel_pt_log_at("ERROR: Missing FUP after EXSTOP",
+						decoder->pos);
+			}
+			goto next;
+
+		case INTEL_PT_EXSTOP:
+			decoder->state.type = INTEL_PT_EX_STOP;
+			decoder->state.from_ip = decoder->ip;
+			decoder->state.to_ip = 0;
+			return 0;
+
+		case INTEL_PT_PWRX:
+			decoder->state.type = INTEL_PT_PWR_EXIT;
+			decoder->state.from_ip = decoder->ip;
+			decoder->state.to_ip = 0;
+			decoder->state.pwrx_payload = decoder->packet.payload;
+			return 0;
+
 		default:
 			return intel_pt_bug(decoder);
 		}
@@ -1730,8 +1919,9 @@ static int intel_pt_walk_trace(struct intel_pt_decoder *decoder)
 
 static inline bool intel_pt_have_ip(struct intel_pt_decoder *decoder)
 {
-	return decoder->last_ip || decoder->packet.count == 0 ||
-	       decoder->packet.count == 3 || decoder->packet.count == 6;
+	return decoder->packet.count &&
+	       (decoder->have_last_ip || decoder->packet.count == 3 ||
+		decoder->packet.count == 6);
 }
 
 /* Walk PSB+ packets to get in sync. */
@@ -1750,6 +1940,13 @@ static int intel_pt_walk_psb(struct intel_pt_decoder *decoder)
 			__fallthrough;
 		case INTEL_PT_TIP_PGE:
 		case INTEL_PT_TIP:
+		case INTEL_PT_PTWRITE:
+		case INTEL_PT_PTWRITE_IP:
+		case INTEL_PT_EXSTOP:
+		case INTEL_PT_EXSTOP_IP:
+		case INTEL_PT_MWAIT:
+		case INTEL_PT_PWRE:
+		case INTEL_PT_PWRX:
 			intel_pt_log("ERROR: Unexpected packet\n");
 			return -ENOENT;
 
@@ -1854,14 +2051,10 @@ static int intel_pt_walk_to_ip(struct intel_pt_decoder *decoder)
 			break;
 
 		case INTEL_PT_FUP:
-			if (decoder->overflow) {
-				if (intel_pt_have_ip(decoder))
-					intel_pt_set_ip(decoder);
-				if (decoder->ip)
-					return 0;
-			}
-			if (decoder->packet.count)
-				intel_pt_set_last_ip(decoder);
+			if (intel_pt_have_ip(decoder))
+				intel_pt_set_ip(decoder);
+			if (decoder->ip)
+				return 0;
 			break;
 
 		case INTEL_PT_MTC:
@@ -1910,6 +2103,9 @@ static int intel_pt_walk_to_ip(struct intel_pt_decoder *decoder)
 			break;
 
 		case INTEL_PT_PSB:
+			decoder->last_ip = 0;
+			decoder->have_last_ip = true;
+			intel_pt_clear_stack(&decoder->stack);
 			err = intel_pt_walk_psb(decoder);
 			if (err)
 				return err;
@@ -1925,6 +2121,13 @@ static int intel_pt_walk_to_ip(struct intel_pt_decoder *decoder)
 		case INTEL_PT_VMCS:
 		case INTEL_PT_MNT:
 		case INTEL_PT_PAD:
+		case INTEL_PT_PTWRITE:
+		case INTEL_PT_PTWRITE_IP:
+		case INTEL_PT_EXSTOP:
+		case INTEL_PT_EXSTOP_IP:
+		case INTEL_PT_MWAIT:
+		case INTEL_PT_PWRE:
+		case INTEL_PT_PWRX:
 		default:
 			break;
 		}
@@ -1935,6 +2138,19 @@ static int intel_pt_sync_ip(struct intel_pt_decoder *decoder)
 {
 	int err;
 
+	decoder->set_fup_tx_flags = false;
+	decoder->set_fup_ptw = false;
+	decoder->set_fup_mwait = false;
+	decoder->set_fup_pwre = false;
+	decoder->set_fup_exstop = false;
+
+	if (!decoder->branch_enable) {
+		decoder->pkt_state = INTEL_PT_STATE_IN_SYNC;
+		decoder->overflow = false;
+		decoder->state.type = 0; /* Do not have a sample */
+		return 0;
+	}
+
 	intel_pt_log("Scanning for full IP\n");
 	err = intel_pt_walk_to_ip(decoder);
 	if (err)
@@ -2043,6 +2259,7 @@ static int intel_pt_sync(struct intel_pt_decoder *decoder)
 
 	decoder->pge = false;
 	decoder->continuous_period = false;
+	decoder->have_last_ip = false;
 	decoder->last_ip = 0;
 	decoder->ip = 0;
 	intel_pt_clear_stack(&decoder->stack);
@@ -2051,6 +2268,7 @@ static int intel_pt_sync(struct intel_pt_decoder *decoder)
 	if (err)
 		return err;
 
+	decoder->have_last_ip = true;
 	decoder->pkt_state = INTEL_PT_STATE_NO_IP;
 
 	err = intel_pt_walk_psb(decoder);
@@ -2069,7 +2287,7 @@ static int intel_pt_sync(struct intel_pt_decoder *decoder)
 
 static uint64_t intel_pt_est_timestamp(struct intel_pt_decoder *decoder)
 {
-	uint64_t est = decoder->timestamp_insn_cnt << 1;
+	uint64_t est = decoder->sample_insn_cnt << 1;
 
 	if (!decoder->cbr || !decoder->max_non_turbo_ratio)
 		goto out;
@@ -2077,7 +2295,7 @@ static uint64_t intel_pt_est_timestamp(struct intel_pt_decoder *decoder)
 	est *= decoder->max_non_turbo_ratio;
 	est /= decoder->cbr;
 out:
-	return decoder->timestamp + est;
+	return decoder->sample_timestamp + est;
 }
 
 const struct intel_pt_state *intel_pt_decode(struct intel_pt_decoder *decoder)
@@ -2093,8 +2311,10 @@ const struct intel_pt_state *intel_pt_decode(struct intel_pt_decoder *decoder)
 			err = intel_pt_sync(decoder);
 			break;
 		case INTEL_PT_STATE_NO_IP:
+			decoder->have_last_ip = false;
 			decoder->last_ip = 0;
-			/* Fall through */
+			decoder->ip = 0;
+			__fallthrough;
 		case INTEL_PT_STATE_ERR_RESYNC:
 			err = intel_pt_sync_ip(decoder);
 			break;
@@ -2130,15 +2350,29 @@ const struct intel_pt_state *intel_pt_decode(struct intel_pt_decoder *decoder)
 		}
 	} while (err == -ENOLINK);
 
-	decoder->state.err = err ? intel_pt_ext_err(err) : 0;
-	decoder->state.timestamp = decoder->timestamp;
+	if (err) {
+		decoder->state.err = intel_pt_ext_err(err);
+		decoder->state.from_ip = decoder->ip;
+		decoder->sample_timestamp = decoder->timestamp;
+		decoder->sample_insn_cnt = decoder->timestamp_insn_cnt;
+	} else {
+		decoder->state.err = 0;
+		if (decoder->cbr != decoder->cbr_seen && decoder->state.type) {
+			decoder->cbr_seen = decoder->cbr;
+			decoder->state.type |= INTEL_PT_CBR_CHG;
+			decoder->state.cbr_payload = decoder->cbr_payload;
+		}
+		if (intel_pt_sample_time(decoder->pkt_state)) {
+			decoder->sample_timestamp = decoder->timestamp;
+			decoder->sample_insn_cnt = decoder->timestamp_insn_cnt;
+		}
+	}
+
+	decoder->state.timestamp = decoder->sample_timestamp;
 	decoder->state.est_timestamp = intel_pt_est_timestamp(decoder);
 	decoder->state.cr3 = decoder->cr3;
 	decoder->state.tot_insn_cnt = decoder->tot_insn_cnt;
 
-	if (err)
-		decoder->state.from_ip = decoder->ip;
-
 	return &decoder->state;
 }
 
diff --git a/tools/perf/util/intel-pt-decoder/intel-pt-decoder.h b/tools/perf/util/intel-pt-decoder/intel-pt-decoder.h
index e90619a43c0cefdd6edebb0369e77dbf46017c21..921b22e8ca0eb5a00a1172e1d10f89f4752d429f 100644
--- a/tools/perf/util/intel-pt-decoder/intel-pt-decoder.h
+++ b/tools/perf/util/intel-pt-decoder/intel-pt-decoder.h
@@ -25,11 +25,18 @@
 #define INTEL_PT_IN_TX		(1 << 0)
 #define INTEL_PT_ABORT_TX	(1 << 1)
 #define INTEL_PT_ASYNC		(1 << 2)
+#define INTEL_PT_FUP_IP		(1 << 3)
 
 enum intel_pt_sample_type {
 	INTEL_PT_BRANCH		= 1 << 0,
 	INTEL_PT_INSTRUCTION	= 1 << 1,
 	INTEL_PT_TRANSACTION	= 1 << 2,
+	INTEL_PT_PTW		= 1 << 3,
+	INTEL_PT_MWAIT_OP	= 1 << 4,
+	INTEL_PT_PWR_ENTRY	= 1 << 5,
+	INTEL_PT_EX_STOP	= 1 << 6,
+	INTEL_PT_PWR_EXIT	= 1 << 7,
+	INTEL_PT_CBR_CHG	= 1 << 8,
 };
 
 enum intel_pt_period_type {
@@ -63,6 +70,11 @@ struct intel_pt_state {
 	uint64_t timestamp;
 	uint64_t est_timestamp;
 	uint64_t trace_nr;
+	uint64_t ptw_payload;
+	uint64_t mwait_payload;
+	uint64_t pwre_payload;
+	uint64_t pwrx_payload;
+	uint64_t cbr_payload;
 	uint32_t flags;
 	enum intel_pt_insn_op insn_op;
 	int insn_len;
@@ -87,6 +99,7 @@ struct intel_pt_params {
 	bool (*pgd_ip)(uint64_t ip, void *data);
 	void *data;
 	bool return_compression;
+	bool branch_enable;
 	uint64_t period;
 	enum intel_pt_period_type period_type;
 	unsigned max_non_turbo_ratio;
diff --git a/tools/perf/util/intel-pt-decoder/intel-pt-pkt-decoder.c b/tools/perf/util/intel-pt-decoder/intel-pt-pkt-decoder.c
index 7528ae4f7e28e1d419c699759125c979e8dc1397..ba4c9dd186434a33c8c33a59ab8884fd7c679dd3 100644
--- a/tools/perf/util/intel-pt-decoder/intel-pt-pkt-decoder.c
+++ b/tools/perf/util/intel-pt-decoder/intel-pt-pkt-decoder.c
@@ -64,6 +64,13 @@ static const char * const packet_name[] = {
 	[INTEL_PT_PIP]		= "PIP",
 	[INTEL_PT_OVF]		= "OVF",
 	[INTEL_PT_MNT]		= "MNT",
+	[INTEL_PT_PTWRITE]	= "PTWRITE",
+	[INTEL_PT_PTWRITE_IP]	= "PTWRITE",
+	[INTEL_PT_EXSTOP]	= "EXSTOP",
+	[INTEL_PT_EXSTOP_IP]	= "EXSTOP",
+	[INTEL_PT_MWAIT]	= "MWAIT",
+	[INTEL_PT_PWRE]		= "PWRE",
+	[INTEL_PT_PWRX]		= "PWRX",
 };
 
 const char *intel_pt_pkt_name(enum intel_pt_pkt_type type)
@@ -123,7 +130,7 @@ static int intel_pt_get_cbr(const unsigned char *buf, size_t len,
 	if (len < 4)
 		return INTEL_PT_NEED_MORE_BYTES;
 	packet->type = INTEL_PT_CBR;
-	packet->payload = buf[2];
+	packet->payload = le16_to_cpu(*(uint16_t *)(buf + 2));
 	return 4;
 }
 
@@ -217,12 +224,80 @@ static int intel_pt_get_3byte(const unsigned char *buf, size_t len,
 	}
 }
 
+static int intel_pt_get_ptwrite(const unsigned char *buf, size_t len,
+				struct intel_pt_pkt *packet)
+{
+	packet->count = (buf[1] >> 5) & 0x3;
+	packet->type = buf[1] & BIT(7) ? INTEL_PT_PTWRITE_IP :
+					 INTEL_PT_PTWRITE;
+
+	switch (packet->count) {
+	case 0:
+		if (len < 6)
+			return INTEL_PT_NEED_MORE_BYTES;
+		packet->payload = le32_to_cpu(*(uint32_t *)(buf + 2));
+		return 6;
+	case 1:
+		if (len < 10)
+			return INTEL_PT_NEED_MORE_BYTES;
+		packet->payload = le64_to_cpu(*(uint64_t *)(buf + 2));
+		return 10;
+	default:
+		return INTEL_PT_BAD_PACKET;
+	}
+}
+
+static int intel_pt_get_exstop(struct intel_pt_pkt *packet)
+{
+	packet->type = INTEL_PT_EXSTOP;
+	return 2;
+}
+
+static int intel_pt_get_exstop_ip(struct intel_pt_pkt *packet)
+{
+	packet->type = INTEL_PT_EXSTOP_IP;
+	return 2;
+}
+
+static int intel_pt_get_mwait(const unsigned char *buf, size_t len,
+			      struct intel_pt_pkt *packet)
+{
+	if (len < 10)
+		return INTEL_PT_NEED_MORE_BYTES;
+	packet->type = INTEL_PT_MWAIT;
+	packet->payload = le64_to_cpu(*(uint64_t *)(buf + 2));
+	return 10;
+}
+
+static int intel_pt_get_pwre(const unsigned char *buf, size_t len,
+			     struct intel_pt_pkt *packet)
+{
+	if (len < 4)
+		return INTEL_PT_NEED_MORE_BYTES;
+	packet->type = INTEL_PT_PWRE;
+	memcpy_le64(&packet->payload, buf + 2, 2);
+	return 4;
+}
+
+static int intel_pt_get_pwrx(const unsigned char *buf, size_t len,
+			     struct intel_pt_pkt *packet)
+{
+	if (len < 7)
+		return INTEL_PT_NEED_MORE_BYTES;
+	packet->type = INTEL_PT_PWRX;
+	memcpy_le64(&packet->payload, buf + 2, 5);
+	return 7;
+}
+
 static int intel_pt_get_ext(const unsigned char *buf, size_t len,
 			    struct intel_pt_pkt *packet)
 {
 	if (len < 2)
 		return INTEL_PT_NEED_MORE_BYTES;
 
+	if ((buf[1] & 0x1f) == 0x12)
+		return intel_pt_get_ptwrite(buf, len, packet);
+
 	switch (buf[1]) {
 	case 0xa3: /* Long TNT */
 		return intel_pt_get_long_tnt(buf, len, packet);
@@ -244,6 +319,16 @@ static int intel_pt_get_ext(const unsigned char *buf, size_t len,
 		return intel_pt_get_tma(buf, len, packet);
 	case 0xC3: /* 3-byte header */
 		return intel_pt_get_3byte(buf, len, packet);
+	case 0x62: /* EXSTOP no IP */
+		return intel_pt_get_exstop(packet);
+	case 0xE2: /* EXSTOP with IP */
+		return intel_pt_get_exstop_ip(packet);
+	case 0xC2: /* MWAIT */
+		return intel_pt_get_mwait(buf, len, packet);
+	case 0x22: /* PWRE */
+		return intel_pt_get_pwre(buf, len, packet);
+	case 0xA2: /* PWRX */
+		return intel_pt_get_pwrx(buf, len, packet);
 	default:
 		return INTEL_PT_BAD_PACKET;
 	}
@@ -522,6 +607,29 @@ int intel_pt_pkt_desc(const struct intel_pt_pkt *packet, char *buf,
 		ret = snprintf(buf, buf_len, "%s 0x%llx (NR=%d)",
 			       name, payload, nr);
 		return ret;
+	case INTEL_PT_PTWRITE:
+		return snprintf(buf, buf_len, "%s 0x%llx IP:0", name, payload);
+	case INTEL_PT_PTWRITE_IP:
+		return snprintf(buf, buf_len, "%s 0x%llx IP:1", name, payload);
+	case INTEL_PT_EXSTOP:
+		return snprintf(buf, buf_len, "%s IP:0", name);
+	case INTEL_PT_EXSTOP_IP:
+		return snprintf(buf, buf_len, "%s IP:1", name);
+	case INTEL_PT_MWAIT:
+		return snprintf(buf, buf_len, "%s 0x%llx Hints 0x%x Extensions 0x%x",
+				name, payload, (unsigned int)(payload & 0xff),
+				(unsigned int)((payload >> 32) & 0x3));
+	case INTEL_PT_PWRE:
+		return snprintf(buf, buf_len, "%s 0x%llx HW:%u CState:%u Sub-CState:%u",
+				name, payload, !!(payload & 0x80),
+				(unsigned int)((payload >> 12) & 0xf),
+				(unsigned int)((payload >> 8) & 0xf));
+	case INTEL_PT_PWRX:
+		return snprintf(buf, buf_len, "%s 0x%llx Last CState:%u Deepest CState:%u Wake Reason 0x%x",
+				name, payload,
+				(unsigned int)((payload >> 4) & 0xf),
+				(unsigned int)(payload & 0xf),
+				(unsigned int)((payload >> 8) & 0xf));
 	default:
 		break;
 	}
diff --git a/tools/perf/util/intel-pt-decoder/intel-pt-pkt-decoder.h b/tools/perf/util/intel-pt-decoder/intel-pt-pkt-decoder.h
index 781bb79883bd612bb803318daff7248710f6c107..73ddc3a88d0749eddc31b4a7a98982921c4edc78 100644
--- a/tools/perf/util/intel-pt-decoder/intel-pt-pkt-decoder.h
+++ b/tools/perf/util/intel-pt-decoder/intel-pt-pkt-decoder.h
@@ -52,6 +52,13 @@ enum intel_pt_pkt_type {
 	INTEL_PT_PIP,
 	INTEL_PT_OVF,
 	INTEL_PT_MNT,
+	INTEL_PT_PTWRITE,
+	INTEL_PT_PTWRITE_IP,
+	INTEL_PT_EXSTOP,
+	INTEL_PT_EXSTOP_IP,
+	INTEL_PT_MWAIT,
+	INTEL_PT_PWRE,
+	INTEL_PT_PWRX,
 };
 
 struct intel_pt_pkt {
diff --git a/tools/perf/util/intel-pt.c b/tools/perf/util/intel-pt.c
index 4c7718f87a0890ee64e2f78e0b97291b67ecf33e..6df836469f2b4987651c8197f4bfafc823819c36 100644
--- a/tools/perf/util/intel-pt.c
+++ b/tools/perf/util/intel-pt.c
@@ -668,6 +668,19 @@ static bool intel_pt_return_compression(struct intel_pt *pt)
 	return true;
 }
 
+static bool intel_pt_branch_enable(struct intel_pt *pt)
+{
+	struct perf_evsel *evsel;
+	u64 config;
+
+	evlist__for_each_entry(pt->session->evlist, evsel) {
+		if (intel_pt_get_config(pt, &evsel->attr, &config) &&
+		    (config & 1) && !(config & 0x2000))
+			return false;
+	}
+	return true;
+}
+
 static unsigned int intel_pt_mtc_period(struct intel_pt *pt)
 {
 	struct perf_evsel *evsel;
@@ -799,6 +812,7 @@ static struct intel_pt_queue *intel_pt_alloc_queue(struct intel_pt *pt,
 	params.walk_insn = intel_pt_walk_next_insn;
 	params.data = ptq;
 	params.return_compression = intel_pt_return_compression(pt);
+	params.branch_enable = intel_pt_branch_enable(pt);
 	params.max_non_turbo_ratio = pt->max_non_turbo_ratio;
 	params.mtc_period = intel_pt_mtc_period(pt);
 	params.tsc_ctc_ratio_n = pt->tsc_ctc_ratio_n;
@@ -1308,18 +1322,14 @@ static int intel_pt_sample(struct intel_pt_queue *ptq)
 	ptq->have_sample = false;
 
 	if (pt->sample_instructions &&
-	    (state->type & INTEL_PT_INSTRUCTION) &&
-	    (!pt->synth_opts.initial_skip ||
-	     pt->num_events++ >= pt->synth_opts.initial_skip)) {
+	    (state->type & INTEL_PT_INSTRUCTION)) {
 		err = intel_pt_synth_instruction_sample(ptq);
 		if (err)
 			return err;
 	}
 
 	if (pt->sample_transactions &&
-	    (state->type & INTEL_PT_TRANSACTION) &&
-	    (!pt->synth_opts.initial_skip ||
-	     pt->num_events++ >= pt->synth_opts.initial_skip)) {
+	    (state->type & INTEL_PT_TRANSACTION)) {
 		err = intel_pt_synth_transaction_sample(ptq);
 		if (err)
 			return err;
@@ -2025,6 +2035,7 @@ static int intel_pt_synth_events(struct intel_pt *pt,
 			return err;
 		}
 		pt->sample_transactions = true;
+		pt->transactions_sample_type = attr.sample_type;
 		pt->transactions_id = id;
 		id += 1;
 		evlist__for_each_entry(evlist, evsel) {
diff --git a/tools/perf/util/session.c b/tools/perf/util/session.c
index 7dc1096264c575cbbd1cd41d07f041585a997453..d19c40a8104027117a2d30ad2a6aad6af8207025 100644
--- a/tools/perf/util/session.c
+++ b/tools/perf/util/session.c
@@ -2035,7 +2035,7 @@ int perf_session__cpu_bitmap(struct perf_session *session,
 
 		if (!(evsel->attr.sample_type & PERF_SAMPLE_CPU)) {
 			pr_err("File does not contain CPU events. "
-			       "Remove -c option to proceed.\n");
+			       "Remove -C option to proceed.\n");
 			return -1;
 		}
 	}
diff --git a/tools/perf/util/stat-shadow.c b/tools/perf/util/stat-shadow.c
index ac10cc675d39579bfca249abbc4353e7c9accc6e..719d6cb86952e5c6482e7a09070370360efe2030 100644
--- a/tools/perf/util/stat-shadow.c
+++ b/tools/perf/util/stat-shadow.c
@@ -44,6 +44,8 @@ static struct stats runtime_topdown_slots_issued[NUM_CTX][MAX_NR_CPUS];
 static struct stats runtime_topdown_slots_retired[NUM_CTX][MAX_NR_CPUS];
 static struct stats runtime_topdown_fetch_bubbles[NUM_CTX][MAX_NR_CPUS];
 static struct stats runtime_topdown_recovery_bubbles[NUM_CTX][MAX_NR_CPUS];
+static struct stats runtime_smi_num_stats[NUM_CTX][MAX_NR_CPUS];
+static struct stats runtime_aperf_stats[NUM_CTX][MAX_NR_CPUS];
 static struct rblist runtime_saved_values;
 static bool have_frontend_stalled;
 
@@ -157,6 +159,8 @@ void perf_stat__reset_shadow_stats(void)
 	memset(runtime_topdown_slots_issued, 0, sizeof(runtime_topdown_slots_issued));
 	memset(runtime_topdown_fetch_bubbles, 0, sizeof(runtime_topdown_fetch_bubbles));
 	memset(runtime_topdown_recovery_bubbles, 0, sizeof(runtime_topdown_recovery_bubbles));
+	memset(runtime_smi_num_stats, 0, sizeof(runtime_smi_num_stats));
+	memset(runtime_aperf_stats, 0, sizeof(runtime_aperf_stats));
 
 	next = rb_first(&runtime_saved_values.entries);
 	while (next) {
@@ -217,6 +221,10 @@ void perf_stat__update_shadow_stats(struct perf_evsel *counter, u64 *count,
 		update_stats(&runtime_dtlb_cache_stats[ctx][cpu], count[0]);
 	else if (perf_evsel__match(counter, HW_CACHE, HW_CACHE_ITLB))
 		update_stats(&runtime_itlb_cache_stats[ctx][cpu], count[0]);
+	else if (perf_stat_evsel__is(counter, SMI_NUM))
+		update_stats(&runtime_smi_num_stats[ctx][cpu], count[0]);
+	else if (perf_stat_evsel__is(counter, APERF))
+		update_stats(&runtime_aperf_stats[ctx][cpu], count[0]);
 
 	if (counter->collect_stat) {
 		struct saved_value *v = saved_value_lookup(counter, cpu, ctx,
@@ -592,6 +600,29 @@ static double td_be_bound(int ctx, int cpu)
 	return sanitize_val(1.0 - sum);
 }
 
+static void print_smi_cost(int cpu, struct perf_evsel *evsel,
+			   struct perf_stat_output_ctx *out)
+{
+	double smi_num, aperf, cycles, cost = 0.0;
+	int ctx = evsel_context(evsel);
+	const char *color = NULL;
+
+	smi_num = avg_stats(&runtime_smi_num_stats[ctx][cpu]);
+	aperf = avg_stats(&runtime_aperf_stats[ctx][cpu]);
+	cycles = avg_stats(&runtime_cycles_stats[ctx][cpu]);
+
+	if ((cycles == 0) || (aperf == 0))
+		return;
+
+	if (smi_num)
+		cost = (aperf - cycles) / aperf * 100.00;
+
+	if (cost > 10)
+		color = PERF_COLOR_RED;
+	out->print_metric(out->ctx, color, "%8.1f%%", "SMI cycles%", cost);
+	out->print_metric(out->ctx, NULL, "%4.0f", "SMI#", smi_num);
+}
+
 void perf_stat__print_shadow_stats(struct perf_evsel *evsel,
 				   double avg, int cpu,
 				   struct perf_stat_output_ctx *out)
@@ -825,6 +856,8 @@ void perf_stat__print_shadow_stats(struct perf_evsel *evsel,
 		}
 		snprintf(unit_buf, sizeof(unit_buf), "%c/sec", unit);
 		print_metric(ctxp, NULL, "%8.3f", unit_buf, ratio);
+	} else if (perf_stat_evsel__is(evsel, SMI_NUM)) {
+		print_smi_cost(cpu, evsel, out);
 	} else {
 		print_metric(ctxp, NULL, NULL, NULL, 0);
 	}
diff --git a/tools/perf/util/stat.c b/tools/perf/util/stat.c
index c58174443dc12c7fad840d8b67a34223e5283f9f..53b9a994a3dc9e50aca6da360d4781d93f52dfc9 100644
--- a/tools/perf/util/stat.c
+++ b/tools/perf/util/stat.c
@@ -86,6 +86,8 @@ static const char *id_str[PERF_STAT_EVSEL_ID__MAX] = {
 	ID(TOPDOWN_SLOTS_RETIRED, topdown-slots-retired),
 	ID(TOPDOWN_FETCH_BUBBLES, topdown-fetch-bubbles),
 	ID(TOPDOWN_RECOVERY_BUBBLES, topdown-recovery-bubbles),
+	ID(SMI_NUM, msr/smi/),
+	ID(APERF, msr/aperf/),
 };
 #undef ID
 
diff --git a/tools/perf/util/stat.h b/tools/perf/util/stat.h
index 0a65ae23f49504874bf82134d4e9ed40d5408a7f..7522bf10b03e2fcbf26b9c67bebd41e56bff310b 100644
--- a/tools/perf/util/stat.h
+++ b/tools/perf/util/stat.h
@@ -22,6 +22,8 @@ enum perf_stat_evsel_id {
 	PERF_STAT_EVSEL_ID__TOPDOWN_SLOTS_RETIRED,
 	PERF_STAT_EVSEL_ID__TOPDOWN_FETCH_BUBBLES,
 	PERF_STAT_EVSEL_ID__TOPDOWN_RECOVERY_BUBBLES,
+	PERF_STAT_EVSEL_ID__SMI_NUM,
+	PERF_STAT_EVSEL_ID__APERF,
 	PERF_STAT_EVSEL_ID__MAX,
 };
 
diff --git a/tools/perf/util/util.c b/tools/perf/util/util.c
index 28c9f335006c962a5df924e30d9a45687775c201..988111e0bab592369ecf197d43127dab0dc57975 100644
--- a/tools/perf/util/util.c
+++ b/tools/perf/util/util.c
@@ -343,43 +343,6 @@ int perf_event_paranoid(void)
 
 	return value;
 }
-
-bool find_process(const char *name)
-{
-	size_t len = strlen(name);
-	DIR *dir;
-	struct dirent *d;
-	int ret = -1;
-
-	dir = opendir(procfs__mountpoint());
-	if (!dir)
-		return false;
-
-	/* Walk through the directory. */
-	while (ret && (d = readdir(dir)) != NULL) {
-		char path[PATH_MAX];
-		char *data;
-		size_t size;
-
-		if ((d->d_type != DT_DIR) ||
-		     !strcmp(".", d->d_name) ||
-		     !strcmp("..", d->d_name))
-			continue;
-
-		scnprintf(path, sizeof(path), "%s/%s/comm",
-			  procfs__mountpoint(), d->d_name);
-
-		if (filename__read_str(path, &data, &size))
-			continue;
-
-		ret = strncmp(name, data, len);
-		free(data);
-	}
-
-	closedir(dir);
-	return ret ? false : true;
-}
-
 static int
 fetch_ubuntu_kernel_version(unsigned int *puint)
 {
@@ -387,8 +350,12 @@ fetch_ubuntu_kernel_version(unsigned int *puint)
 	size_t line_len = 0;
 	char *ptr, *line = NULL;
 	int version, patchlevel, sublevel, err;
-	FILE *vsig = fopen("/proc/version_signature", "r");
+	FILE *vsig;
+
+	if (!puint)
+		return 0;
 
+	vsig = fopen("/proc/version_signature", "r");
 	if (!vsig) {
 		pr_debug("Open /proc/version_signature failed: %s\n",
 			 strerror(errno));
@@ -418,8 +385,7 @@ fetch_ubuntu_kernel_version(unsigned int *puint)
 		goto errout;
 	}
 
-	if (puint)
-		*puint = (version << 16) + (patchlevel << 8) + sublevel;
+	*puint = (version << 16) + (patchlevel << 8) + sublevel;
 	err = 0;
 errout:
 	free(line);
@@ -446,6 +412,9 @@ fetch_kernel_version(unsigned int *puint, char *str,
 		str[str_size - 1] = '\0';
 	}
 
+	if (!puint || int_ver_ready)
+		return 0;
+
 	err = sscanf(utsname.release, "%d.%d.%d",
 		     &version, &patchlevel, &sublevel);
 
@@ -455,8 +424,7 @@ fetch_kernel_version(unsigned int *puint, char *str,
 		return -1;
 	}
 
-	if (puint && !int_ver_ready)
-		*puint = (version << 16) + (patchlevel << 8) + sublevel;
+	*puint = (version << 16) + (patchlevel << 8) + sublevel;
 	return 0;
 }
 
diff --git a/tools/perf/util/util.h b/tools/perf/util/util.h
index 21c6db173bcc45d331f803b7e2da3fd6d39fedd3..978572dfeb14351494bf0a939e8ccada7b278f30 100644
--- a/tools/perf/util/util.h
+++ b/tools/perf/util/util.h
@@ -1,7 +1,6 @@
 #ifndef GIT_COMPAT_UTIL_H
 #define GIT_COMPAT_UTIL_H
 
-#define _ALL_SOURCE 1
 #define _BSD_SOURCE 1
 /* glibc 2.20 deprecates _BSD_SOURCE in favour of _DEFAULT_SOURCE */
 #define _DEFAULT_SOURCE 1
@@ -49,8 +48,6 @@ int hex2u64(const char *ptr, u64 *val);
 extern unsigned int page_size;
 extern int cacheline_size;
 
-bool find_process(const char *name);
-
 int fetch_kernel_version(unsigned int *puint,
 			 char *str, size_t str_sz);
 #define KVER_VERSION(x)		(((x) >> 16) & 0xff)