init commit

4797dc34 · wangxiaohui.neo · 4797dc34 · 4797dc34 · 4797dc34 · 4797dc34
189 changed file
--- a/.clang-format
+++ b/.clang-format
+---
+BasedOnStyle: Google
+---
+Language: Cpp
+ColumnLimit: 80
--- a/.gitignore
+++ b/.gitignore
+test_case/
--- a/3rdparty/cub-1.8.0/.cproject
+++ b/3rdparty/cub-1.8.0/.cproject
--- a/3rdparty/cub-1.8.0/.project
+++ b/3rdparty/cub-1.8.0/.project
+<?xml version="1.0" encoding="UTF-8"?>
+<projectDescription>
+	<name>GIT_CUB</name>
+	<comment></comment>
+	<projects>
+	</projects>
+	<buildSpec>
+		<buildCommand>
+			<name>org.eclipse.cdt.managedbuilder.core.genmakebuilder</name>
+			<triggers>clean,full,incremental,</triggers>
+			<arguments>
+			</arguments>
+		</buildCommand>
+		<buildCommand>
+			<name>org.eclipse.cdt.managedbuilder.core.ScannerConfigBuilder</name>
+			<triggers>full,incremental,</triggers>
+			<arguments>
+			</arguments>
+		</buildCommand>
+	</buildSpec>
+	<natures>
+		<nature>org.eclipse.cdt.core.cnature</nature>
+		<nature>org.eclipse.cdt.managedbuilder.core.managedBuildNature</nature>
+		<nature>org.eclipse.cdt.managedbuilder.core.ScannerConfigNature</nature>
+		<nature>org.eclipse.cdt.core.ccnature</nature>
+	</natures>
+</projectDescription>
--- a/3rdparty/cub-1.8.0/.settings/.gitignore
+++ b/3rdparty/cub-1.8.0/.settings/.gitignore
+/language.settings.xml
--- a/3rdparty/cub-1.8.0/.settings/org.eclipse.cdt.codan.core.prefs
+++ b/3rdparty/cub-1.8.0/.settings/org.eclipse.cdt.codan.core.prefs
+eclipse.preferences.version=1
+org.eclipse.cdt.codan.checkers.errnoreturn=Warning
+org.eclipse.cdt.codan.checkers.errnoreturn.params={launchModes\=>{RUN_ON_FULL_BUILD\=>true,RUN_ON_INC_BUILD\=>true,RUN_ON_FILE_OPEN\=>false,RUN_ON_FILE_SAVE\=>false,RUN_AS_YOU_TYPE\=>true,RUN_ON_DEMAND\=>true},implicit\=>false}
+org.eclipse.cdt.codan.checkers.errreturnvalue=Error
+org.eclipse.cdt.codan.checkers.errreturnvalue.params={launchModes\=>{RUN_ON_FULL_BUILD\=>true,RUN_ON_INC_BUILD\=>true,RUN_ON_FILE_OPEN\=>false,RUN_ON_FILE_SAVE\=>false,RUN_AS_YOU_TYPE\=>true,RUN_ON_DEMAND\=>true}}
+org.eclipse.cdt.codan.checkers.nocommentinside=-Error
+org.eclipse.cdt.codan.checkers.nocommentinside.params={launchModes\=>{RUN_ON_FULL_BUILD\=>true,RUN_ON_INC_BUILD\=>true,RUN_ON_FILE_OPEN\=>false,RUN_ON_FILE_SAVE\=>false,RUN_AS_YOU_TYPE\=>true,RUN_ON_DEMAND\=>true}}
+org.eclipse.cdt.codan.checkers.nolinecomment=-Error
+org.eclipse.cdt.codan.checkers.nolinecomment.params={launchModes\=>{RUN_ON_FULL_BUILD\=>true,RUN_ON_INC_BUILD\=>true,RUN_ON_FILE_OPEN\=>false,RUN_ON_FILE_SAVE\=>false,RUN_AS_YOU_TYPE\=>true,RUN_ON_DEMAND\=>true}}
+org.eclipse.cdt.codan.checkers.noreturn=Error
+org.eclipse.cdt.codan.checkers.noreturn.params={launchModes\=>{RUN_ON_FULL_BUILD\=>true,RUN_ON_INC_BUILD\=>true,RUN_ON_FILE_OPEN\=>false,RUN_ON_FILE_SAVE\=>false,RUN_AS_YOU_TYPE\=>true,RUN_ON_DEMAND\=>true},implicit\=>false}
+org.eclipse.cdt.codan.internal.checkers.AbstractClassCreation=Error
+org.eclipse.cdt.codan.internal.checkers.AbstractClassCreation.params={launchModes\=>{RUN_ON_FULL_BUILD\=>true,RUN_ON_INC_BUILD\=>true,RUN_ON_FILE_OPEN\=>false,RUN_ON_FILE_SAVE\=>false,RUN_AS_YOU_TYPE\=>true,RUN_ON_DEMAND\=>true}}
+org.eclipse.cdt.codan.internal.checkers.AmbiguousProblem=Error
+org.eclipse.cdt.codan.internal.checkers.AmbiguousProblem.params={launchModes\=>{RUN_ON_FULL_BUILD\=>true,RUN_ON_INC_BUILD\=>true,RUN_ON_FILE_OPEN\=>false,RUN_ON_FILE_SAVE\=>false,RUN_AS_YOU_TYPE\=>true,RUN_ON_DEMAND\=>true}}
+org.eclipse.cdt.codan.internal.checkers.AssignmentInConditionProblem=Warning
+org.eclipse.cdt.codan.internal.checkers.AssignmentInConditionProblem.params={launchModes\=>{RUN_ON_FULL_BUILD\=>true,RUN_ON_INC_BUILD\=>true,RUN_ON_FILE_OPEN\=>false,RUN_ON_FILE_SAVE\=>false,RUN_AS_YOU_TYPE\=>true,RUN_ON_DEMAND\=>true}}
+org.eclipse.cdt.codan.internal.checkers.AssignmentToItselfProblem=Error
+org.eclipse.cdt.codan.internal.checkers.AssignmentToItselfProblem.params={launchModes\=>{RUN_ON_FULL_BUILD\=>true,RUN_ON_INC_BUILD\=>true,RUN_ON_FILE_OPEN\=>false,RUN_ON_FILE_SAVE\=>false,RUN_AS_YOU_TYPE\=>true,RUN_ON_DEMAND\=>true}}
+org.eclipse.cdt.codan.internal.checkers.CaseBreakProblem=Warning
+org.eclipse.cdt.codan.internal.checkers.CaseBreakProblem.params={launchModes\=>{RUN_ON_FULL_BUILD\=>true,RUN_ON_INC_BUILD\=>true,RUN_ON_FILE_OPEN\=>false,RUN_ON_FILE_SAVE\=>false,RUN_AS_YOU_TYPE\=>true,RUN_ON_DEMAND\=>true},no_break_comment\=>"no break",last_case_param\=>true,empty_case_param\=>false}
+org.eclipse.cdt.codan.internal.checkers.CatchByReference=Warning
+org.eclipse.cdt.codan.internal.checkers.CatchByReference.params={launchModes\=>{RUN_ON_FULL_BUILD\=>true,RUN_ON_INC_BUILD\=>true,RUN_ON_FILE_OPEN\=>false,RUN_ON_FILE_SAVE\=>false,RUN_AS_YOU_TYPE\=>true,RUN_ON_DEMAND\=>true},unknown\=>false,exceptions\=>()}
+org.eclipse.cdt.codan.internal.checkers.CircularReferenceProblem=Error
+org.eclipse.cdt.codan.internal.checkers.CircularReferenceProblem.params={launchModes\=>{RUN_ON_FULL_BUILD\=>true,RUN_ON_INC_BUILD\=>true,RUN_ON_FILE_OPEN\=>false,RUN_ON_FILE_SAVE\=>false,RUN_AS_YOU_TYPE\=>true,RUN_ON_DEMAND\=>true}}
+org.eclipse.cdt.codan.internal.checkers.ClassMembersInitialization=Warning
+org.eclipse.cdt.codan.internal.checkers.ClassMembersInitialization.params={launchModes\=>{RUN_ON_FULL_BUILD\=>true,RUN_ON_INC_BUILD\=>true,RUN_ON_FILE_OPEN\=>false,RUN_ON_FILE_SAVE\=>false,RUN_AS_YOU_TYPE\=>true,RUN_ON_DEMAND\=>true},skip\=>true}
+org.eclipse.cdt.codan.internal.checkers.FieldResolutionProblem=Error
+org.eclipse.cdt.codan.internal.checkers.FieldResolutionProblem.params={launchModes\=>{RUN_ON_FULL_BUILD\=>true,RUN_ON_INC_BUILD\=>true,RUN_ON_FILE_OPEN\=>false,RUN_ON_FILE_SAVE\=>false,RUN_AS_YOU_TYPE\=>true,RUN_ON_DEMAND\=>true}}
+org.eclipse.cdt.codan.internal.checkers.FunctionResolutionProblem=Error
+org.eclipse.cdt.codan.internal.checkers.FunctionResolutionProblem.params={launchModes\=>{RUN_ON_FULL_BUILD\=>true,RUN_ON_INC_BUILD\=>true,RUN_ON_FILE_OPEN\=>false,RUN_ON_FILE_SAVE\=>false,RUN_AS_YOU_TYPE\=>true,RUN_ON_DEMAND\=>true}}
+org.eclipse.cdt.codan.internal.checkers.InvalidArguments=Error
+org.eclipse.cdt.codan.internal.checkers.InvalidArguments.params={launchModes\=>{RUN_ON_FULL_BUILD\=>true,RUN_ON_INC_BUILD\=>true,RUN_ON_FILE_OPEN\=>false,RUN_ON_FILE_SAVE\=>false,RUN_AS_YOU_TYPE\=>true,RUN_ON_DEMAND\=>true}}
+org.eclipse.cdt.codan.internal.checkers.InvalidTemplateArgumentsProblem=Error
+org.eclipse.cdt.codan.internal.checkers.InvalidTemplateArgumentsProblem.params={launchModes\=>{RUN_ON_FULL_BUILD\=>true,RUN_ON_INC_BUILD\=>true,RUN_ON_FILE_OPEN\=>false,RUN_ON_FILE_SAVE\=>false,RUN_AS_YOU_TYPE\=>true,RUN_ON_DEMAND\=>true}}
+org.eclipse.cdt.codan.internal.checkers.LabelStatementNotFoundProblem=Error
+org.eclipse.cdt.codan.internal.checkers.LabelStatementNotFoundProblem.params={launchModes\=>{RUN_ON_FULL_BUILD\=>true,RUN_ON_INC_BUILD\=>true,RUN_ON_FILE_OPEN\=>false,RUN_ON_FILE_SAVE\=>false,RUN_AS_YOU_TYPE\=>true,RUN_ON_DEMAND\=>true}}
+org.eclipse.cdt.codan.internal.checkers.MemberDeclarationNotFoundProblem=Error
+org.eclipse.cdt.codan.internal.checkers.MemberDeclarationNotFoundProblem.params={launchModes\=>{RUN_ON_FULL_BUILD\=>true,RUN_ON_INC_BUILD\=>true,RUN_ON_FILE_OPEN\=>false,RUN_ON_FILE_SAVE\=>false,RUN_AS_YOU_TYPE\=>true,RUN_ON_DEMAND\=>true}}
+org.eclipse.cdt.codan.internal.checkers.MethodResolutionProblem=Error
+org.eclipse.cdt.codan.internal.checkers.MethodResolutionProblem.params={launchModes\=>{RUN_ON_FULL_BUILD\=>true,RUN_ON_INC_BUILD\=>true,RUN_ON_FILE_OPEN\=>false,RUN_ON_FILE_SAVE\=>false,RUN_AS_YOU_TYPE\=>true,RUN_ON_DEMAND\=>true}}
+org.eclipse.cdt.codan.internal.checkers.NamingConventionFunctionChecker=-Info
+org.eclipse.cdt.codan.internal.checkers.NamingConventionFunctionChecker.params={launchModes\=>{RUN_ON_FULL_BUILD\=>true,RUN_ON_INC_BUILD\=>true,RUN_ON_FILE_OPEN\=>false,RUN_ON_FILE_SAVE\=>false,RUN_AS_YOU_TYPE\=>true,RUN_ON_DEMAND\=>true},pattern\=>"^[a-z]",macro\=>true,exceptions\=>()}
+org.eclipse.cdt.codan.internal.checkers.NonVirtualDestructorProblem=Warning
+org.eclipse.cdt.codan.internal.checkers.NonVirtualDestructorProblem.params={launchModes\=>{RUN_ON_FULL_BUILD\=>true,RUN_ON_INC_BUILD\=>true,RUN_ON_FILE_OPEN\=>false,RUN_ON_FILE_SAVE\=>false,RUN_AS_YOU_TYPE\=>true,RUN_ON_DEMAND\=>true}}
+org.eclipse.cdt.codan.internal.checkers.OverloadProblem=Error
+org.eclipse.cdt.codan.internal.checkers.OverloadProblem.params={launchModes\=>{RUN_ON_FULL_BUILD\=>true,RUN_ON_INC_BUILD\=>true,RUN_ON_FILE_OPEN\=>false,RUN_ON_FILE_SAVE\=>false,RUN_AS_YOU_TYPE\=>true,RUN_ON_DEMAND\=>true}}
+org.eclipse.cdt.codan.internal.checkers.RedeclarationProblem=Error
+org.eclipse.cdt.codan.internal.checkers.RedeclarationProblem.params={launchModes\=>{RUN_ON_FULL_BUILD\=>true,RUN_ON_INC_BUILD\=>true,RUN_ON_FILE_OPEN\=>false,RUN_ON_FILE_SAVE\=>false,RUN_AS_YOU_TYPE\=>true,RUN_ON_DEMAND\=>true}}
+org.eclipse.cdt.codan.internal.checkers.RedefinitionProblem=Error
+org.eclipse.cdt.codan.internal.checkers.RedefinitionProblem.params={launchModes\=>{RUN_ON_FULL_BUILD\=>true,RUN_ON_INC_BUILD\=>true,RUN_ON_FILE_OPEN\=>false,RUN_ON_FILE_SAVE\=>false,RUN_AS_YOU_TYPE\=>true,RUN_ON_DEMAND\=>true}}
+org.eclipse.cdt.codan.internal.checkers.ReturnStyleProblem=-Warning
+org.eclipse.cdt.codan.internal.checkers.ReturnStyleProblem.params={launchModes\=>{RUN_ON_FULL_BUILD\=>true,RUN_ON_INC_BUILD\=>true,RUN_ON_FILE_OPEN\=>false,RUN_ON_FILE_SAVE\=>false,RUN_AS_YOU_TYPE\=>true,RUN_ON_DEMAND\=>true}}
+org.eclipse.cdt.codan.internal.checkers.ScanfFormatStringSecurityProblem=-Warning
+org.eclipse.cdt.codan.internal.checkers.ScanfFormatStringSecurityProblem.params={launchModes\=>{RUN_ON_FULL_BUILD\=>true,RUN_ON_INC_BUILD\=>true,RUN_ON_FILE_OPEN\=>false,RUN_ON_FILE_SAVE\=>false,RUN_AS_YOU_TYPE\=>true,RUN_ON_DEMAND\=>true}}
+org.eclipse.cdt.codan.internal.checkers.StatementHasNoEffectProblem=Warning
+org.eclipse.cdt.codan.internal.checkers.StatementHasNoEffectProblem.params={launchModes\=>{RUN_ON_FULL_BUILD\=>true,RUN_ON_INC_BUILD\=>true,RUN_ON_FILE_OPEN\=>false,RUN_ON_FILE_SAVE\=>false,RUN_AS_YOU_TYPE\=>true,RUN_ON_DEMAND\=>true},macro\=>true,exceptions\=>()}
+org.eclipse.cdt.codan.internal.checkers.SuggestedParenthesisProblem=Warning
+org.eclipse.cdt.codan.internal.checkers.SuggestedParenthesisProblem.params={launchModes\=>{RUN_ON_FULL_BUILD\=>true,RUN_ON_INC_BUILD\=>true,RUN_ON_FILE_OPEN\=>false,RUN_ON_FILE_SAVE\=>false,RUN_AS_YOU_TYPE\=>true,RUN_ON_DEMAND\=>true},paramNot\=>false}
+org.eclipse.cdt.codan.internal.checkers.SuspiciousSemicolonProblem=Warning
+org.eclipse.cdt.codan.internal.checkers.SuspiciousSemicolonProblem.params={launchModes\=>{RUN_ON_FULL_BUILD\=>true,RUN_ON_INC_BUILD\=>true,RUN_ON_FILE_OPEN\=>false,RUN_ON_FILE_SAVE\=>false,RUN_AS_YOU_TYPE\=>true,RUN_ON_DEMAND\=>true},else\=>false,afterelse\=>false}
+org.eclipse.cdt.codan.internal.checkers.TypeResolutionProblem=Error
+org.eclipse.cdt.codan.internal.checkers.TypeResolutionProblem.params={launchModes\=>{RUN_ON_FULL_BUILD\=>true,RUN_ON_INC_BUILD\=>true,RUN_ON_FILE_OPEN\=>false,RUN_ON_FILE_SAVE\=>false,RUN_AS_YOU_TYPE\=>true,RUN_ON_DEMAND\=>true}}
+org.eclipse.cdt.codan.internal.checkers.UnusedFunctionDeclarationProblem=Warning
+org.eclipse.cdt.codan.internal.checkers.UnusedFunctionDeclarationProblem.params={launchModes\=>{RUN_ON_FULL_BUILD\=>true,RUN_ON_INC_BUILD\=>true,RUN_ON_FILE_OPEN\=>false,RUN_ON_FILE_SAVE\=>false,RUN_AS_YOU_TYPE\=>true,RUN_ON_DEMAND\=>true},macro\=>true}
+org.eclipse.cdt.codan.internal.checkers.UnusedStaticFunctionProblem=Warning
+org.eclipse.cdt.codan.internal.checkers.UnusedStaticFunctionProblem.params={launchModes\=>{RUN_ON_FULL_BUILD\=>true,RUN_ON_INC_BUILD\=>true,RUN_ON_FILE_OPEN\=>false,RUN_ON_FILE_SAVE\=>false,RUN_AS_YOU_TYPE\=>true,RUN_ON_DEMAND\=>true},macro\=>true}
+org.eclipse.cdt.codan.internal.checkers.UnusedVariableDeclarationProblem=Warning
+org.eclipse.cdt.codan.internal.checkers.UnusedVariableDeclarationProblem.params={launchModes\=>{RUN_ON_FULL_BUILD\=>true,RUN_ON_INC_BUILD\=>true,RUN_ON_FILE_OPEN\=>false,RUN_ON_FILE_SAVE\=>false,RUN_AS_YOU_TYPE\=>true,RUN_ON_DEMAND\=>true},macro\=>true,exceptions\=>("@(\#)","$Id")}
+org.eclipse.cdt.codan.internal.checkers.VariableResolutionProblem=Error
+org.eclipse.cdt.codan.internal.checkers.VariableResolutionProblem.params={launchModes\=>{RUN_ON_FULL_BUILD\=>true,RUN_ON_INC_BUILD\=>true,RUN_ON_FILE_OPEN\=>false,RUN_ON_FILE_SAVE\=>false,RUN_AS_YOU_TYPE\=>true,RUN_ON_DEMAND\=>true}}
+useParentScope=false
--- a/3rdparty/cub-1.8.0/.settings/org.eclipse.cdt.core.prefs
+++ b/3rdparty/cub-1.8.0/.settings/org.eclipse.cdt.core.prefs
+eclipse.preferences.version=1
+indexer/indexAllFiles=true
+indexer/indexAllHeaderVersions=false
+indexer/indexAllVersionsSpecificHeaders=
+indexer/indexOnOpen=false
+indexer/indexUnusedHeadersWithAlternateLang=false
+indexer/indexUnusedHeadersWithDefaultLang=true
+indexer/indexerId=org.eclipse.cdt.core.fastIndexer
+indexer/skipFilesLargerThanMB=8
+indexer/skipImplicitReferences=false
+indexer/skipIncludedFilesLargerThanMB=16
+indexer/skipMacroReferences=false
+indexer/skipReferences=false
+indexer/skipTypeReferences=false
+indexer/useHeuristicIncludeResolution=true
+org.eclipse.cdt.core.formatter.alignment_for_arguments_in_method_invocation=16
+org.eclipse.cdt.core.formatter.alignment_for_assignment=16
+org.eclipse.cdt.core.formatter.alignment_for_base_clause_in_type_declaration=48
+org.eclipse.cdt.core.formatter.alignment_for_binary_expression=16
+org.eclipse.cdt.core.formatter.alignment_for_compact_if=0
+org.eclipse.cdt.core.formatter.alignment_for_conditional_expression=48
+org.eclipse.cdt.core.formatter.alignment_for_conditional_expression_chain=18
+org.eclipse.cdt.core.formatter.alignment_for_constructor_initializer_list=0
+org.eclipse.cdt.core.formatter.alignment_for_declarator_list=16
+org.eclipse.cdt.core.formatter.alignment_for_enumerator_list=48
+org.eclipse.cdt.core.formatter.alignment_for_expression_list=0
+org.eclipse.cdt.core.formatter.alignment_for_expressions_in_array_initializer=16
+org.eclipse.cdt.core.formatter.alignment_for_member_access=0
+org.eclipse.cdt.core.formatter.alignment_for_overloaded_left_shift_chain=16
+org.eclipse.cdt.core.formatter.alignment_for_parameters_in_method_declaration=48
+org.eclipse.cdt.core.formatter.alignment_for_throws_clause_in_method_declaration=48
+org.eclipse.cdt.core.formatter.brace_position_for_array_initializer=next_line
+org.eclipse.cdt.core.formatter.brace_position_for_block=next_line
+org.eclipse.cdt.core.formatter.brace_position_for_block_in_case=end_of_line
+org.eclipse.cdt.core.formatter.brace_position_for_method_declaration=next_line
+org.eclipse.cdt.core.formatter.brace_position_for_namespace_declaration=end_of_line
+org.eclipse.cdt.core.formatter.brace_position_for_switch=end_of_line
+org.eclipse.cdt.core.formatter.brace_position_for_type_declaration=next_line
+org.eclipse.cdt.core.formatter.comment.min_distance_between_code_and_line_comment=1
+org.eclipse.cdt.core.formatter.comment.never_indent_line_comments_on_first_column=true
+org.eclipse.cdt.core.formatter.comment.preserve_white_space_between_code_and_line_comments=true
+org.eclipse.cdt.core.formatter.compact_else_if=true
+org.eclipse.cdt.core.formatter.continuation_indentation=1
+org.eclipse.cdt.core.formatter.continuation_indentation_for_array_initializer=1
+org.eclipse.cdt.core.formatter.format_guardian_clause_on_one_line=false
+org.eclipse.cdt.core.formatter.indent_access_specifier_compare_to_type_header=false
+org.eclipse.cdt.core.formatter.indent_access_specifier_extra_spaces=0
+org.eclipse.cdt.core.formatter.indent_body_declarations_compare_to_access_specifier=true
+org.eclipse.cdt.core.formatter.indent_body_declarations_compare_to_namespace_header=false
+org.eclipse.cdt.core.formatter.indent_breaks_compare_to_cases=true
+org.eclipse.cdt.core.formatter.indent_declaration_compare_to_template_header=false
+org.eclipse.cdt.core.formatter.indent_empty_lines=false
+org.eclipse.cdt.core.formatter.indent_statements_compare_to_block=true
+org.eclipse.cdt.core.formatter.indent_statements_compare_to_body=true
+org.eclipse.cdt.core.formatter.indent_switchstatements_compare_to_cases=true
+org.eclipse.cdt.core.formatter.indent_switchstatements_compare_to_switch=false
+org.eclipse.cdt.core.formatter.indentation.size=4
+org.eclipse.cdt.core.formatter.insert_new_line_after_opening_brace_in_array_initializer=do not insert
+org.eclipse.cdt.core.formatter.insert_new_line_after_template_declaration=do not insert
+org.eclipse.cdt.core.formatter.insert_new_line_at_end_of_file_if_missing=do not insert
+org.eclipse.cdt.core.formatter.insert_new_line_before_catch_in_try_statement=insert
+org.eclipse.cdt.core.formatter.insert_new_line_before_closing_brace_in_array_initializer=do not insert
+org.eclipse.cdt.core.formatter.insert_new_line_before_colon_in_constructor_initializer_list=do not insert
+org.eclipse.cdt.core.formatter.insert_new_line_before_else_in_if_statement=insert
+org.eclipse.cdt.core.formatter.insert_new_line_before_identifier_in_function_declaration=do not insert
+org.eclipse.cdt.core.formatter.insert_new_line_before_while_in_do_statement=do not insert
+org.eclipse.cdt.core.formatter.insert_new_line_in_empty_block=insert
+org.eclipse.cdt.core.formatter.insert_space_after_assignment_operator=insert
+org.eclipse.cdt.core.formatter.insert_space_after_binary_operator=insert
+org.eclipse.cdt.core.formatter.insert_space_after_closing_angle_bracket_in_template_arguments=insert
+org.eclipse.cdt.core.formatter.insert_space_after_closing_angle_bracket_in_template_parameters=insert
+org.eclipse.cdt.core.formatter.insert_space_after_closing_brace_in_block=insert
+org.eclipse.cdt.core.formatter.insert_space_after_closing_paren_in_cast=insert
+org.eclipse.cdt.core.formatter.insert_space_after_colon_in_base_clause=insert
+org.eclipse.cdt.core.formatter.insert_space_after_colon_in_case=insert
+org.eclipse.cdt.core.formatter.insert_space_after_colon_in_conditional=insert
+org.eclipse.cdt.core.formatter.insert_space_after_colon_in_labeled_statement=insert
+org.eclipse.cdt.core.formatter.insert_space_after_comma_in_array_initializer=insert
+org.eclipse.cdt.core.formatter.insert_space_after_comma_in_base_types=insert
+org.eclipse.cdt.core.formatter.insert_space_after_comma_in_declarator_list=insert
+org.eclipse.cdt.core.formatter.insert_space_after_comma_in_enum_declarations=insert
+org.eclipse.cdt.core.formatter.insert_space_after_comma_in_expression_list=insert
+org.eclipse.cdt.core.formatter.insert_space_after_comma_in_method_declaration_parameters=insert
+org.eclipse.cdt.core.formatter.insert_space_after_comma_in_method_declaration_throws=insert
+org.eclipse.cdt.core.formatter.insert_space_after_comma_in_method_invocation_arguments=insert
+org.eclipse.cdt.core.formatter.insert_space_after_comma_in_template_arguments=insert
+org.eclipse.cdt.core.formatter.insert_space_after_comma_in_template_parameters=insert
+org.eclipse.cdt.core.formatter.insert_space_after_opening_angle_bracket_in_template_arguments=do not insert
+org.eclipse.cdt.core.formatter.insert_space_after_opening_angle_bracket_in_template_parameters=do not insert
+org.eclipse.cdt.core.formatter.insert_space_after_opening_brace_in_array_initializer=insert
+org.eclipse.cdt.core.formatter.insert_space_after_opening_bracket=do not insert
+org.eclipse.cdt.core.formatter.insert_space_after_opening_paren_in_cast=do not insert
+org.eclipse.cdt.core.formatter.insert_space_after_opening_paren_in_catch=do not insert
+org.eclipse.cdt.core.formatter.insert_space_after_opening_paren_in_exception_specification=do not insert
+org.eclipse.cdt.core.formatter.insert_space_after_opening_paren_in_for=do not insert
+org.eclipse.cdt.core.formatter.insert_space_after_opening_paren_in_if=do not insert
+org.eclipse.cdt.core.formatter.insert_space_after_opening_paren_in_method_declaration=do not insert
+org.eclipse.cdt.core.formatter.insert_space_after_opening_paren_in_method_invocation=do not insert
+org.eclipse.cdt.core.formatter.insert_space_after_opening_paren_in_parenthesized_expression=do not insert
+org.eclipse.cdt.core.formatter.insert_space_after_opening_paren_in_switch=do not insert
+org.eclipse.cdt.core.formatter.insert_space_after_opening_paren_in_while=do not insert
+org.eclipse.cdt.core.formatter.insert_space_after_postfix_operator=do not insert
+org.eclipse.cdt.core.formatter.insert_space_after_prefix_operator=do not insert
+org.eclipse.cdt.core.formatter.insert_space_after_question_in_conditional=insert
+org.eclipse.cdt.core.formatter.insert_space_after_semicolon_in_for=insert
+org.eclipse.cdt.core.formatter.insert_space_after_unary_operator=do not insert
+org.eclipse.cdt.core.formatter.insert_space_before_assignment_operator=insert
+org.eclipse.cdt.core.formatter.insert_space_before_binary_operator=insert
+org.eclipse.cdt.core.formatter.insert_space_before_closing_angle_bracket_in_template_arguments=do not insert
+org.eclipse.cdt.core.formatter.insert_space_before_closing_angle_bracket_in_template_parameters=do not insert
+org.eclipse.cdt.core.formatter.insert_space_before_closing_brace_in_array_initializer=insert
+org.eclipse.cdt.core.formatter.insert_space_before_closing_bracket=do not insert
+org.eclipse.cdt.core.formatter.insert_space_before_closing_paren_in_cast=do not insert
+org.eclipse.cdt.core.formatter.insert_space_before_closing_paren_in_catch=do not insert
+org.eclipse.cdt.core.formatter.insert_space_before_closing_paren_in_exception_specification=do not insert
+org.eclipse.cdt.core.formatter.insert_space_before_closing_paren_in_for=do not insert
+org.eclipse.cdt.core.formatter.insert_space_before_closing_paren_in_if=do not insert
+org.eclipse.cdt.core.formatter.insert_space_before_closing_paren_in_method_declaration=do not insert
+org.eclipse.cdt.core.formatter.insert_space_before_closing_paren_in_method_invocation=do not insert
+org.eclipse.cdt.core.formatter.insert_space_before_closing_paren_in_parenthesized_expression=do not insert
+org.eclipse.cdt.core.formatter.insert_space_before_closing_paren_in_switch=do not insert
+org.eclipse.cdt.core.formatter.insert_space_before_closing_paren_in_while=do not insert
+org.eclipse.cdt.core.formatter.insert_space_before_colon_in_base_clause=insert
+org.eclipse.cdt.core.formatter.insert_space_before_colon_in_case=do not insert
+org.eclipse.cdt.core.formatter.insert_space_before_colon_in_conditional=insert
+org.eclipse.cdt.core.formatter.insert_space_before_colon_in_default=do not insert
+org.eclipse.cdt.core.formatter.insert_space_before_colon_in_labeled_statement=do not insert
+org.eclipse.cdt.core.formatter.insert_space_before_comma_in_array_initializer=do not insert
+org.eclipse.cdt.core.formatter.insert_space_before_comma_in_base_types=do not insert
+org.eclipse.cdt.core.formatter.insert_space_before_comma_in_declarator_list=do not insert
+org.eclipse.cdt.core.formatter.insert_space_before_comma_in_enum_declarations=do not insert
+org.eclipse.cdt.core.formatter.insert_space_before_comma_in_expression_list=do not insert
+org.eclipse.cdt.core.formatter.insert_space_before_comma_in_method_declaration_parameters=do not insert
+org.eclipse.cdt.core.formatter.insert_space_before_comma_in_method_declaration_throws=do not insert
+org.eclipse.cdt.core.formatter.insert_space_before_comma_in_method_invocation_arguments=do not insert
+org.eclipse.cdt.core.formatter.insert_space_before_comma_in_template_arguments=do not insert
+org.eclipse.cdt.core.formatter.insert_space_before_comma_in_template_parameters=do not insert
+org.eclipse.cdt.core.formatter.insert_space_before_opening_angle_bracket_in_template_arguments=do not insert
+org.eclipse.cdt.core.formatter.insert_space_before_opening_angle_bracket_in_template_parameters=do not insert
+org.eclipse.cdt.core.formatter.insert_space_before_opening_brace_in_array_initializer=insert
+org.eclipse.cdt.core.formatter.insert_space_before_opening_brace_in_block=insert
+org.eclipse.cdt.core.formatter.insert_space_before_opening_brace_in_method_declaration=insert
+org.eclipse.cdt.core.formatter.insert_space_before_opening_brace_in_namespace_declaration=insert
+org.eclipse.cdt.core.formatter.insert_space_before_opening_brace_in_switch=insert
+org.eclipse.cdt.core.formatter.insert_space_before_opening_brace_in_type_declaration=insert
+org.eclipse.cdt.core.formatter.insert_space_before_opening_bracket=do not insert
+org.eclipse.cdt.core.formatter.insert_space_before_opening_paren_in_catch=insert
+org.eclipse.cdt.core.formatter.insert_space_before_opening_paren_in_exception_specification=insert
+org.eclipse.cdt.core.formatter.insert_space_before_opening_paren_in_for=insert
+org.eclipse.cdt.core.formatter.insert_space_before_opening_paren_in_if=insert
+org.eclipse.cdt.core.formatter.insert_space_before_opening_paren_in_method_declaration=do not insert
+org.eclipse.cdt.core.formatter.insert_space_before_opening_paren_in_method_invocation=do not insert
+org.eclipse.cdt.core.formatter.insert_space_before_opening_paren_in_parenthesized_expression=do not insert
+org.eclipse.cdt.core.formatter.insert_space_before_opening_paren_in_switch=insert
+org.eclipse.cdt.core.formatter.insert_space_before_opening_paren_in_while=insert
+org.eclipse.cdt.core.formatter.insert_space_before_postfix_operator=do not insert
+org.eclipse.cdt.core.formatter.insert_space_before_prefix_operator=do not insert
+org.eclipse.cdt.core.formatter.insert_space_before_question_in_conditional=insert
+org.eclipse.cdt.core.formatter.insert_space_before_semicolon=do not insert
+org.eclipse.cdt.core.formatter.insert_space_before_semicolon_in_for=do not insert
+org.eclipse.cdt.core.formatter.insert_space_before_unary_operator=do not insert
+org.eclipse.cdt.core.formatter.insert_space_between_empty_braces_in_array_initializer=do not insert
+org.eclipse.cdt.core.formatter.insert_space_between_empty_brackets=do not insert
+org.eclipse.cdt.core.formatter.insert_space_between_empty_parens_in_exception_specification=do not insert
+org.eclipse.cdt.core.formatter.insert_space_between_empty_parens_in_method_declaration=do not insert
+org.eclipse.cdt.core.formatter.insert_space_between_empty_parens_in_method_invocation=do not insert
+org.eclipse.cdt.core.formatter.join_wrapped_lines=true
+org.eclipse.cdt.core.formatter.keep_else_statement_on_same_line=false
+org.eclipse.cdt.core.formatter.keep_empty_array_initializer_on_one_line=false
+org.eclipse.cdt.core.formatter.keep_imple_if_on_one_line=true
+org.eclipse.cdt.core.formatter.keep_then_statement_on_same_line=false
+org.eclipse.cdt.core.formatter.lineSplit=80
+org.eclipse.cdt.core.formatter.number_of_empty_lines_to_preserve=1
+org.eclipse.cdt.core.formatter.put_empty_statement_on_new_line=true
+org.eclipse.cdt.core.formatter.tabulation.char=space
+org.eclipse.cdt.core.formatter.tabulation.size=4
+org.eclipse.cdt.core.formatter.use_tabs_only_for_leading_indentations=false
--- a/3rdparty/cub-1.8.0/.settings/org.eclipse.cdt.ui.prefs
+++ b/3rdparty/cub-1.8.0/.settings/org.eclipse.cdt.ui.prefs
+eclipse.preferences.version=1
+formatter_profile=_B40C
+formatter_settings_version=1
--- a/3rdparty/cub-1.8.0/.settings/org.eclipse.core.runtime.prefs
+++ b/3rdparty/cub-1.8.0/.settings/org.eclipse.core.runtime.prefs
+content-types/enabled=true
+content-types/org.eclipse.cdt.core.cxxHeader/file-extensions=cuh
+content-types/org.eclipse.cdt.core.cxxSource/file-extensions=cu
+eclipse.preferences.version=1
--- a/3rdparty/cub-1.8.0/CHANGE_LOG.TXT
+++ b/3rdparty/cub-1.8.0/CHANGE_LOG.TXT
--- a/3rdparty/cub-1.8.0/LICENSE.TXT
+++ b/3rdparty/cub-1.8.0/LICENSE.TXT
+Copyright (c) 2010-2011, Duane Merrill.  All rights reserved.
+Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+   *  Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+   *  Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+   *  Neither the name of the NVIDIA CORPORATION nor the
+      names of its contributors may be used to endorse or promote products
+      derived from this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
\ No newline at end of file
--- a/3rdparty/cub-1.8.0/README.md
+++ b/3rdparty/cub-1.8.0/README.md
+<hr>
+<h3>About CUB</h3>
+
+Current release: v1.8.0 (02/16/2018)
+
+We recommend the [CUB Project Website](http://nvlabs.github.com/cub) for further information and examples.
+
+CUB provides state-of-the-art, reusable software components for every layer 
+of the CUDA programming model:
+- [<b><em>Device-wide primitives</em></b>] (https://nvlabs.github.com/cub/group___device_module.html) 
+  - Sort, prefix scan, reduction, histogram, etc.  
+  - Compatible with CUDA dynamic parallelism
+- [<b><em>Block-wide "collective" primitives</em></b>] (https://nvlabs.github.com/cub/group___block_module.html)
+  - I/O, sort, prefix scan, reduction, histogram, etc.  
+  - Compatible with arbitrary thread block sizes and types 
+- [<b><em>Warp-wide "collective" primitives</em></b>] (https://nvlabs.github.com/cub/group___warp_module.html)
+  - Warp-wide prefix scan, reduction, etc.
+  - Safe and architecture-specific
+- [<b><em>Thread and resource utilities</em></b>](https://nvlabs.github.com/cub/group___thread_module.html)
+  - PTX intrinsics, device reflection, texture-caching iterators, caching memory allocators, etc. 
+
+![Orientation of collective primitives within the CUDA software stack](http://nvlabs.github.com/cub/cub_overview.png)
+
+<br><hr>
+<h3>A Simple Example</h3>
+
+```C++
+#include <cub/cub.cuh>
+ 
+// Block-sorting CUDA kernel
+__global__ void BlockSortKernel(int *d_in, int *d_out)
+{
+     using namespace cub;
+
+     // Specialize BlockRadixSort, BlockLoad, and BlockStore for 128 threads 
+     // owning 16 integer items each
+     typedef BlockRadixSort<int, 128, 16>                     BlockRadixSort;
+     typedef BlockLoad<int, 128, 16, BLOCK_LOAD_TRANSPOSE>   BlockLoad;
+     typedef BlockStore<int, 128, 16, BLOCK_STORE_TRANSPOSE> BlockStore;
+ 
+     // Allocate shared memory
+     __shared__ union {
+         typename BlockRadixSort::TempStorage  sort;
+         typename BlockLoad::TempStorage       load; 
+         typename BlockStore::TempStorage      store; 
+     } temp_storage; 
+
+     int block_offset = blockIdx.x * (128 * 16);	  // OffsetT for this block's ment
+
+     // Obtain a segment of 2048 consecutive keys that are blocked across threads
+     int thread_keys[16];
+     BlockLoad(temp_storage.load).Load(d_in + block_offset, thread_keys);
+     __syncthreads();
+
+     // Collectively sort the keys
+     BlockRadixSort(temp_storage.sort).Sort(thread_keys);
+     __syncthreads();
+
+     // Store the sorted segment 
+     BlockStore(temp_storage.store).Store(d_out + block_offset, thread_keys);
+}
+```
+
+Each thread block uses cub::BlockRadixSort to collectively sort 
+its own input segment.  The class is specialized by the 
+data type being sorted, by the number of threads per block, by the number of 
+keys per thread, and implicitly by the targeted compilation architecture.  
+
+The cub::BlockLoad and cub::BlockStore classes are similarly specialized.    
+Furthermore, to provide coalesced accesses to device memory, these primitives are 
+configured to access memory using a striped access pattern (where consecutive threads 
+simultaneously access consecutive items) and then <em>transpose</em> the keys into 
+a [<em>blocked arrangement</em>](index.html#sec4sec3) of elements across threads. 
+
+Once specialized, these classes expose opaque \p TempStorage member types.  
+The thread block uses these storage types to statically allocate the union of 
+shared memory needed by the thread block.  (Alternatively these storage types 
+could be aliased to global memory allocations).
+
+<br><hr>
+<h3>Stable Releases</h3>
+
+CUB releases are labeled using version identifiers having three fields: 
+*epoch.feature.update*.  The *epoch* field corresponds to support for
+a major change in the CUDA programming model.  The *feature* field 
+corresponds to a stable set of features, functionality, and interface.  The
+*update* field corresponds to a bug-fix or performance update for that
+feature set.  At the moment, we do not publicly provide non-stable releases 
+such as development snapshots, beta releases or rolling releases.  (Feel free
+to contact us if you would like such things.)  See the 
+[CUB Project Website](http://nvlabs.github.com/cub) for more information.
+
+<br><hr>
+<h3>Contributors</h3>
+
+CUB is developed as an open-source project by [NVIDIA Research](http://research.nvidia.com).  The primary contributor is [Duane Merrill](http://github.com/dumerrill).
+
+<br><hr>
+<h3>Open Source License</h3>
+
+CUB is available under the "New BSD" open-source license:
+
+```
+Copyright (c) 2010-2011, Duane Merrill.  All rights reserved.
+Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+   *  Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+   *  Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+   *  Neither the name of the NVIDIA CORPORATION nor the
+      names of its contributors may be used to endorse or promote products
+      derived from this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+```
--- a/3rdparty/cub-1.8.0/common.mk
+++ b/3rdparty/cub-1.8.0/common.mk
+#/******************************************************************************
+# * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+# * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
+# * 
+# * Redistribution and use in source and binary forms, with or without
+# * modification, are permitted provided that the following conditions are met:
+# *	 * Redistributions of source code must retain the above copyright
+# *	   notice, this list of conditions and the following disclaimer.
+# *	 * Redistributions in binary form must reproduce the above copyright
+# *	   notice, this list of conditions and the following disclaimer in the
+# *	   documentation and/or other materials provided with the distribution.
+# *	 * Neither the name of the NVIDIA CORPORATION nor the
+# *	   names of its contributors may be used to endorse or promote products
+# *	   derived from this software without specific prior written permission.
+# * 
+# * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+# * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+# * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+# * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+# * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+# * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+# * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+# * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+# *
+#******************************************************************************/
+
+
+#-------------------------------------------------------------------------------
+# Commandline Options
+#-------------------------------------------------------------------------------
+
+# [sm=<XXX,...>] Compute-capability to compile for, e.g., "sm=200,300,350" (SM20 by default).
+  
+COMMA = ,
+ifdef sm
+	SM_ARCH = $(subst $(COMMA),-,$(sm))
+else 
+    SM_ARCH = 200
+endif
+
+ifeq (700, $(findstring 700, $(SM_ARCH)))
+    SM_TARGETS 	+= -gencode=arch=compute_70,code=\"sm_70,compute_70\" 
+    SM_DEF 		+= -DSM700
+    TEST_ARCH 	= 700
+endif
+ifeq (620, $(findstring 620, $(SM_ARCH)))
+    SM_TARGETS 	+= -gencode=arch=compute_62,code=\"sm_62,compute_62\" 
+    SM_DEF 		+= -DSM620
+    TEST_ARCH 	= 620
+endif
+ifeq (610, $(findstring 610, $(SM_ARCH)))
+    SM_TARGETS 	+= -gencode=arch=compute_61,code=\"sm_61,compute_61\" 
+    SM_DEF 		+= -DSM610
+    TEST_ARCH 	= 610
+endif
+ifeq (600, $(findstring 600, $(SM_ARCH)))
+    SM_TARGETS 	+= -gencode=arch=compute_60,code=\"sm_60,compute_60\" 
+    SM_DEF 		+= -DSM600
+    TEST_ARCH 	= 600
+endif
+ifeq (520, $(findstring 520, $(SM_ARCH)))
+    SM_TARGETS 	+= -gencode=arch=compute_52,code=\"sm_52,compute_52\" 
+    SM_DEF 		+= -DSM520
+    TEST_ARCH 	= 520
+endif
+ifeq (370, $(findstring 370, $(SM_ARCH)))
+    SM_TARGETS 	+= -gencode=arch=compute_37,code=\"sm_37,compute_37\" 
+    SM_DEF 		+= -DSM370
+    TEST_ARCH 	= 370
+endif
+ifeq (350, $(findstring 350, $(SM_ARCH)))
+    SM_TARGETS 	+= -gencode=arch=compute_35,code=\"sm_35,compute_35\" 
+    SM_DEF 		+= -DSM350
+    TEST_ARCH 	= 350
+endif
+ifeq (300, $(findstring 300, $(SM_ARCH)))
+    SM_TARGETS 	+= -gencode=arch=compute_30,code=\"sm_30,compute_30\"
+    SM_DEF 		+= -DSM300
+    TEST_ARCH 	= 300
+endif
+ifeq (210, $(findstring 210, $(SM_ARCH)))
+    SM_TARGETS 	+= -gencode=arch=compute_20,code=\"sm_21,compute_20\"
+    SM_DEF 		+= -DSM210
+    TEST_ARCH 	= 210
+endif
+ifeq (200, $(findstring 200, $(SM_ARCH)))
+    SM_TARGETS 	+= -gencode=arch=compute_20,code=\"sm_20,compute_20\"
+    SM_DEF 		+= -DSM200
+    TEST_ARCH 	= 200
+endif
+ifeq (130, $(findstring 130, $(SM_ARCH)))
+    SM_TARGETS 	+= -gencode=arch=compute_13,code=\"sm_13,compute_13\" 
+    SM_DEF 		+= -DSM130
+    TEST_ARCH 	= 130
+endif
+ifeq (120, $(findstring 120, $(SM_ARCH)))
+    SM_TARGETS 	+= -gencode=arch=compute_12,code=\"sm_12,compute_12\" 
+    SM_DEF 		+= -DSM120
+    TEST_ARCH 	= 120
+endif
+ifeq (110, $(findstring 110, $(SM_ARCH)))
+    SM_TARGETS 	+= -gencode=arch=compute_11,code=\"sm_11,compute_11\" 
+    SM_DEF 		+= -DSM110
+    TEST_ARCH 	= 110
+endif
+ifeq (100, $(findstring 100, $(SM_ARCH)))
+    SM_TARGETS 	+= -gencode=arch=compute_10,code=\"sm_10,compute_10\" 
+    SM_DEF 		+= -DSM100
+    TEST_ARCH 	= 100
+endif
+
+
+# [cdp=<0|1>] CDP enable option (default: no)
+ifeq ($(cdp), 1)
+	DEFINES += -DCUB_CDP
+	CDP_SUFFIX = cdp
+    NVCCFLAGS += -rdc=true -lcudadevrt
+else
+	CDP_SUFFIX = nocdp
+endif
+
+
+# [force32=<0|1>] Device addressing mode option (64-bit device pointers by default) 
+ifeq ($(force32), 1)
+	CPU_ARCH = -m32
+	CPU_ARCH_SUFFIX = i386
+else
+	CPU_ARCH = -m64
+	CPU_ARCH_SUFFIX = x86_64
+    NPPI = -lnppist
+endif
+
+
+# [abi=<0|1>] CUDA ABI option (enabled by default) 
+ifneq ($(abi), 0)
+	ABI_SUFFIX = abi
+else 
+	NVCCFLAGS += -Xptxas -abi=no
+	ABI_SUFFIX = noabi
+endif
+
+
+# [open64=<0|1>] Middle-end compiler option (nvvm by default)
+ifeq ($(open64), 1)
+	NVCCFLAGS += -open64
+	PTX_SUFFIX = open64
+else 
+	PTX_SUFFIX = nvvm
+endif
+
+
+# [verbose=<0|1>] Verbose toolchain output from nvcc option
+ifeq ($(verbose), 1)
+	NVCCFLAGS += -v
+endif
+
+
+# [keep=<0|1>] Keep intermediate compilation artifacts option
+ifeq ($(keep), 1)
+	NVCCFLAGS += -keep
+endif
+
+# [debug=<0|1>] Generate debug mode code
+ifeq ($(debug), 1)
+	NVCCFLAGS += -G
+endif
+
+
+#-------------------------------------------------------------------------------
+# Compiler and compilation platform
+#-------------------------------------------------------------------------------
+
+CUB_DIR = $(dir $(lastword $(MAKEFILE_LIST)))
+
+NVCC = "$(shell which nvcc)"
+ifdef nvccver
+    NVCC_VERSION = $(nvccver)
+else
+    NVCC_VERSION = $(strip $(shell nvcc --version | grep release | sed 's/.*release //' |  sed 's/,.*//'))
+endif
+
+# detect OS
+OSUPPER = $(shell uname -s 2>/dev/null | tr [:lower:] [:upper:])
+
+# Default flags: verbose kernel properties (regs, smem, cmem, etc.); runtimes for compilation phases 
+NVCCFLAGS += $(SM_DEF) -Xptxas -v -Xcudafe -\# 
+
+ifeq (WIN_NT, $(findstring WIN_NT, $(OSUPPER)))
+    # For MSVC
+    # Enable more warnings and treat as errors
+    NVCCFLAGS += -Xcompiler /W3 -Xcompiler /WX
+    # Disable excess x86 floating point precision that can lead to results being labeled incorrectly
+    NVCCFLAGS += -Xcompiler /fp:strict
+    # Help the compiler/linker work with huge numbers of kernels on Windows
+	NVCCFLAGS += -Xcompiler /bigobj -Xcompiler /Zm500
+	CC = cl
+	
+	# Multithreaded runtime
+	NVCCFLAGS += -Xcompiler /MT
+	
+ifneq ($(force32), 1)
+	CUDART_CYG = "$(shell dirname $(NVCC))/../lib/Win32/cudart.lib"
+else
+	CUDART_CYG = "$(shell dirname $(NVCC))/../lib/x64/cudart.lib"
+endif
+	CUDART = "$(shell cygpath -w $(CUDART_CYG))"
+else
+    # For g++
+    # Disable excess x86 floating point precision that can lead to results being labeled incorrectly
+    NVCCFLAGS += -Xcompiler -ffloat-store
+    CC = g++
+ifneq ($(force32), 1)
+    CUDART = "$(shell dirname $(NVCC))/../lib/libcudart_static.a"
+else
+    CUDART = "$(shell dirname $(NVCC))/../lib64/libcudart_static.a"
+endif
+endif
+
+# Suffix to append to each binary
+BIN_SUFFIX = sm$(SM_ARCH)_$(PTX_SUFFIX)_$(NVCC_VERSION)_$(ABI_SUFFIX)_$(CDP_SUFFIX)_$(CPU_ARCH_SUFFIX)
+
+
+#-------------------------------------------------------------------------------
+# Dependency Lists
+#-------------------------------------------------------------------------------
+
+rwildcard=$(foreach d,$(wildcard $1*),$(call rwildcard,$d/,$2) $(filter $(subst *,%,$2),$d))
+
+CUB_DEPS = 	$(call rwildcard, $(CUB_DIR),*.cuh) \
+			$(CUB_DIR)common.mk
+		
--- a/3rdparty/cub-1.8.0/cub/agent/agent_histogram.cuh
+++ b/3rdparty/cub-1.8.0/cub/agent/agent_histogram.cuh
--- a/3rdparty/cub-1.8.0/cub/agent/agent_radix_sort_downsweep.cuh
+++ b/3rdparty/cub-1.8.0/cub/agent/agent_radix_sort_downsweep.cuh
--- a/3rdparty/cub-1.8.0/cub/agent/agent_radix_sort_upsweep.cuh
+++ b/3rdparty/cub-1.8.0/cub/agent/agent_radix_sort_upsweep.cuh
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * AgentRadixSortUpsweep implements a stateful abstraction of CUDA thread blocks for participating in device-wide radix sort upsweep .
+ */
+
+#pragma once
+
+#include "../thread/thread_reduce.cuh"
+#include "../thread/thread_load.cuh"
+#include "../warp/warp_reduce.cuh"
+#include "../block/block_load.cuh"
+#include "../util_type.cuh"
+#include "../iterator/cache_modified_input_iterator.cuh"
+#include "../util_namespace.cuh"
+
+/// Optional outer namespace(s)
+CUB_NS_PREFIX
+
+/// CUB namespace
+namespace cub {
+
+/******************************************************************************
+ * Tuning policy types
+ ******************************************************************************/
+
+/**
+ * Parameterizable tuning policy type for AgentRadixSortUpsweep
+ */
+template <
+    int                 _BLOCK_THREADS,     ///< Threads per thread block
+    int                 _ITEMS_PER_THREAD,  ///< Items per thread (per tile of input)
+    CacheLoadModifier   _LOAD_MODIFIER,     ///< Cache load modifier for reading keys
+    int                 _RADIX_BITS>        ///< The number of radix bits, i.e., log2(bins)
+struct AgentRadixSortUpsweepPolicy
+{
+    enum
+    {
+        BLOCK_THREADS       = _BLOCK_THREADS,       ///< Threads per thread block
+        ITEMS_PER_THREAD    = _ITEMS_PER_THREAD,    ///< Items per thread (per tile of input)
+        RADIX_BITS          = _RADIX_BITS,          ///< The number of radix bits, i.e., log2(bins)
+    };
+
+    static const CacheLoadModifier LOAD_MODIFIER = _LOAD_MODIFIER;      ///< Cache load modifier for reading keys
+};
+
+
+/******************************************************************************
+ * Thread block abstractions
+ ******************************************************************************/
+
+/**
+ * \brief AgentRadixSortUpsweep implements a stateful abstraction of CUDA thread blocks for participating in device-wide radix sort upsweep .
+ */
+template <
+    typename AgentRadixSortUpsweepPolicy,   ///< Parameterized AgentRadixSortUpsweepPolicy tuning policy type
+    typename KeyT,                          ///< KeyT type
+    typename OffsetT>                       ///< Signed integer type for global offsets
+struct AgentRadixSortUpsweep
+{
+
+    //---------------------------------------------------------------------
+    // Type definitions and constants
+    //---------------------------------------------------------------------
+
+    typedef typename Traits<KeyT>::UnsignedBits UnsignedBits;
+
+    // Integer type for digit counters (to be packed into words of PackedCounters)
+    typedef unsigned char DigitCounter;
+
+    // Integer type for packing DigitCounters into columns of shared memory banks
+    typedef unsigned int PackedCounter;
+
+    static const CacheLoadModifier LOAD_MODIFIER = AgentRadixSortUpsweepPolicy::LOAD_MODIFIER;
+
+    enum
+    {
+        RADIX_BITS              = AgentRadixSortUpsweepPolicy::RADIX_BITS,
+        BLOCK_THREADS           = AgentRadixSortUpsweepPolicy::BLOCK_THREADS,
+        KEYS_PER_THREAD         = AgentRadixSortUpsweepPolicy::ITEMS_PER_THREAD,
+
+        RADIX_DIGITS            = 1 << RADIX_BITS,
+
+        LOG_WARP_THREADS        = CUB_PTX_LOG_WARP_THREADS,
+        WARP_THREADS            = 1 << LOG_WARP_THREADS,
+        WARPS                   = (BLOCK_THREADS + WARP_THREADS - 1) / WARP_THREADS,
+
+        TILE_ITEMS              = BLOCK_THREADS * KEYS_PER_THREAD,
+
+        BYTES_PER_COUNTER       = sizeof(DigitCounter),
+        LOG_BYTES_PER_COUNTER   = Log2<BYTES_PER_COUNTER>::VALUE,
+
+        PACKING_RATIO           = sizeof(PackedCounter) / sizeof(DigitCounter),
+        LOG_PACKING_RATIO       = Log2<PACKING_RATIO>::VALUE,
+
+        LOG_COUNTER_LANES       = CUB_MAX(0, RADIX_BITS - LOG_PACKING_RATIO),
+        COUNTER_LANES           = 1 << LOG_COUNTER_LANES,
+
+        // To prevent counter overflow, we must periodically unpack and aggregate the
+        // digit counters back into registers.  Each counter lane is assigned to a
+        // warp for aggregation.
+
+        LANES_PER_WARP          = CUB_MAX(1, (COUNTER_LANES + WARPS - 1) / WARPS),
+
+        // Unroll tiles in batches without risk of counter overflow
+        UNROLL_COUNT            = CUB_MIN(64, 255 / KEYS_PER_THREAD),
+        UNROLLED_ELEMENTS       = UNROLL_COUNT * TILE_ITEMS,
+    };
+
+
+    // Input iterator wrapper type (for applying cache modifier)s
+    typedef CacheModifiedInputIterator<LOAD_MODIFIER, UnsignedBits, OffsetT> KeysItr;
+
+    /**
+     * Shared memory storage layout
+     */
+    union __align__(16) _TempStorage
+    {
+        DigitCounter    thread_counters[COUNTER_LANES][BLOCK_THREADS][PACKING_RATIO];
+        PackedCounter   packed_thread_counters[COUNTER_LANES][BLOCK_THREADS];
+        OffsetT         block_counters[WARP_THREADS][RADIX_DIGITS];
+    };
+
+
+    /// Alias wrapper allowing storage to be unioned
+    struct TempStorage : Uninitialized<_TempStorage> {};
+
+
+    //---------------------------------------------------------------------
+    // Thread fields (aggregate state bundle)
+    //---------------------------------------------------------------------
+
+    // Shared storage for this CTA
+    _TempStorage    &temp_storage;
+
+    // Thread-local counters for periodically aggregating composite-counter lanes
+    OffsetT         local_counts[LANES_PER_WARP][PACKING_RATIO];
+
+    // Input and output device pointers
+    KeysItr         d_keys_in;
+
+    // The least-significant bit position of the current digit to extract
+    int             current_bit;
+
+    // Number of bits in current digit
+    int             num_bits;
+
+
+
+    //---------------------------------------------------------------------
+    // Helper structure for templated iteration
+    //---------------------------------------------------------------------
+
+    // Iterate
+    template <int COUNT, int MAX>
+    struct Iterate
+    {
+        // BucketKeys
+        static __device__ __forceinline__ void BucketKeys(
+            AgentRadixSortUpsweep       &cta,
+            UnsignedBits                keys[KEYS_PER_THREAD])
+        {
+            cta.Bucket(keys[COUNT]);
+
+            // Next
+            Iterate<COUNT + 1, MAX>::BucketKeys(cta, keys);
+        }
+    };
+
+    // Terminate
+    template <int MAX>
+    struct Iterate<MAX, MAX>
+    {
+        // BucketKeys
+        static __device__ __forceinline__ void BucketKeys(AgentRadixSortUpsweep &/*cta*/, UnsignedBits /*keys*/[KEYS_PER_THREAD]) {}
+    };
+
+
+    //---------------------------------------------------------------------
+    // Utility methods
+    //---------------------------------------------------------------------
+
+    /**
+     * Decode a key and increment corresponding smem digit counter
+     */
+    __device__ __forceinline__ void Bucket(UnsignedBits key)
+    {
+        // Perform transform op
+        UnsignedBits converted_key = Traits<KeyT>::TwiddleIn(key);
+
+        // Extract current digit bits
+        UnsignedBits digit = BFE(converted_key, current_bit, num_bits);
+
+        // Get sub-counter offset
+        UnsignedBits sub_counter = digit & (PACKING_RATIO - 1);
+
+        // Get row offset
+        UnsignedBits row_offset = digit >> LOG_PACKING_RATIO;
+
+        // Increment counter
+        temp_storage.thread_counters[row_offset][threadIdx.x][sub_counter]++;
+    }
+
+
+    /**
+     * Reset composite counters
+     */
+    __device__ __forceinline__ void ResetDigitCounters()
+    {
+        #pragma unroll
+        for (int LANE = 0; LANE < COUNTER_LANES; LANE++)
+        {
+            temp_storage.packed_thread_counters[LANE][threadIdx.x] = 0;
+        }
+    }
+
+
+    /**
+     * Reset the unpacked counters in each thread
+     */
+    __device__ __forceinline__ void ResetUnpackedCounters()
+    {
+        #pragma unroll
+        for (int LANE = 0; LANE < LANES_PER_WARP; LANE++)
+        {
+            #pragma unroll
+            for (int UNPACKED_COUNTER = 0; UNPACKED_COUNTER < PACKING_RATIO; UNPACKED_COUNTER++)
+            {
+                local_counts[LANE][UNPACKED_COUNTER] = 0;
+            }
+        }
+    }
+
+
+    /**
+     * Extracts and aggregates the digit counters for each counter lane
+     * owned by this warp
+     */
+    __device__ __forceinline__ void UnpackDigitCounts()
+    {
+        unsigned int warp_id = threadIdx.x >> LOG_WARP_THREADS;
+        unsigned int warp_tid = LaneId();
+
+        #pragma unroll
+        for (int LANE = 0; LANE < LANES_PER_WARP; LANE++)
+        {
+            const int counter_lane = (LANE * WARPS) + warp_id;
+            if (counter_lane < COUNTER_LANES)
+            {
+                #pragma unroll
+                for (int PACKED_COUNTER = 0; PACKED_COUNTER < BLOCK_THREADS; PACKED_COUNTER += WARP_THREADS)
+                {
+                    #pragma unroll
+                    for (int UNPACKED_COUNTER = 0; UNPACKED_COUNTER < PACKING_RATIO; UNPACKED_COUNTER++)
+                    {
+                        OffsetT counter = temp_storage.thread_counters[counter_lane][warp_tid + PACKED_COUNTER][UNPACKED_COUNTER];
+                        local_counts[LANE][UNPACKED_COUNTER] += counter;
+                    }
+                }
+            }
+        }
+    }
+
+
+    /**
+     * Processes a single, full tile
+     */
+    __device__ __forceinline__ void ProcessFullTile(OffsetT block_offset)
+    {
+        // Tile of keys
+        UnsignedBits keys[KEYS_PER_THREAD];
+
+        LoadDirectStriped<BLOCK_THREADS>(threadIdx.x, d_keys_in + block_offset, keys);
+
+        // Prevent hoisting
+        CTA_SYNC();
+
+        // Bucket tile of keys
+        Iterate<0, KEYS_PER_THREAD>::BucketKeys(*this, keys);
+    }
+
+
+    /**
+     * Processes a single load (may have some threads masked off)
+     */
+    __device__ __forceinline__ void ProcessPartialTile(
+        OffsetT block_offset,
+        const OffsetT &block_end)
+    {
+        // Process partial tile if necessary using single loads
+        block_offset += threadIdx.x;
+        while (block_offset < block_end)
+        {
+            // Load and bucket key
+            UnsignedBits key = d_keys_in[block_offset];
+            Bucket(key);
+            block_offset += BLOCK_THREADS;
+        }
+    }
+
+
+    //---------------------------------------------------------------------
+    // Interface
+    //---------------------------------------------------------------------
+
+    /**
+     * Constructor
+     */
+    __device__ __forceinline__ AgentRadixSortUpsweep(
+        TempStorage &temp_storage,
+        const KeyT  *d_keys_in,
+        int         current_bit,
+        int         num_bits)
+    :
+        temp_storage(temp_storage.Alias()),
+        d_keys_in(reinterpret_cast<const UnsignedBits*>(d_keys_in)),
+        current_bit(current_bit),
+        num_bits(num_bits)
+    {}
+
+
+    /**
+     * Compute radix digit histograms from a segment of input tiles.
+     */
+    __device__ __forceinline__ void ProcessRegion(
+        OffsetT          block_offset,
+        const OffsetT    &block_end)
+    {
+        // Reset digit counters in smem and unpacked counters in registers
+        ResetDigitCounters();
+        ResetUnpackedCounters();
+
+        // Unroll batches of full tiles
+        while (block_offset + UNROLLED_ELEMENTS <= block_end)
+        {
+            for (int i = 0; i < UNROLL_COUNT; ++i)
+            {
+                ProcessFullTile(block_offset);
+                block_offset += TILE_ITEMS;
+            }
+
+            CTA_SYNC();
+
+            // Aggregate back into local_count registers to prevent overflow
+            UnpackDigitCounts();
+
+            CTA_SYNC();
+
+            // Reset composite counters in lanes
+            ResetDigitCounters();
+        }
+
+        // Unroll single full tiles
+        while (block_offset + TILE_ITEMS <= block_end)
+        {
+            ProcessFullTile(block_offset);
+            block_offset += TILE_ITEMS;
+        }
+
+        // Process partial tile if necessary
+        ProcessPartialTile(
+            block_offset,
+            block_end);
+
+        CTA_SYNC();
+
+        // Aggregate back into local_count registers
+        UnpackDigitCounts();
+    }
+
+
+    /**
+     * Extract counts (saving them to the external array)
+     */
+    template <bool IS_DESCENDING>
+    __device__ __forceinline__ void ExtractCounts(
+        OffsetT     *counters,
+        int         bin_stride = 1,
+        int         bin_offset = 0)
+    {
+        unsigned int warp_id    = threadIdx.x >> LOG_WARP_THREADS;
+        unsigned int warp_tid   = LaneId();
+
+        // Place unpacked digit counters in shared memory
+        #pragma unroll
+        for (int LANE = 0; LANE < LANES_PER_WARP; LANE++)
+        {
+            int counter_lane = (LANE * WARPS) + warp_id;
+            if (counter_lane < COUNTER_LANES)
+            {
+                int digit_row = counter_lane << LOG_PACKING_RATIO;
+
+                #pragma unroll
+                for (int UNPACKED_COUNTER = 0; UNPACKED_COUNTER < PACKING_RATIO; UNPACKED_COUNTER++)
+                {
+                    int bin_idx = digit_row + UNPACKED_COUNTER;
+
+                    temp_storage.block_counters[warp_tid][bin_idx] =
+                        local_counts[LANE][UNPACKED_COUNTER];
+                }
+            }
+        }
+
+        CTA_SYNC();
+
+        // Rake-reduce bin_count reductions
+
+        // Whole blocks
+        #pragma unroll
+        for (int BIN_BASE   = RADIX_DIGITS % BLOCK_THREADS;
+            (BIN_BASE + BLOCK_THREADS) <= RADIX_DIGITS;
+            BIN_BASE += BLOCK_THREADS)
+        {
+            int bin_idx = BIN_BASE + threadIdx.x;
+
+            OffsetT bin_count = 0;
+            #pragma unroll
+            for (int i = 0; i < WARP_THREADS; ++i)
+                bin_count += temp_storage.block_counters[i][bin_idx];
+
+            if (IS_DESCENDING)
+                bin_idx = RADIX_DIGITS - bin_idx - 1;
+
+            counters[(bin_stride * bin_idx) + bin_offset] = bin_count;
+        }
+
+        // Remainder
+        if ((RADIX_DIGITS % BLOCK_THREADS != 0) && (threadIdx.x < RADIX_DIGITS))
+        {
+            int bin_idx = threadIdx.x;
+
+            OffsetT bin_count = 0;
+            #pragma unroll
+            for (int i = 0; i < WARP_THREADS; ++i)
+                bin_count += temp_storage.block_counters[i][bin_idx];
+
+            if (IS_DESCENDING)
+                bin_idx = RADIX_DIGITS - bin_idx - 1;
+
+            counters[(bin_stride * bin_idx) + bin_offset] = bin_count;
+        }
+    }
+
+
+    /**
+     * Extract counts
+     */
+    template <int BINS_TRACKED_PER_THREAD>
+    __device__ __forceinline__ void ExtractCounts(
+        OffsetT (&bin_count)[BINS_TRACKED_PER_THREAD])  ///< [out] The exclusive prefix sum for the digits [(threadIdx.x * BINS_TRACKED_PER_THREAD) ... (threadIdx.x * BINS_TRACKED_PER_THREAD) + BINS_TRACKED_PER_THREAD - 1]
+    {
+        unsigned int warp_id    = threadIdx.x >> LOG_WARP_THREADS;
+        unsigned int warp_tid   = LaneId();
+
+        // Place unpacked digit counters in shared memory
+        #pragma unroll
+        for (int LANE = 0; LANE < LANES_PER_WARP; LANE++)
+        {
+            int counter_lane = (LANE * WARPS) + warp_id;
+            if (counter_lane < COUNTER_LANES)
+            {
+                int digit_row = counter_lane << LOG_PACKING_RATIO;
+
+                #pragma unroll
+                for (int UNPACKED_COUNTER = 0; UNPACKED_COUNTER < PACKING_RATIO; UNPACKED_COUNTER++)
+                {
+                    int bin_idx = digit_row + UNPACKED_COUNTER;
+
+                    temp_storage.block_counters[warp_tid][bin_idx] =
+                        local_counts[LANE][UNPACKED_COUNTER];
+                }
+            }
+        }
+
+        CTA_SYNC();
+
+        // Rake-reduce bin_count reductions
+        #pragma unroll
+        for (int track = 0; track < BINS_TRACKED_PER_THREAD; ++track)
+        {
+            int bin_idx = (threadIdx.x * BINS_TRACKED_PER_THREAD) + track;
+
+            if ((BLOCK_THREADS == RADIX_DIGITS) || (bin_idx < RADIX_DIGITS))
+            {
+                bin_count[track] = 0;
+
+                #pragma unroll
+                for (int i = 0; i < WARP_THREADS; ++i)
+                    bin_count[track] += temp_storage.block_counters[i][bin_idx];
+            }
+        }
+    }
+
+};
+
+
+}               // CUB namespace
+CUB_NS_POSTFIX  // Optional outer namespace(s)
+
--- a/3rdparty/cub-1.8.0/cub/agent/agent_reduce.cuh
+++ b/3rdparty/cub-1.8.0/cub/agent/agent_reduce.cuh
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * cub::AgentReduce implements a stateful abstraction of CUDA thread blocks for participating in device-wide reduction .
+ */
+
+#pragma once
+
+#include <iterator>
+
+#include "../block/block_load.cuh"
+#include "../block/block_reduce.cuh"
+#include "../grid/grid_mapping.cuh"
+#include "../grid/grid_even_share.cuh"
+#include "../util_type.cuh"
+#include "../iterator/cache_modified_input_iterator.cuh"
+#include "../util_namespace.cuh"
+
+
+/// Optional outer namespace(s)
+CUB_NS_PREFIX
+
+/// CUB namespace
+namespace cub {
+
+
+/******************************************************************************
+ * Tuning policy types
+ ******************************************************************************/
+
+/**
+ * Parameterizable tuning policy type for AgentReduce
+ */
+template <
+    int                     _BLOCK_THREADS,         ///< Threads per thread block
+    int                     _ITEMS_PER_THREAD,      ///< Items per thread (per tile of input)
+    int                     _VECTOR_LOAD_LENGTH,    ///< Number of items per vectorized load
+    BlockReduceAlgorithm    _BLOCK_ALGORITHM,       ///< Cooperative block-wide reduction algorithm to use
+    CacheLoadModifier       _LOAD_MODIFIER>         ///< Cache load modifier for reading input elements
+struct AgentReducePolicy
+{
+    enum
+    {
+        BLOCK_THREADS       = _BLOCK_THREADS,       ///< Threads per thread block
+        ITEMS_PER_THREAD    = _ITEMS_PER_THREAD,    ///< Items per thread (per tile of input)
+        VECTOR_LOAD_LENGTH  = _VECTOR_LOAD_LENGTH,  ///< Number of items per vectorized load
+    };
+
+    static const BlockReduceAlgorithm  BLOCK_ALGORITHM      = _BLOCK_ALGORITHM;     ///< Cooperative block-wide reduction algorithm to use
+    static const CacheLoadModifier     LOAD_MODIFIER        = _LOAD_MODIFIER;       ///< Cache load modifier for reading input elements
+};
+
+
+
+/******************************************************************************
+ * Thread block abstractions
+ ******************************************************************************/
+
+/**
+ * \brief AgentReduce implements a stateful abstraction of CUDA thread blocks for participating in device-wide reduction .
+ *
+ * Each thread reduces only the values it loads. If \p FIRST_TILE, this
+ * partial reduction is stored into \p thread_aggregate.  Otherwise it is
+ * accumulated into \p thread_aggregate.
+ */
+template <
+    typename AgentReducePolicy,        ///< Parameterized AgentReducePolicy tuning policy type
+    typename InputIteratorT,           ///< Random-access iterator type for input
+    typename OutputIteratorT,          ///< Random-access iterator type for output
+    typename OffsetT,                  ///< Signed integer type for global offsets
+    typename ReductionOp>              ///< Binary reduction operator type having member <tt>T operator()(const T &a, const T &b)</tt>
+struct AgentReduce
+{
+
+    //---------------------------------------------------------------------
+    // Types and constants
+    //---------------------------------------------------------------------
+
+    /// The input value type
+    typedef typename std::iterator_traits<InputIteratorT>::value_type InputT;
+
+    /// The output value type
+    typedef typename If<(Equals<typename std::iterator_traits<OutputIteratorT>::value_type, void>::VALUE),  // OutputT =  (if output iterator's value type is void) ?
+        typename std::iterator_traits<InputIteratorT>::value_type,                                          // ... then the input iterator's value type,
+        typename std::iterator_traits<OutputIteratorT>::value_type>::Type OutputT;                          // ... else the output iterator's value type
+
+    /// Vector type of InputT for data movement
+    typedef typename CubVector<InputT, AgentReducePolicy::VECTOR_LOAD_LENGTH>::Type VectorT;
+
+    /// Input iterator wrapper type (for applying cache modifier)
+    typedef typename If<IsPointer<InputIteratorT>::VALUE,
+            CacheModifiedInputIterator<AgentReducePolicy::LOAD_MODIFIER, InputT, OffsetT>,      // Wrap the native input pointer with CacheModifiedInputIterator
+            InputIteratorT>::Type                                                               // Directly use the supplied input iterator type
+        WrappedInputIteratorT;
+
+    /// Constants
+    enum
+    {
+        BLOCK_THREADS       = AgentReducePolicy::BLOCK_THREADS,
+        ITEMS_PER_THREAD    = AgentReducePolicy::ITEMS_PER_THREAD,
+        VECTOR_LOAD_LENGTH  = CUB_MIN(ITEMS_PER_THREAD, AgentReducePolicy::VECTOR_LOAD_LENGTH),
+        TILE_ITEMS          = BLOCK_THREADS * ITEMS_PER_THREAD,
+
+        // Can vectorize according to the policy if the input iterator is a native pointer to a primitive type
+        ATTEMPT_VECTORIZATION   = (VECTOR_LOAD_LENGTH > 1) &&
+                                    (ITEMS_PER_THREAD % VECTOR_LOAD_LENGTH == 0) &&
+                                    (IsPointer<InputIteratorT>::VALUE) && Traits<InputT>::PRIMITIVE,
+
+    };
+
+    static const CacheLoadModifier    LOAD_MODIFIER   = AgentReducePolicy::LOAD_MODIFIER;
+    static const BlockReduceAlgorithm BLOCK_ALGORITHM = AgentReducePolicy::BLOCK_ALGORITHM;
+
+    /// Parameterized BlockReduce primitive
+    typedef BlockReduce<OutputT, BLOCK_THREADS, AgentReducePolicy::BLOCK_ALGORITHM> BlockReduceT;
+
+    /// Shared memory type required by this thread block
+    struct _TempStorage
+    {
+        typename BlockReduceT::TempStorage  reduce;
+    };
+
+    /// Alias wrapper allowing storage to be unioned
+    struct TempStorage : Uninitialized<_TempStorage> {};
+
+
+    //---------------------------------------------------------------------
+    // Per-thread fields
+    //---------------------------------------------------------------------
+
+    _TempStorage&           temp_storage;       ///< Reference to temp_storage
+    InputIteratorT          d_in;               ///< Input data to reduce
+    WrappedInputIteratorT   d_wrapped_in;       ///< Wrapped input data to reduce
+    ReductionOp             reduction_op;       ///< Binary reduction operator
+
+
+    //---------------------------------------------------------------------
+    // Utility
+    //---------------------------------------------------------------------
+
+
+    // Whether or not the input is aligned with the vector type (specialized for types we can vectorize)
+    template <typename Iterator>
+    static __device__ __forceinline__ bool IsAligned(
+        Iterator        d_in,
+        Int2Type<true>  /*can_vectorize*/)
+    {
+        return (size_t(d_in) & (sizeof(VectorT) - 1)) == 0;
+    }
+
+    // Whether or not the input is aligned with the vector type (specialized for types we cannot vectorize)
+    template <typename Iterator>
+    static __device__ __forceinline__ bool IsAligned(
+        Iterator        /*d_in*/,
+        Int2Type<false> /*can_vectorize*/)
+    {
+        return false;
+    }
+
+
+    //---------------------------------------------------------------------
+    // Constructor
+    //---------------------------------------------------------------------
+
+    /**
+     * Constructor
+     */
+    __device__ __forceinline__ AgentReduce(
+        TempStorage&            temp_storage,       ///< Reference to temp_storage
+        InputIteratorT          d_in,               ///< Input data to reduce
+        ReductionOp             reduction_op)       ///< Binary reduction operator
+    :
+        temp_storage(temp_storage.Alias()),
+        d_in(d_in),
+        d_wrapped_in(d_in),
+        reduction_op(reduction_op)
+    {}
+
+
+    //---------------------------------------------------------------------
+    // Tile consumption
+    //---------------------------------------------------------------------
+
+    /**
+     * Consume a full tile of input (non-vectorized)
+     */
+    template <int IS_FIRST_TILE>
+    __device__ __forceinline__ void ConsumeTile(
+        OutputT                 &thread_aggregate,
+        OffsetT                 block_offset,       ///< The offset the tile to consume
+        int                     /*valid_items*/,    ///< The number of valid items in the tile
+        Int2Type<true>          /*is_full_tile*/,   ///< Whether or not this is a full tile
+        Int2Type<false>         /*can_vectorize*/)  ///< Whether or not we can vectorize loads
+    {
+        OutputT items[ITEMS_PER_THREAD];
+
+        // Load items in striped fashion
+        LoadDirectStriped<BLOCK_THREADS>(threadIdx.x, d_wrapped_in + block_offset, items);
+
+        // Reduce items within each thread stripe
+        thread_aggregate = (IS_FIRST_TILE) ?
+            internal::ThreadReduce(items, reduction_op) :
+            internal::ThreadReduce(items, reduction_op, thread_aggregate);
+    }
+
+
+    /**
+     * Consume a full tile of input (vectorized)
+     */
+    template <int IS_FIRST_TILE>
+    __device__ __forceinline__ void ConsumeTile(
+        OutputT                 &thread_aggregate,
+        OffsetT                 block_offset,       ///< The offset the tile to consume
+        int                     /*valid_items*/,    ///< The number of valid items in the tile
+        Int2Type<true>          /*is_full_tile*/,   ///< Whether or not this is a full tile
+        Int2Type<true>          /*can_vectorize*/)  ///< Whether or not we can vectorize loads
+    {
+        // Alias items as an array of VectorT and load it in striped fashion
+        enum { WORDS =  ITEMS_PER_THREAD / VECTOR_LOAD_LENGTH };
+
+        // Fabricate a vectorized input iterator
+        InputT *d_in_unqualified = const_cast<InputT*>(d_in) + block_offset + (threadIdx.x * VECTOR_LOAD_LENGTH);
+        CacheModifiedInputIterator<AgentReducePolicy::LOAD_MODIFIER, VectorT, OffsetT> d_vec_in(
+            reinterpret_cast<VectorT*>(d_in_unqualified));
+
+        // Load items as vector items
+        InputT input_items[ITEMS_PER_THREAD];
+        VectorT *vec_items = reinterpret_cast<VectorT*>(input_items);
+        #pragma unroll
+        for (int i = 0; i < WORDS; ++i)
+            vec_items[i] = d_vec_in[BLOCK_THREADS * i];
+
+        // Convert from input type to output type
+        OutputT items[ITEMS_PER_THREAD];
+        #pragma unroll
+        for (int i = 0; i < ITEMS_PER_THREAD; ++i)
+            items[i] = input_items[i];
+
+        // Reduce items within each thread stripe
+        thread_aggregate = (IS_FIRST_TILE) ?
+            internal::ThreadReduce(items, reduction_op) :
+            internal::ThreadReduce(items, reduction_op, thread_aggregate);
+    }
+
+
+    /**
+     * Consume a partial tile of input
+     */
+    template <int IS_FIRST_TILE, int CAN_VECTORIZE>
+    __device__ __forceinline__ void ConsumeTile(
+        OutputT                 &thread_aggregate,
+        OffsetT                 block_offset,       ///< The offset the tile to consume
+        int                     valid_items,        ///< The number of valid items in the tile
+        Int2Type<false>         /*is_full_tile*/,   ///< Whether or not this is a full tile
+        Int2Type<CAN_VECTORIZE> /*can_vectorize*/)  ///< Whether or not we can vectorize loads
+    {
+        // Partial tile
+        int thread_offset = threadIdx.x;
+
+        // Read first item
+        if ((IS_FIRST_TILE) && (thread_offset < valid_items))
+        {
+            thread_aggregate = d_wrapped_in[block_offset + thread_offset];
+            thread_offset += BLOCK_THREADS;
+        }
+
+        // Continue reading items (block-striped)
+        while (thread_offset < valid_items)
+        {
+            OutputT item        = d_wrapped_in[block_offset + thread_offset];
+            thread_aggregate    = reduction_op(thread_aggregate, item);
+            thread_offset       += BLOCK_THREADS;
+        }
+    }
+
+
+    //---------------------------------------------------------------
+    // Consume a contiguous segment of tiles
+    //---------------------------------------------------------------------
+
+    /**
+     * \brief Reduce a contiguous segment of input tiles
+     */
+    template <int CAN_VECTORIZE>
+    __device__ __forceinline__ OutputT ConsumeRange(
+        GridEvenShare<OffsetT> &even_share,          ///< GridEvenShare descriptor
+        Int2Type<CAN_VECTORIZE> can_vectorize)      ///< Whether or not we can vectorize loads
+    {
+        OutputT thread_aggregate;
+
+        if (even_share.block_offset + TILE_ITEMS > even_share.block_end)
+        {
+            // First tile isn't full (not all threads have valid items)
+            int valid_items = even_share.block_end - even_share.block_offset;
+            ConsumeTile<true>(thread_aggregate, even_share.block_offset, valid_items, Int2Type<false>(), can_vectorize);
+            return BlockReduceT(temp_storage.reduce).Reduce(thread_aggregate, reduction_op, valid_items);
+        }
+
+        // At least one full block
+        ConsumeTile<true>(thread_aggregate, even_share.block_offset, TILE_ITEMS, Int2Type<true>(), can_vectorize);
+        even_share.block_offset += even_share.block_stride;
+
+        // Consume subsequent full tiles of input
+        while (even_share.block_offset + TILE_ITEMS <= even_share.block_end)
+        {
+            ConsumeTile<false>(thread_aggregate, even_share.block_offset, TILE_ITEMS, Int2Type<true>(), can_vectorize);
+            even_share.block_offset += even_share.block_stride;
+        }
+
+        // Consume a partially-full tile
+        if (even_share.block_offset < even_share.block_end)
+        {
+            int valid_items = even_share.block_end - even_share.block_offset;
+            ConsumeTile<false>(thread_aggregate, even_share.block_offset, valid_items, Int2Type<false>(), can_vectorize);
+        }
+
+        // Compute block-wide reduction (all threads have valid items)
+        return BlockReduceT(temp_storage.reduce).Reduce(thread_aggregate, reduction_op);
+    }
+
+
+    /**
+     * \brief Reduce a contiguous segment of input tiles
+     */
+    __device__ __forceinline__ OutputT ConsumeRange(
+        OffsetT block_offset,                       ///< [in] Threadblock begin offset (inclusive)
+        OffsetT block_end)                          ///< [in] Threadblock end offset (exclusive)
+    {
+        GridEvenShare<OffsetT> even_share;
+        even_share.template BlockInit<TILE_ITEMS>(block_offset, block_end);
+
+        return (IsAligned(d_in + block_offset, Int2Type<ATTEMPT_VECTORIZATION>())) ?
+            ConsumeRange(even_share, Int2Type<true && ATTEMPT_VECTORIZATION>()) :
+            ConsumeRange(even_share, Int2Type<false && ATTEMPT_VECTORIZATION>());
+    }
+
+
+    /**
+     * Reduce a contiguous segment of input tiles
+     */
+    __device__ __forceinline__ OutputT ConsumeTiles(
+        GridEvenShare<OffsetT> &even_share)        ///< [in] GridEvenShare descriptor
+    {
+        // Initialize GRID_MAPPING_STRIP_MINE even-share descriptor for this thread block
+        even_share.template BlockInit<TILE_ITEMS, GRID_MAPPING_STRIP_MINE>();
+
+        return (IsAligned(d_in, Int2Type<ATTEMPT_VECTORIZATION>())) ?
+            ConsumeRange(even_share, Int2Type<true && ATTEMPT_VECTORIZATION>()) :
+            ConsumeRange(even_share, Int2Type<false && ATTEMPT_VECTORIZATION>());
+
+    }
+
+};
+
+
+}               // CUB namespace
+CUB_NS_POSTFIX  // Optional outer namespace(s)
+
--- a/3rdparty/cub-1.8.0/cub/agent/agent_reduce_by_key.cuh
+++ b/3rdparty/cub-1.8.0/cub/agent/agent_reduce_by_key.cuh
--- a/3rdparty/cub-1.8.0/cub/agent/agent_rle.cuh
+++ b/3rdparty/cub-1.8.0/cub/agent/agent_rle.cuh
--- a/3rdparty/cub-1.8.0/cub/agent/agent_scan.cuh
+++ b/3rdparty/cub-1.8.0/cub/agent/agent_scan.cuh
--- a/3rdparty/cub-1.8.0/cub/agent/agent_segment_fixup.cuh
+++ b/3rdparty/cub-1.8.0/cub/agent/agent_segment_fixup.cuh
--- a/3rdparty/cub-1.8.0/cub/agent/agent_select_if.cuh
+++ b/3rdparty/cub-1.8.0/cub/agent/agent_select_if.cuh
--- a/3rdparty/cub-1.8.0/cub/agent/agent_spmv_orig.cuh
+++ b/3rdparty/cub-1.8.0/cub/agent/agent_spmv_orig.cuh
--- a/3rdparty/cub-1.8.0/cub/agent/single_pass_scan_operators.cuh
+++ b/3rdparty/cub-1.8.0/cub/agent/single_pass_scan_operators.cuh
--- a/3rdparty/cub-1.8.0/cub/block/block_adjacent_difference.cuh
+++ b/3rdparty/cub-1.8.0/cub/block/block_adjacent_difference.cuh
--- a/3rdparty/cub-1.8.0/cub/block/block_discontinuity.cuh
+++ b/3rdparty/cub-1.8.0/cub/block/block_discontinuity.cuh
--- a/3rdparty/cub-1.8.0/cub/block/block_exchange.cuh
+++ b/3rdparty/cub-1.8.0/cub/block/block_exchange.cuh
--- a/3rdparty/cub-1.8.0/cub/block/block_histogram.cuh
+++ b/3rdparty/cub-1.8.0/cub/block/block_histogram.cuh
--- a/3rdparty/cub-1.8.0/cub/block/block_load.cuh
+++ b/3rdparty/cub-1.8.0/cub/block/block_load.cuh
--- a/3rdparty/cub-1.8.0/cub/block/block_radix_rank.cuh
+++ b/3rdparty/cub-1.8.0/cub/block/block_radix_rank.cuh
--- a/3rdparty/cub-1.8.0/cub/block/block_radix_sort.cuh
+++ b/3rdparty/cub-1.8.0/cub/block/block_radix_sort.cuh
--- a/3rdparty/cub-1.8.0/cub/block/block_raking_layout.cuh
+++ b/3rdparty/cub-1.8.0/cub/block/block_raking_layout.cuh
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
+ * 
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ * 
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * cub::BlockRakingLayout provides a conflict-free shared memory layout abstraction for warp-raking across thread block data.
+ */
+
+
+#pragma once
+
+#include "../util_macro.cuh"
+#include "../util_arch.cuh"
+#include "../util_type.cuh"
+#include "../util_namespace.cuh"
+
+/// Optional outer namespace(s)
+CUB_NS_PREFIX
+
+/// CUB namespace
+namespace cub {
+
+/**
+ * \brief BlockRakingLayout provides a conflict-free shared memory layout abstraction for 1D raking across thread block data.    ![](raking.png)
+ * \ingroup BlockModule
+ *
+ * \par Overview
+ * This type facilitates a shared memory usage pattern where a block of CUDA
+ * threads places elements into shared memory and then reduces the active
+ * parallelism to one "raking" warp of threads for serially aggregating consecutive
+ * sequences of shared items.  Padding is inserted to eliminate bank conflicts
+ * (for most data types).
+ *
+ * \tparam T                        The data type to be exchanged.
+ * \tparam BLOCK_THREADS            The thread block size in threads.
+ * \tparam PTX_ARCH                 <b>[optional]</b> \ptxversion
+ */
+template <
+    typename    T,
+    int         BLOCK_THREADS,
+    int         PTX_ARCH = CUB_PTX_ARCH>
+struct BlockRakingLayout
+{
+    //---------------------------------------------------------------------
+    // Constants and type definitions
+    //---------------------------------------------------------------------
+
+    enum
+    {
+        /// The total number of elements that need to be cooperatively reduced
+        SHARED_ELEMENTS = BLOCK_THREADS,
+
+        /// Maximum number of warp-synchronous raking threads
+        MAX_RAKING_THREADS = CUB_MIN(BLOCK_THREADS, CUB_WARP_THREADS(PTX_ARCH)),
+
+        /// Number of raking elements per warp-synchronous raking thread (rounded up)
+        SEGMENT_LENGTH = (SHARED_ELEMENTS + MAX_RAKING_THREADS - 1) / MAX_RAKING_THREADS,
+
+        /// Never use a raking thread that will have no valid data (e.g., when BLOCK_THREADS is 62 and SEGMENT_LENGTH is 2, we should only use 31 raking threads)
+        RAKING_THREADS = (SHARED_ELEMENTS + SEGMENT_LENGTH - 1) / SEGMENT_LENGTH,
+
+        /// Whether we will have bank conflicts (technically we should find out if the GCD is > 1)
+        HAS_CONFLICTS = (CUB_SMEM_BANKS(PTX_ARCH) % SEGMENT_LENGTH == 0),
+
+        /// Degree of bank conflicts (e.g., 4-way)
+        CONFLICT_DEGREE = (HAS_CONFLICTS) ?
+            (MAX_RAKING_THREADS * SEGMENT_LENGTH) / CUB_SMEM_BANKS(PTX_ARCH) :
+            1,
+
+        /// Pad each segment length with one element if segment length is not relatively prime to warp size and can't be optimized as a vector load
+        USE_SEGMENT_PADDING = ((SEGMENT_LENGTH & 1) == 0) && (SEGMENT_LENGTH > 2),
+
+        /// Total number of elements in the raking grid
+        GRID_ELEMENTS = RAKING_THREADS * (SEGMENT_LENGTH + USE_SEGMENT_PADDING),
+
+        /// Whether or not we need bounds checking during raking (the number of reduction elements is not a multiple of the number of raking threads)
+        UNGUARDED = (SHARED_ELEMENTS % RAKING_THREADS == 0),
+    };
+
+
+    /**
+     * \brief Shared memory storage type
+     */
+    struct __align__(16) _TempStorage
+    {
+        T buff[BlockRakingLayout::GRID_ELEMENTS];
+    };
+
+    /// Alias wrapper allowing storage to be unioned
+    struct TempStorage : Uninitialized<_TempStorage> {};
+
+
+    /**
+     * \brief Returns the location for the calling thread to place data into the grid
+     */
+    static __device__ __forceinline__ T* PlacementPtr(
+        TempStorage &temp_storage,
+        unsigned int linear_tid)
+    {
+        // Offset for partial
+        unsigned int offset = linear_tid;
+
+        // Add in one padding element for every segment
+        if (USE_SEGMENT_PADDING > 0)
+        {
+            offset += offset / SEGMENT_LENGTH;
+        }
+
+        // Incorporating a block of padding partials every shared memory segment
+        return temp_storage.Alias().buff + offset;
+    }
+
+
+    /**
+     * \brief Returns the location for the calling thread to begin sequential raking
+     */
+    static __device__ __forceinline__ T* RakingPtr(
+        TempStorage &temp_storage,
+        unsigned int linear_tid)
+    {
+        return temp_storage.Alias().buff + (linear_tid * (SEGMENT_LENGTH + USE_SEGMENT_PADDING));
+    }
+};
+
+}               // CUB namespace
+CUB_NS_POSTFIX  // Optional outer namespace(s)
+
--- a/3rdparty/cub-1.8.0/cub/block/block_reduce.cuh
+++ b/3rdparty/cub-1.8.0/cub/block/block_reduce.cuh
--- a/3rdparty/cub-1.8.0/cub/block/block_scan.cuh
+++ b/3rdparty/cub-1.8.0/cub/block/block_scan.cuh
--- a/3rdparty/cub-1.8.0/cub/block/block_shuffle.cuh
+++ b/3rdparty/cub-1.8.0/cub/block/block_shuffle.cuh
--- a/3rdparty/cub-1.8.0/cub/block/block_store.cuh
+++ b/3rdparty/cub-1.8.0/cub/block/block_store.cuh
--- a/3rdparty/cub-1.8.0/cub/block/specializations/block_histogram_atomic.cuh
+++ b/3rdparty/cub-1.8.0/cub/block/specializations/block_histogram_atomic.cuh
--- a/3rdparty/cub-1.8.0/cub/block/specializations/block_histogram_sort.cuh
+++ b/3rdparty/cub-1.8.0/cub/block/specializations/block_histogram_sort.cuh
--- a/3rdparty/cub-1.8.0/cub/block/specializations/block_reduce_raking.cuh
+++ b/3rdparty/cub-1.8.0/cub/block/specializations/block_reduce_raking.cuh
--- a/3rdparty/cub-1.8.0/cub/block/specializations/block_reduce_raking_commutative_only.cuh
+++ b/3rdparty/cub-1.8.0/cub/block/specializations/block_reduce_raking_commutative_only.cuh
--- a/3rdparty/cub-1.8.0/cub/block/specializations/block_reduce_warp_reductions.cuh
+++ b/3rdparty/cub-1.8.0/cub/block/specializations/block_reduce_warp_reductions.cuh
--- a/3rdparty/cub-1.8.0/cub/block/specializations/block_scan_raking.cuh
+++ b/3rdparty/cub-1.8.0/cub/block/specializations/block_scan_raking.cuh
--- a/3rdparty/cub-1.8.0/cub/block/specializations/block_scan_warp_scans.cuh
+++ b/3rdparty/cub-1.8.0/cub/block/specializations/block_scan_warp_scans.cuh
--- a/3rdparty/cub-1.8.0/cub/block/specializations/block_scan_warp_scans2.cuh
+++ b/3rdparty/cub-1.8.0/cub/block/specializations/block_scan_warp_scans2.cuh
--- a/3rdparty/cub-1.8.0/cub/block/specializations/block_scan_warp_scans3.cuh
+++ b/3rdparty/cub-1.8.0/cub/block/specializations/block_scan_warp_scans3.cuh
--- a/3rdparty/cub-1.8.0/cub/cub.cuh
+++ b/3rdparty/cub-1.8.0/cub/cub.cuh
--- a/3rdparty/cub-1.8.0/cub/device/device_histogram.cuh
+++ b/3rdparty/cub-1.8.0/cub/device/device_histogram.cuh
--- a/3rdparty/cub-1.8.0/cub/device/device_partition.cuh
+++ b/3rdparty/cub-1.8.0/cub/device/device_partition.cuh
--- a/3rdparty/cub-1.8.0/cub/device/device_radix_sort.cuh
+++ b/3rdparty/cub-1.8.0/cub/device/device_radix_sort.cuh
--- a/3rdparty/cub-1.8.0/cub/device/device_reduce.cuh
+++ b/3rdparty/cub-1.8.0/cub/device/device_reduce.cuh
--- a/3rdparty/cub-1.8.0/cub/device/device_run_length_encode.cuh
+++ b/3rdparty/cub-1.8.0/cub/device/device_run_length_encode.cuh
--- a/3rdparty/cub-1.8.0/cub/device/device_scan.cuh
+++ b/3rdparty/cub-1.8.0/cub/device/device_scan.cuh
--- a/3rdparty/cub-1.8.0/cub/device/device_segmented_radix_sort.cuh
+++ b/3rdparty/cub-1.8.0/cub/device/device_segmented_radix_sort.cuh
--- a/3rdparty/cub-1.8.0/cub/device/device_segmented_reduce.cuh
+++ b/3rdparty/cub-1.8.0/cub/device/device_segmented_reduce.cuh
--- a/3rdparty/cub-1.8.0/cub/device/device_select.cuh
+++ b/3rdparty/cub-1.8.0/cub/device/device_select.cuh
--- a/3rdparty/cub-1.8.0/cub/device/device_spmv.cuh
+++ b/3rdparty/cub-1.8.0/cub/device/device_spmv.cuh
--- a/3rdparty/cub-1.8.0/cub/device/dispatch/dispatch_histogram.cuh
+++ b/3rdparty/cub-1.8.0/cub/device/dispatch/dispatch_histogram.cuh
--- a/3rdparty/cub-1.8.0/cub/device/dispatch/dispatch_radix_sort.cuh
+++ b/3rdparty/cub-1.8.0/cub/device/dispatch/dispatch_radix_sort.cuh
--- a/3rdparty/cub-1.8.0/cub/device/dispatch/dispatch_reduce.cuh
+++ b/3rdparty/cub-1.8.0/cub/device/dispatch/dispatch_reduce.cuh
--- a/3rdparty/cub-1.8.0/cub/device/dispatch/dispatch_reduce_by_key.cuh
+++ b/3rdparty/cub-1.8.0/cub/device/dispatch/dispatch_reduce_by_key.cuh
--- a/3rdparty/cub-1.8.0/cub/device/dispatch/dispatch_rle.cuh
+++ b/3rdparty/cub-1.8.0/cub/device/dispatch/dispatch_rle.cuh
--- a/3rdparty/cub-1.8.0/cub/device/dispatch/dispatch_scan.cuh
+++ b/3rdparty/cub-1.8.0/cub/device/dispatch/dispatch_scan.cuh
--- a/3rdparty/cub-1.8.0/cub/device/dispatch/dispatch_select_if.cuh
+++ b/3rdparty/cub-1.8.0/cub/device/dispatch/dispatch_select_if.cuh
--- a/3rdparty/cub-1.8.0/cub/device/dispatch/dispatch_spmv_orig.cuh
+++ b/3rdparty/cub-1.8.0/cub/device/dispatch/dispatch_spmv_orig.cuh
--- a/3rdparty/cub-1.8.0/cub/grid/grid_barrier.cuh
+++ b/3rdparty/cub-1.8.0/cub/grid/grid_barrier.cuh
--- a/3rdparty/cub-1.8.0/cub/grid/grid_even_share.cuh
+++ b/3rdparty/cub-1.8.0/cub/grid/grid_even_share.cuh
--- a/3rdparty/cub-1.8.0/cub/grid/grid_mapping.cuh
+++ b/3rdparty/cub-1.8.0/cub/grid/grid_mapping.cuh
--- a/3rdparty/cub-1.8.0/cub/grid/grid_queue.cuh
+++ b/3rdparty/cub-1.8.0/cub/grid/grid_queue.cuh
--- a/3rdparty/cub-1.8.0/cub/host/mutex.cuh
+++ b/3rdparty/cub-1.8.0/cub/host/mutex.cuh
--- a/3rdparty/cub-1.8.0/cub/iterator/arg_index_input_iterator.cuh
+++ b/3rdparty/cub-1.8.0/cub/iterator/arg_index_input_iterator.cuh
--- a/3rdparty/cub-1.8.0/cub/iterator/cache_modified_input_iterator.cuh
+++ b/3rdparty/cub-1.8.0/cub/iterator/cache_modified_input_iterator.cuh
--- a/3rdparty/cub-1.8.0/cub/iterator/cache_modified_output_iterator.cuh
+++ b/3rdparty/cub-1.8.0/cub/iterator/cache_modified_output_iterator.cuh
--- a/3rdparty/cub-1.8.0/cub/iterator/constant_input_iterator.cuh
+++ b/3rdparty/cub-1.8.0/cub/iterator/constant_input_iterator.cuh
--- a/3rdparty/cub-1.8.0/cub/iterator/counting_input_iterator.cuh
+++ b/3rdparty/cub-1.8.0/cub/iterator/counting_input_iterator.cuh
--- a/3rdparty/cub-1.8.0/cub/iterator/discard_output_iterator.cuh
+++ b/3rdparty/cub-1.8.0/cub/iterator/discard_output_iterator.cuh
--- a/3rdparty/cub-1.8.0/cub/iterator/tex_obj_input_iterator.cuh
+++ b/3rdparty/cub-1.8.0/cub/iterator/tex_obj_input_iterator.cuh
--- a/3rdparty/cub-1.8.0/cub/iterator/tex_ref_input_iterator.cuh
+++ b/3rdparty/cub-1.8.0/cub/iterator/tex_ref_input_iterator.cuh
--- a/3rdparty/cub-1.8.0/cub/iterator/transform_input_iterator.cuh
+++ b/3rdparty/cub-1.8.0/cub/iterator/transform_input_iterator.cuh
--- a/3rdparty/cub-1.8.0/cub/thread/thread_load.cuh
+++ b/3rdparty/cub-1.8.0/cub/thread/thread_load.cuh
--- a/3rdparty/cub-1.8.0/cub/thread/thread_operators.cuh
+++ b/3rdparty/cub-1.8.0/cub/thread/thread_operators.cuh
--- a/3rdparty/cub-1.8.0/cub/thread/thread_reduce.cuh
+++ b/3rdparty/cub-1.8.0/cub/thread/thread_reduce.cuh
--- a/3rdparty/cub-1.8.0/cub/thread/thread_scan.cuh
+++ b/3rdparty/cub-1.8.0/cub/thread/thread_scan.cuh
--- a/3rdparty/cub-1.8.0/cub/thread/thread_search.cuh
+++ b/3rdparty/cub-1.8.0/cub/thread/thread_search.cuh
--- a/3rdparty/cub-1.8.0/cub/thread/thread_store.cuh
+++ b/3rdparty/cub-1.8.0/cub/thread/thread_store.cuh
--- a/3rdparty/cub-1.8.0/cub/util_allocator.cuh
+++ b/3rdparty/cub-1.8.0/cub/util_allocator.cuh
--- a/3rdparty/cub-1.8.0/cub/util_arch.cuh
+++ b/3rdparty/cub-1.8.0/cub/util_arch.cuh
--- a/3rdparty/cub-1.8.0/cub/util_debug.cuh
+++ b/3rdparty/cub-1.8.0/cub/util_debug.cuh
--- a/3rdparty/cub-1.8.0/cub/util_device.cuh
+++ b/3rdparty/cub-1.8.0/cub/util_device.cuh
--- a/3rdparty/cub-1.8.0/cub/util_macro.cuh
+++ b/3rdparty/cub-1.8.0/cub/util_macro.cuh
--- a/3rdparty/cub-1.8.0/cub/util_namespace.cuh
+++ b/3rdparty/cub-1.8.0/cub/util_namespace.cuh
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * Place-holder for prefixing the cub namespace
+ */
+
+#pragma once
+
+// For example:
+//#define CUB_NS_PREFIX namespace thrust{ namespace detail {
+//#define CUB_NS_POSTFIX } }
+
+#ifndef CUB_NS_PREFIX
+#define CUB_NS_PREFIX
+#endif
+
+#ifndef CUB_NS_POSTFIX
+#define CUB_NS_POSTFIX
+#endif
--- a/3rdparty/cub-1.8.0/cub/util_ptx.cuh
+++ b/3rdparty/cub-1.8.0/cub/util_ptx.cuh
--- a/3rdparty/cub-1.8.0/cub/util_type.cuh
+++ b/3rdparty/cub-1.8.0/cub/util_type.cuh
--- a/3rdparty/cub-1.8.0/cub/warp/specializations/warp_reduce_shfl.cuh
+++ b/3rdparty/cub-1.8.0/cub/warp/specializations/warp_reduce_shfl.cuh
--- a/3rdparty/cub-1.8.0/cub/warp/specializations/warp_reduce_smem.cuh
+++ b/3rdparty/cub-1.8.0/cub/warp/specializations/warp_reduce_smem.cuh
--- a/3rdparty/cub-1.8.0/cub/warp/specializations/warp_scan_shfl.cuh
+++ b/3rdparty/cub-1.8.0/cub/warp/specializations/warp_scan_shfl.cuh
--- a/3rdparty/cub-1.8.0/cub/warp/specializations/warp_scan_smem.cuh
+++ b/3rdparty/cub-1.8.0/cub/warp/specializations/warp_scan_smem.cuh
--- a/3rdparty/cub-1.8.0/cub/warp/warp_reduce.cuh
+++ b/3rdparty/cub-1.8.0/cub/warp/warp_reduce.cuh
--- a/3rdparty/cub-1.8.0/cub/warp/warp_scan.cuh
+++ b/3rdparty/cub-1.8.0/cub/warp/warp_scan.cuh
--- a/3rdparty/cub-1.8.0/eclipse code style profile.xml
+++ b/3rdparty/cub-1.8.0/eclipse code style profile.xml
--- a/3rdparty/cub-1.8.0/examples/block/.gitignore
+++ b/3rdparty/cub-1.8.0/examples/block/.gitignore
+/bin
+/Debug
+/Release
+/cuda55.sdf
+/cuda55.suo
+/cuda60.sdf
+/cuda60.suo
--- a/3rdparty/cub-1.8.0/examples/block/Makefile
+++ b/3rdparty/cub-1.8.0/examples/block/Makefile
--- a/3rdparty/cub-1.8.0/examples/block/example_block_radix_sort.cu
+++ b/3rdparty/cub-1.8.0/examples/block/example_block_radix_sort.cu
--- a/3rdparty/cub-1.8.0/examples/block/example_block_reduce.cu
+++ b/3rdparty/cub-1.8.0/examples/block/example_block_reduce.cu
--- a/3rdparty/cub-1.8.0/examples/block/example_block_scan.cu
+++ b/3rdparty/cub-1.8.0/examples/block/example_block_scan.cu
--- a/3rdparty/cub-1.8.0/examples/block/reduce_by_key.cu
+++ b/3rdparty/cub-1.8.0/examples/block/reduce_by_key.cu
--- a/3rdparty/cub-1.8.0/examples/device/.gitignore
+++ b/3rdparty/cub-1.8.0/examples/device/.gitignore
--- a/3rdparty/cub-1.8.0/examples/device/Makefile
+++ b/3rdparty/cub-1.8.0/examples/device/Makefile
--- a/3rdparty/cub-1.8.0/examples/device/example_device_partition_flagged.cu
+++ b/3rdparty/cub-1.8.0/examples/device/example_device_partition_flagged.cu
--- a/3rdparty/cub-1.8.0/examples/device/example_device_partition_if.cu
+++ b/3rdparty/cub-1.8.0/examples/device/example_device_partition_if.cu
--- a/3rdparty/cub-1.8.0/examples/device/example_device_radix_sort.cu
+++ b/3rdparty/cub-1.8.0/examples/device/example_device_radix_sort.cu
--- a/3rdparty/cub-1.8.0/examples/device/example_device_reduce.cu
+++ b/3rdparty/cub-1.8.0/examples/device/example_device_reduce.cu
--- a/3rdparty/cub-1.8.0/examples/device/example_device_scan.cu
+++ b/3rdparty/cub-1.8.0/examples/device/example_device_scan.cu
--- a/3rdparty/cub-1.8.0/examples/device/example_device_select_flagged.cu
+++ b/3rdparty/cub-1.8.0/examples/device/example_device_select_flagged.cu
--- a/3rdparty/cub-1.8.0/examples/device/example_device_select_if.cu
+++ b/3rdparty/cub-1.8.0/examples/device/example_device_select_if.cu
--- a/3rdparty/cub-1.8.0/examples/device/example_device_select_unique.cu
+++ b/3rdparty/cub-1.8.0/examples/device/example_device_select_unique.cu
--- a/3rdparty/cub-1.8.0/examples/device/example_device_sort_find_non_trivial_runs.cu
+++ b/3rdparty/cub-1.8.0/examples/device/example_device_sort_find_non_trivial_runs.cu
--- a/3rdparty/cub-1.8.0/experimental/.gitignore
+++ b/3rdparty/cub-1.8.0/experimental/.gitignore
--- a/3rdparty/cub-1.8.0/experimental/Makefile
+++ b/3rdparty/cub-1.8.0/experimental/Makefile
--- a/3rdparty/cub-1.8.0/experimental/defunct/example_coo_spmv.cu
+++ b/3rdparty/cub-1.8.0/experimental/defunct/example_coo_spmv.cu
--- a/3rdparty/cub-1.8.0/experimental/defunct/test_device_seg_reduce.cu
+++ b/3rdparty/cub-1.8.0/experimental/defunct/test_device_seg_reduce.cu
--- a/3rdparty/cub-1.8.0/experimental/histogram/histogram_cub.h
+++ b/3rdparty/cub-1.8.0/experimental/histogram/histogram_cub.h
--- a/3rdparty/cub-1.8.0/experimental/histogram/histogram_gmem_atomics.h
+++ b/3rdparty/cub-1.8.0/experimental/histogram/histogram_gmem_atomics.h
--- a/3rdparty/cub-1.8.0/experimental/histogram/histogram_smem_atomics.h
+++ b/3rdparty/cub-1.8.0/experimental/histogram/histogram_smem_atomics.h
--- a/3rdparty/cub-1.8.0/experimental/histogram_compare.cu
+++ b/3rdparty/cub-1.8.0/experimental/histogram_compare.cu
--- a/3rdparty/cub-1.8.0/experimental/sparse_matrix.h
+++ b/3rdparty/cub-1.8.0/experimental/sparse_matrix.h
--- a/3rdparty/cub-1.8.0/experimental/spmv_compare.cu
+++ b/3rdparty/cub-1.8.0/experimental/spmv_compare.cu
--- a/3rdparty/cub-1.8.0/experimental/spmv_script.sh
+++ b/3rdparty/cub-1.8.0/experimental/spmv_script.sh
--- a/3rdparty/cub-1.8.0/test/.gitignore
+++ b/3rdparty/cub-1.8.0/test/.gitignore
--- a/3rdparty/cub-1.8.0/test/Makefile
+++ b/3rdparty/cub-1.8.0/test/Makefile
--- a/3rdparty/cub-1.8.0/test/half.h
+++ b/3rdparty/cub-1.8.0/test/half.h
--- a/3rdparty/cub-1.8.0/test/link_a.cu
+++ b/3rdparty/cub-1.8.0/test/link_a.cu
--- a/3rdparty/cub-1.8.0/test/link_b.cu
+++ b/3rdparty/cub-1.8.0/test/link_b.cu
--- a/3rdparty/cub-1.8.0/test/link_main.cpp
+++ b/3rdparty/cub-1.8.0/test/link_main.cpp
--- a/3rdparty/cub-1.8.0/test/mersenne.h
+++ b/3rdparty/cub-1.8.0/test/mersenne.h
--- a/3rdparty/cub-1.8.0/test/test_allocator.cu
+++ b/3rdparty/cub-1.8.0/test/test_allocator.cu
--- a/3rdparty/cub-1.8.0/test/test_block_histogram.cu
+++ b/3rdparty/cub-1.8.0/test/test_block_histogram.cu
--- a/3rdparty/cub-1.8.0/test/test_block_load_store.cu
+++ b/3rdparty/cub-1.8.0/test/test_block_load_store.cu
--- a/3rdparty/cub-1.8.0/test/test_block_radix_sort.cu
+++ b/3rdparty/cub-1.8.0/test/test_block_radix_sort.cu
--- a/3rdparty/cub-1.8.0/test/test_block_reduce.cu
+++ b/3rdparty/cub-1.8.0/test/test_block_reduce.cu
--- a/3rdparty/cub-1.8.0/test/test_block_scan.cu
+++ b/3rdparty/cub-1.8.0/test/test_block_scan.cu
--- a/3rdparty/cub-1.8.0/test/test_device_histogram.cu
+++ b/3rdparty/cub-1.8.0/test/test_device_histogram.cu
--- a/3rdparty/cub-1.8.0/test/test_device_radix_sort.cu
+++ b/3rdparty/cub-1.8.0/test/test_device_radix_sort.cu
--- a/3rdparty/cub-1.8.0/test/test_device_reduce.cu
+++ b/3rdparty/cub-1.8.0/test/test_device_reduce.cu
--- a/3rdparty/cub-1.8.0/test/test_device_reduce_by_key.cu
+++ b/3rdparty/cub-1.8.0/test/test_device_reduce_by_key.cu
--- a/3rdparty/cub-1.8.0/test/test_device_run_length_encode.cu
+++ b/3rdparty/cub-1.8.0/test/test_device_run_length_encode.cu
--- a/3rdparty/cub-1.8.0/test/test_device_scan.cu
+++ b/3rdparty/cub-1.8.0/test/test_device_scan.cu
--- a/3rdparty/cub-1.8.0/test/test_device_select_if.cu
+++ b/3rdparty/cub-1.8.0/test/test_device_select_if.cu
--- a/3rdparty/cub-1.8.0/test/test_device_select_unique.cu
+++ b/3rdparty/cub-1.8.0/test/test_device_select_unique.cu
--- a/3rdparty/cub-1.8.0/test/test_grid_barrier.cu
+++ b/3rdparty/cub-1.8.0/test/test_grid_barrier.cu
--- a/3rdparty/cub-1.8.0/test/test_iterator.cu
+++ b/3rdparty/cub-1.8.0/test/test_iterator.cu
--- a/3rdparty/cub-1.8.0/test/test_util.h
+++ b/3rdparty/cub-1.8.0/test/test_util.h
--- a/3rdparty/cub-1.8.0/test/test_warp_reduce.cu
+++ b/3rdparty/cub-1.8.0/test/test_warp_reduce.cu
--- a/3rdparty/cub-1.8.0/test/test_warp_scan.cu
+++ b/3rdparty/cub-1.8.0/test/test_warp_scan.cu
--- a/3rdparty/cub-1.8.0/tt
+++ b/3rdparty/cub-1.8.0/tt
--- a/3rdparty/cub-1.8.0/tune/.gitignore
+++ b/3rdparty/cub-1.8.0/tune/.gitignore
--- a/3rdparty/cub-1.8.0/tune/Makefile
+++ b/3rdparty/cub-1.8.0/tune/Makefile
--- a/3rdparty/cub-1.8.0/tune/tune_device_reduce.cu
+++ b/3rdparty/cub-1.8.0/tune/tune_device_reduce.cu
--- a/BUILD
+++ b/BUILD
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
--- a/LICENSE
+++ b/LICENSE
--- a/NOTICE
+++ b/NOTICE
--- a/README.md
+++ b/README.md
--- a/docs/build.md
+++ b/docs/build.md
--- a/docs/export_model.md
+++ b/docs/export_model.md
--- a/docs/performance.md
+++ b/docs/performance.md
--- a/example/gptlm_example.cu.cc
+++ b/example/gptlm_example.cu.cc
--- a/example/transformer_example.cu.cc
+++ b/example/transformer_example.cu.cc
--- a/kernels/common.h
+++ b/kernels/common.h
--- a/kernels/gptKernels.cu.cc
+++ b/kernels/gptKernels.cu.cc
--- a/kernels/gptKernels.h
+++ b/kernels/gptKernels.h
--- a/kernels/transformerKernels.cu.cc
+++ b/kernels/transformerKernels.cu.cc
--- a/kernels/transformerKernels.h
+++ b/kernels/transformerKernels.h
--- a/model/decoder.cu.cc
+++ b/model/decoder.cu.cc
--- a/model/decoder.h
+++ b/model/decoder.h
--- a/model/encoder.cu.cc
+++ b/model/encoder.cu.cc
--- a/model/encoder.h
+++ b/model/encoder.h
--- a/model/gpt_encoder.cu.cc
+++ b/model/gpt_encoder.cu.cc
--- a/model/gpt_encoder.h
+++ b/model/gpt_encoder.h
--- a/proto/gpt.proto
+++ b/proto/gpt.proto
--- a/proto/gpt_weight.cu.cc
+++ b/proto/gpt_weight.cu.cc
--- a/proto/gpt_weight.h
+++ b/proto/gpt_weight.h
--- a/proto/transformer.proto
+++ b/proto/transformer.proto
--- a/proto/transformer_weight.cu.cc
+++ b/proto/transformer_weight.cu.cc
--- a/proto/transformer_weight.h
+++ b/proto/transformer_weight.h
--- a/server/generate_server.cu.cc
+++ b/server/generate_server.cu.cc
--- a/server/gptlm_server.cu.cc
+++ b/server/gptlm_server.cu.cc
--- a/server/transformer_server.cu.cc
+++ b/server/transformer_server.cu.cc
--- a/tools/util.cu.cc
+++ b/tools/util.cu.cc
--- a/tools/util.h
+++ b/tools/util.h