1. 21 5月, 2016 1 次提交
    • Z
      lib/GCD.c: use binary GCD algorithm instead of Euclidean · fff7fb0b
      Zhaoxiu Zeng 提交于
      The binary GCD algorithm is based on the following facts:
      	1. If a and b are all evens, then gcd(a,b) = 2 * gcd(a/2, b/2)
      	2. If a is even and b is odd, then gcd(a,b) = gcd(a/2, b)
      	3. If a and b are all odds, then gcd(a,b) = gcd((a-b)/2, b) = gcd((a+b)/2, b)
      
      Even on x86 machines with reasonable division hardware, the binary
      algorithm runs about 25% faster (80% the execution time) than the
      division-based Euclidian algorithm.
      
      On platforms like Alpha and ARMv6 where division is a function call to
      emulation code, it's even more significant.
      
      There are two variants of the code here, depending on whether a fast
      __ffs (find least significant set bit) instruction is available.  This
      allows the unpredictable branches in the bit-at-a-time shifting loop to
      be eliminated.
      
      If fast __ffs is not available, the "even/odd" GCD variant is used.
      
      I use the following code to benchmark:
      
      	#include <stdio.h>
      	#include <stdlib.h>
      	#include <stdint.h>
      	#include <string.h>
      	#include <time.h>
      	#include <unistd.h>
      
      	#define swap(a, b) \
      		do { \
      			a ^= b; \
      			b ^= a; \
      			a ^= b; \
      		} while (0)
      
      	unsigned long gcd0(unsigned long a, unsigned long b)
      	{
      		unsigned long r;
      
      		if (a < b) {
      			swap(a, b);
      		}
      
      		if (b == 0)
      			return a;
      
      		while ((r = a % b) != 0) {
      			a = b;
      			b = r;
      		}
      
      		return b;
      	}
      
      	unsigned long gcd1(unsigned long a, unsigned long b)
      	{
      		unsigned long r = a | b;
      
      		if (!a || !b)
      			return r;
      
      		b >>= __builtin_ctzl(b);
      
      		for (;;) {
      			a >>= __builtin_ctzl(a);
      			if (a == b)
      				return a << __builtin_ctzl(r);
      
      			if (a < b)
      				swap(a, b);
      			a -= b;
      		}
      	}
      
      	unsigned long gcd2(unsigned long a, unsigned long b)
      	{
      		unsigned long r = a | b;
      
      		if (!a || !b)
      			return r;
      
      		r &= -r;
      
      		while (!(b & r))
      			b >>= 1;
      
      		for (;;) {
      			while (!(a & r))
      				a >>= 1;
      			if (a == b)
      				return a;
      
      			if (a < b)
      				swap(a, b);
      			a -= b;
      			a >>= 1;
      			if (a & r)
      				a += b;
      			a >>= 1;
      		}
      	}
      
      	unsigned long gcd3(unsigned long a, unsigned long b)
      	{
      		unsigned long r = a | b;
      
      		if (!a || !b)
      			return r;
      
      		b >>= __builtin_ctzl(b);
      		if (b == 1)
      			return r & -r;
      
      		for (;;) {
      			a >>= __builtin_ctzl(a);
      			if (a == 1)
      				return r & -r;
      			if (a == b)
      				return a << __builtin_ctzl(r);
      
      			if (a < b)
      				swap(a, b);
      			a -= b;
      		}
      	}
      
      	unsigned long gcd4(unsigned long a, unsigned long b)
      	{
      		unsigned long r = a | b;
      
      		if (!a || !b)
      			return r;
      
      		r &= -r;
      
      		while (!(b & r))
      			b >>= 1;
      		if (b == r)
      			return r;
      
      		for (;;) {
      			while (!(a & r))
      				a >>= 1;
      			if (a == r)
      				return r;
      			if (a == b)
      				return a;
      
      			if (a < b)
      				swap(a, b);
      			a -= b;
      			a >>= 1;
      			if (a & r)
      				a += b;
      			a >>= 1;
      		}
      	}
      
      	static unsigned long (*gcd_func[])(unsigned long a, unsigned long b) = {
      		gcd0, gcd1, gcd2, gcd3, gcd4,
      	};
      
      	#define TEST_ENTRIES (sizeof(gcd_func) / sizeof(gcd_func[0]))
      
      	#if defined(__x86_64__)
      
      	#define rdtscll(val) do { \
      		unsigned long __a,__d; \
      		__asm__ __volatile__("rdtsc" : "=a" (__a), "=d" (__d)); \
      		(val) = ((unsigned long long)__a) | (((unsigned long long)__d)<<32); \
      	} while(0)
      
      	static unsigned long long benchmark_gcd_func(unsigned long (*gcd)(unsigned long, unsigned long),
      								unsigned long a, unsigned long b, unsigned long *res)
      	{
      		unsigned long long start, end;
      		unsigned long long ret;
      		unsigned long gcd_res;
      
      		rdtscll(start);
      		gcd_res = gcd(a, b);
      		rdtscll(end);
      
      		if (end >= start)
      			ret = end - start;
      		else
      			ret = ~0ULL - start + 1 + end;
      
      		*res = gcd_res;
      		return ret;
      	}
      
      	#else
      
      	static inline struct timespec read_time(void)
      	{
      		struct timespec time;
      		clock_gettime(CLOCK_PROCESS_CPUTIME_ID, &time);
      		return time;
      	}
      
      	static inline unsigned long long diff_time(struct timespec start, struct timespec end)
      	{
      		struct timespec temp;
      
      		if ((end.tv_nsec - start.tv_nsec) < 0) {
      			temp.tv_sec = end.tv_sec - start.tv_sec - 1;
      			temp.tv_nsec = 1000000000ULL + end.tv_nsec - start.tv_nsec;
      		} else {
      			temp.tv_sec = end.tv_sec - start.tv_sec;
      			temp.tv_nsec = end.tv_nsec - start.tv_nsec;
      		}
      
      		return temp.tv_sec * 1000000000ULL + temp.tv_nsec;
      	}
      
      	static unsigned long long benchmark_gcd_func(unsigned long (*gcd)(unsigned long, unsigned long),
      								unsigned long a, unsigned long b, unsigned long *res)
      	{
      		struct timespec start, end;
      		unsigned long gcd_res;
      
      		start = read_time();
      		gcd_res = gcd(a, b);
      		end = read_time();
      
      		*res = gcd_res;
      		return diff_time(start, end);
      	}
      
      	#endif
      
      	static inline unsigned long get_rand()
      	{
      		if (sizeof(long) == 8)
      			return (unsigned long)rand() << 32 | rand();
      		else
      			return rand();
      	}
      
      	int main(int argc, char **argv)
      	{
      		unsigned int seed = time(0);
      		int loops = 100;
      		int repeats = 1000;
      		unsigned long (*res)[TEST_ENTRIES];
      		unsigned long long elapsed[TEST_ENTRIES];
      		int i, j, k;
      
      		for (;;) {
      			int opt = getopt(argc, argv, "n:r:s:");
      			/* End condition always first */
      			if (opt == -1)
      				break;
      
      			switch (opt) {
      			case 'n':
      				loops = atoi(optarg);
      				break;
      			case 'r':
      				repeats = atoi(optarg);
      				break;
      			case 's':
      				seed = strtoul(optarg, NULL, 10);
      				break;
      			default:
      				/* You won't actually get here. */
      				break;
      			}
      		}
      
      		res = malloc(sizeof(unsigned long) * TEST_ENTRIES * loops);
      		memset(elapsed, 0, sizeof(elapsed));
      
      		srand(seed);
      		for (j = 0; j < loops; j++) {
      			unsigned long a = get_rand();
      			/* Do we have args? */
      			unsigned long b = argc > optind ? strtoul(argv[optind], NULL, 10) : get_rand();
      			unsigned long long min_elapsed[TEST_ENTRIES];
      			for (k = 0; k < repeats; k++) {
      				for (i = 0; i < TEST_ENTRIES; i++) {
      					unsigned long long tmp = benchmark_gcd_func(gcd_func[i], a, b, &res[j][i]);
      					if (k == 0 || min_elapsed[i] > tmp)
      						min_elapsed[i] = tmp;
      				}
      			}
      			for (i = 0; i < TEST_ENTRIES; i++)
      				elapsed[i] += min_elapsed[i];
      		}
      
      		for (i = 0; i < TEST_ENTRIES; i++)
      			printf("gcd%d: elapsed %llu\n", i, elapsed[i]);
      
      		k = 0;
      		srand(seed);
      		for (j = 0; j < loops; j++) {
      			unsigned long a = get_rand();
      			unsigned long b = argc > optind ? strtoul(argv[optind], NULL, 10) : get_rand();
      			for (i = 1; i < TEST_ENTRIES; i++) {
      				if (res[j][i] != res[j][0])
      					break;
      			}
      			if (i < TEST_ENTRIES) {
      				if (k == 0) {
      					k = 1;
      					fprintf(stderr, "Error:\n");
      				}
      				fprintf(stderr, "gcd(%lu, %lu): ", a, b);
      				for (i = 0; i < TEST_ENTRIES; i++)
      					fprintf(stderr, "%ld%s", res[j][i], i < TEST_ENTRIES - 1 ? ", " : "\n");
      			}
      		}
      
      		if (k == 0)
      			fprintf(stderr, "PASS\n");
      
      		free(res);
      
      		return 0;
      	}
      
      Compiled with "-O2", on "VirtualBox 4.4.0-22-generic #38-Ubuntu x86_64" got:
      
        zhaoxiuzeng@zhaoxiuzeng-VirtualBox:~/develop$ ./gcd -r 500000 -n 10
        gcd0: elapsed 10174
        gcd1: elapsed 2120
        gcd2: elapsed 2902
        gcd3: elapsed 2039
        gcd4: elapsed 2812
        PASS
        zhaoxiuzeng@zhaoxiuzeng-VirtualBox:~/develop$ ./gcd -r 500000 -n 10
        gcd0: elapsed 9309
        gcd1: elapsed 2280
        gcd2: elapsed 2822
        gcd3: elapsed 2217
        gcd4: elapsed 2710
        PASS
        zhaoxiuzeng@zhaoxiuzeng-VirtualBox:~/develop$ ./gcd -r 500000 -n 10
        gcd0: elapsed 9589
        gcd1: elapsed 2098
        gcd2: elapsed 2815
        gcd3: elapsed 2030
        gcd4: elapsed 2718
        PASS
        zhaoxiuzeng@zhaoxiuzeng-VirtualBox:~/develop$ ./gcd -r 500000 -n 10
        gcd0: elapsed 9914
        gcd1: elapsed 2309
        gcd2: elapsed 2779
        gcd3: elapsed 2228
        gcd4: elapsed 2709
        PASS
      
      [akpm@linux-foundation.org: avoid #defining a CONFIG_ variable]
      Signed-off-by: NZhaoxiu Zeng <zhaoxiu.zeng@gmail.com>
      Signed-off-by: NGeorge Spelvin <linux@horizon.com>
      Signed-off-by: NAndrew Morton <akpm@linux-foundation.org>
      Signed-off-by: NLinus Torvalds <torvalds@linux-foundation.org>
      fff7fb0b
  2. 09 5月, 2016 4 次提交
  3. 05 5月, 2016 1 次提交
    • V
      ARC: support HIGHMEM even without PAE40 · 26f9d5fd
      Vineet Gupta 提交于
      Initial HIGHMEM support on ARC was introduced for PAE40 where the low
      memory (0x8000_0000 based) and high memory (0x1_0000_0000) were
      physically contiguous. So CONFIG_FLATMEM sufficed (despite a peipheral
      hole in the middle, which wasted a bit of struct page memory, but things
      worked).
      
      However w/o PAE, highmem was not possible and we could only reach
      ~1.75GB of DDR. Now there is a use case to access ~4GB of DDR w/o PAE40
      The idea is to have low memory at canonical 0x8000_0000 and highmem
      at 0 so enire 4GB address space is available for physical addressing
      This needs additional platform/interconnect mapping to convert
      the non contiguous physical addresses into linear bus adresses.
      
      From Linux point of view, non contiguous divide means FLATMEM no
      longer works and DISCONTIGMEM is needed to track the pfns in the 2
      regions.
      
      This scheme would also work for PAE40, only better in that we don't
      waste struct page memory for the peripheral hole.
      
      The DT description will be something like
      
          memory {
              ...
              reg = <0x80000000 0x200000000   /* 512MB: lowmem */
                     0x00000000 0x10000000>;  /* 256MB: highmem */
         }
      Signed-off-by: NNoam Camus <noamc@ezchip.com>
      Signed-off-by: NVineet Gupta <vgupta@synopsys.com>
      26f9d5fd
  4. 27 4月, 2016 2 次提交
  5. 07 4月, 2016 1 次提交
    • A
      ARC: Don't source drivers/pci/pcie/Kconfig ourselves · 732dc97b
      Andreas Ziegler 提交于
      Commit 5f8fc432 ("PCI: Include pci/pcie/Kconfig directly from
      pci/Kconfig") in linux-next changed drivers/pci/Kconfig to include
      drivers/pci/pcie/Kconfig itself, so that architectures do not need
      to source both files themselves. ARC just recently gained PCI support
      through commit 6b3fb77998dd ("ARC: Add PCI support"), but this change
      was based on the old behaviour of the Kconfig files. This makes
      Kconfig now spit out the following warnings:
      
      drivers/pci/pcie/Kconfig:61:warning: choice value used outside its choice group
      drivers/pci/pcie/Kconfig:67:warning: choice value used outside its choice group
      drivers/pci/pcie/Kconfig:74:warning: choice value used outside its choice group
      
      This change updates the Kconfig file for ARC, dropping the now
      unnecessary 'source' statement, which makes the warning disappear.
      Signed-off-by: NAndreas Ziegler <andreas.ziegler@fau.de>
      Signed-off-by: NVineet Gupta <vgupta@synopsys.com>
      732dc97b
  6. 19 3月, 2016 1 次提交
  7. 15 3月, 2016 1 次提交
  8. 12 3月, 2016 1 次提交
  9. 11 3月, 2016 1 次提交
  10. 24 2月, 2016 1 次提交
  11. 23 2月, 2016 1 次提交
    • A
      arc: get rid of DEVTMPFS dependency on INITRAMFS_SOURCE · 3e5177c1
      Alexey Brodkin 提交于
      Even though DEVTMPFS is required when our pre-built initramfs
      is used it is not the case in general. It is perfectly possible
      to use initramfs with device nodes already populated or there
      could be other usages, see discussion below for more detials:
      http://thread.gmane.org/gmane.comp.embedded.openwrt.devel/37819/focus=37821
      
      This change removes mentioned dependency from arch/arc/Kconfig
      updating instead those defconfigs that are usually used with this
      kind of pre-build initramfs.
      
      And while at it all touched defconfigs were regenerated via
      savedefconfig and some options were removed:
       * USB is selected by other options implicitly
       * VGA_CONSOLE is disableb for ARC since
         031e29b5
       * EXT3_FS automatically selects EXT4_FS
       * MTDxxx and JFFS2_FS make no sense for AXS because
         AXS NAND controller is not upstreamed
       * NET_OSCI_LAN is not in upstream as well
       * ARCPGU_xxx options make no sense because ARC PGU is not yet
         in upstream and when it gets there all config options would
         be taken from devicetree
      Signed-off-by: NAlexey Brodkin <abrodkin@synopsys.com>
      Signed-off-by: NVineet Gupta <vgupta@synopsys.com>
      3e5177c1
  12. 18 2月, 2016 1 次提交
  13. 12 2月, 2016 1 次提交
    • V
      ARC: mm: Introduce explicit super page size support · 37eda9df
      Vineet Gupta 提交于
      MMUv4 supports 2 concurrent page sizes: Normal and Super [4K to 16M]
      
      So far Linux supported a single super page size for a given Normal page,
      depending on the software page walking address split.
      e.g. we had 11:8:13 address split for 8K page, which meant super page
      was 2 ^(8+13) = 2M (given that THP size has to be PMD_SHIFT)
      
      Now we turn this around, by allowing multiple Super Pages in Kconfig
      (currently 2M and 16M only) and forcing page walker address split to
      PGDIR_SHIFT and PAGE_SHIFT
      
      For configs without Super page, things are same as before and
      PGDIR_SHIFT can be hacked to get non default address split
      
      The motivation for this change is a customer who needs 16M super page
      and a 8K Normal page combo.
      Signed-off-by: NVineet Gupta <vgupta@synopsys.com>
      37eda9df
  14. 29 1月, 2016 1 次提交
  15. 21 1月, 2016 2 次提交
  16. 17 1月, 2016 1 次提交
  17. 17 12月, 2015 1 次提交
  18. 29 10月, 2015 1 次提交
  19. 28 10月, 2015 2 次提交
    • V
      ARC: mm: HIGHMEM: kmap API implementation · 45890f6d
      Vineet Gupta 提交于
      Implement kmap* API for ARC.
      
      This enables
       - permanent kernel maps (pkmaps): :kmap() API
       - fixmap : kmap_atomic()
      
      We use a very simple/uniform approach for both (unlike some of the other
      arches). So fixmap doesn't use the customary compile time address stuff.
      The important semantic is sleep'ability (pkmap) vs. not (fixmap) which
      the API guarantees.
      
      Note that this patch only enables highmem for subsequent PAE40 support
      as there is no real highmem for ARC in pure 32-bit paradigm as explained
      below.
      
      ARC has 2:2 address split of the 32-bit address space with lower half
      being translated (virtual) while upper half unstranslated
      (0x8000_0000 to 0xFFFF_FFFF). kernel itself is linked at base of
      unstranslated space (i.e. 0x8000_0000 onwards), which is mapped to say
      DDR 0x0 by external Bus Glue logic (outside the core). So kernel can
      potentially access 1.75G worth of memory directly w/o need for highmem.
      (the top 256M is taken by uncached peripheral space from 0xF000_0000 to
      0xFFFF_FFFF)
      
      In PAE40, hardware can address memory beyond 4G (0x1_0000_0000) while
      the logical/virtual addresses remain 32-bits. Thus highmem is required
      for kernel proper to be able to access these pages for it's own purposes
      (user space is agnostic to this anyways).
      Signed-off-by: NAlexey Brodkin <abrodkin@synopsys.com>
      Signed-off-by: NVineet Gupta <vgupta@synopsys.com>
      45890f6d
    • V
      ARC: boot: Support Halt-on-reset and Run-on-reset SMP booting modes · 3971cdc2
      Vineet Gupta 提交于
      For Run-on-reset, non masters need to spin wait. For Halt-on-reset they
      can jump to entry point directly.
      
      Also while at it, made reset vector handler as "the" entry point for
      kernel including host debugger based boot (which uses the ELF header
      entry point)
      Signed-off-by: NVineet Gupta <vgupta@synopsys.com>
      3971cdc2
  20. 17 10月, 2015 2 次提交
    • V
    • V
      ARCv2: mm: THP support · fe6c1b86
      Vineet Gupta 提交于
      MMUv4 in HS38x cores supports Super Pages which are basis for Linux THP
      support.
      
      Normal and Super pages can co-exist (ofcourse not overlap) in TLB with a
      new bit "SZ" in TLB page desciptor to distinguish between them.
      Super Page size is configurable in hardware (4K to 16M), but fixed once
      RTL builds.
      
      The exact THP size a Linx configuration will support is a function of:
       - MMU page size (typical 8K, RTL fixed)
       - software page walker address split between PGD:PTE:PFN (typical
         11:8:13, but can be changed with 1 line)
      
      So for above default, THP size supported is 8K * 256 = 2M
      
      Default Page Walker is 2 levels, PGD:PTE:PFN, which in THP regime
      reduces to 1 level (as PTE is folded into PGD and canonically referred
      to as PMD).
      
      Thus thp PMD accessors are implemented in terms of PTE (just like sparc)
      Signed-off-by: NVineet Gupta <vgupta@synopsys.com>
      fe6c1b86
  21. 20 8月, 2015 1 次提交
  22. 11 8月, 2015 1 次提交
  23. 04 8月, 2015 1 次提交
  24. 23 7月, 2015 1 次提交
  25. 20 7月, 2015 1 次提交
  26. 06 7月, 2015 1 次提交
  27. 25 6月, 2015 1 次提交
  28. 22 6月, 2015 6 次提交