Search in the blog

середа, 5 жовтня 2011 р.

Performance and memory alignment to cache line size boundaries

Here i would like to show an example of the application where you can see that data aligned to the boundaries of cache line size is accessed much more faster then unaligned:
  1. #include "timer.h"
  2.  
  3. #include <algorithm>
  4. #include <iostream>
  5.  
  6. class A
  7. {
  8. public:
  9.     int x[18];
  10.     A* next;
  11. };
  12.  
  13. class B
  14. {
  15. public:
  16.     int x[15];
  17.     B* next;
  18. };
  19.  
  20. template<class T>
  21. void TestFunc(size_t len, size_t cycles)
  22. {
  23.     size_t l = sizeof(T);        
  24.     std::cout << "Size = " << l << std::endl;
  25.  
  26.     T* m = new T[len];
  27.     for(size_t i = 0; i < len; ++i)
  28.     {
  29.         m[i].next = 0;
  30.         if (i > 0)
  31.         {
  32.             m[i - 1].next = &m[i];
  33.         }
  34.     }
  35.     parallel::Timer timer;
  36.     timer.Start();
  37.     for (size_t i = 0; i < cycles; ++i)
  38.     {
  39.         T* item = &m[0];
  40.         while (item != 0)
  41.         {
  42.             item->x[4] += 5;
  43.             item = item->next;
  44.         }
  45.     }
  46.     std::cout << cycles << " cycles in " << timer.End() << " ms\n";
  47.     delete[] m;
  48. }
  49.  
  50.  
  51. int main(int /*argc*/, char* /*argv*/[])
  52. {      
  53.     size_t len = 1024;
  54.     size_t cycles = 5000;
  55.  
  56.     TestFunc<A>(len, cycles);
  57.     TestFunc<B>(len, cycles);
  58.     return 0;
  59. }
  60.  
Here i assume that the cache line size is 64 bytes. Also I have founded for myself dependency between structure size and access speed - Smaller structures are processed quickly. You can look at this example under ADM CodeAnalyst to see that there are no data cache misses for second function.