Search in the blog

середа, 5 жовтня 2011 р.

Performance and memory alignment to cache line size boundaries

Here i would like to show an example of the application where you can see that data aligned to the boundaries of cache line size is accessed much more faster then unaligned:
  1. #include "timer.h"
  2.  
  3. #include <algorithm>
  4. #include <iostream>
  5.  
  6. class A
  7. {
  8. public:
  9.     int x[18];
  10.     A* next;
  11. };
  12.  
  13. class B
  14. {
  15. public:
  16.     int x[15];
  17.     B* next;
  18. };
  19.  
  20. template<class T>
  21. void TestFunc(size_t len, size_t cycles)
  22. {
  23.     size_t l = sizeof(T);        
  24.     std::cout << "Size = " << l << std::endl;
  25.  
  26.     T* m = new T[len];
  27.     for(size_t i = 0; i < len; ++i)
  28.     {
  29.         m[i].next = 0;
  30.         if (i > 0)
  31.         {
  32.             m[i - 1].next = &m[i];
  33.         }
  34.     }
  35.     parallel::Timer timer;
  36.     timer.Start();
  37.     for (size_t i = 0; i < cycles; ++i)
  38.     {
  39.         T* item = &m[0];
  40.         while (item != 0)
  41.         {
  42.             item->x[4] += 5;
  43.             item = item->next;
  44.         }
  45.     }
  46.     std::cout << cycles << " cycles in " << timer.End() << " ms\n";
  47.     delete[] m;
  48. }
  49.  
  50.  
  51. int main(int /*argc*/, char* /*argv*/[])
  52. {      
  53.     size_t len = 1024;
  54.     size_t cycles = 5000;
  55.  
  56.     TestFunc<A>(len, cycles);
  57.     TestFunc<B>(len, cycles);
  58.     return 0;
  59. }
  60.  
Here i assume that the cache line size is 64 bytes. Also I have founded for myself dependency between structure size and access speed - Smaller structures are processed quickly. You can look at this example under ADM CodeAnalyst to see that there are no data cache misses for second function.

субота, 23 квітня 2011 р.

Active object and future result TBB implementation

After the reading articles about future results and active objects i have implemented a class which realize these concepts, as threading library i used TBB. Here is a code:
Active.h:
#ifndef _ACTIVE_OBJECT_H_
#define _ACTIVE_OBJECT_H_

#include "Types.h"

#include <functional>
#include <memory>

#include <tbb/tbb.h>

namespace Measurements
{

template <class T>
class FutureResult
{
public:
    FutureResult()
    {
        isAvailable = false;
        haveError = false;
    }

    void SetValue(const T& val)
    {
        tbb::mutex::scoped_lock lock(guard);
        value = val;
        isAvailable = true;
        haveError = false;
    }

    T GetValue() const
    {
        tbb::mutex::scoped_lock lock(guard);
        isAvailable = false;
        haveError = false;
        return value;
    }

    bool IsAvailable() const
    {
        return isAvailable;
    }

    bool HaveError() const
    {
        return haveError;
    }

    std::string GetErrorReport() const
    {
        tbb::mutex::scoped_lock lock(guard);
        return errorReport;
    }

    void SetErrorReport(const std::string& error)
    {
        tbb::mutex::scoped_lock lock(guard);
        errorReport = error;
    }

private:
    T value;
    std::string errorReport;
    mutable tbb::atomic<bool> isAvailable;
    mutable tbb::atomic<bool> haveError;
    mutable tbb::mutex guard;
};

template <class R, class F, class O>
class FunctionRunner : public tbb::task
{
public:
    FunctionRunner(R r, F f, O* p)
        : result(r)
        , func(f)
        , ptr(p)
    {}
    virtual task* execute()  
    {
        try
        {
            result->SetValue((ptr->*func)());
        }
        catch(std::exception& e)
        {
            result->SetErrorReport(e.what());
        }
        return 0;
    }
    R result;
    F func;
    O* ptr;
};

class Active

{

public:

    Active();



    ~Active();



    template<class R, class F, class O>

    std::shared_ptr<FutureResult<R> > ExecuteParallel(F f, O* p)

    {

        std::shared_ptr<FutureResult<R> > result(new FutureResult<R>());

        

        tbb::task& tbbTask = *(new(parent->allocate_child())
                               FunctionRunner<decltype(result),F,O>(result, f, p));  

        parent->increment_ref_count();  

        parent->spawn(tbbTask);  



        return result;

    }



    void WaitAll() const;



private:



    Active(const Active&);

    void operator= (const Active&);



private:

    tbb::empty_task* parent;

};

}

#endif

Active.cpp:
#include "Active.h"

namespace Measurements
{
/*******************************************************************************/
Active::Active()

{

    parent = new( tbb::task::allocate_root() ) tbb::empty_task;  

    parent->set_ref_count(1);

}

/*******************************************************************************/

Active::~Active() 

{

    parent->wait_for_all();
    parent->destroy(*parent);

}
/*******************************************************************************/
void Active::WaitAll() const
{
    parent->wait_for_all();
    parent->set_ref_count(1);
}
/*******************************************************************************/
}

середа, 26 січня 2011 р.

Debug STL with GDB

I've found very useful script for viewing STL containers under GDB here is link. Also i duplicated them here :

##########################################
# #
# STL GDB evaluators/views/utilities #
# #
##########################################
#
# The new GDB commands:
# are entirely non instrumental
# do not depend on any "inline"(s) - e.g. size(), [], etc
# are extremely tolerant to debugger settings
#
# This file should be "included" in .gdbinit as following:
# source stl-views.gdb or just paste it into your .gdbinit file
#
# The following STL containers are currently supported:
#
# std::vector -- via pvector command
# std::list -- via plist command
# std::map -- via pmap command
# std::multimap -- via pmap command
# std::set -- via pset command
# std::multiset -- via pset command
# std::deque -- via pdequeue command
# std::stack -- via pstack command
# std::queue -- via pqueue command
# std::priority_queue -- via ppqueue command
# std::bitset -- via pbitset command
# std::string -- via pstring command
# std::widestring -- via pwstring command
#
# The end of this file contains (optional) C++ beautifiers
#
##########################################################################
# #
# CopyRight @ 2008 - Dan C Marinescu - All Rights Reserved under GPL V3. #
# #
# Email: dan_c_marinescu.RemoveThis@yahoo.com #
# #
##########################################################################



#
# std::vector<>
#

define pvector
if $argc == 0
help pvector
else
set $size = $arg0._M_impl._M_finish - $arg0._M_impl._M_start
set $capacity = $arg0._M_impl._M_end_of_storage - $arg0._M_impl._M_start
set $size_max = $size - 1
end
if $argc == 1
set $i = 0
while $i < $size printf "elem[%u]: ", $i p *($arg0._M_impl._M_start + $i) set $i++ end end if $argc == 2 set $idx = $arg1 if $idx < 0 || $idx > $size_max
printf "idx1, idx2 are not in acceptable range: [0..%u].\n", $size_max
else
printf "elem[%u]: ", $idx
p *($arg0._M_impl._M_start + $idx)
end
end
if $argc == 3
set $start_idx = $arg1
set $stop_idx = $arg2
if $start_idx > $stop_idx
set $tmp_idx = $start_idx
set $start_idx = $stop_idx
set $stop_idx = $tmp_idx
end
if $start_idx < 0 || $stop_idx < 0 || $start_idx > $size_max || $stop_idx > $size_max
printf "idx1, idx2 are not in acceptable range: [0..%u].\n", $size_max
else
set $i = $start_idx
while $i <= $stop_idx printf "elem[%u]: ", $i p *($arg0._M_impl._M_start + $i) set $i++ end end end if $argc > 0
printf "Vector size = %u\n", $size
printf "Vector capacity = %u\n", $capacity
printf "Element "
whatis $arg0._M_impl._M_start
end
end

document pvector
Prints std::vector information.
Syntax: pvector
Note: idx, idx1 and idx2 must be in acceptable range [0...size()-1].
Examples:
pvector v - Prints vector content, size, capacity and T typedef
pvector v 0 - Prints element[idx] from vector
pvector v 1 2 - Prints elements in range [idx1..idx2] from vector
end



#
# std::list<>
#

define plist
if $argc == 0
help plist
else
set $head = &$arg0._M_impl._M_node
set $current = $arg0->_M_impl->_M_node->_M_next
set $size = 0
while $current != $head
if $argc == 2
printf "elem[%u]: ", $size
p *($arg1*)($current + 1)
end
if $argc == 3
if $size == $arg2
printf "elem[%u]: ", $size
p *($arg1*)($current + 1)
end
end
set $current = $current->_M_next
set $size++
end
printf "List size = %u \n", $size
if $argc == 1
printf "List "
whatis $arg0
printf "Use plist to see the elements in the list.\n"
end
end
end

document plist
Prints std::list information.
Syntax: plist : Prints list size, if T defined all elements or just element at idx
Examples:
plist l - prints list size and definition
plist l int - prints all elements and list size
plist l int 2 - prints the third element in the list (if exists) and list size
end



#
# std::map and std::multimap
#

define pmap
if $argc == 0
help pmap
else
set $tree = $arg0
set $i = 0
set $node = $tree->_M_t->_M_impl->_M_header->_M_left
set $end = $tree->_M_t->_M_impl->_M_header
set $tree_size = $tree->_M_t->_M_impl->_M_node_count
if $argc == 1
printf "Map "
whatis $tree
printf "Use pmap to see the elements in the map.\n"
end
if $argc == 3
while $i < $tree_size set $value = (void *)($node + 1) printf "elem[%u]->left: ", $i
p *($arg1*)$value
set $value = $value + 4
printf "elem[%u]->right: ", $i
p *($arg2*)$value
if $node->_M_right != 0
set $node = $node->_M_right
while $node->_M_left != 0
set $node = $node->_M_left
end
else
set $tmp_node = $node->_M_parent
while $node == $tmp_node->_M_right
set $node = $tmp_node
set $tmp_node = $tmp_node->_M_parent
end
if $node->_M_right != $tmp_node
set $node = $tmp_node
end
end
set $i++
end
end
if $argc == 4
set $idx = $arg3
set $ElementsFound = 0
while $i < $tree_size set $value = (void *)($node + 1) if *($arg1*)$value == $idx printf "elem[%u]->left: ", $i
p *($arg1*)$value
set $value = $value + 4
printf "elem[%u]->right: ", $i
p *($arg2*)$value
set $ElementsFound++
end
if $node->_M_right != 0
set $node = $node->_M_right
while $node->_M_left != 0
set $node = $node->_M_left
end
else
set $tmp_node = $node->_M_parent
while $node == $tmp_node->_M_right
set $node = $tmp_node
set $tmp_node = $tmp_node->_M_parent
end
if $node->_M_right != $tmp_node
set $node = $tmp_node
end
end
set $i++
end
printf "Number of elements found = %u\n", $ElementsFound
end
if $argc == 5
set $idx1 = $arg3
set $idx2 = $arg4
set $ElementsFound = 0
while $i < $tree_size set $value = (void *)($node + 1) set $valueLeft = *($arg1*)$value set $valueRight = *($arg2*)($value + 4) if $valueLeft == $idx1 && $valueRight == $idx2 printf "elem[%u]->left: ", $i
p $valueLeft
printf "elem[%u]->right: ", $i
p $valueRight
set $ElementsFound++
end
if $node->_M_right != 0
set $node = $node->_M_right
while $node->_M_left != 0
set $node = $node->_M_left
end
else
set $tmp_node = $node->_M_parent
while $node == $tmp_node->_M_right
set $node = $tmp_node
set $tmp_node = $tmp_node->_M_parent
end
if $node->_M_right != $tmp_node
set $node = $tmp_node
end
end
set $i++
end
printf "Number of elements found = %u\n", $ElementsFound
end
printf "Map size = %u\n", $tree_size
end
end

document pmap
Prints std::map or std::multimap information. Works for std::multimap as well.
Syntax: pmap : Prints map size, if T defined all elements or just element(s) with val(s)
Examples:
pmap m - prints map size and definition
pmap m int int - prints all elements and map size
pmap m int int 20 - prints the element(s) with left-value = 20 (if any) and map size
pmap m int int 20 200 - prints the element(s) with left-value = 20 and right-value = 200 (if any) and map size
end



#
# std::set and std::multiset
#

define pset
if $argc == 0
help pset
else
set $tree = $arg0
set $i = 0
set $node = $tree->_M_t->_M_impl->_M_header->_M_left
set $end = $tree->_M_t->_M_impl->_M_header
set $tree_size = $tree->_M_t->_M_impl->_M_node_count
if $argc == 1
printf "Set "
whatis $tree
printf "Use pset to see the elements in the set.\n"
end
if $argc == 2
while $i < $tree_size set $value = (void *)($node + 1) printf "elem[%u]: ", $i p *($arg1*)$value if $node->_M_right != 0
set $node = $node->_M_right
while $node->_M_left != 0
set $node = $node->_M_left
end
else
set $tmp_node = $node->_M_parent
while $node == $tmp_node->_M_right
set $node = $tmp_node
set $tmp_node = $tmp_node->_M_parent
end
if $node->_M_right != $tmp_node
set $node = $tmp_node
end
end
set $i++
end
end
if $argc == 3
set $idx = $arg2
set $ElementsFound = 0
while $i < $tree_size set $value = (void *)($node + 1) if *($arg1*)$value == $idx printf "elem[%u]: ", $i p *($arg1*)$value set $ElementsFound++ end if $node->_M_right != 0
set $node = $node->_M_right
while $node->_M_left != 0
set $node = $node->_M_left
end
else
set $tmp_node = $node->_M_parent
while $node == $tmp_node->_M_right
set $node = $tmp_node
set $tmp_node = $tmp_node->_M_parent
end
if $node->_M_right != $tmp_node
set $node = $tmp_node
end
end
set $i++
end
printf "Number of elements found = %u\n", $ElementsFound
end
printf "Set size = %u\n", $tree_size
end
end

document pset
Prints std::set or std::multiset information. Works for std::multiset as well.
Syntax: pset : Prints set size, if T defined all elements or just element(s) having val
Examples:
pset s - prints set size and definition
pset s int - prints all elements and the size of s
pset s int 20 - prints the element(s) with value = 20 (if any) and the size of s
end



#
# std::dequeue
#

define pdequeue
if $argc == 0
help pdequeue
else
set $size = 0
set $start_cur = $arg0._M_impl._M_start._M_cur
set $start_last = $arg0._M_impl._M_start._M_last
set $start_stop = $start_last
while $start_cur != $start_stop
p *$start_cur
set $start_cur++
set $size++
end
set $finish_first = $arg0._M_impl._M_finish._M_first
set $finish_cur = $arg0._M_impl._M_finish._M_cur
set $finish_last = $arg0._M_impl._M_finish._M_last
if $finish_cur < $finish_last set $finish_stop = $finish_cur else set $finish_stop = $finish_last end while $finish_first != $finish_stop p *$finish_first set $finish_first++ set $size++ end printf "Dequeue size = %u\n", $size end end document pdequeue Prints std::dequeue information.
Syntax: pdequeue : Prints dequeue size, if T defined all elements
Deque elements are listed "left to right" (left-most stands for front and right-most stands for back)
Example:
pdequeue d - prints all elements and size of d
end



#
# std::stack
#

define pstack
if $argc == 0
help pstack
else
set $start_cur = $arg0.c._M_impl._M_start._M_cur
set $finish_cur = $arg0.c._M_impl._M_finish._M_cur
set $size = $finish_cur - $start_cur
set $i = $size - 1
while $i >= 0
p *($start_cur + $i)
set $i--
end
printf "Stack size = %u\n", $size
end
end

document pstack
Prints std::stack information.
Syntax: pstack : Prints all elements and size of the stack
Stack elements are listed "top to buttom" (top-most element is the first to come on pop)
Example:
pstack s - prints all elements and the size of s
end



#
# std::queue
#

define pqueue
if $argc == 0
help pqueue
else
set $start_cur = $arg0.c._M_impl._M_start._M_cur
set $finish_cur = $arg0.c._M_impl._M_finish._M_cur
set $size = $finish_cur - $start_cur
set $i = 0
while $i < $size p *($start_cur + $i) set $i++ end printf "Queue size = %u\n", $size end end document pqueue Prints std::queue information.
Syntax: pqueue : Prints all elements and the size of the queue
Queue elements are listed "top to bottom" (top-most element is the first to come on pop)
Example:
pqueue q - prints all elements and the size of q
end



#
# std::priority_queue
#

define ppqueue
if $argc == 0
help ppqueue
else
set $size = $arg0.c._M_impl._M_finish - $arg0.c._M_impl._M_start
set $capacity = $arg0.c._M_impl._M_end_of_storage - $arg0.c._M_impl._M_start
set $i = $size - 1
while $i >= 0
p *($arg0.c._M_impl._M_start + $i)
set $i--
end
printf "Priority queue size = %u\n", $size
printf "Priority queue capacity = %u\n", $capacity
end
end

document ppqueue
Prints std::priority_queue information.
Syntax: ppqueue : Prints all elements, size and capacity of the priority_queue
Priority_queue elements are listed "top to buttom" (top-most element is the first to come on pop)
Example:
ppqueue pq - prints all elements, size and capacity of pq
end



#
# std::bitset
#

define pbitset
if $argc == 0
help pbitset
else
p /t $arg0._M_w
end
end

document pbitset
Prints std::bitset information.
Syntax: pbitset : Prints all bits in bitset
Example:
pbitset b - prints all bits in b
end



#
# std::string
#

define pstring
if $argc == 0
help pstring
else
printf "String \t\t\t= \"%s\"\n", $arg0._M_data()
printf "String size/length \t= %u\n", $arg0._M_rep()->_M_length
printf "String capacity \t= %u\n", $arg0._M_rep()->_M_capacity
printf "String ref-count \t= %d\n", $arg0._M_rep()->_M_refcount
end
end

document pstring
Prints std::string information.
Syntax: pstring
Example:
pstring s - Prints content, size/length, capacity and ref-count of string s
end



#
# std::wstring
#

define pwstring
if $argc == 0
help pwstring
else
call printf("WString \t\t= \"%ls\"\n", $arg0._M_data())
printf "WString size/length \t= %u\n", $arg0._M_rep()->_M_length
printf "WString capacity \t= %u\n", $arg0._M_rep()->_M_capacity
printf "WString ref-count \t= %d\n", $arg0._M_rep()->_M_refcount
end
end

document pwstring
Prints std::wstring information.
Syntax: pwstring
Example:
pwstring s - Prints content, size/length, capacity and ref-count of wstring s
end



#
# C++ related beautifiers
#

set print pretty on
set print object on
set print static-members on
set print vtbl on
set print demangle on
set demangle-style gnu-v3
set print sevenbit-strings off

вівторок, 23 листопада 2010 р.

Here is "False sharing" example, you can try it in VS with enabled OpenMP flag. Also if you enable speed optimization for compiler single thread version will be faster, but you can see false sharing effect too.

#include <iostream>
#include <ctime>
#include <omp.h>

using std::cout;
using std::endl;

const double M=8e8;

//single thread work to do
void one_thread() 
{
    long int n1=1, n2=1;
    for(; n1<M; n1+=n1%3);
    for(; n2<M; n2+=n2%3);
    cout << "Sum = " << n1 + n2 << endl;
}


//multi thread work to do
void multi_thread() 
{
    struct T
    {
        long int n1;
        char chuck[64];//should be commented to see false sharing effect
        long int n2;
    } t;
    t.n1 = 1;
    t.n2 = 1;
    #pragma omp parallel num_threads(2) shared(M,t)
    {
        #pragma omp sections
        {
            #pragma omp section
            for(; t.n1<M; t.n1+=t.n1%3);

            #pragma omp section
            for(; t.n2<M; t.n2+=t.n2%3);
        }
    }
    cout << "Sum = " << t.n1 + t.n2 << endl;
}

time_t start, end;
double diff;

int main(int argc, char* argv[]) 
{
    time(&start);
    multi_thread();
    time(&end);
    diff=difftime(end, start);
    cout<<diff<<" seconds elapsed for multi thread calculation."<<endl;

    time(&start);
    one_thread();
    time(&end);
    diff=difftime(end, start);
    cout<<diff<<" seconds elapsed for 1 thread calculation."<<endl;
    return 0;
}

четвер, 12 серпня 2010 р.

Again about rvalues

You can use rvalues for creating move semantics - remove copy constructors for temporaries. Example:

#include <iostream>

class A
{
public:
    A()
    {
        x = new int[1];
        x[0] = 1;
        std::cout << "new\n";
    }
    ~A()
    {
        delete[] x;
    }

    A(const A& a)
    {
        x = new int[1];
        x[0] = a.x[0];
        std::cout << "new\n";
    }

   /* A(A&& a) //move constructor
    {
        x = a.x;
        a.x = 0;
    }*/
private:
    int *x;
};

A GetA()
{
    A a;
    return a;
}

//if you want to use movable object as member in another object you should declare move constructor
//and use std::move
class B
{
public:
    B(){}
    B(B&& b) //move constructor
    {
        a = std::move(b.a);
    }
    A a;
};

B GetB()
{
    B b;
    return b;
}

int main(int, char*[])
{
    B b = GetB();
    {
        A a = GetA();
        A a2(a);
        std::cout << "scope\n";
    }
    return 0;
}

понеділок, 7 червня 2010 р.

New web page

I have updated my personal page. You can find updated demo section there.

четвер, 3 червня 2010 р.

C++0x multi-threading

If you want to play with new functionality for multi-threading in C++ for free you need gcc-4.5 and Linux, MinGW currently doesn't support this functionality (but it has all required headers). To compile programm you need to specify complier flag -std=c++0x and linker flag -pthread. VC++ currently have not this functionality too. Here is small example:

#include <thread>
#include <mutex>
#include <iostream>
#include <unistd.h>

class MyJob
{
public:
    MyJob(std::mutex* m) : m(m){}
    void operator()()
    {
        for (int i = 0; i < 5; ++i)
        {
            {
                std::lock_guard<std::mutex> lock(*m);
                std::thread::id id = std::this_thread::get_id();
                std::cout << "Thread " << id << std::endl;
            }
            sleep(1);
        }
    }
private:
    std::mutex* m;
};

int main()
{
    std::mutex m;
    std::thread t1{MyJob(&m)};
    std::thread t2{MyJob(&m)};

    t1.join();
    t2.join();

    return 0;
}