"""	
Lisandro Dalcin:

Below, an scalable point-to-point based implementation of barrier() 
with the sleep() trick you need. A standard implementation would just 
merge send() and recv() on a single sendrecv() call. Just may need to 
tweak the sleep interval, and perhaps use a different tag value to 
avoid previous on-going communication.
"""

__all__ = ['barrier']

import time

from numpy.linalg import svd
from numpy.random import randn

from mpi4py import MPI

def barrier(comm, tag=0, sleep=0.01): 
    size = comm.Get_size() 
    if size == 1: 
        return 
    rank = comm.Get_rank() 
    mask = 1 
    while mask < size: 
        dst = (rank + mask) % size 
        src = (rank - mask + size) % size 
        req = comm.isend(None, dst, tag) 
        while not comm.Iprobe(src, tag): 
            time.sleep(sleep) 
        comm.recv(None, src, tag) 
        req.Wait() 
        mask <<= 1 

def _test():
    """
    A small test to show that barrier(comm) is more efficient than comm.barrier() when a
    single process needs multithreading while other processes are idle.
    """

    comm = MPI.COMM_WORLD
    rank = comm.Get_rank()

    if rank == 0:
        print "Using barrier(comm):"
        A       = randn(1000,1000)
        t_svd   = time.time()
        U, S, V = svd(A)
        dt_svd  = time.time() - t_svd
        print 'svd time: %f' % (dt_svd,)
    else:
        pass

    barrier(comm)

    if rank == 0:
        print "Using comm.barrier():"
        t_svd   = time.time()
        U, S, V = svd(A)
        dt_svd  = time.time() - t_svd
        print 'svd time: %f' % (dt_svd,)
    else:
        pass

    comm.barrier()

if __name__ == '__main__':
    _test()
