subroutine kfilt_new(A,z,H,ns,no,varmes,nrsamp)

c***  A is the matrix of ensemble forecasts, z is the observation,
c***  H is the observation operator, ns is the size of the model
c***  state, no is the number of observation locations, varmes is
c***  the observational uncertainty, and nrsamp is the ensemble size.

        include 'mpif.h'
#include "parallel.h"
	
	integer   ns, no, nrsamp, MASTER, FROM_MASTER, FROM_WORKER
        parameter (MASTER = 0)
        parameter (FROM_MASTER = 1)
        parameter (FROM_WORKER = 2)

        integer   taskid, ierr, numtasks, numworkers, snumworkers
	integer   source, dest, mtype, num, avenum, extra, offset
	integer   ijim
        integer   status(MPI_STATUS_SIZE)

	real, intent(inout)         :: A(ns,nrsamp)
	real, intent(in)            :: z(no)

   	integer i,j,m,mm,info,idum,ibawk
        real    rcond
   	logical zlz,svd,lud
	real    H(no,ns)
	real    vave(ns), vvar(ns), varmes(no)
	real    temp

   	real, allocatable    :: K(:,:)
   	real, allocatable    :: RR(:,:)
	real, allocatable    :: RRR(:,:)
   	real, allocatable    :: w(:)
   	real, allocatable    :: tmp(:,:)
   	real, allocatable    :: d(:)
	real, allocatable    :: pd(:,:)
	real, allocatable    :: transA(:,:)
	real, allocatable    :: transtmp(:,:)
	real, allocatable    :: tmp2(:,:)

   	zlz=.true.
   	svd=.false.
   	lud=.false.

   	allocate (K(no,ns), RR(no,no), RRR(no,no))
	allocate (transA(nrsamp,ns),transtmp(nrsamp,no))
   	allocate (w(ns), tmp(no,nrsamp))
   	allocate (d(no), pd(1,ns), tmp2(1,ns))

        call MPI_COMM_RANK(MPI_COMM_WORLD,taskid,ierr)
        call MPI_COMM_SIZE(MPI_COMM_WORLD,numtasks,ierr)

        numworkers = numtasks-1
        temp=sqrt(float(numworkers))
        snumworkers=int(temp)

        if(taskid.eq.MASTER)then

c***  initialising the observational error covariance matrix and
c***  the projector from model space to observational space.
   	 RR=0.0

c***  define obs error covariance matrix as a diagonal matrix
   	 do m=1,no
	  RR(m,m)=varmes(m)
   	 enddo

c***  calculate ensemble mean
   	 call ranmean2(A,w,ns,nrsamp)

c***  remove ensemble mean from the ensemble members
   	 do i=1,nrsamp
   	  A(:,i)=A(:,i)-w(:)
   	 enddo

        endif                          ! MASTER if

c***  tmp takes on the product of the observation projector and 
c***  the ensemble anomalies.
#ifdef pmatmul
	if(taskid.eq.MASTER) write(6,*) 'first pmm'
	call pmm(H,A,tmp,no,ns,nrsamp)
#else
        if(taskid.eq.MASTER)then
	 write(6,*) 'first matmul'
   	 tmp=matmul(H,A)
	 write(6,*) 'completed first matmul'
        endif                          ! MASTER if
#endif

c***  K takes on the value of H(A-ABAR)(A-ABAR)^T
#ifdef pmatmul
        if(taskid.eq.MASTER)then
	 transA=transpose(A)
        endif                          ! MASTER if
	
	if(taskid.eq.MASTER) write(6,*) 'second pmm'
        call pmm(tmp,transA,K,no,nrsamp,ns)
#else
        if(taskid.eq.MASTER)then
	 write(6,*) 'second matmul'
	 K=matmul(tmp,transpose(A))
	 write(6,*) 'completed second matmul'
	endif                          ! MASTER if
#endif

        if(taskid.eq.MASTER)then

c***  K is what is called B in eqn (6)
   	 K=(1.0/(float(nrsamp)-1.0))*K

c***  restore A by adding back ensemble mean value.
   	 do i=1,nrsamp
   	  A(:,i)=A(:,i)+w(:)
   	 enddo  

c***  RR is obs error cov matrix, and the forecast error cov is added to
c***  it, making it the first half of the right hand side of equation (8).
c***  if one was to wipe out spurious long distance correlations, it would
c***  happen here.
#ifdef pmatmul
	 transtmp=transpose(tmp)

        endif                          ! MASTER if

	if(taskid.eq.MASTER) write(6,*) 'third pmm'
        call pmm(tmp,transtmp,RRR,no,nrsamp,no)

        if(taskid.eq.MASTER)then
         RR=RR+(1./(float(nrsamp)-1.))*RRR
	endif                          ! MASTER if
#else
	write(6,*) 'third matmul'
   	RR=RR+(1./(float(nrsamp)-1.))*matmul(tmp,transpose(tmp))
	write(6,*) 'completed third matmul'        
        endif                          ! MASTER if
#endif

        if(taskid.eq.MASTER)then

c***  standard solver for coefficients beta
   	 call spoco(RR,no,no,rcond,w,info)


#ifdef pcorrect

	endif           ! MASTER if

	if(taskid.eq.MASTER) then

	write(6,*) 'updating ensemble members'

c***  figure out how many ens members go to each processor and look for extras
         avenum = nrsamp/numworkers
         extra = mod(nrsamp, numworkers)
         offset = 1
         mtype = FROM_MASTER

c***  loop over the worker nodes
         do dest=1, numworkers

c***  pad in the extra columns until they've all been accounted for
          if (dest .le. extra) then
           num = avenum + 1
          else
           num = avenum
          endif

c***  send worker nodes the offset, number of cols, a and b
	  call MPI_SEND(offset, 1, MPI_INTEGER, dest, mtype, 
     >                  MPI_COMM_WORLD, ierr)
          call MPI_SEND(num, 1, MPI_INTEGER, dest, mtype, 
     >                  MPI_COMM_WORLD, ierr)
          call MPI_SEND(z, no, MPI_REAL, dest, mtype, 
     >                  MPI_COMM_WORLD, ierr)
          call MPI_SEND(varmes, no, MPI_REAL, dest, 
     >                  mtype, MPI_COMM_WORLD, ierr)
          call MPI_SEND(RR, no*no, MPI_REAL, dest, 
     >                  mtype, MPI_COMM_WORLD, ierr)
	  call MPI_SEND(H, no*ns, MPI_REAL, dest, 
     >                  mtype, MPI_COMM_WORLD, ierr)
	  call MPI_SEND(K, no*ns, MPI_REAL, dest, 
     >                  mtype, MPI_COMM_WORLD, ierr)
          call MPI_SEND(A(1,offset), num*ns, MPI_REAL,
     >                  dest, mtype, MPI_COMM_WORLD, ierr)

c***  increment the offset
          offset = offset + num

	 enddo

c***  receive results from worker tasks
         mtype = FROM_WORKER
         do i=1, numworkers
          source = i
          call MPI_RECV(offset, 1, MPI_INTEGER, source,
     >                  mtype, MPI_COMM_WORLD, status, ierr)
          call MPI_RECV(num, 1, MPI_INTEGER, source,
     >                  mtype, MPI_COMM_WORLD, status, ierr)
          call MPI_RECV(A(1,offset), num*ns, MPI_REAL, source, 
     >                  mtype, MPI_COMM_WORLD, status, ierr)
         enddo

        endif              ! end of MASTER bits

	if(taskid.gt.MASTER)then

c***  receive matrix data from master task
         mtype = FROM_MASTER
         call MPI_RECV(offset, 1, MPI_INTEGER, MASTER,
     >                 mtype, MPI_COMM_WORLD, status, ierr)
         call MPI_RECV(num, 1, MPI_INTEGER, MASTER,
     >                 mtype, MPI_COMM_WORLD, status, ierr)
	 call MPI_RECV(z, no, MPI_REAL, MASTER, mtype,
     >                 MPI_COMM_WORLD, status, ierr)
	 call MPI_RECV(varmes, no, MPI_REAL, MASTER, mtype,
     >                 MPI_COMM_WORLD, status, ierr)
	 call MPI_RECV(RR, no*no, MPI_REAL, MASTER, mtype,
     >                 MPI_COMM_WORLD, status, ierr)
	 call MPI_RECV(H, no*ns, MPI_REAL, MASTER, mtype,
     >                 MPI_COMM_WORLD, status, ierr)
	 call MPI_RECV(K, no*ns, MPI_REAL, MASTER, mtype,
     >                 MPI_COMM_WORLD, status, ierr)
	 call MPI_RECV(A, num*ns, MPI_REAL, MASTER, mtype,
     >                 MPI_COMM_WORLD, status, ierr)


         do j=1,num
	  call random2(d,no)
	  d=z+sqrt(varmes)*d-matmul(H,A(:,j))
          call sposl(RR,no,no,d)
	  A(:,j)=A(:,j)+matmul(d,K)
         enddo

c***  send results back to master task
         mtype = FROM_WORKER
         call MPI_SEND(offset, 1, MPI_INTEGER, MASTER, mtype, 
     >                 MPI_COMM_WORLD, ierr)
         call MPI_SEND(num, 1, MPI_INTEGER, MASTER, mtype, 
     >                 MPI_COMM_WORLD, ierr)
         call MPI_SEND(A, num*ns, MPI_REAL, MASTER, mtype,
     >                 MPI_COMM_WORLD, ierr)

        endif                   ! end of SLAVE bits

#else
	write(6,*) 'in correction bit'
c***  correct each member of the ensemble
   	 do j=1,nrsamp

   	  call random2(d,no)
          d=z+sqrt(varmes)*d-matmul(H,A(:,j))
   	  call sposl(RR,no,no,d)
   	  A(:,j)=A(:,j)+matmul(d,K) 

   	 enddo  		!A has now been corrected by the data.

	write(6,*) 'end of correction bit'
	
	endif               ! MASTER if

#endif

   	deallocate (K, RR, RRR)
	deallocate (transA, transtmp)
   	deallocate (w, tmp)
   	deallocate (d,pd,tmp2)

   	return
   	end