Exporting parallelized functions within DLL using C++

 

I have a function within DLL that uses OpenMP to parallelize few loops. The functions have been tested/exported by calling them from an app built in C++ and everything works nicely (method used for linking). I then export the functions to MQL4 (method used for linking) where the code also works but substantially slower. I am using Visual Studio 2015 for creating the DLL. I strongly suspect that parallelization is not working while calling from the MQL4. Is it something to do with MQL4 or the DLL it-self?

Loops that contributes to overall slowdown:

            double dtime;
            dtime = omp_get_wtime();

            ofstream fopen("C:\\output.txt", 'a');
            ofstream fout("C:\\output.txt", 'a');

            dtime = omp_get_wtime();


#pragma omp parallel for num_threads(num)
            for (int p = 1; p <= r1; p++) {
                int k = omp_get_thread_num();
                int i = I1[p], j = i + l;
                double alpha = 0, beta = 0, gamma = 0;
                double zeta, t, c, s;
                for (int k = 0; k < N; k++) {
                    alpha = alpha + (U_t[i][k] * U_t[i][k]);
                    beta = beta + (U_t[j][k] * U_t[j][k]);
                    gamma = gamma + (U_t[i][k] * U_t[j][k]);
                }
                C[k] = max(C[k], abs(gamma) / sqrt(alpha*beta));
                //converge = max(converge, abs(gamma)/sqrt(alpha*beta));    //compute convergence
                //basicaly is the angle
                //between column i and j


                zeta = (beta - alpha) / (2.0 * gamma);
                t = sgn(zeta) / (abs(zeta) + sqrt(1.0 + (zeta*zeta)));        //compute tan of angle
                c = 1.0 / (sqrt(1.0 + (t*t)));              //extract cos
                s = c*t;                            //extrac sin


                for (int k = 0; k<N; k++) {

                    t = U_t[i][k];

                    U_t[i][k] = c*t - s*U_t[j][k];
                    if (!(U_t[i][k] < 0 || U_t[i][k] > 0)) {
                        U_t[i][k] = 0;
                    }

                    U_t[j][k] = s*t + c*U_t[j][k];
                    if (!(U_t[j][k] < 0 || U_t[j][k] > 0)) {
                        U_t[j][k] = 0;
                    }


                    t = V_t[i][k];

                    V_t[i][k] = c*t - s*V_t[j][k];
                    if (!(V_t[i][k] < 0 || V_t[i][k] > 0)) {
                        V_t[i][k] = 0;
                    }

                    V_t[j][k] = s*t + c*V_t[j][k];
                    if (!(V_t[j][k] < 0 || V_t[j][k] > 0)) {
                        V_t[j][k] = 0;
                    }

                }
            }
            fout << endl;
#pragma omp parallel for num_threads(num)
            for (int p = 1; p <= r2; p++) {
                int k = omp_get_thread_num();
                int i = I2[p], j = i + l;
                double alpha = 0, beta = 0, gamma = 0;
                double zeta, t, c, s;
                for (int k = 0; k < N; k++) {
                    alpha = alpha + (U_t[i][k] * U_t[i][k]);
                    beta = beta + (U_t[j][k] * U_t[j][k]);
                    gamma = gamma + (U_t[i][k] * U_t[j][k]);
                }
                C[k] = max(C[k], abs(gamma) / sqrt(alpha*beta));
                //converge = max(converge, abs(gamma)/sqrt(alpha*beta));    //compute convergence
                //basicaly is the angle
                //between column i and j

                zeta = (beta - alpha) / (2.0 * gamma);
                t = sgn(zeta) / (abs(zeta) + sqrt(1.0 + (zeta*zeta)));        //compute tan of angle
                c = 1.0 / (sqrt(1.0 + (t*t)));              //extract cos
                s = c*t;                            //extrac sin


                for (int k = 0; k<N; k++) {


                    t = U_t[i][k];

                    U_t[i][k] = c*t - s*U_t[j][k];
                    if (!(U_t[i][k] < 0 || U_t[i][k] > 0)) {
                        U_t[i][k] = 0;
                    }

                    U_t[j][k] = s*t + c*U_t[j][k];
                    if (!(U_t[j][k] < 0 || U_t[j][k] > 0)) {
                        U_t[j][k] = 0;
                    }



                    t = V_t[i][k];

                    V_t[i][k] = c*t - s*V_t[j][k];
                    if (!(V_t[i][k] < 0 || V_t[i][k] > 0)) {
                        V_t[i][k] = 0;
                    }

                    V_t[j][k] = s*t + c*V_t[j][k];
                    if (!(V_t[j][k] < 0 || V_t[j][k] > 0)) {
                        V_t[j][k] = 0;
                    }


                }

            }
            fout << endl;
            for (int k = 0; k < num; k++)
                converge = max(converge, C[k]);
            fout << endl;
            if (l == M) {
                fout << converge << '\t';

                fout << endl;
                dtime = omp_get_wtime() - dtime;
                fout << "\n" << "dtime: " << dtime << " ";
                dtime = omp_get_wtime();
                fout << endl;

            }
            fout << endl; 
            fout.close();


These are outputs for convergence and time taken:

C++ app:

0.999996    dtime: 7.91817e-05
0.954982    dtime: 8.01593e-05
0.964351    dtime: 0.000116817
0.934475    dtime: 7.86929e-05
0.781737    dtime: 7.77154e-05
0.812496    dtime: 7.96705e-05
0.500925    dtime: 7.77154e-05
0.174739    dtime: 7.77154e-05
0.0407444   dtime: 7.86929e-05 
0.0137945   dtime: 8.01593e-05 
0.0039458   dtime: 0.000136857 
0.000550945 dtime: 7.86929e-05 
0.000149865 dtime: 7.96705e-05 
3.76775e-05 dtime: 7.96705e-05 
6.86001e-06 dtime: 8.0648e-05 
2.04005e-06 dtime: 7.82042e-05 
5.6817e-07  dtime: 8.84685e-05 
2.70614e-07 dtime: 7.96705e-05
5.78656e-08 dtime: 7.86929e-05
1.90527e-08 dtime: 8.01593e-05
1.00316e-09 dtime: 7.96705e-05


From MetaTrader4:

0.999997    dtime: 0.222026
0.917038    dtime: 0.219041
0.982879    dtime: 0.215614
0.723091    dtime: 0.219034
0.295653    dtime: 0.215915
0.097825    dtime: 0.21803
0.0350881   dtime: 0.21804
0.00654856  dtime: 0.219009
0.00188476  dtime: 0.217366
0.000435981 dtime: 0.223172
9.50818e-05 dtime: 0.21804
2.27348e-05 dtime: 0.260625
1.39124e-05 dtime: 0.219027
1.72161e-06 dtime: 0.218035
3.18178e-07 dtime: 0.218927
1.77708e-07 dtime: 0.218026
3.81575e-08 dtime: 0.204294
9.53867e-09 dtime: 0.221036


MQL4 function declarations:

#property copyright "Adrijus"
#property version   "1.00"
#property strict
#import  "LMBRDLL.dll"

double getWeights(double &data[], int &topology[], int topSize, double &TV[], double validationDifference, int vSize, int timeSteps, int nVabs, double &weights[]);
double testWeights(double &weights[], double &currentData[], int &topology[], int topSize, int timeSteps, int nVabs);

#import
#include <stdlib.mqh>


MQL4 function calls:

getWeights(data, topology, topSize, TV, validationDifference, vSize, timeSteps, nVabs, weights);
double output = testWeights(weights, currentData, topology, topSize, timeSteps, nVabs);    //DLL function returns predicted percentage change


Definitions in the DLL:

#include "stdafx.h"

    double *getWeights(double const *idata, int const *aTopology, int topSize, double const *aTV, double validationDifference, int vSize, int tSteps, int nVabs, double *T)
{
    vector<unsigned> topology(topSize);
    for (i = 0; i < topSize; i++) {
        topology[i] = aTopology[i];
    }

    vector<double> TV(2);
    TV[0] = aTV[0];
    TV[1] = aTV[1];

    Matrix inputVals = buildInputs(idata, vSize, tSteps, nVabs);
    inputVals = sortInputs(inputVals, TV);
    sortTargets(TV);
    Matrix targetVals = getTargets();
    Net myNet(topology, inputVals);

    double currentValidationError = 1000000000777;
    double previousValidationError = 1000000000000000;
    double difference = 1;

    while (currentValidationError < previousValidationError) {

        difference = previousValidationError - currentValidationError;

        if (difference < validationDifference)
            break;

        FeedForward(&topology, &myNet);

        Backpropagation(&topology, &myNet, &targetVals);

        BuildJacobian(&topology, &myNet);

        LevenberMarquardtBeyesianRegularization(&topology, &myNet, &targetVals);

        previousValidationError = currentValidationError;

        currentValidationError = Validation(&myNet.allLMweights, topology);

        //UpdateSynapses(topology, myNet);
    }

    vector<double> rowWeights = buildRWeights(myNet.allSynapses);

    for (i = 0; i < rowWeights.size(); i++) {
        T[i] = rowWeights[i];
    }

    return T;
}

double testWeights(double const *rowWeights, double const *testData, int const *aTopology, int topSize, int timeSteps, int nVariables) {
    vector<unsigned> topology(topSize);
    for (i = 0; i < topSize; i++) {
        topology[i] = aTopology[i];
    }

    vector<Matrix> testWeights = buildWeightMatrices(rowWeights, topology);

    vector<double> input = buildTestInputs(testData, timeSteps, nVariables);

    double output = getOutput(testWeights, input);

    return output;
}


LMBRDLL.def for exporting to MetaTrader platform:

LIBRARY "LMBRDLL"
EXPORTS
getWeights
testWeights


HeaderDLL.h for exporting to C++ app:

#include "stdafx.h"
using namespace std;

    __declspec(dllexport) double *getWeights(double const *idata, int const *aTopology, int topSize, double const *aTV, double validationDifference, int vSize, int tSteps, int nVabs, double *T);
    __declspec(dllexport) double testWeights(double const *rowWeights, double const *testData, int const *aTopology, int topSize, int timeSteps, int nVariables);
Walkthrough: Creating and Using a Dynamic Link Library (C++)
Walkthrough: Creating and Using a Dynamic Link Library (C++)
  • msdn.microsoft.com
This step-by-step walkthrough shows how to create a dynamic link library (DLL) for use with a C++ app. Using a library is a great way to reuse code. Rather than re-implementing the same routines in every program that you create, you write them one time and then reference them from apps that require the functionality. By putting code in the DLL...