libsvm源码凝视+算法描写叙述:svm_train

(I will try my best to make this note clearer. We mainly focus on solve_c_svc in this note)

We mainly focus on solve_c_svc in this note.

Our goal:

mindB 12dTB∇2f(αk)dB+∇f(αk)TBdB

s.t. yTBdB=0

Core function of training: solve_c_svc.

It will call the function:

s.Solve(l, SVC_Q(*prob,*param,y), minus_ones, y, alpha, Cp, Cn, param->eps, si, param->shrinking);

Algotirhm 2 of WSS3 (An SMO-type decomposition method using WSS3)

After the initialization (trivial, we will discuss later), the first step is to select the working set B={i,j}. The chosen i and j most violate the optimally condition. Thus it is a natural choice . We called it “maximal violating pair”.

The pair satisfies in WSS1:

i∈argmaxt{−yt∇f(αk)t|t∈Iup(αk)}

j∈argmint{−yt∇f(αk)t|t∈Ilow(αk)}

where

Iup(α)≡{t|αt<C,yt=1orαt>0,yt=−1}, and

Ilow(α)≡{t|αt<C,yt=−1orαt>0,yt=1}

while in WSS3 j is different:

j∈argmint{−b2ita¯it|t∈Ilow(αk),−yt∇f(αk)t<−yi∇f(αk)i}

Detailed proof of it is shown in Fan(2005) Theorem 3.

if(select_working_set(i,j)!=0)

{

    // reconstruct the whole gradient

    reconstruct_gradient();

    // reset active set size and check

    active_size = l;

    info("*");

    if(select_working_set(i,j)!=0)  //select_working_set: 1 optimal

        break;

    else

        counter = 1;    // do shrinking next iteration

}

Working set selection subroutine (WSS3):

Note that we only discuss the case of yj=1, while the other case is similar. Also note that there is no yj because it is 1 already which is reflected by the sign.

Note that:

i∈argmaxt{−yt∇f(αk)t|t∈Iup(αk)}

where Iup(α)≡{t|αt<C,yt=1orαt>0,yt=−1}

We wanna find a t satisfying its −yt∇f(αk)t is maximum.

Select i:

// return i,j such that

// i: maximizes -y_i * grad(f)_i, i in I_up(\alpha)

// j: minimizes the decrease of obj value

//    (if quadratic coefficeint <= 0, replace it with tau)

//    -y_j*grad(f)_j < -y_i*grad(f)_i, j in I_low(\alpha)

for(int t=0;t<active_size;t++)

    if(y[t]==+1)

    {

        if(!is_upper_bound(t))

            if(-G[t] >= Gmax)

            {

                Gmax = -G[t];

                Gmax_idx = t;

            }

    }

    else

    {

        if(!is_lower_bound(t))

            if(G[t] >= Gmax)

            {

                Gmax = G[t];

                Gmax_idx = t;

            }

    }

Select j:

quad_coef is aij. grad_diff is bij.

if (!is_lower_bound(j))

{

    double grad_diff=Gmax+G[j]; //y[t]==1

    if (G[j] >= Gmax2)

        Gmax2 = G[j];

    if (grad_diff > 0)

    {

        double obj_diff;

        double quad_coef = QD[i]+QD[j]-2.0*y[i]*Q_i[j];

        //HERE y[j]==1 and quad_coef is "a" in (Fan2005) Q_ii+Q_tt-yiyjQ_it

        if (quad_coef > 0)

            obj_diff = -(grad_diff*grad_diff)/quad_coef;

        else

            obj_diff = -(grad_diff*grad_diff)/TAU;

            if (obj_diff <= obj_diff_min)

        {

            Gmin_idx=j;

            obj_diff_min = obj_diff;

        }

    }

}

After choosing the i and j for the set B={i,j}, we update the αi and αj:

const Qfloat *Q_i = Q.get_Q(i,active_size);

const Qfloat *Q_j = Q.get_Q(j,active_size);

double C_i = get_C(i);

double C_j = get_C(j);

double old_alpha_i = alpha[i];

double old_alpha_j = alpha[j];

if(y[i]!=y[j])

{

    double quad_coef = QD[i]+QD[j]+2*Q_i[j];

    if (quad_coef <= 0)

        quad_coef = TAU;

    double delta = (-G[i]-G[j])/quad_coef;

    double diff = alpha[i] - alpha[j];

    alpha[i] += delta;

    alpha[j] += delta;

    if(diff > 0)

    {

        if(alpha[j] < 0)

        {

            alpha[j] = 0;

            alpha[i] = diff;

        }

    }

    else

    {

        if(alpha[i] < 0)

        {

            alpha[i] = 0;

            alpha[j] = -diff;

        }

    }

    if(diff > C_i - C_j)

    {

        if(alpha[i] > C_i)

        {

            alpha[i] = C_i;

            alpha[j] = C_i - diff;

        }

    }

    else

    {

        if(alpha[j] > C_j)

        {

            alpha[j] = C_j;

            alpha[i] = C_j + diff;

        }

    }

}

else

{

    double quad_coef = QD[i]+QD[j]-2*Q_i[j];

    if (quad_coef <= 0)

        quad_coef = TAU;

    double delta = (G[i]-G[j])/quad_coef;

    double sum = alpha[i] + alpha[j];

    alpha[i] -= delta;

    alpha[j] += delta;

    if(sum > C_i)

    {

        if(alpha[i] > C_i)

        {

            alpha[i] = C_i;

            alpha[j] = sum - C_i;

        }

    }

    else

    {

        if(alpha[j] < 0)

        {

            alpha[j] = 0;

            alpha[i] = sum;

        }

    }

    if(sum > C_j)

    {

        if(alpha[j] > C_j)

        {

            alpha[j] = C_j;

            alpha[i] = sum - C_j;

        }

    }

    else

    {

        if(alpha[i] < 0)

        {

            alpha[i] = 0;

            alpha[j] = sum;

        }

    }

}

Then updating the gradient G[i]: ∇f(αk+1)i:

double delta_alpha_i = alpha[i] - old_alpha_i;

double delta_alpha_j = alpha[j] - old_alpha_j;

for(int k=0;k<active_size;k++)

{

    G[k] += Q_i[k]*delta_alpha_i + Q_j[k]*delta_alpha_j;

}

QBNαkN+pB=∇Bf(αk)−QBBαkB

∇f(αk+1)=∇f(αk)+Q:,B(αk+1B−αkB)

where Q:,B is the sub-matrix of Q including columns in B. After the sub-problem is solved, ∇f(αk+1) is acquired. Therefore, LIBSVM maintains the gradient throughout the decomposition method.

bool ui = is_upper_bound(i);

bool uj = is_upper_bound(j);

update_alpha_status(i);

update_alpha_status(j);

int k;

if(ui != is_upper_bound(i))

{

    Q_i = Q.get_Q(i,l);

    if(ui)

        for(k=0;k<l;k++)

            G_bar[k] -= C_i * Q_i[k];

    else

        for(k=0;k<l;k++)

            G_bar[k] += C_i * Q_i[k];

}

if(uj != is_upper_bound(j))

{

    Q_j = Q.get_Q(j,l);

    if(uj)

        for(k=0;k<l;k++)

            G_bar[k] -= C_j * Q_j[k];

    else

        for(k=0;k<l;k++)

            G_bar[k] += C_j * Q_j[k];

}

Opssss, we are almost there:

    // calculate rho

    si->rho = calculate_rho();

    // calculate objective value

    {

        double v = 0;

        int i;

        for(i=0;i<l;i++)

            v += alpha[i] * (G[i] + p[i]);

        si->obj = v/2;

    }

    // put back the solution

    {

        for(int i=0;i<l;i++)

            alpha_[active_set[i]] = alpha[i];

    }

    si->upper_bound_p = Cp;

    si->upper_bound_n = Cn;

INPUT & OUTPUT

After the training by svm-train, the file “*.model” will show up. In the svm.cpp, where almost all useful functions locate, the function of “svm_save_model” is responsible for the output. The file looks like:

svm_type c_svc

kernel_type rbf

gamma 0.25

nr_class 2

total_sv 3053

rho -0.495266

label 1 0

nr_sv 1996 1057

SV

0.2759747847329309 1:26.173 2:58.867 3:-0.1894697 4:125.1225

0.5042408472321596 1:57.07397 2:221.404 3:0.08607959 4:122.9114

0.5047343497502841 1:17.259 2:173.436 3:-0.1298053 4:125.0318

0.450868920775192 1:21.7794 2:124.9531 3:0.1538853 4:152.715

0.5049158714985423 1:91.33997 2:293.5699 3:0.1423918 4:160.5402

0.5048770796985661 1:55.375 2:179.222 3:0.1654953 4:111.2273

0.5047914121992919 1:29.562 2:191.357 3:0.09901439 4:103.4076

The first column: y*alpha where α are dual solution (#-1 coefficients).

Data following: each line represents a support vector.

    for(int i=0;i<l;i++)

    {

        for(int j=0;j<nr_class-1;j++)

            fprintf(fp, "%.16g ",sv_coef[j][i]); //the first column

        const svm_node *p = SV[i];

        if(param.kernel_type == PRECOMPUTED)

            fprintf(fp,"0:%d ",(int)(p->value));

        else

            while(p->index != -1)

            {

                fprintf(fp,"%d:%.8g ",p->index,p->value); // index: 1-4; sv value

                p++;

            }

        fprintf(fp, "\n");

    }

One part from the svm_train function:

    decision_function f = svm_train_one(prob,param,0,0);

    //receive the f from svm_train_one

    int i;

    for(i=0;i<prob->l;i++)

        if(fabs(f.alpha[i]) > 0) ++nSV;

    int j = 0;

    for(i=0;i<prob->l;i++)

        if(fabs(f.alpha[i]) > 0)

        {

            model->SV[j] = prob->x[i];

            model->sv_coef[0][j] = f.alpha[i];

            model->sv_indices[j] = i+1;

            ++j;

        }

In svm_train, the function first declare a new svm_model and at last return it.

    svm_model *model = Malloc(svm_model,1);

In the main(), model will receive it and save its data to the file:

    model = svm_train(&prob,&param);

    if(svm_save_model(model_file_name,model))

Read problem from the file. Mainly are the

void read_problem(const char *filename)

{

    //...

    prob.y = Malloc(double,prob.l);

    prob.x = Malloc(struct svm_node *,prob.l);

    x_space = Malloc(struct svm_node,elements);

    // elements are # of values

    prob.x[i] = &x_space[j];

    // x_space is a svm_node (index,value); prob.x is a svm_node *.

    // prob.x[i][0].value is x_space[0].value

    // prob.y is lable

    //...

    x_space[j].index = (int) strtol(idx,&endptr,10);

    x_space[j].value = strtod(val,&endptr);

}

2 IMPORTANT classes, actually 2+2：

struct svm_node

{

    int index;

    double value;

};

struct svm_problem

{

    int l;

    double *y;

    struct svm_node **x;

};

struct svm_parameter

{

    int svm_type;

    int kernel_type;

    int degree; /* for poly */

    double gamma;   /* for poly/rbf/sigmoid */

    double coef0;   /* for poly/sigmoid */

    /* these are for training only */

    double cache_size; /* in MB */

    double eps; /* stopping criteria */

    double C;   /* for C_SVC, EPSILON_SVR and NU_SVR */

    int nr_weight;      /* for C_SVC */

    int *weight_label;  /* for C_SVC */

    double* weight;     /* for C_SVC */

    double nu;  /* for NU_SVC, ONE_CLASS, and NU_SVR */

    double p;   /* for EPSILON_SVR */

    int shrinking;  /* use the shrinking heuristics */

    int probability; /* do probability estimates */

};

struct svm_model

{

    struct svm_parameter param; /* parameter */

    int nr_class;       /* number of classes, = 2 in regression/one class svm */

    int l;          /* total #SV */

    struct svm_node **SV;       /* SVs (SV[l]) */

    double **sv_coef;   /* coefficients for SVs in decision functions (sv_coef[k-1][l]) */

    double *rho;        /* constants in decision functions (rho[k*(k-1)/2]) */

    double *probA;      /* pariwise probability information */

    double *probB;

    int *sv_indices;        /* sv_indices[0,...,nSV-1] are values in [1,...,num_traning_data] to indicate SVs in the training set */

    /* for classification only */

    int *label;     /* label of each class (label[k]) */

    int *nSV;       /* number of SVs for each class (nSV[k]) */

                /* nSV[0] + nSV[1] + ... + nSV[k-1] = l */

    /* XXX */

    int free_sv;        /* 1 if svm_model is created by svm_load_model*/

                /* 0 if svm_model is created by svm_train */

};

Citations:

[1] http://www.csie.ntu.edu.tw/~cjlin/papers/libsvm.pdf

[2]“Working set selection using second order information for traning support vector machines”, Rong-En Fan, etc., 2005.

秒客网

libsvm源码凝视+算法描写叙述:svm_train

相关文章