sklearn_classify.py

# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import numpy as np
import sklearn
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

random_seed = 67


def train_lr_l2_model(args, data):
    """
    The main function to train lr model with l2 regularization.
    """
    acc_list = []
    data = np.array(data)
    data = data[data[:, 0].argsort()]
    x_data = data[:, 1:-1]
    y_data = data[:, -1]
    for random_num in range(0, 10):
        X_train, X_test, y_train, y_test = train_test_split(
            x_data,
            y_data,
            test_size=0.2,
            random_state=random_num + random_seed)

        # use the one vs rest to train the lr model with l2
        pred_test = []
        for i in range(0, args.num_class):
            y_train_relabel = np.where(y_train == i, 1, 0)
            y_test_relabel = np.where(y_test == i, 1, 0)
            lr = LogisticRegression(C=10.0, random_state=0, max_iter=100)
            lr.fit(X_train, y_train_relabel)
            pred = lr.predict_proba(X_test)
            pred_test.append(pred[:, -1].tolist())
        pred_test = np.array(pred_test)
        pred_test = np.transpose(pred_test)
        c_index = np.argmax(pred_test, axis=1)
        acc = accuracy_score(y_test.flatten(), c_index)
        acc_list.append(acc)
        print("pass:{}-acc:{}".format(random_num, acc))
    print("the avg acc is {}".format(np.mean(acc_list)))