Filter - Filters for Train-Validation-Test Split

From Q
Jump to navigation Jump to search

This QScript creates 3 new filters based on a random 50%/25%/25% split of the selected data. These filters can then be applied to predictive models in order to separate a training data set from a validation data set and a testing set. The QScript can be amended to adjust the split ratio.

Example

The result of running this script is shown below. The first 3 variables are the new filters created.

3way new6.png

Technical details

The values of trainPercentage and validationPercentage in the QScript code below control the split ratio. The defaults of 50 and 25 mean that 50% of the data (rounded to the nearest whole number of instances) is selected as part of the Training split, 25% of the data (also rounded to the nearest whole number of instances) forms part of the Validation split and the remaining 25% is in the Testing split filter.

By adjusting these values as described below in Customizing the QScript the percentages in the filters can be controlled.

How to apply this QScript

  • Start typing the name of the QScript into the Search features and data box in the top right of the Q window.
  • Click on the QScript when it appears in the QScripts and Rules section of the search results.

OR

  • Select Automate > Browse Online Library.
  • Select this QScript from the list.

Customizing the QScript

This QScript is written in JavaScript and can be customized by copying and modifying the JavaScript.

Customizing QScripts in Q4.11 and more recent versions

  • Start typing the name of the QScript into the Search features and data box in the top right of the Q window.
  • Hover your mouse over the QScript when it appears in the QScripts and Rules section of the search results.
  • Press Edit a Copy (bottom-left corner of the preview).
  • Modify the JavaScript (see QScripts for more detail on this).
  • Either:
    • Run the QScript, by pressing the blue triangle button.
    • Save the QScript and run it at a later time, using Automate > Run QScript (Macro) from File.

Customizing QScripts in older versions

  • Copy the JavaScript shown on this page.
  • Create a new text file, giving it a file extension of .QScript. See here for more information about how to do this.
  • Modify the JavaScript (see QScripts for more detail on this).
  • Run the file using Automate > Run QScript (Macro) from File.

JavaScript

// This script creates 3 new filters based upon a random split of the data.
 
includeWeb("QScript Selection Functions");
includeWeb("QScript Functions to Generate Outputs"); 
 
filtersForTrainTestValidationSplit()
 
 
function filtersForTrainTestValidationSplit() {
    
    const is_displayr = inDisplayr();

    // Set percentage of data used for training and validation set
    let trainPercentage = parseFloat(prompt("What percentage of the data set should be used as the training set?", 50));
    if (trainPercentage < 0 || trainPercentage > 100) {
        log("Invalid split.  Please ensure that the training data percentage is between 0 and 100.");
        return false;
    }
    let validationPercentage = parseFloat(prompt("What percentage of the data set should be used as the validation set?", 25));
    if (validationPercentage < 0 || validationPercentage > 100) {
        log("Invalid split.  Please ensure that the validation data percentage is between 0 and 100.");
        return false;
    }    
    if (trainPercentage + validationPercentage > 100) {
        log("The percentage lf training and validation data should sum to less than 100: " + (trainPercentage + validationPercentage));
        return false;
    }
    
    // Get the data
    let dataFile;
    const user_selections = getAllUserSelections()
    let selected_questions = user_selections.selected_questions;
    if (selected_questions.length > 0)
        dataFile = project.report.selectedQuestions()[0].dataFile;
    else if (project.dataFiles.length == 1)
        dataFile = project.dataFiles[0];
    else if (project.dataFiles.length == 0) {
        log("Please add a data set.");
        return false;
    } else if (!is_displayr) {
        dataFile = dataFileSelection()[0];
    } else {
        log("Please select data from a single data set.")
        return false;
    }
   
    // Create a training filter based on a random sample
    let RText = "percentage <- " + trainPercentage + " # Change this number to change the percentage in the training sample\n" +
                "set.seed(123) # This ensures that the randomization is identical each time\n" +
                "n <- " + dataFile.totalN + " # This is the total sample size\n" +
                "indices <- sample.int(n, round(percentage * n / 100))\n" +
                "filter <- rep(0, n)\n" +
                "filter[indices] <- 1\n" + 
                "filter";
    let new_q_name = preventDuplicateQuestionName(dataFile, "Training sample");
    let tempVar = preventDuplicateVariableName(dataFile, "training");

    let train;
    let test;
    let validation;

    try {
        train = dataFile.newRVariable(RText, tempVar, "Training sample", null);
    } catch (e) {
        log("Could not create train filter: " + e);
        return false;
    }
    let trainFullName = "`" + dataFile.name + "`$Variables$" + train.name;
 
    // Create a validation filter based on those not selected in training filter
    RText = "percentage <- " + validationPercentage + " # Change this number to change the percentage in the validation sample\n" +
                "set.seed(123) # This ensures that the randomization is identical each time\n" +
                "n <- " + dataFile.totalN + " # This is the total sample size\n" +
                "n.remaining <- n - sum(" + trainFullName + ")\n" +
                "indices <- sample.int(n.remaining, round(percentage * n / 100))\n" +
                "filter <- rep(0, n)\n" +
                "filter[" + trainFullName + " == 0][indices] <- 1\n" +
                "filter";
    try {
        validation = dataFile.newRVariable(RText, preventDuplicateVariableName(dataFile, "validation"), "Validation sample", null);
    } catch (e) {
        log("Could not create validation filter: " + e);
        return false;
    }
 
    // Create test filter from those not selected in either training or validation filters
    RText = "as.numeric(!(" + trainFullName + " + `" + dataFile.name + "`$Variables$" + validation.name + "))";
    try {
        test = dataFile.newRVariable(RText, preventDuplicateVariableName(dataFile, "testing"), "Testing sample", null);
    } catch (e) {
        log("Could not create test filter: " + e);
        return false;
    }
 
    // Combine the 3 new variables into a Pick-Any question
    trainValidateTest = dataFile.setQuestion(preventDuplicateQuestionName(dataFile,"Train validate test split"),
                                             "Pick Any", [train, validation, test])
    let suffix = trainValidateTest.name.replace(/^Train validate test split/, "");
    trainValidateTest.variables[0].label = "Training sample" + suffix;
    trainValidateTest.variables[1].label = "Validation sample" + suffix;
    trainValidateTest.variables[2].label = "Testing sample" + suffix;
    trainValidateTest.isFilter = true;
    setCountThisValueForVariablesInQuestion(trainValidateTest, 1, true);
    trainValidateTest.needsCheckValuesToCount = false;
    insertAtHoverButtonIfShown(trainValidateTest); 
    reportNewRQuestion(trainValidateTest, "Train validate test split");
    return true;
}

See also