# Filter - Filters for Train-Validation-Test Split

Create 3 new filters based on a random 50%/25%/25% split of the selected data

This QScript creates 3 new filters based on a random 50%/25%/25% split of the selected data. These filters can then be applied to predictive models in order to separate a training data set from a validation data set and a testing set. The QScript can be amended to adjust the split ratio.

## Example

The result of running this script is shown below. The first 3 variables are the new filters created.

## Technical details

The values of trainPercentage and validationPercentage in the QScript code below control the split ratio. The defaults of 50 and 25 mean that 50% of the data (rounded to the nearest whole number of instances) is selected as part of the Training split, 25% of the data (also rounded to the nearest whole number of instances) forms part of the Validation split and the remaining 25% is in the Testing split filter.

By adjusting these values as described below in Customizing the QScript the percentages in the filters can be controlled.

## How to apply this QScript

• Start typing the name of the QScript into the Search features and data box in the top right of the Q window.
• Click on the QScript when it appears in the QScripts and Rules section of the search results.

OR

• Select Automate > Browse Online Library.
• Select this QScript from the list.

## Customizing the QScript

This QScript is written in JavaScript and can be customized by copying and modifying the JavaScript.

• Start typing the name of the QScript into the Search features and data box in the top right of the Q window.
• Hover your mouse over the QScript when it appears in the QScripts and Rules section of the search results.
• Press Edit a Copy (bottom-left corner of the preview).
• Modify the JavaScript (see QScripts for more detail on this).
• Either:
• Run the QScript, by pressing the blue triangle button.
• Save the QScript and run it at a later time, using Automate > Run QScript (Macro) from File.

### Customizing QScripts in older versions

• Create a new text file, giving it a file extension of .QScript. See here for more information about how to do this.
• Modify the JavaScript (see QScripts for more detail on this).
• Run the file using Automate > Run QScript (Macro) from File.

## JavaScript

// This script creates 3 new filters based upon a random split of the data.

includeWeb("QScript Selection Functions");
includeWeb("QScript Functions to Generate Outputs");

filtersForTrainTestValidationSplit()

function filtersForTrainTestValidationSplit() {

const is_displayr = inDisplayr();

// Set percentage of data used for training and validation set
let trainPercentage = parseFloat(prompt("What percentage of the data set should be used as the training set?", 50));
if (trainPercentage < 0 || trainPercentage > 100) {
log("Invalid split.  Please ensure that the training data percentage is between 0 and 100.");
return false;
}
let validationPercentage = parseFloat(prompt("What percentage of the data set should be used as the validation set?", 25));
if (validationPercentage < 0 || validationPercentage > 100) {
log("Invalid split.  Please ensure that the validation data percentage is between 0 and 100.");
return false;
}
if (trainPercentage + validationPercentage > 100) {
log("The percentage lf training and validation data should sum to less than 100: " + (trainPercentage + validationPercentage));
return false;
}

// Get the data
let dataFile;
const user_selections = getAllUserSelections()
let selected_questions = user_selections.selected_questions;
if (selected_questions.length > 0)
dataFile = project.report.selectedQuestions()[0].dataFile;
else if (project.dataFiles.length == 1)
dataFile = project.dataFiles[0];
else if (project.dataFiles.length == 0) {
return false;
} else if (!is_displayr) {
dataFile = dataFileSelection()[0];
} else {
log("Please select data from a single data set.")
return false;
}

// Create a training filter based on a random sample
let RText = "percentage <- " + trainPercentage + " # Change this number to change the percentage in the training sample\n" +
"set.seed(123) # This ensures that the randomization is identical each time\n" +
"n <- " + dataFile.totalN + " # This is the total sample size\n" +
"indices <- sample.int(n, round(percentage * n / 100))\n" +
"filter <- rep(0, n)\n" +
"filter[indices] <- 1\n" +
"filter";
let new_q_name = preventDuplicateQuestionName(dataFile, "Training sample");
let tempVar = preventDuplicateVariableName(dataFile, "training");

let train;
let test;
let validation;

try {
train = dataFile.newRVariable(RText, tempVar, "Training sample", null);
} catch (e) {
log("Could not create train filter: " + e);
return false;
}
let trainFullName = "" + dataFile.name + "$Variables$" + train.name;

// Create a validation filter based on those not selected in training filter
RText = "percentage <- " + validationPercentage + " # Change this number to change the percentage in the validation sample\n" +
"set.seed(123) # This ensures that the randomization is identical each time\n" +
"n <- " + dataFile.totalN + " # This is the total sample size\n" +
"n.remaining <- n - sum(" + trainFullName + ")\n" +
"indices <- sample.int(n.remaining, round(percentage * n / 100))\n" +
"filter <- rep(0, n)\n" +
"filter[" + trainFullName + " == 0][indices] <- 1\n" +
"filter";
try {
validation = dataFile.newRVariable(RText, preventDuplicateVariableName(dataFile, "validation"), "Validation sample", null);
} catch (e) {
log("Could not create validation filter: " + e);
return false;
}

// Create test filter from those not selected in either training or validation filters
RText = "as.numeric(!(" + trainFullName + " + " + dataFile.name + "$Variables$" + validation.name + "))";
try {
test = dataFile.newRVariable(RText, preventDuplicateVariableName(dataFile, "testing"), "Testing sample", null);
} catch (e) {
log("Could not create test filter: " + e);
return false;
}

// Combine the 3 new variables into a Pick-Any question
trainValidateTest = dataFile.setQuestion(preventDuplicateQuestionName(dataFile,"Train validate test split"),
"Pick Any", [train, validation, test])
let suffix = trainValidateTest.name.replace(/^Train validate test split/, "");
trainValidateTest.variables[0].label = "Training sample" + suffix;
trainValidateTest.variables[1].label = "Validation sample" + suffix;
trainValidateTest.variables[2].label = "Testing sample" + suffix;
trainValidateTest.isFilter = true;
setCountThisValueForVariablesInQuestion(trainValidateTest, 1, true);
trainValidateTest.needsCheckValuesToCount = false;
insertAtHoverButtonIfShown(trainValidateTest);
reportNewRQuestion(trainValidateTest, "Train validate test split");
return true;
}