Text Analysis - Advanced - Save Variable(s) - Categories
This QScript, saves the categories from a selected Automatic Categorization - List of Items, Automatic Categorization - Unstructured Text, Automatic Categorization - Entity Extraction or Automatic Categorization - Advanced - Setup Text Analysis output as a Pick One or Pick Any - Compact question.
Technical Details
The variables created from this QScript may become invalid and need to be deleted and recreated if the output from Automatic Categorization - List of Items, Automatic Categorization - Unstructured Text, Automatic Categorization - Entity Extraction or Automatic Categorization - Advanced - Setup Text Analysis has changed, either due to the input text variable being modified or the input settings modified.
This QScript requires version 5.5.1 to be executed.
Code
includeWeb("QScript R Output Functions");
main();
function main() {
if (Q.fileFormatVersion() < 14.11) // require Q 5.5.1 or after in order to call selected_item.data
{
log("Q 5.5.1 or later is required to run this script, please update your version of Q.")
return false;
}
var script_name = "Save Variable(s) - Categories";
var analysis_name = "Categories";
var bad_selection_message = "Select a List Categorization, Automatic Categorization, Entity Extraction or Setup Text Analysis output.";
var web_mode = (!!Q.isOnTheWeb && Q.isOnTheWeb());
var selected_item = getSelectedROutputFromPage(["categorizedlist", "AutomaticCategorization", "EntityExtraction", "wordBag"]);
if (selected_item === null) {
log(bad_selection_message);
return false;
}
var data_file = getDataFileFromItemDependants(selected_item);
if (data_file == null)
{
log("'Save variables' cannot be applied to an output with no data file.");
return false;
}
// Find last variable, which we will place the generated variables after
var last_variable = getLastVariable(getVariables(selected_item.dependants(false)));
if (selected_item.outputClasses.indexOf("categorizedlist") > -1)
{
var variable_names = selected_item.data.get("variable.names");
for (var j = 0; j < variable_names.length; j++)
{
var expression = "categorizedlist = " +
stringToRName(selected_item.referenceName) +
"\nif (categorizedlist$hash != '" + selected_item.data.get("hash") + "')\n" +
" stop('The Text Analysis output used to create these variables has changed " +
"and as a result these variables are no longer valid. Please delete these variables and rerun the " +
script_name + " script on the Text Analysis output.')\n" +
"var.i <- " + j + "\n" +
"ind <- (1:categorizedlist$n.cases) + var.i * categorizedlist$n.cases\n" +
"max.splits <- max(max(sapply(categorizedlist $transformed.text[ind], length)), 1)\n" +
"result <- matrix(NA, nrow = length(categorizedlist$subset), ncol = max.splits)\n" +
"for (i in 1:max.splits)\n" +
" result[categorizedlist$subset, i] <- match(sapply(categorizedlist$transformed.text[ind],\n" +
" function(x) if (is.null(x) || length(x) < i) NA else x[i]), categorizedlist$final.tokens)\n" +
"result";
var new_q_name = preventDuplicateQuestionName(data_file, analysis_name + " from " + variable_names[j]);
var new_var_name = new_q_name.replace(" from ", " ").replace(/[^a-zA-Z0-9_@\#\$\\]/g, '_').toLowerCase() + "_";
new_var_name = randomVariableName(16, new_var_name);
try {
var new_r_question = data_file.newRQuestion(expression, new_q_name, new_var_name, last_variable);
} catch (e) {
log(analysis_name + " could not be created from this item: " + e);
return false;
}
for (var i = 0; i < new_r_question.variables.length; i++)
{
new_r_question.variables[i].label = "Phrase " + (i + 1);
new_r_question.variables[i].variableType = "Categorical";
}
// Set value attributes (categories)
new_r_question.questionType = "Pick One - Multi";
var categories = selected_item.data.get("final.tokens");
for (var i = 0; i < categories.length; i++)
new_r_question.valueAttributes.setLabel(i + 1, categories[i]);
new_r_question.questionType = "Pick Any - Compact";
// In Q, create a table showing the new question
if (!web_mode) {
var t = selected_item.group.appendTable();
t.primary = new_r_question;
}
last_variable = new_r_question.variables[new_r_question.variables.length - 1];
}
}
else if (selected_item.outputClasses.indexOf("AutomaticCategorization") > -1)
{
var valid_check = "categorization = " + stringToRName(selected_item.referenceName) +
"\nif (gsub('\\'', '', categorization$text.label, fixed = TRUE) != '" + selected_item.data.get("text.label").toString().replace("'", "") + "')\n" +
" stop('The input text variable used for the Text Analysis has changed " +
"and as a result this variable is no longer valid. Please delete this variable and rerun the " +
script_name + " script on the Text Analysis output.')\n"
var existing_cat = selected_item.getInput("formExistingCat");
if (existing_cat !== null) {
var predicted_from_existing = selected_item.data.get("predicted");
var predicted_type = selected_item.data.getAttribute("predicted", "class");
var pick_any_output = predicted_type == "data.frame";
var expression = valid_check +
"categorization$predicted";
} else {
var pick_any_output = false;
var expression = valid_check +
"categorization$categorization";
}
var new_q_name = preventDuplicateQuestionName(data_file, analysis_name + " from " +
selected_item.data.get("text.label"));
var new_var_name = new_q_name.replace(" from ", " ").replace(/[^a-zA-Z0-9_@\#\$\\]/g, '_').toLowerCase() + "_";
new_var_name = randomVariableName(16, new_var_name);
try {
var new_r_question = data_file.newRQuestion(expression, new_q_name, new_var_name, last_variable);
} catch (e) {
log(analysis_name + " could not be created from this item: " + e);
return false;
}
new_r_question.variables.forEach(v => v.variableType = "Categorical");
new_r_question.questionType = pick_any_output ? "Pick Any" : "Pick One";
if (pick_any_output) {
new_r_question.needsCheckValuesToCount = false;
}
// In Q, create a table showing the new question
if (!web_mode) {
var t = selected_item.group.appendTable();
t.primary = new_r_question;
project.report.setSelectedRaw([t]);
}
}
else if(selected_item.outputClasses.indexOf("EntityExtraction") > -1)
{
var entity_variables = selected_item.data.get("entity.variables");
if(entity_variables.length === 1 && typeof entity_variables[0] === "boolean") {
var empty_reason = selected_item.data.get("entity.variables.empty.reason");
if(empty_reason == "output") {
var reason = "there are no extracted entities in the output.";
} else if (empty_reason == "min") {
var entity_counts = selected_item.data.get("entity.counts");
var entity_names = selected_item.data.getAttribute("entity.counts", "names");
var min_cases_to_save = selected_item.data.get("min.cases.to.save");
var largest_count = entity_counts.reduce(function(x, y) {
return (x > y) ? x : y;
});
var largest_name = entity_names[entity_counts.indexOf(largest_count)];
var reason = "the entity type with the most number of extractions, '" + largest_name + "', contains " + largest_count + " named entities. However," +
" the minimum number of entities required to save is set at " + min_cases_to_save + ". Set 'Minimum number of cases to save' to " +
largest_count + " or lower to save the extracted named entities as variables.";
} else if(empty_reason == "remove"){
var reason = "the only entities in the output have been removed with the user specified remove entities from extraction settings.";
} else {
log("Error: unknown reason for no variables to save");
}
log("No entity variables have been saved since " + reason);
return false;
}
var entity_R_type = selected_item.data.getAttribute("entity.variables", "data.type");
var entity_type_names = selected_item.data.getAttribute("entity.variables", "names");
for(let i = entity_variables.length - 1; i >= 0; i--){
var r_entity_variables = entity_type_names[i].replace(/ /g, ".").toLowerCase() + ".variables";
var expression = r_entity_variables + " = " + stringToRName(selected_item.referenceName) + '$entity.variables[["' + entity_type_names[i] + '"]]\n' +
r_entity_variables
var new_q_name = preventDuplicateQuestionName(data_file, entity_type_names[i] + " Entities from " +
selected_item.data.get("text.label"));
var new_var_name = new_q_name.replace(" from ", " ").replace(/[^a-zA-Z0-9_@\#\$\\]/g, '_').toLowerCase() + "_";
new_var_name = randomVariableName(16, new_var_name);
try {
var new_r_question = data_file.newRQuestion(expression, new_q_name, new_var_name, last_variable);
if(entity_R_type[i] === "list") {
new_r_question.questionType = "Pick Any - Compact"
}
} catch (e) {
log("Categories could not be created from this item: " + e);
return false;
}
if (!web_mode) {
var t = selected_item.group.appendTable();
t.primary = new_r_question;
project.report.setSelectedRaw([t]);
}
}
}
else if(selected_item.outputClasses.indexOf("wordBag") > -1)
{
var tokenized_text = selected_item.data.get("transformed.tokenized");
var flattened_tokenized = tokenized_text.flat();
if(flattened_tokenized.length === 0)
{
var base_message = "Variables could not be created from this item, there are no terms in the text after transformation.";
var min_frequency = selected_item.getInput("formminfreq");
var extra_message = "";
if(min_frequency > 1)
{
extra_message = " Terms might be found if the minimum frequency control is reduced from " + min_frequency + " to a smaller value.";
}
log(base_message + extra_message);
return false;
}
var expression = "wordBag <- " + stringToRName(selected_item.referenceName) + "\n" +
"tokenized <- wordBag$transformed.tokenized\n" +
"below.min.frequency <- which(table(unlist(lapply(tokenized, unique))) < wordBag$min.frequency)\n" +
"if (length(below.min.frequency) != 0)\n" +
"{\n" +
"\tbelow.min.frequency <- names(below.min.frequency)\n" +
"\ttokenized <- lapply(tokenized, function(x) {\n" +
"\t\tremove.token <- x %in% below.min.frequency\n" +
"\t\tif(any(remove.token)) x <- x[!remove.token]\n" +
"\t\t\treturn(x)\n" +
"\t\t})\n" +
"}\n" +
"unique.tokens <- sort(unique(unlist(tokenized)))\n" +
"max.tokens <- max(sapply(tokenized, length))\n" +
"list.template <- lapply(1:max.tokens, function(i) sapply(tokenized, function(x) x[i]))\n" +
"list.as.factor <- lapply(list.template, function(x) factor(x, levels = unique.tokens))\n" +
"pick.any.data.frame <- data.frame(list.as.factor)\n" +
"names(pick.any.data.frame) <- paste0('Term ', 1:max.tokens)\n"+
"pick.any.data.frame";
var new_q_name = preventDuplicateQuestionName(data_file, analysis_name + " from " +
selected_item.data.getAttribute("original.text", "label"));
var new_var_name = new_q_name.replace(" from ", " ").replace(/[^a-zA-Z0-9_@\#\$\\]/g, '_').toLowerCase() + "_";
new_var_name = randomVariableName(16, new_var_name);
try {
var new_r_question = data_file.newRQuestion(expression, new_q_name, new_var_name, last_variable);
} catch (e) {
log(analysis_name + " could not be created from this item: " + e);
return false;
}
new_r_question.questionType = "Pick Any - Compact"
// In Q, create a table showing the new question
if (!web_mode) {
var t = selected_item.group.appendTable();
t.primary = new_r_question;
project.report.setSelectedRaw([t]);
}
}
return true;
}