Text Analysis - Advanced - Save Variable(s) - Categories

From Q
Jump to: navigation, search

This QScript saves the categories from a selected Automatic Categorization - List of Items, Automatic Categorization - Unstructured Text, Automatic Categorization - Entity Extraction or Text Analysis - Advanced - Setup Text Analysis output as a Pick One or Pick Any - CompactNominal or Binary Multi - Compact question.

Technical Details

The variables created from this QScript may become invalid and need to be deleted and recreated if the output from Automatic Categorization - List of Items, Automatic Categorization - Unstructured Text, Automatic Categorization - Entity Extraction or Automatic Categorization - Advanced - Setup Text Analysis has changed, either due to the input text variable being modified or the input settings modified.

This QScript requires version 5.5.1 to be executed.

Code

includeWeb("QScript R Output Functions");

main();

function main() {
    if (Q.fileFormatVersion() < 14.11) // require Q 5.5.1 or after in order to call selected_item.data
    {
        log("Q 5.5.1 or later is required to run this script, please update your version of Q.")
        return false;
    }
    
    var script_name = "Save Variable(s) - Categories";
    var analysis_name = "Categories";
    var variable_prefix = "categ";
    
    var bad_selection_message = "Select a List Categorization, Automatic Categorization, Entity Extraction or Setup Text Analysis output.";
    var web_mode = (!!Q.isOnTheWeb && Q.isOnTheWeb());
    
    var selected_item = getSelectedROutputFromPage(["categorizedlist", "AutomaticCategorization", "EntityExtraction", "TextClassifier", "wordBag"]);
    if (selected_item === null) {
        log(bad_selection_message);
        return false;
    }
    
    var data_file = getDataFileFromItemDependants(selected_item);
    
    if (data_file == null)
    {
        log("'Save variables' cannot be applied to an output with no data file.");
        return false;
    }
    
    // Find last variable, which we will place the generated variables after
    var last_variable = getLastVariable(getVariables(selected_item.dependants(false)));
    
    // Check if the Token controls exists and pass it in if it does.
    var form_max_levels = selected_item.getInput("formMaxLevels");
    if (form_max_levels === null) {
        form_max_levels = "Inf";
    }
    var form_max_mentions = selected_item.getInput("formMaxMentions");
    if (form_max_mentions === null) {
        form_max_mentions = "Inf";
    }
    // Save the variables depending on the R output class.
    if (selected_item.outputClasses.indexOf("categorizedlist") > -1)
    {
        var variable_names = selected_item.data.get("variable.names");
        for (var j = 0; j < variable_names.length; j++)
        {
            var expression = "categorizedlist = " + 
                             stringToRName(selected_item.referenceName) + 
                          "\nif (categorizedlist$hash != '" + selected_item.data.get("hash") + "')\n" +
                          "    stop('The Text Analysis output used to create these variables has changed " +
                          "and as a result these variables are no longer valid. Please delete these variables and rerun the " +
                          script_name + " script on the Text Analysis output.')\n" +
                          "# Look to replace this with single javascript code outside the loop\n" +
                          "form.max.levels <- " + form_max_levels + "\n" + 
                          "form.max.mentions <- " + form_max_mentions + "\n" +
                          "flipTextAnalysis::SaveVariablesCategories(" + stringToRName(selected_item.referenceName) + ",\n" + 
                          "                                          variable.name = categorizedlist$variable.names[" + (j + 1) + "],\n" +
                          "                                          form.max.levels = " + form_max_levels + ",\n" +
                          "                                          form.max.mentions = " + form_max_mentions + ")";
            var new_q_name = preventDuplicateQuestionName(data_file, analysis_name + " from " + variable_names[j]);
            var temp_var_name = randomVariableName(16); // temporary name, random to (almost) guarantee uniqueness
            
            try {
                var new_r_question = data_file.newRQuestion(expression, new_q_name, temp_var_name, last_variable);
            } catch (e) {
                if (/Can only convert tabular results to an R/.test(e)) {
                    log(analysis_name + " could not be created from this item: The output variable is too large. " + 
                        "The variable size can be reduced by increasing the required size of each category before saving (increasing the value in the 'Minimum category size' control)");
                } else {
                    log(analysis_name + " could not be created from this item: " + e);
                }
                return false;
            }
            
            // Replace temporary variable names
            nameSequentialVariables(new_r_question.variables, variable_prefix);
            
            new_r_question.questionType = "Pick Any - Compact";
            
            // In Q, create a table showing the new question
            if (!web_mode) {
                var t = selected_item.group.appendTable();
                t.primary = new_r_question;
            }
            
            last_variable = new_r_question.variables[new_r_question.variables.length - 1];
        }
    }
    else if (selected_item.outputClasses.indexOf("AutomaticCategorization") > -1)
    {
        var valid_check = "categorization = " + stringToRName(selected_item.referenceName) + 
            "\nif (gsub('\\u00a0', ' ', gsub('\\'', '', categorization$text.label, fixed = TRUE), fixed = TRUE) != '" + selected_item.data.get("text.label").toString().replace(/'/g, "") + "')\n" +
                          "    stop('The input text variable used for the Text Analysis has changed " +
                          "and as a result this variable is no longer valid. Please delete this variable and rerun the " +
                          script_name + " script on the Text Analysis output.')\n"
        var pick_any_output = false;
        var existing_cat = selected_item.getInput("formExistingCat");
        if (existing_cat !== null) {
            var predicted_from_existing = selected_item.data.get("predicted");
            var predicted_type = selected_item.data.getAttribute("predicted", "class");
            var pick_any_output = predicted_type == "data.frame";
            var expression = valid_check + 
                             "categorization$predicted";
        } else {
            var expression = valid_check + 
                             "categorization$categorization";
        }
        var new_q_name = preventDuplicateQuestionName(data_file, analysis_name + " from " +
                                                          selected_item.data.get("text.label"));
        var temp_var_name = randomVariableName(16); // temporary name, random to (almost) guarantee uniqueness

        try {
            var new_r_question = data_file.newRQuestion(expression, new_q_name, temp_var_name, last_variable);
        } catch (e) {
            if (/Can only convert tabular results to an R/.test(e)) {
                var number_categories = selected_item.getInput("formCategories");
                var size_msg = number_categories > 1 ? " The variable size could possibly be reduced by decreasing the number of categories from " + number_categories + " to a smaller number." : "";
                log(analysis_name + " could not be created from this item: The output variable is too large. " + size_msg);
            } else {
                log(analysis_name + " could not be created from this item: " + e);
            }
            return false;
        }

        // Replace temporary variable names
        nameSequentialVariables(new_r_question.variables, variable_prefix);

        new_r_question.variables.forEach(function(v) {v.variableType = "Categorical"});
           
        new_r_question.questionType = pick_any_output ? "Pick Any" : "Pick One";
        
        if (pick_any_output) {
            new_r_question.needsCheckValuesToCount = false;
        }   
        
        // In Q, create a table showing the new question
        if (!web_mode) {
            var t = selected_item.group.appendTable();
            t.primary = new_r_question;
            project.report.setSelectedRaw([t]);
        }
    }
	else if (selected_item.outputClasses.indexOf("TextClassifier") > -1)
    {
        var valid_check = "categorization = " + stringToRName(selected_item.referenceName) + 
                          "\nif (gsub('\\u00a0', ' ', gsub('\\'', '', categorization$text.label, fixed = TRUE), fixed = TRUE) != '" + selected_item.data.get("text.label").toString().replace(/'/g, "") + "')\n" +
                          "    stop('The input text variable used for the Text Analysis has changed " +
                          "and as a result this variable is no longer valid. Please delete this variable and rerun the " +
                          script_name + " script on the Text Analysis output.')\n"
        var existing_cat = selected_item.getInput("formExistingCat");
        var predicted_from_existing = selected_item.data.get("predicted");
        var predicted_type = selected_item.data.getAttribute("predicted", "class");
        var pick_any_output = predicted_type == "data.frame";
        var expression = valid_check + 
                             "categorization$predicted";
        var new_q_name = preventDuplicateQuestionName(data_file, analysis_name + " from " +
                                                          selected_item.data.get("text.label"));
        var temp_var_name = randomVariableName(16); // temporary name, random to (almost) guarantee uniqueness

        try {
            var new_r_question = data_file.newRQuestion(expression, new_q_name, temp_var_name, last_variable);
        } catch (e) {
            log(analysis_name + " could not be created from this item: " + e);
            return false;
        }

        // Replace temporary variable names
        nameSequentialVariables(new_r_question.variables, variable_prefix);

        new_r_question.variables.forEach(function(v) {v.variableType = "Categorical"});
           
        new_r_question.questionType = pick_any_output ? "Pick Any" : "Pick One";
        
        if (pick_any_output) {
            new_r_question.needsCheckValuesToCount = false;
        }   
        
        // In Q, create a table showing the new question
        if (!web_mode) {
            var t = selected_item.group.appendTable();
            t.primary = new_r_question;
            project.report.setSelectedRaw([t]);
        }
    }
    else if(selected_item.outputClasses.indexOf("EntityExtraction") > -1)
    {
        var entity_variables = selected_item.data.get("entity.variables");
        
        if(entity_variables.length === 1 && typeof entity_variables[0] === "boolean") {
            var empty_reason = selected_item.data.get("entity.variables.empty.reason");
            if(empty_reason == "output") {
                var reason = "there are no extracted entities in the output.";
            } else if (empty_reason == "min") {
                var entity_counts = selected_item.data.get("entity.counts");
                var entity_names = selected_item.data.getAttribute("entity.counts", "names");
                var min_cases_to_save = selected_item.data.get("min.cases.to.save");
                var largest_count = entity_counts.reduce(function(x, y) {
                    return (x > y) ? x : y;
                });
                var largest_name = entity_names[entity_counts.indexOf(largest_count)];
                var reason = "the entity type with the most number of extractions, '" + largest_name + "', contains " + largest_count + " named entities. However," +
                    " the minimum number of entities required to save is set at " + min_cases_to_save + ". Set 'Minimum number of cases to save' to " + 
                    largest_count + " or lower to save the extracted named entities as variables.";
            } else if(empty_reason == "remove"){
                var reason = "the only entities in the output have been removed with the user specified remove entities from extraction settings.";
            } else {
                log("Error: unknown reason for no variables to save");
            }
            log("No entity variables have been saved since " + reason);
            return false;
        }
        var entity_R_type = selected_item.data.getAttribute("entity.variables", "data.type");
        var entity_type_names = selected_item.data.getAttribute("entity.variables", "names");
        for(var i = entity_variables.length - 1; i >= 0; i--){
            var r_entity_variables = entity_type_names[i].replace(/ /g, ".").toLowerCase() + ".variables";
            var expression = r_entity_variables + " <- " + stringToRName(selected_item.referenceName) + '$entity.variables[["' + entity_type_names[i] + '"]]\n' +
                             "n.levels <- attr(" + r_entity_variables + ", \"length.original.levels\")\n" +
                             "levels.exceeded <- !is.null(n.levels)\n" +
                             "form.max.levels <- " + form_max_levels + "\n" +
                             "if (levels.exceeded)\n" +
                             "{\n" +
                             "\twarning.msg <- paste0(\" has \", n.levels, \" categories (number of different \",\n" +
                             "\t                      \"extracted entities identified in the text). For performance reasons, the number of \",\n" +
                             "\t                      \"extracted entities has been reduced to the top \", form.max.levels, \" most \",\n" +
                             "\t                      \"popular entities with ties broken by alphabetical order if \",\n" +
                             "\t                      \"necessary. If you wish to relax this setting and save more \",\n" +
                             "\t                      \"entities of this type, increase the number to save by changing the value in \",\n" +
                             "\t                      \"the 'Maximum number of unique entity levels to save' control. However, \",\n" +
                             "\t                      \"performance might suffer.\")\n" +
                             "} else\n" +
                             "\twarning.msg <- NULL\n" +
                             "mentions.exceeded <- isTRUE(attr(" + r_entity_variables + ", \"mentions.truncated\"))\n" +
                             "n.mentions <- attr(" + r_entity_variables + ", \"original.max.mentions\")\n" +
                             "n.levels <- attr(" + r_entity_variables + ", \"mention.adjusted.levels\")\n" +
                             "if (mentions.exceeded && !is.null(n.mentions))\n" +
                             "{\n" +
                             "\toutput.msg <- paste0(\", the number of saved variables per case has been truncated to the first \",\n" +
                             "\t                     " + form_max_mentions + ", \" observed entities.\")\n" +
                             "\tif (!is.null(n.levels) && n.levels < form.max.levels)\n" +
                             "\toutput.msg <- paste0(output.msg, \" This truncation has reduced the number of extracted entities \",\n" +
                             "\t                     \"further from \", form.max.levels, \" categories to \", n.levels,\n" +
                             "\t                     \" categories.\")\n" +
                             "\toutput.msg <- paste0(output.msg, \" If you wish to relax this setting and save more entities per \",\n" +
                             "\t                     \"case, increase the value in the 'Maximum number of entities \",\n" +
                             "\t                     \"per case to save'.\")\n" +
                             "\tif (levels.exceeded)\n" +
                             "\t\twarning.msg <- paste0(warning.msg, \" Also, there is at least one case with \", n.mentions,\n" +
                             "\t\t                      \" identified entities. To keep the variable size manageable\", output.msg)\n" +
                             "\telse\n" +
                             "\t\twarning.msg <- paste0(\", has at least one case with \", n.mentions, \" identified entities. \",\n" +
                             "\t\t                      \"For performance reasons and to keep the variable \",\n" +
                             "\t\t                      \"size manageable\", output.msg)\n" +
                             "}\n" +
                             "if (levels.exceeded || mentions.exceeded)\n" +
                             "{\n" +
                             "\tpre.warning <- paste0(\"The " + entity_type_names[i] + " entities from the Entity Extraction output, " + selected_item.referenceName + "\")\n" +
                             "\twarning(pre.warning, warning.msg)\n" +
                             "}\n" +
                             r_entity_variables;
            var new_q_name = preventDuplicateQuestionName(data_file, entity_type_names[i] + " Entities from " +
                                                      selected_item.data.get("text.label"));
            var temp_var_name = randomVariableName(16); // temporary name, random to (almost) guarantee uniqueness
            
            try {
                var new_r_question = data_file.newRQuestion(expression, new_q_name, temp_var_name, last_variable);
                if(entity_R_type[i] === "list") {
                    new_r_question.questionType = "Pick Any - Compact"
                }
            } catch (e) {
                if (/Can only convert tabular results to an R/.test(e)) {
                    var size_msg = selected_item.getInput("formMaxLevels") === null ? "." : " The variable size can be reduced by decreasing the maximum number of entity levels.";
                    log(analysis_name + " could not be created from this item: The output variable is too large" + size_msg);
                } else {
                    log(analysis_name + " could not be created from this item: " + e);
                }
                return false;
            }

            // Replace temporary variable names
            nameSequentialVariables(new_r_question.variables, variable_prefix);

            if (!web_mode) {
                var t = selected_item.group.appendTable();
                t.primary = new_r_question;
                project.report.setSelectedRaw([t]);
            }
        }
    }
    else if(selected_item.outputClasses.indexOf("wordBag") > -1)
    {
	var tokenized_text = selected_item.data.get("transformed.tokenized");
        var n_gram_max = selected_item.getInput("formNGramMax");
        var form_min_freq = selected_item.getInput("formminfreq");
        var flattened_tokenized = tokenized_text.flat();
        if(flattened_tokenized.length === 0)
        {
            var base_message = "Variables could not be created from this item, there are no terms in the text after transformation.";
            var min_frequency = selected_item.getInput("formminfreq");
            var extra_message = "";
            if(min_frequency > 1)
            {
                extra_message = " Terms might be found if the minimum frequency control is reduced from " + min_frequency + " to a smaller value.";
            }
            log(base_message + extra_message);
            return false;
        }
        // If wordBag is new, check the hash, otherwise compute the variables regardless for an older wordBag output.
        try {
            var valid_check = "\nif (wordBag$hash != '" + selected_item.data.get("hash") + "')\n" +
                          "    stop('The Text Analysis output used to create these variables has changed " +
                          "and as a result these variables are no longer valid. Please delete these variables and rerun the " +
                          script_name + " script on the Text Analysis output.')\n";
        } catch(e) {
            var valid_check = "";
        }
        var freq_statement = "The output size can be reduced by increasing the Minimum Frequency value from " + form_min_freq + " to a larger value.";
        var n_gram_reduce_statement = n_gram_max > 1 ? " Also consider reducing the Maximum n for n-gram identification value from " + n_gram_max + " to a smaller value." : "";
 
        var expression = "wordBag <- " + stringToRName(selected_item.referenceName) + "\n" +
                         valid_check +
                         "flipTextAnalysis::SaveVariablesCategories(" + stringToRName(selected_item.referenceName) + ",\n" + 
                         "                                          form.max.levels = " + form_max_levels + ",\n" +
                         "                                          form.max.mentions = " + form_max_mentions + ")";
        var new_q_name = preventDuplicateQuestionName(data_file, analysis_name + " from " +
                                                          selected_item.data.getAttribute("original.text", "label"));
        var temp_var_name = randomVariableName(16); // temporary name, random to (almost) guarantee uniqueness

        try {
            var new_r_question = data_file.newRQuestion(expression, new_q_name, temp_var_name, last_variable);
        } catch (e) {
            if (/Can only convert tabular results to an R/.test(e)) {
                log(analysis_name + " could not be created from this item: The output variable is too large. " + 
                    freq_statement + n_gram_reduce_statement);
            } else {
                log(analysis_name + " could not be created from this item: " + e);
            }
            return false;
        }

        // Replace temporary variable names
        nameSequentialVariables(new_r_question.variables, variable_prefix);
        
        new_r_question.questionType = "Pick Any - Compact";
               
        // In Q, create a table showing the new question
        if (!web_mode) {
            var t = selected_item.group.appendTable();
            t.primary = new_r_question;
            project.report.setSelectedRaw([t]);
        }
    }
    return true;
}