Commit 40dadf5f authored by Robin Engler's avatar Robin Engler
Browse files

Add support for scored markers

parent bb2cd129
...@@ -18,11 +18,13 @@ AUTHORIZED_TISSUES <<- c('stroma', 'tumor', 'dermis', 'epidermis', 'melano ...@@ -18,11 +18,13 @@ AUTHORIZED_TISSUES <<- c('stroma', 'tumor', 'dermis', 'epidermis', 'melano
AUTHORIZED_COMPARTMENTS <<- c('nucleus', 'membrane', 'cytoplasm', 'entire_cell') AUTHORIZED_COMPARTMENTS <<- c('nucleus', 'membrane', 'cytoplasm', 'entire_cell')
AUTHORIZED_STROMA_VALUES <<- c('DAPI', 'stroma', 'other') AUTHORIZED_STROMA_VALUES <<- c('DAPI', 'stroma', 'other')
AUTHORIZED_TUMOR_VALUES <<- c('CK', 'tumor') AUTHORIZED_TUMOR_VALUES <<- c('CK', 'tumor')
AUTHORIZED_MARKERS <<- c('CAL', 'CD3', 'CD4', 'CD8', 'CD11C', 'CD15', 'CD20', 'CD56', 'CD68', AUTHORIZED_MARKERS <<- c('CAL', 'CD3', 'CD4', 'CD8', 'CD11c', 'CD15', 'CD20', 'CD56', 'CD68',
'CD103', 'CD163', 'CD206', 'FOXP3', 'GB', 'gH2AX', 'gH2AXN', 'IDO', 'CD103', 'CD163', 'CD206', 'FOXP3', 'GB', 'gH2AX', 'gH2AXN', 'IDO',
'IL10R', 'Keratin', 'KI67', 'PD1', 'PDL1', 'PERFORIN', 'SOX10', 'IL10R', 'Keratin', 'KI67', 'PD1', 'PDL1', 'PERFORIN', 'SOX10',
'WT1', 'CK', 'VISTA') 'WT1', 'CK', 'VISTA')
IGNORED_PHENOTYPES <<- c('DAPIp', 'MISSING') IGNORED_PHENOTYPES <<- c('DAPIp', 'MISSING')
NO_PHENOTYPE <<- 'MISSING'
NO_PHENOTYPE_SYNONYMS <<- c('ARTEFACT', 'ARTEFACTS', 'OTHER')
DATAREDUCE_SCRIPT <<- file.path(dirname(dirname(sys.frame(1)$ofile)), DATAREDUCE_SCRIPT <<- file.path(dirname(dirname(sys.frame(1)$ofile)),
'inst/bash/reduce_file_size.sh') 'inst/bash/reduce_file_size.sh')
......
...@@ -16,122 +16,112 @@ check_for_duplicated_rows <- function(data_frame){ ...@@ -16,122 +16,112 @@ check_for_duplicated_rows <- function(data_frame){
#################################################################################################### ####################################################################################################
reclass_cells_by_marker <- function(marker, #' Reclass input 'cell_values' into a binary vector indicating whether a cell is positive for a
cell_values, #' given marker (1) or not (0). This function is for phenotyped markers and expects 'cell_values'
thresholds = NULL){ #' to be a vector of strings.
# ******************************************************************************************** #'
# Computes a binary vector indicating whether a cell is positive for a given marker (1) or #' @param marker [string] Name of marker for which to reclass cells. This must be a single marker.
# not (0). The function accepts cell_values that are either phenotypes, or marker intensity #' @param cell_values [string vector] List of cell values (phenotypes) to reclassify.
# values. #'
# reclass_cells_by_marker_phenotyped <- function(marker, cell_values){
# Input arguments:
# marker : string. Name of marker for which to reclassify cells.
# cell_values: if the marker was phenotyped: must be a vector of strings.
# if the marker was scored: must be a vector of floats.
# thresholds : list of thresholds for the given marker. If the marker was phenotyped (as
# opposed to scored), then thresholds must be set to NULL. This is how the
# function knows that the maker is phenotyped.
# ********************************************************************************************
# Case 1: marker was phenotyped. # Verify input values.
# ***************************** stopifnot(is.character(cell_values))
if(length(thresholds) == 0){
# Test whether the elements of the cell_values vector contain the string "markerName + p". # For each phenotype value, test whether maker is part of the phenotype. E.g. test whether
# E.g. for CD11c, we test if the phenotype contains the string "CD11cp". # "CD8" is part of "KI67p_CD8p_CD11cp".
# "partial matches", means that if we are searching for say "CD11", then all cells marker = paste0(marker, 'p')
# whose phenotype contains "CD11p" (e.g. "CD11p", "CD11p_CD86p" or "CD8p_CD11p_CD86p") will be return(as.numeric(sapply(strsplit(cell_values, split='_'), FUN=function(x) marker %in% x)))
# reclassified as 1 (the marker is present in the cell). Note that if the marker apprears in the }
# phenotype with a "m", e.g. "CD8p_CD11m", then this is not counted as a match since the "m" ####################################################################################################
# stands for "minus", meaning the marker is not present in the cell.
marker = paste0(marker, 'p')
stopifnot(is.character(cell_values))
return(sapply(strsplit(cell_values, split='_'), FUN=function(x) ifelse(marker %in% x,1,0)))
}
# Case 2: marker was scored.
# **************************
# In this case we reclass the values depending on whether they are >= or < than the threshold.
# Values >= threshold get reclassified as 1, while values < threshold get reclassified as 0.
stopifnot(is.numeric(cell_values))
stop("########### NOT IMPLEMENTED YET !!!!") ####################################################################################################
# Create temporary vector of thresholds. This vector has the same length as the number of rows in #' Reclass input 'cell_values' into a binary vector indicating whether a cell is positive for a
# the cell table, and simply contains the threshold value coresponding to the tissue type for #' given marker (1) or not (0). This function is for scored markers and expects 'cell_values'
# each row of the cell table. #' to be a vector of numeric values, the marker intensity values for the given marker.
thresholdVector = as.numeric(sapply(intensity_table[,'tissue_category'], #'
FUN=function(x) thresholdList[1,x])) #' @param marker [string] Name of marker for which to reclass cells. This must be a single marker.
#' @param cell_values [numeric vector] List of cell values (intensity values) to reclassify.
#' @param tissue_type [string vector] Tissue type associated to each cell. This vector must have
#' the same length as [cell_values].
#' @param thresholds [data frame] Reclassification threshold table. Must contain a "tissue_type"
#' column and a column bearing the name of the marker.
reclass_cells_by_marker_scored <- function(marker, cell_values, tissue_type, thresholds){
# Verify input values.
stopifnot(is.numeric(cell_values))
stopifnot(length(cell_values) == length(tissue_type))
stopifnot(sort(unique(tissue_type[tissue_type != 'missing'])) == sort(thresholds$tissue_type))
# Test whether intensity value is >= threshold. # Reclass cell values depending on whether they are >= or < than the threshold.
return(as.numeric(intensity_table[,tmpColNb] > thresholdVector)) # Values >= threshold get reclassified as 1, while values < threshold get reclassified as 0.
# The threshold value can be different for each tissue type.
thresholds = setNames(thresholds[,marker], thresholds[,'tissue_type'])
reclassed_values = as.numeric(sapply(1:length(cell_values),
function(x) cell_values[x] >= thresholds[tissue_type[x]]))
# If the tissue_type vector contains any "missing" values, this introduces NA values in the
# reclassified values output. Here we set those values to 0.
reclassed_values[which(is.na(reclassed_values))] = 0
return(reclassed_values)
} }
#################################################################################################### ####################################################################################################
#################################################################################################### ####################################################################################################
generate_summary_table <- function(sample_name, #' Generate a table where each row contains values for a given sample, image ID, cell type and
image_ids, #' tissue type combination. The summary_table has the following columns:
cell_types, #'
tissue_list, #' SampleName ImageID TissueType CellType Threshold SurfacePIX SurfaceMM2 CellDensity CellCount IntMean IntMedian IntMin IntMax IntSD
markers_phenotyped, #' TMA_IF02 Total stroma CD8p -1.00 NA NA NA NA NA NA NA NA NA
markers_scored, #' TMA_IF02 Total stroma CD8p_total -1.00 NA NA NA NA NA NA NA NA NA
thresholds, #' TMA_IF02 Total stroma GBp -1.00 NA NA NA NA NA NA NA NA NA
tissue_table){ #' TMA_IF02 Total stroma GBp_total -1.00 NA NA NA NA NA NA NA NA NA
# ******************************************************************************************** #' TMA_IF02 Total stroma KI67p 19.99 NA NA NA NA NA NA NA NA NA
# Generate a table where each row contains values for a given sample, image ID, cell type and #' TMA_IF02 Total stroma KI67p_total 19.99 NA NA NA NA NA NA NA NA NA
# tissue type combination. The summary_table has the following columns: #' TMA_IF02 Total stroma CD8p_GBp NA NA NA NA NA NA NA NA NA NA
# - sample_name : name of sample. #' TMA_IF02 Total stroma CD8p_GBp_total NA NA NA NA NA NA NA NA NA NA
# - image_id : name of image subset within the sample. #' TMA_IF02 Total stroma CD8p_KI67p NA NA NA NA NA NA NA NA NA NA
# - tissue_type : name of tissue (e.g. "stroma", "tumor"). #' TMA_IF02 Total stroma CD8p_KI67p_total NA NA NA NA NA NA NA NA NA NA
# - cell_type : name of cell type. Either an individual marker or a combination (e.g. "CD4", "CD8p_FPXP3n"). #' TMA_IF02 Total stroma Total NA NA 13431416.5 NA NA NA NA NA NA NA
# - threshold : threshold value for mean marker intensity values. These are the threshold that are used to #' TMA_IF02 Total tumor CD8p -1.00 NA NA NA NA NA NA NA NA NA
# reclassify the marker intensity values into 0/1 values. Note that these values only exist #' TMA_IF02 Total tumor CD8p_total -1.00 NA NA NA NA NA NA NA NA NA
# for the rows corresponding to individual markers (combination cell types and Total rows #' TMA_IF02 Total tumor GBp -1.00 NA NA NA NA NA NA NA NA NA
# have no values in the column). #' TMA_IF02 Total tumor GBp_total -1.00 NA NA NA NA NA NA NA NA NA
# - SurfacePIX : surface of each cellType in pixel units. #' TMA_IF02 Total tumor KI67p 19.99 NA NA NA NA NA NA NA NA NA
# - SurfaceMM2 : surface of each cellType, in square mm. #' TMA_IF02 Total tumor KI67p_total 19.99 NA NA NA NA NA NA NA NA NA
# - CellDensity: density of cells per square mm. #' TMA_IF02 Total tumor CD8p_GBp NA NA NA NA NA NA NA NA NA NA
# - CellCount : cell count for the given cell type in the given tissue type. #' TMA_IF02 Total tumor CD8p_GBp_total NA NA NA NA NA NA NA NA NA NA
# - IntMean : mean value of marker intensity for the cell belonging to the row's cellType. #' TMA_IF02 Total tumor CD8p_KI67p NA NA NA NA NA NA NA NA NA NA
# - IntMedian : median value " " " #' TMA_IF02 Total tumor CD8p_KI67p_total NA NA NA NA NA NA NA NA NA NA
# - IntMin : minimum value " " " #' TMA_IF02 Total tumor Total NA NA 2321735.6 NA NA NA NA NA NA NA
# - IntMax : maximum value " " " #'
# - IntSD : standard deviation value " " " #' @param sample_name [string]
# #' @param image_ids [string vector]
# SampleName ImageID CellType TissueType Threshold SurfacePIX SurfaceMM2 #' @param cell_types [string vector]
# His2757_7 Total CD4 Stroma 0.5 NA NA #' @param tissue_list [string vector]
# His2757_7 Total CD4 Tumor 0.5 NA NA #' @param markers_phenotyped [string vector]
# His2757_7 Total FOXP3 Stroma 0.2 NA NA #' @param markers_scored [string vector]
# His2757_7 Total FOXP3 Tumor 0.2 NA NA #' @param thresholds [data frame]
# His2757_7 Total CD4p_FOXP3p Stroma NA NA NA #' @param tissue_table [data frame]
# His2757_7 Total CD4p_FOXP3p Tumor NA NA NA generate_summary_table <- function(sample_name, image_ids, cell_types, tissue_list,
# His2757_7 Total Total Stroma NA 299488 NA markers_phenotyped, markers_scored, thresholds, tissue_table){
# His2757_7 Total Total Tumor NA 2108192 NA
# His2757_7 39995_14773 CD4 Stroma 0.5 NA NA # Input check.
# His2757_7 39995_14773 CD4 Tumor 0.5 NA NA stopifnot(tissue_table[,'sample_name'] == sample_name)
# His2757_7 39995_14773 FOXP3 Stroma 0.2 NA NA
# His2757_7 39995_14773 FOXP3 Tumor 0.2 NA NA
# His2757_7 39995_14773 CD4p_FOXP3p Stroma NA NA NA
# His2757_7 39995_14773 CD4p_FOXP3p Tumor NA NA NA
# His2757_7 39995_14773 Total Stroma NA 149744 NA
# His2757_7 39995_14773 Total Tumor NA 1054096 NA
#
#
# Input arguments:
# sample_name: string. Name of sample.
#
# ********************************************************************************************
stopifnot(all(tissue_table[,'sample_name'] == sample_name))
# Create summary table backbone. # Create summary table backbone.
# ***************************** # *****************************
# Add "Total" values to the list of image IDs and cell types. These represent the total for # Add "Total" values to the list of image IDs and cell types. These represent the total for
# all image IDs, and the total for all cell types (i.e. all cell types together). # all image IDs, and the total for all cell types, i.e. all cell types together.
image_ids = c('Total', image_ids) image_ids = c('Total', image_ids)
cell_types = c(cell_types, 'Total') cell_types = c(cell_types, 'Total')
# Compute nuber of rows of the summary_table (i.e. image ID, cell type and tissue type combinations) # Compute number of rows of the summary_table, i.e. image ID, cell type and tissue type
# combinations.
row_nb = length(tissue_list) * length(cell_types) * length(image_ids) row_nb = length(tissue_list) * length(cell_types) * length(image_ids)
# Create table. # Create table.
...@@ -159,31 +149,31 @@ generate_summary_table <- function(sample_name, ...@@ -159,31 +149,31 @@ generate_summary_table <- function(sample_name,
summary_table[summary_table[,'CellType'] %in% summary_table[summary_table[,'CellType'] %in%
paste0(rep(markers_phenotyped, each=2), c('p','p_total')), 'Threshold'] = -1 paste0(rep(markers_phenotyped, each=2), c('p','p_total')), 'Threshold'] = -1
# Scored markers get the threshold value extracted from the thresholds table. # Scored markers get their threshold value extracted from the thresholds table.
for(marker in markers_scored){ for(marker in markers_scored){
for(tissue in tissue_list){ for(tissue in tissue_list){
stop("######## NOT IMPLEMENTED YET") col_index = which(summary_table[,'CellType'] %in% paste0(marker, c('p','p_total')) &
tmpColNb = which( summary_table[,'CellType'] %in% paste0(marker, c('p','p_total'), sep='') & summary_table[,'TissueType'] == tissue)
summary_table[,'TissueType'] == tissue ) summary_table[col_index, 'Threshold'] = thresholds[thresholds$tissue_type == tissue, marker]
summary_table[tmpColNb,'Threshold'] = threshold_table[sample_name,marker,tissue]
} }
} }
# Add surface values in MM2 to summary table. # Add surface values in mm2 to summary table.
# ****************************************** # ******************************************
# Add the surface values for each image ID and tissue type to the "Total" cell type row of the # Add the surface values for each image ID and tissue type to the "Total" cell type row of the
# summary table. # summary table.
for(image_id in image_ids){ for(image_id in image_ids){
for(tissue in tissue_list){ for(tissue in tissue_list){
row_nb = which(summary_table$ImageID == image_id & row_index = which(summary_table$ImageID == image_id &
summary_table$CellType == 'Total' & summary_table$CellType == 'Total' &
summary_table$TissueType == tissue) summary_table$TissueType == tissue)
surface_rows = switch(ifelse(image_id == 'Total', 1, 2), surface_rows = switch(ifelse(image_id == 'Total', 1, 2),
which(tissue_table[,'tissue_category']==tissue), which(tissue_table[,'tissue_category']==tissue),
which(tissue_table[,'tissue_category']==tissue & which(tissue_table[,'tissue_category']==tissue &
tissue_table[,'image_id']==image_id)) tissue_table[,'image_id']==image_id))
summary_table[row_nb, 'SurfaceMM2'] = sum(tissue_table[surface_rows, 'region_area_surface']) summary_table[row_index, 'SurfaceMM2'] = sum(tissue_table[surface_rows,
'region_area_surface'])
} }
} }
...@@ -195,46 +185,46 @@ generate_summary_table <- function(sample_name, ...@@ -195,46 +185,46 @@ generate_summary_table <- function(sample_name,
#################################################################################################### ####################################################################################################
#' Computes the following statistics for all cell types of a given image_id and tissue type:
#'
#' - cell count : number of cells with type "cell_type" in tissue "tissue_type".
#' - intensity mean : mean intensity value for cells matching "cell_type" and "tissue_type".
#' - intensity median: median ...
#' - intensity min : minimum ...
#' - intensity max : maximum ...
#' - intensity SD : standard deviation ...
#'
#' The function returns a data frame with the statistic values in the same order as listed above.
#'
#' @param image_id [string] image_id for which the statistics should be computed. If this value is
#' set to "Total" the statistics are computed on all images in cell_table.
#' @param tissue_type [string] Tissue type, e.g. "stroma", "tumor".
#' @param cell_types [string] cell type, e.g. "CD4", "CD8", "CD4_CD8".
#' @param cell_table [data frame] Data frame containing MEAN and RECLASSIFIED marker intensity
#' values.
#'
cell_type_statistics <- function(image_id, tissue_type, cell_types, cell_table){ cell_type_statistics <- function(image_id, tissue_type, cell_types, cell_table){
# *******************************************************************************************
# Computes the following statistics for all cell types of a given image_id and tissue type:
# - cell count : number of cells with type "cell_type" in tissue "tissue_type".
# - intensity mean : mean intensity value for cells maching "cell_type" and "tissue_type".
# - intensity median: median ...
# - intensity min : minimum ...
# - intensity max : maximum ...
# - intensity SD : standard deviation ...
#
# The function returns a data frame with the statistic values in the same order as listed above.
#
# Input arguments:
# cell_type : string describing cell type, e.g. "CD4", "CD8", "CD4_CD8".
# tissue_type: string for tissue type, e.g. "stroma", "tumor".
# cell_table : data frame containing MEAN and RECLASSIFIED marker intensity values,
# in the format as returned by the load_cell_data() function.
# image_id : string giving the image_id value of the subset image for which the
# statistics should be computed. If this value is set to "Total" (the
# default), then the statistics are computed on all images contained in
# the cell_table.
# *******************************************************************************************
# Create return data frame. # Create return data frame.
cell_types = c(cell_types, 'Total')
output_df = data.frame('ImageID' = image_id, output_df = data.frame('ImageID' = image_id,
'TissueType' = tissue_type, 'TissueType' = tissue_type,
'CellType' = cell_types, 'CellType' = c(cell_types, 'Total'),
'CellCount' = 0, 'CellCount' = 0,
row.names = cell_types, 'IntMean' = NA,
'IntMedian' = NA,
'IntMin' = NA,
'IntMax' = NA,
'IntSD' = NA,
row.names = c(cell_types, 'Total'),
stringsAsFactors = FALSE) stringsAsFactors = FALSE)
output_df[,c('IntMean', 'IntMedian', 'IntMin', 'IntMax', 'IntSD')] = NA
# Subset cell_table to keep only the rows that are matching the requested "tissue_category" # Subset cell_table to keep only the rows that are matching the requested tissue type and
# and "image_id" values. If the image ID is 'Total', then we only subset by tissue type # image_id value. If image_id is 'Total', only subset by tissue type to get the sum across all
# because it corresponds values for the sum accross all image_id values. # image_id values.
# Note: it's much faster to first subset by image_id and then by tissue_type than the reverse. # Note: it's much faster to first subset by image_id and then by tissue_type than the reverse.
# get("image_id", 1L) if(image_id != 'Total') cell_table = cell_table[cell_table$image_id == image_id, ]
if(image_id != 'Total') cell_table = cell_table[cell_table$image_id==image_id, ] cell_table = cell_table[cell_table$tissue_category == tissue_type, ]
cell_table = cell_table[cell_table$tissue_category==tissue_type, ]
# If the subset has 0 rows, the image ID has no cells for the given tissue type. This does # If the subset has 0 rows, the image ID has no cells for the given tissue type. This does
# sometimes occur. # sometimes occur.
if(nrow(cell_table) == 0) return(output_df) if(nrow(cell_table) == 0) return(output_df)
...@@ -252,7 +242,7 @@ cell_type_statistics <- function(image_id, tissue_type, cell_types, cell_table){ ...@@ -252,7 +242,7 @@ cell_type_statistics <- function(image_id, tissue_type, cell_types, cell_table){
# Loop through all cell types and compute statistics # Loop through all cell types and compute statistics
for(cell_type in cell_types[-length(cell_types)]){ for(cell_type in cell_types){
# Split the cell type into individual markers. # Split the cell type into individual markers.
# ******************************************* # *******************************************
...@@ -284,8 +274,8 @@ cell_type_statistics <- function(image_id, tissue_type, cell_types, cell_table){ ...@@ -284,8 +274,8 @@ cell_type_statistics <- function(image_id, tissue_type, cell_types, cell_table){
# to the target cell_type. # to the target cell_type.
# -> regular: keep only cells (rows of table) that are positive for the markers of the # -> regular: keep only cells (rows of table) that are positive for the markers of the
# target cell_type and negative for all other markers. # target cell_type and negative for all other markers.
row_sum_cell_type = rowSums(sub_table[, in_cell_type, drop=F]) row_sum_cell_type = rowSums(sub_table[, in_cell_type, drop=FALSE])
row_sum_total = rowSums(sub_table[, c(in_cell_type,not_in_cell_type), drop=F]) row_sum_total = rowSums(sub_table[, c(in_cell_type,not_in_cell_type), drop=FALSE])
if(endsWith(cell_type, '_total')){ if(endsWith(cell_type, '_total')){
sub_table = sub_table[row_sum_cell_type == length(in_cell_type),] sub_table = sub_table[row_sum_cell_type == length(in_cell_type),]
} else{ } else{
......
...@@ -131,19 +131,19 @@ merge_cell_data_files <- function(files_to_merge){ ...@@ -131,19 +131,19 @@ merge_cell_data_files <- function(files_to_merge){
if(as.numeric(value_frequency[1]) > as.numeric(value_frequency[2])){ if(as.numeric(value_frequency[1]) > as.numeric(value_frequency[2])){
tissue_cat_df[x,] = names(value_frequency)[1] tissue_cat_df[x,] = names(value_frequency)[1]
if(SHOW_TISSUE_CATEGORY_MISMATCH_WARNING) raise_error( if(SHOW_TISSUE_CATEGORY_MISMATCH_WARNING) raise_error(
msg = c(paste0('Tissue_category values differ across files. ', msg = c(paste0('[tissue_category] values differ across files. ',
'Values were reconciled based on majority ruling.'), 'Values were reconciled based on majority ruling.'),
paste0('Offending row: ', x)), paste0('Offending row: ', x)),
file=files_to_merge[1], file = files_to_merge[1],
type = 'warning') type = 'warning')
# Case 2: majority ruling is not possible # Case 2: majority ruling is not possible.
} else{ } else{
raise_error( raise_error(
msg = c('Could not merge individual marker files.', msg = c('Could not merge individual marker files.',
'Reason: tissue_category values differ across files with no majority.', 'Reason: tissue_category values differ across files with no majority.',
paste0('Offending row: ', x), paste0('Offending row: ', x),
paste0('Offending values:', paste(tissue_cat_df[x,], collapse=' '))), paste0('Offending values: ', paste(tissue_cat_df[x,], collapse=' '))),
file = files_to_merge[1]) file = files_to_merge[1])
} }
} }
......
...@@ -36,6 +36,8 @@ inputdir_check <- function(input_dir, output_dir){ ...@@ -36,6 +36,8 @@ inputdir_check <- function(input_dir, output_dir){
# Verify that input parameters and sample rename files are present, and if needed rename them. # Verify that input parameters and sample rename files are present, and if needed rename them.
rename_file_by_pattern(file_name=PARAMETERS_FILE, pattern='param', dir_name=input_dir, rename_file_by_pattern(file_name=PARAMETERS_FILE, pattern='param', dir_name=input_dir,
out_dir=output_dir, raise_error_if_absent=TRUE) out_dir=output_dir, raise_error_if_absent=TRUE)
rename_file_by_pattern(file_name=THRESHOLDS_FILE, pattern='threshold', dir_name=input_dir,
out_dir=output_dir, raise_error_if_absent=FALSE)
rename_file_by_pattern(file_name=SAMPLE_RENAME_FILE, pattern='rename', dir_name=input_dir, rename_file_by_pattern(file_name=SAMPLE_RENAME_FILE, pattern='rename', dir_name=input_dir,
out_dir=output_dir, raise_error_if_absent=FALSE) out_dir=output_dir, raise_error_if_absent=FALSE)
return(invisible(NULL)) return(invisible(NULL))
...@@ -104,7 +106,6 @@ standardize_and_split_cell_data <- function(input_file, ...@@ -104,7 +106,6 @@ standardize_and_split_cell_data <- function(input_file,
phenotype_confidence_threshold, phenotype_confidence_threshold,
delete_input_file = FALSE){ delete_input_file = FALSE){
# ******************************************************************************************** # ********************************************************************************************
#
# Differences between inForm versions: # Differences between inForm versions:
# version 2.2 # version 2.2
# - *_tissue_seg_data_summary.txt files contain a column named "Region Area (pixels)". # - *_tissue_seg_data_summary.txt files contain a column named "Region Area (pixels)".
...@@ -113,7 +114,7 @@ standardize_and_split_cell_data <- function(input_file, ...@@ -113,7 +114,7 @@ standardize_and_split_cell_data <- function(input_file,
# - a new column named "Annotation ID" is added to *_cell_seg_data.txt. # - a new column named "Annotation ID" is added to *_cell_seg_data.txt.
# - *_tissue_seg_data_summary.txt files contain column named "Region Area (square microns)" # - *_tissue_seg_data_summary.txt files contain column named "Region Area (square microns)"
# - in addition the "Sample ID" column of the *_cell_seg_data.txt files no longer contains # - in addition the "Sample ID" column of the *_cell_seg_data.txt files no longer contains
# the imageID values. These are now present in the "Annotation ID" column. # the image ID values. These are now present in the "Annotation ID" column.
# #
# #
# Input arguments: # Input arguments:
...@@ -125,7 +126,7 @@ standardize_and_split_cell_data <- function(input_file, ...@@ -125,7 +126,7 @@ standardize_and_split_cell_data <- function(input_file,
# delete_input_file: if TRUE, the input_file is deleted after it was split by samples. # delete_input_file: if TRUE, the input_file is deleted after it was split by samples.
# ******************************************************************************************** # ********************************************************************************************
# Load input table. Verifty it is not empty and standardize the column names. # Load input table. Verify it is not empty and standardize the column names.
input_table = read.table(input_file, sep='\t', as.is=T, header=T, input_table = read.table(input_file, sep='\t', as.is=T, header=T,
colClasses='character', check.names=T, strip.white=T) colClasses='character', check.names=T, strip.white=T)
if(nrow(input_table) == 0) raise_error('Input file has zero rows.', file = input_file) if(nrow(input_table) == 0) raise_error('Input file has zero rows.', file = input_file)
...@@ -148,7 +149,10 @@ standardize_and_split_cell_data <- function(input_file, ...@@ -148,7 +149,10 @@ standardize_and_split_cell_data <- function(input_file,
sample_names = extract_sample_name(input_table[,'sample_name'], input_file=input_file) sample_names = extract_sample_name(input_table[,'sample_name'], input_file=input_file)
input_table = input_table[sample_names %in% samples,] input_table = input_table[sample_names %in% samples,]
sample_names = sample_names[sample_names %in% samples] sample_names = sample_names[sample_names %in% samples]
if(nrow(input_table) == 0) raise_error('Input file has zero rows.', file = input_file) if(nrow(input_table) == 0) raise_error(
msg = 'No matching values found in [sample_name] column for any of the input samples',
file = input_file,
items_to_list = samples)
# Extract image ID values. If the "annotation_id" column is present (inForm 2.4), extract # Extract image ID values. If the "annotation_id" column is present (inForm 2.4), extract
...@@ -173,7 +177,7 @@ standardize_and_split_cell_data <- function(input_file, ...@@ -173,7 +177,7 @@ standardize_and_split_cell_data <- function(input_file,
file_values_are_from = input_file) file_values_are_from = input_file)
# Reclass 'phenotype_values' for rows where confidence < phenotype_confidence_threshold # Re-class 'phenotype_values' for rows where confidence < phenotype_confidence_threshold
# to the value of 'MISSING'. # to the value of 'MISSING'.
phenotype_values[which(confidence_values < phenotype_confidence_threshold)] = 'MISSING' phenotype_values[which(confidence_values < phenotype_confidence_threshold)] = 'MISSING'
...@@ -352,17 +356,17 @@ standardize_and_split_tissue_data <- function(input_file, ...@@ -352,17 +356,17 @@ standardize_and_split_tissue_data <- function(input_file,
#################################################################################################### ####################################################################################################
#' Standardize column names of input files.
#'
#' @param column_names [string vector] Names of columns to standardize.
#' @param input_file [string] Path and name of file from which the columns were taken. Only used to
#' display an error message.
#' @return Standardized column names.
standardize_column_names = function(column_names, input_file){ standardize_column_names = function(column_names, input_file){
# ********************************************************************************************
#
# Input arguments:
# - column_names: string vector. Names of columns to standardize.
# - input_file: file name from which the columns were taken. Only used to display an error
# message.
# ********************************************************************************************
# Replace any '.' in column names by an '_'. The '.' are generally introduced in column names # Replace any '.' in column names by an '_'. The '.' are generally introduced in column names
# by R as a replacement of a non-authorized character such as a blank space or a bracket. # by R as a replacement of a non-authorized character such as a blank space or a bracket.
# For readability, multiple '.' are replaced by a single '_'.
column_names = gsub(pattern='\\.+', replacement='_', x=column_names) column_names = gsub(pattern='\\.+', replacement='_', x=column_names)
column_names = gsub(pattern='_+$', replacement='', x=column_names) column_names = gsub(pattern='_+$', replacement='', x=column_names)
...@@ -382,10 +386,18 @@ standardize_column_names = function(column_names, input_file){ ...@@ -382,10 +386,18 @@ standardize_column_names = function(column_names, input_file){
for(i in grep(paste0(col_start_regexp, '.*_mean_.*'), x=column_names)){ for(i in grep(paste0(col_start_regexp, '.*_mean_.*'), x=column_names)){
marker_name = sub(col_start_regexp, '', column_names[i]) marker_name = sub(col_start_regexp, '', column_names[i])
marker_name = sub('_.*$', '', marker_name) marker_name = sub('_.*$', '', marker_name)
# If the marker is present in the AUTHORIZED_MARKERS list, correct its capitalization if
# needed.
x = which(toupper(marker_name) == toupper(AUTHORIZED_MARKERS))
stopifnot(length(x) <= 1)
marker_name = ifelse(length(x) == 0, marker_name, AUTHORIZED_MARKERS[x])
# Rename column.
column_names[i] = paste0(marker_name, '_mean') column_names[i] = paste0(marker_name, '_mean')
} }
# Verify there is no duplicate column. # Verify there are no duplicate columns.
duplicated_columns = which(duplicated(column_names)) duplicated_columns = which(duplicated(column_names))
if(length(duplicated_columns) > 0) raise_error( if(length(duplicated_columns) > 0) raise_error(
msg = 'Duplicated column names found in input file:', msg = 'Duplicated column names found in input file:',
...@@ -454,6 +466,8 @@ check_and_fix_phenotype_values <- function(phenotype_values, ...@@ -454,6 +466,8 @@ check_and_fix_phenotype_values <- function(phenotype_values,
type = 'warning') type = 'warning')
} }
# Replace
# Substitute '-' with '_' in Phenotype values. This is for the case where a '-' was used as