Commit 2aadaaa3 authored by Robin Engler's avatar Robin Engler
Browse files

Add support for auto-deletion of rows with tissue category mismatch up to a...

Add support for auto-deletion of rows with tissue category mismatch up to a total of TISSUE_CATEGORY_MISMATCH_THRESHOLD percent
parent ab421232
...@@ -3,7 +3,7 @@ ...@@ -3,7 +3,7 @@
# declaration is made at the top level. # declaration is made at the top level.
# Set application version. # Set application version.
POSTINFORM_VERSION <<- '0.1.0' POSTINFORM_VERSION <<- '0.2.0'
# Define default parameter values. # Define default parameter values.
DEFAULT_CELL_COMPARTMENT = 'nucleus' DEFAULT_CELL_COMPARTMENT = 'nucleus'
...@@ -18,7 +18,7 @@ AUTHORIZED_TISSUES <<- c('stroma', 'tumor', 'dermis', 'epidermis', 'melano ...@@ -18,7 +18,7 @@ AUTHORIZED_TISSUES <<- c('stroma', 'tumor', 'dermis', 'epidermis', 'melano
AUTHORIZED_COMPARTMENTS <<- c('nucleus', 'membrane', 'cytoplasm', 'entire_cell') AUTHORIZED_COMPARTMENTS <<- c('nucleus', 'membrane', 'cytoplasm', 'entire_cell')
AUTHORIZED_STROMA_VALUES <<- c('DAPI', 'stroma', 'other') AUTHORIZED_STROMA_VALUES <<- c('DAPI', 'stroma', 'other')
AUTHORIZED_TUMOR_VALUES <<- c('CK', 'tumor') AUTHORIZED_TUMOR_VALUES <<- c('CK', 'tumor')
AUTHORIZED_MARKERS <<- c('CAL', 'CD3', 'CD4', 'CD8', 'CD11c', 'CD15', 'CD20', 'CD56', 'CD68', AUTHORIZED_MARKERS <<- c('CAL', 'CD3', 'CD4', 'CD8', 'CD11C', 'CD15', 'CD20', 'CD56', 'CD68',
'CD103', 'CD163', 'CD206', 'FOXP3', 'GB', 'gH2AX', 'gH2AXN', 'IDO', 'CD103', 'CD163', 'CD206', 'FOXP3', 'GB', 'gH2AX', 'gH2AXN', 'IDO',
'IL10R', 'Keratin', 'KI67', 'PD1', 'PDL1', 'PERFORIN', 'SOX10', 'IL10R', 'Keratin', 'KI67', 'PD1', 'PDL1', 'PERFORIN', 'SOX10',
'WT1', 'CK', 'VISTA') 'WT1', 'CK', 'VISTA')
...@@ -44,11 +44,18 @@ LOG_FILE_NAME <<- 'log.txt' ...@@ -44,11 +44,18 @@ LOG_FILE_NAME <<- 'log.txt'
# above which a warning is shown. # above which a warning is shown.
# * FILE_LOSS_PERCENTAGE_THRESHOLD: threshold (as a percentage) above which a warning is # * FILE_LOSS_PERCENTAGE_THRESHOLD: threshold (as a percentage) above which a warning is
# displayed during file merging. # displayed during file merging.
# * SHOW_TISSUE_CATEGORY_MISMATCH_WARNING: if TRUE, the # * SHOW_TISSUE_CATEGORY_MISMATCH_WARNING: if TRUE, the warning "Tissue_category values differ
# "Tissue_category values differ across files" is shown. Otherwise the warning is skipped. # across files" is shown. Otherwise the warning is skipped.
MARKER_INTENSITY_THRESHOLD <<- 0.01 # * TISSUE_CATEGORY_MISMATCH_THRESHOLD: when there are Tissue Category mismatches between
FILE_LOSS_PERCENTAGE_THRESHOLD <<- 5 # individual marker files that cannot be resolved by majority ruling, the offending lines are
# deleted and a warning is displayed to the user. If the percentage of cells (lines) deleted
# is > TISSUE_CATEGORY_MISMATCH_THRESHOLD then an error is generated. As long as the number of
# deleted cells remains <= the threshold, only a warning is show. The threshold value is
# a percentage (e.g. 5 = 5%).
MARKER_INTENSITY_THRESHOLD <<- 0.01
FILE_LOSS_PERCENTAGE_THRESHOLD <<- 5
SHOW_TISSUE_CATEGORY_MISMATCH_WARNING <<- TRUE SHOW_TISSUE_CATEGORY_MISMATCH_WARNING <<- TRUE
TISSUE_CATEGORY_MISMATCH_THRESHOLD <<- 5
# InForm version data formats supported. # InForm version data formats supported.
SUPPORTED_INFORM_VERSIONS <<- c(2.2, 2.4) SUPPORTED_INFORM_VERSIONS <<- c(2.2, 2.4)
......
...@@ -76,7 +76,7 @@ merge_cell_data_files <- function(files_to_merge){ ...@@ -76,7 +76,7 @@ merge_cell_data_files <- function(files_to_merge){
# Merge data. # Merge data.
# ********** # **********
# Merge data frame for the current marker with the global dataframe 'merged_df'. # Merge data frame for the current marker with the global data frame 'merged_df'.
if(is.null(merged_df)){ if(is.null(merged_df)){
merged_df = input_df merged_df = input_df
} else{ } else{
...@@ -119,36 +119,88 @@ merge_cell_data_files <- function(files_to_merge){ ...@@ -119,36 +119,88 @@ merge_cell_data_files <- function(files_to_merge){
# *********************************************** # ***********************************************
# In principle, the "Tissue Category" columns of all individual markers should contain the # In principle, the "Tissue Category" columns of all individual markers should contain the
# same value. Here we verify that this is the case and then keep only one copy of them. # same value. Here we verify that this is the case and then keep only one copy of them.
if(any(tissue_cat_df != tissue_cat_df[,1])){ diff_rows = sort(unique(unlist(lapply(2:ncol(tissue_cat_df),
diff_rows = sort(unique(unlist(lapply(2:ncol(tissue_cat_df), FUN=function(x) which(tissue_cat_df[x] != tissue_cat_df[,1])))))
FUN=function(x) which(tissue_cat_df[x] != tissue_cat_df[,1])))))
# If the tissue category data frame contains >= 3 columns, we can try to perform majority
# ruling to resolve some of the conflicting values.
if(ncol(tissue_cat_df) >= 3){
fixed_row = NULL
for(x in diff_rows){ for(x in diff_rows){
# If one of the tissue values has a majority within the row, majority ruling is
# possible and the most frequent value is used.
value_frequency = sort(table(as.character(tissue_cat_df[x,])), decreasing=T) value_frequency = sort(table(as.character(tissue_cat_df[x,])), decreasing=T)
stopifnot(length(value_frequency) >= 2) stopifnot(length(value_frequency) >= 2)
# Case 1: one of the tissue values has a majority within the row. Majority ruling is
# possible and the most frequent value is used.
if(as.numeric(value_frequency[1]) > as.numeric(value_frequency[2])){ if(as.numeric(value_frequency[1]) > as.numeric(value_frequency[2])){
tissue_cat_df[x,] = names(value_frequency)[1] tissue_cat_df[x,] = names(value_frequency)[1]
if(SHOW_TISSUE_CATEGORY_MISMATCH_WARNING) raise_error( fixed_row = c(fixed_row, x)
msg = c(paste0('[tissue_category] values differ across files. ',
'Values were reconciled based on majority ruling.'),
paste0('Offending row: ', x)),
file = files_to_merge[1],
type = 'warning')
# Case 2: majority ruling is not possible.
} else{
raise_error(
msg = c('Could not merge individual marker files.',
'Reason: tissue_category values differ across files with no majority.',
paste0('Offending row: ', x),
paste0('Offending values: ', paste(tissue_cat_df[x,], collapse=' '))),
file = files_to_merge[1])
} }
} }
stopifnot(all(tissue_cat_df == tissue_cat_df[,1]))
# Display warning message if any values were fixed through majority ruling.
if(length(fixed_row) > 0){
if(SHOW_TISSUE_CATEGORY_MISMATCH_WARNING) raise_error(
msg = c('[tissue_category] values differ across files.',
'The following mismatches were reconciled based on majority ruling.',
paste0('Number of offending rows: ',
length(fixed_row), ' [',
round(length(fixed_row)/nrow(tissue_cat_df)*100, 5),
'%]'),
'List of offending rows:'),
items_to_list = fixed_row,
file = files_to_merge[1],
type = 'warning')
# Re-evaluate the number of rows that differ between tissue category values.
diff_rows = sort(unique(unlist(lapply(2:ncol(tissue_cat_df),
FUN=function(x) which(tissue_cat_df[x] != tissue_cat_df[,1])))))
}
rm(fixed_row, x)
} }
# At this point any remaining mismatch cannot be reconciled with majority ruling.
# Therefore the only options we have is to:
# 1. Delete the offending lines and trigger a warning if the number of offending lines
# is smaller than a user-defined threshold.
# 2. If the number of offending lines is above the threshold, an error is generated.
if(length(diff_rows) > 0){
percentage_mismatch = length(diff_rows) / nrow(tissue_cat_df) * 100
# Case 1: the number of non-reconciled mismatches exceeds the user-defined limit.
if(percentage_mismatch > TISSUE_CATEGORY_MISMATCH_THRESHOLD) raise_error(
msg = c('Too many [tissue_category] values differ across files to merge and ',
'could not be reconcilded through majority ruling.',
paste0('Number of mismatches: ', length(diff_rows), " [",
round(percentage_mismatch, 5), "%]"),
paste0('Threshold above which this error is generated: ',
TISSUE_CATEGORY_MISMATCH_THRESHOLD, '%'),
paste0('The threshold value can be modified through the ',
'TISSUE_CATEGORY_MISMATCH_THRESHOLD parameter in the ',
'configuration file.'),
'The offending rows are the following:'),
items_to_list = sapply(diff_rows,
function(x) paste0(x, ' [', tissue_cat_df[x,1],
', ', tissue_cat_df[x,2], ']')),
file = files_to_merge[1])
# Case 2: the number of mismatches is <= the threshold. We only display a warning.
if(SHOW_TISSUE_CATEGORY_MISMATCH_WARNING) raise_error(
msg = c('A number of [tissue_category] values differ across files and could not be',
'reconcilded through majority ruling. Instead, the offending rows were deleted',
'from the input data.',
paste0('Number of mismatches: ', length(diff_rows), " [",
round(percentage_mismatch, 5), "%]"),
'The following lines had mismatches and were deleted:'),
items_to_list = sapply(diff_rows,
function(x) paste0(x, ' [', tissue_cat_df[x,1],
', ', tissue_cat_df[x,2], ']')),
file = files_to_merge[1],
type = 'warning')
# Set tissue category values that differ to NA.
tissue_cat_df[diff_rows, ] = NA
}
# Add tissue category values to the merged data frame. # Add tissue category values to the merged data frame.
merged_df[,'tissue_category'] = tissue_cat_df[,1] merged_df[,'tissue_category'] = tissue_cat_df[,1]
...@@ -185,18 +237,29 @@ merge_cell_data_files <- function(files_to_merge){ ...@@ -185,18 +237,29 @@ merge_cell_data_files <- function(files_to_merge){
if(length(differing_values) > 0){ if(length(differing_values) > 0){
differing_values = as.vector(as.matrix(differences))[differing_values] differing_values = as.vector(as.matrix(differences))[differing_values]
raise_error( raise_error(
msg=c(paste0('Values for column [', col_name, '] differ accross individual files'), msg=c(paste0('Values for column [', col_name, '] differ accross files to merge'),
paste0('to merge by more than ', MARKER_INTENSITY_THRESHOLD, ' at ', paste0('by more than ', MARKER_INTENSITY_THRESHOLD, ' at ',
length(differing_values), ' occurences [', length(differing_values), ' occurences [',
round(length(differing_values)/nrow(marker_int_df)*100, 3), '%].'), round(length(differing_values)/nrow(marker_int_df)*100, 5), '%].'),
'Values from the first file (alphabetically) will be used.'), 'Values from the first file (alphabetically) will be used.'),
file=dirname(files_to_merge[1]), file=dirname(files_to_merge[1]),
type = 'warning') type = 'warning')
} }
} }
# Add marker intensity values to merged dataframe. # Add marker intensity values to merged data frame.
merged_df = cbind(merged_df, marker_int_df[, 1:length(marker_intensity_cols)]) merged_df = cbind(merged_df, marker_int_df[, 1:length(marker_intensity_cols)])
# Delete rows that differed in tissue category. The stopifnot check is just to make sure the
# lines where deleted properly.
# TODO: at this point I'm using "which()" to identify the rows need to be removed, but this
# creates a problem when there are no such rows, since then diff_rows = integer(0), and
# [-integer(0),] is the same as [integer(0),] and thus removes all rows of the table!
# So for now I'm guarding this statement with an if() check, but maybe there is a better
# way of doing it.
if(length(diff_rows) > 0) merged_df = merged_df[-diff_rows,]
stopifnot(!any(is.na(merged_df)))
stopifnot(nrow(merged_df) > 0)
return(merged_df) return(merged_df)
} }
#################################################################################################### ####################################################################################################
...@@ -293,7 +356,7 @@ remove_duplicated_rows <- function(input_df, key_fields, file_name){ ...@@ -293,7 +356,7 @@ remove_duplicated_rows <- function(input_df, key_fields, file_name){
duplicated_rows = which(duplicated(input_df[,key_fields])) duplicated_rows = which(duplicated(input_df[,key_fields]))
if(length(duplicated_rows) > 0){ if(length(duplicated_rows) > 0){
input_df = input_df[-duplicated_rows,] input_df = input_df[-duplicated_rows,]
raise_error(msg = 'The following duplicated rows were deleted from input file:', raise_error(msg = 'The following duplicated rows were deleted:',
file = file_name, file = file_name,
items_to_list = duplicated_rows, items_to_list = duplicated_rows,
type = 'warning') type = 'warning')
......
...@@ -709,67 +709,50 @@ markers_in_file_name <- function(file_name){ ...@@ -709,67 +709,50 @@ markers_in_file_name <- function(file_name){
#################################################################################################### ####################################################################################################
#' Performs the following checks on the input data given a list of samples:
#' * Input samples are present in the raw data.
#' * Samples present in the raw data are also present in input parameter file [warning].
#' * Each sample sub-directory contains exactly one cell and one tissue segmentation file.
#' #'
#' @param samples_from_parameters Sample names passed as input parameter by the user. #' @param samples [str vector] sample names passed in input by the user.
#' @param session_root_dir [string] Root directory for the current session. #' @param root_dir [str] Root directory for the current session.
check_sample_names <- function(samples_from_parameters, session_root_dir){ #' @return [NULL]
check_sample_data <- function(samples, root_dir){
# Retrieve samples present in data from subdirectory names present in session_root_dir.
samples_from_data = sort(basename(list.dirs(session_root_dir, full.names=TRUE, recursive=F))) # Retrieve samples present in data based on sub-directory names present in session's root_dir.
samples_from_data = sort(basename(list.dirs(root_dir, full.names=TRUE, recursive=F)))
# If samples present in the raw data are missing from the input parameters, raise warning.
missing_samples = samples_from_data[!samples_from_data %in% samples_from_parameters] # Verify all samples passed in input by the user are present in the raw data.
if(length(missing_samples) > 0) raise_error( if(length(setdiff(samples, samples_from_data)) > 0) raise_error(
msg = 'One or more samples listed in the input parameters are missing from the raw data:',
items_to_list = setdiff(samples, samples_from_data))
# Verify all samples present in the raw data are also present in the input parameters. If not,
# raise a warning.
if(length(setdiff(samples_from_data, samples)) > 0) raise_error(
msg = 'One or more samples present in the data are not listed in input parameters:', msg = 'One or more samples present in the data are not listed in input parameters:',
items_to_list = missing_samples, items_to_list = setdiff(samples_from_data, samples),
type = 'warning') type = 'warning')
# All sample names passed as input by the user must be present in the data. # Verify each sample sub-directory contains exactly one cell and one tissue segmentation file.
missing_samples = samples_from_parameters[!samples_from_parameters %in% samples_from_data]
if(length(missing_samples) > 0) raise_error(
msg = 'One or more samples listed in the input paramters are missing from the data:',
items_to_list = missing_samples)
}
####################################################################################################
####################################################################################################
check_sample_directories <- function(sample_list, session_root_dir){
# ********************************************************************************************
#
# sample_list : samples to process in the current analysis.
# session_root_dir: root directory for the current session.
# ********************************************************************************************
error_msg = NULL error_msg = NULL
for(sample_name in sample_list){ for(sample_name in samples){
sample_dir = file.path(root_dir, sample_name)
# Verify there is a subdirectory for the sample. for(file_type in c(CELL_FILES_EXTENSION, TISSUE_FILES_EXTENSION)){
sample_dir = file.path(session_root_dir, sample_name) files = list.files(path=sample_dir, pattern=paste0('.*', file_type, '$'),
if(!dir.exists(sample_dir)){ all.files=F, full.names=F, recursive=F, ignore.case=F)
error_msg = c(error_msg, paste0(sample_name,': no subdirectory found for sample.'))
next if(length(files) == 0){
error_msg = c(error_msg, paste0('Sample [', sample_name, ']: no [',
file_type, '] file found.'))
} else if(length(files) > 1){
error_msg = c(error_msg, paste0('Sample [', sample_name,']: more than one [',
file_type, '] file in subdirectory.'))
}
} }
# Verify that each sample subdirectory contains exactly one cell and one tissue
# segmentation file.
cell_files = list.files(path=sample_dir, pattern=paste0('.*', CELL_FILES_EXTENSION, '$'),
all.files=F, full.names=F, recursive=F, ignore.case=F)
tissue_files = list.files(path=sample_dir, pattern=paste0('.*',TISSUE_FILES_EXTENSION,'$'),
all.files=F, full.names=F, recursive=F, ignore.case=F)
if(length(cell_files) == 0) error_msg = c(error_msg,
paste0(sample_name,': no cell segmentation data found.'))
if(length(tissue_files) == 0) error_msg = c(error_msg,
paste0(sample_name,': no tissue segmentation data found.'))
if(length(cell_files) > 1) error_msg = c(error_msg,
paste0(sample_name,': more than one cell segmentation file in subdirectory.'))
if(length(tissue_files) > 1) error_msg = c(error_msg,
paste0(sample_name,': more than one tissue segmentation file in subdirectory.'))
}
if(length(error_msg) > 0){
log_message(paste('ERROR:', error_msg), padding='')
raise_error('One or more errors in input data detected. See above.')
} }
if(length(error_msg) > 0) raise_error(msg = "One or more errors detected in input data:",
items_to_list = error_msg)
} }
#################################################################################################### ####################################################################################################
......
...@@ -164,10 +164,10 @@ postinform <- function(input_file_or_dir, ...@@ -164,10 +164,10 @@ postinform <- function(input_file_or_dir,
#' @return nothing. #' @return nothing.
#' #'
postinform_pipeline <- function(input_dir, postinform_pipeline <- function(input_dir,
output_dir, output_dir,
command, command,
immucan_output = FALSE, immucan_output = FALSE,
no_bash = FALSE){ no_bash = FALSE){
# Input data check # Input data check
# **************** # ****************
...@@ -176,12 +176,22 @@ postinform_pipeline <- function(input_dir, ...@@ -176,12 +176,22 @@ postinform_pipeline <- function(input_dir,
# - load and check marker thresholds file. # - load and check marker thresholds file.
# - search for cell and tissue segmentation files in sub-directories or the session root. # - search for cell and tissue segmentation files in sub-directories or the session root.
log_message('Input data check:') log_message('Input data check:')
log_message(paste('input directory:', input_dir), level=2) log_message(paste0('input directory [', input_dir, ']'), level=2)
inputdir_check(input_dir, output_dir) inputdir_check(input_dir, output_dir)
log_message('input dir check: OK', level=2)
# Load session parameters from "parameters.txt".
log_message('session parameters...', level=2)
input_parameters = load_session_parameters(output_dir) input_parameters = load_session_parameters(output_dir)
log_message('session parameters: OK', level=2)
# Optionally load "sample_rename.txt" values if the file is present.
if(file_test('-f', file.path(output_dir, SAMPLE_RENAME_FILE))){
log_message('sample rename file...', level=2)
sample_rename = load_sample_rename_file(file.path(output_dir, SAMPLE_RENAME_FILE))
input_parameters$sample_rename = sample_rename
check_sample_rename(sample_rename, original_samples=input_parameters$samples)
rm(sample_rename)
}
log_message('completed', level=2, add_empty_line=TRUE) log_message('completed', level=2, add_empty_line=TRUE)
if(command == 'check') return(invisible(NULL)) if(command == 'check') return(invisible(NULL))
...@@ -273,23 +283,20 @@ postinform_pipeline <- function(input_dir, ...@@ -273,23 +283,20 @@ postinform_pipeline <- function(input_dir,
# Verify data is present for each sample. # Verify data is present for each sample.
# ************************************** # **************************************
# Verify all samples passed as input parameters are also present in the actual data. # Verify all samples passed as input parameters are also present in the actual data.
check_sample_names(samples_from_parameters=input_parameters$samples, output_dir) check_sample_data(samples=input_parameters$samples, root_dir=output_dir)
check_sample_directories(sample_list=input_parameters$samples, output_dir)
# Input sample renaming. # Input sample renaming.
# ********************* # *********************
if(file.exists(file.path(output_dir, SAMPLE_RENAME_FILE))){ if("sample_rename" %in% names(input_parameters)){
log_message('Sample renaming:') log_message('Sample renaming:')
# Modify sample names in input files. # Modify sample names in input files.
rename_samples(samples=input_parameters$samples, output_dir) rename_samples(sample_rename=input_parameters$sample_rename, root_dir=output_dir)
# Reload input parameters to update sample names. # Update input_parameters with new sample names.
input_parameters = load_session_parameters(output_dir) input_parameters$samples = sort(unlist(input_parameters$sample_rename, use.names=FALSE))
check_sample_names(samples_from_parameters=input_parameters$samples, output_dir) check_sample_data(samples=input_parameters$samples, root_dir=output_dir)
check_sample_directories(sample_list=input_parameters$samples, output_dir)
log_message('completed', level=2, add_empty_line=TRUE) log_message('completed', level=2, add_empty_line=TRUE)
} }
......
#################################################################################################### ####################################################################################################
rename_samples <- function(samples, session_root_dir){ #' Rename samples in the input data and split them into two if needed.
# ******************************************************************************************** #'
# rename_samples <- function(sample_rename, root_dir){
# ********************************************************************************************
# Load file that contains the sample renaming info.
sample_renaming = load_sample_rename_file(
input_file = file.path(session_root_dir, SAMPLE_RENAME_FILE),
sample_names = samples)
# Rename samples in cell and tissue segmentation files. # Rename samples in cell and tissue segmentation files.
# **************************************************** # ****************************************************
for(old_name in samples){ for(old_name in names(sample_rename)){
new_name = sample_renaming[[old_name]] new_name = sample_rename[[old_name]]
old_dir = file.path(session_root_dir, old_name) old_dir = file.path(root_dir, old_name)
new_dir = file.path(session_root_dir, new_name) new_dir = file.path(root_dir, new_name)
stopifnot(dir.exists(old_dir)) stopifnot(dir.exists(old_dir))
stopifnot(!dir.exists(new_dir)) stopifnot(!dir.exists(new_dir))
for(x in new_dir) dir.create(x, recursive=FALSE) for(x in new_dir) dir.create(x, recursive=FALSE)
log_message(paste0('rename ', old_name, ' to ', paste(new_name, collapse=' / '), log_message(paste0('rename ', old_name, ' to ', paste(new_name, collapse=' / '),
ifelse(length(new_name)>1, ' [sample will be split]','')), ifelse(length(new_name)>1, ' [sample will be split]', '')),
level=2) level=2)
# Load cell and tissue segmentation files, change sample name and write to disk. # Load cell and tissue segmentation files, change sample name and write to disk.
...@@ -30,8 +23,6 @@ rename_samples <- function(samples, session_root_dir){ ...@@ -30,8 +23,6 @@ rename_samples <- function(samples, session_root_dir){
new_file = file.path(new_dir, paste0(new_name, file_extension)) new_file = file.path(new_dir, paste0(new_name, file_extension))
tmp = read.table(old_file, sep='\t', as.is=TRUE, header=TRUE, tmp = read.table(old_file, sep='\t', as.is=TRUE, header=TRUE,
check.names=TRUE, strip.white=TRUE, colClasses='character') check.names=TRUE, strip.white=TRUE, colClasses='character')
stopifnot('sample_name' %in% colnames(tmp))
stopifnot('image_id' %in% colnames(tmp))
# Case 1: the data does not need to be split. # Case 1: the data does not need to be split.
if(length(new_name) == 1){ if(length(new_name) == 1){
...@@ -42,18 +33,19 @@ rename_samples <- function(samples, session_root_dir){ ...@@ -42,18 +33,19 @@ rename_samples <- function(samples, session_root_dir){
} else if(length(new_name) == 2){ } else if(length(new_name) == 2){
# Get image ID values that corresponds to the 2 sub-samples in the original sample. # Get image ID values that corresponds to the 2 sub-samples in the original sample.
if(file_extension == CELL_FILES_EXTENSION){ if(file_extension == CELL_FILES_EXTENSION){
stopifnot('cell_y_position' %in% colnames(tmp))
image_id_split = split_by_coordinate(coordinates = tmp[,'cell_y_position'], image_id_split = split_by_coordinate(coordinates = tmp[,'cell_y_position'],
image_ids = tmp[,'image_id']) image_ids = tmp[,'image_id'])
#plot(tmp$cell_x_position[rand_sample], tmp$cell_y_position[rand_sample], col=as.factor(tmp$image_id[rand_sample]), pch=18)
} }
#stopifnot(all(unique(tmp[,'image_id']) %in% unlist(image_id_split)))
if(!all(unique(tmp[,'image_id']) %in% unlist(image_id_split))) raise_error( if(!all(unique(tmp[,'image_id']) %in% unlist(image_id_split))) raise_error(
msg = 'Tissue seg data has more image IDs than cell seg data.', msg = c("Tissue seg data has more image IDs than cell seg data.",
file = old_file, type = 'warning') "Image ID values no present in the cell seg data will be removed."),
file = old_file,
type = 'warning')
# Temporary check for the split. # Temporary check for the split.
if(FALSE){ if(FALSE){
jpeg(filename = file.path(session_root_dir, paste0(old_name, '_split.jpg')), jpeg(filename = file.path(root_dir, paste0(old_name, '_split.jpg')),
width = 700, height = 1000) width = 700, height = 1000)
rand_sample = sample(1:nrow(tmp), ifelse(nrow(tmp) > 5000, 5000, nrow(tmp))) rand_sample = sample(1:nrow(tmp), ifelse(nrow(tmp) > 5000, 5000, nrow(tmp)))
plot(tmp$cell_x_position[rand_sample], tmp$cell_y_position[rand_sample], plot(tmp$cell_x_position[rand_sample], tmp$cell_y_position[rand_sample],
...@@ -62,7 +54,7 @@ rename_samples <- function(samples, session_root_dir){ ...@@ -62,7 +54,7 @@ rename_samples <- function(samples, session_root_dir){
dev.off() dev.off()
} }
# Split original sample based on imageID values. # Split original sample based on image ID values.
tmp_1 = tmp[tmp[,'image_id'] %in% image_id_split[[1]], ] tmp_1 = tmp[tmp[,'image_id'] %in% image_id_split[[1]], ]
tmp_2 = tmp[tmp[,'image_id'] %in% image_id_split[[2]], ] tmp_2 = tmp[tmp[,'image_id'] %in% image_id_split[[2]], ]
if(file_extension == CELL_FILES_EXTENSION) stopifnot(nrow(tmp_1)+nrow(tmp_2) == nrow(tmp)) if(file_extension == CELL_FILES_EXTENSION) stopifnot(nrow(tmp_1)+nrow(tmp_2) == nrow(tmp))
...@@ -71,93 +63,22 @@ rename_samples <- function(samples, session_root_dir){ ...@@ -71,93 +63,22 @@ rename_samples <- function(samples, session_root_dir){
write.table(tmp_1, file=new_file[1], sep='\t', quote=FALSE, row.names=FALSE) write.table(tmp_1, file=new_file[1], sep='\t', quote=FALSE, row.names=FALSE)
write.table(tmp_2, file=new_file[2], sep='\t', quote=FALSE, row.names=FALSE) write.table(tmp_2, file=new_file[2], sep='\t', quote=FALSE, row.names=FALSE)
} else stop("ERROR: THIS CASE IS NOT IMPLEMENTED YET.") } else stop("cannot split sample in more than 2. THIS CASE IS NOT IMPLEMENTED YET.")
unlink(old_file) unlink(old_file)
} }
unlink(old_dir, recursive=T) unlink(old_dir, recursive=T)
} }
# Rename samples in PARAMETERS_FILE.
# *********************************
file_name = file.path(session_root_dir, PARAMETERS_FILE)
file_content = read_file_as_vector(file_name, ignore_comments=FALSE)
# Find start and end position of 'samples:' paramter in the parameter file.
start_position = grep('^samples:', file_content)
stopifnot(length(start_position) == 1)
parameter_position = grep(':', file_content)
stopifnot(length(parameter_position) >= 2)
if(start_position == tail(parameter_position, 1)){
end_position = length(file_content)
} else end_position = parameter_position[which(parameter_position == start_position) + 1] - 1