Commit 2aadaaa3 authored by Robin Engler's avatar Robin Engler
Browse files

Add support for auto-deletion of rows with tissue category mismatch up to a...

Add support for auto-deletion of rows with tissue category mismatch up to a total of TISSUE_CATEGORY_MISMATCH_THRESHOLD percent
parent ab421232
......@@ -3,7 +3,7 @@
# declaration is made at the top level.
# Set application version.
POSTINFORM_VERSION <<- '0.1.0'
POSTINFORM_VERSION <<- '0.2.0'
# Define default parameter values.
DEFAULT_CELL_COMPARTMENT = 'nucleus'
......@@ -18,7 +18,7 @@ AUTHORIZED_TISSUES <<- c('stroma', 'tumor', 'dermis', 'epidermis', 'melano
AUTHORIZED_COMPARTMENTS <<- c('nucleus', 'membrane', 'cytoplasm', 'entire_cell')
AUTHORIZED_STROMA_VALUES <<- c('DAPI', 'stroma', 'other')
AUTHORIZED_TUMOR_VALUES <<- c('CK', 'tumor')
AUTHORIZED_MARKERS <<- c('CAL', 'CD3', 'CD4', 'CD8', 'CD11c', 'CD15', 'CD20', 'CD56', 'CD68',
AUTHORIZED_MARKERS <<- c('CAL', 'CD3', 'CD4', 'CD8', 'CD11C', 'CD15', 'CD20', 'CD56', 'CD68',
'CD103', 'CD163', 'CD206', 'FOXP3', 'GB', 'gH2AX', 'gH2AXN', 'IDO',
'IL10R', 'Keratin', 'KI67', 'PD1', 'PDL1', 'PERFORIN', 'SOX10',
'WT1', 'CK', 'VISTA')
......@@ -44,11 +44,18 @@ LOG_FILE_NAME <<- 'log.txt'
# above which a warning is shown.
# * FILE_LOSS_PERCENTAGE_THRESHOLD: threshold (as a percentage) above which a warning is
# displayed during file merging.
# * SHOW_TISSUE_CATEGORY_MISMATCH_WARNING: if TRUE, the
# "Tissue_category values differ across files" is shown. Otherwise the warning is skipped.
MARKER_INTENSITY_THRESHOLD <<- 0.01
FILE_LOSS_PERCENTAGE_THRESHOLD <<- 5
# * SHOW_TISSUE_CATEGORY_MISMATCH_WARNING: if TRUE, the warning "Tissue_category values differ
# across files" is shown. Otherwise the warning is skipped.
# * TISSUE_CATEGORY_MISMATCH_THRESHOLD: when there are Tissue Category mismatches between
# individual marker files that cannot be resolved by majority ruling, the offending lines are
# deleted and a warning is displayed to the user. If the percentage of cells (lines) deleted
# is > TISSUE_CATEGORY_MISMATCH_THRESHOLD then an error is generated. As long as the number of
# deleted cells remains <= the threshold, only a warning is show. The threshold value is
# a percentage (e.g. 5 = 5%).
MARKER_INTENSITY_THRESHOLD <<- 0.01
FILE_LOSS_PERCENTAGE_THRESHOLD <<- 5
SHOW_TISSUE_CATEGORY_MISMATCH_WARNING <<- TRUE
TISSUE_CATEGORY_MISMATCH_THRESHOLD <<- 5
# InForm version data formats supported.
SUPPORTED_INFORM_VERSIONS <<- c(2.2, 2.4)
......
......@@ -76,7 +76,7 @@ merge_cell_data_files <- function(files_to_merge){
# Merge data.
# **********
# Merge data frame for the current marker with the global dataframe 'merged_df'.
# Merge data frame for the current marker with the global data frame 'merged_df'.
if(is.null(merged_df)){
merged_df = input_df
} else{
......@@ -119,36 +119,88 @@ merge_cell_data_files <- function(files_to_merge){
# ***********************************************
# In principle, the "Tissue Category" columns of all individual markers should contain the
# same value. Here we verify that this is the case and then keep only one copy of them.
if(any(tissue_cat_df != tissue_cat_df[,1])){
diff_rows = sort(unique(unlist(lapply(2:ncol(tissue_cat_df),
FUN=function(x) which(tissue_cat_df[x] != tissue_cat_df[,1])))))
diff_rows = sort(unique(unlist(lapply(2:ncol(tissue_cat_df),
FUN=function(x) which(tissue_cat_df[x] != tissue_cat_df[,1])))))
# If the tissue category data frame contains >= 3 columns, we can try to perform majority
# ruling to resolve some of the conflicting values.
if(ncol(tissue_cat_df) >= 3){
fixed_row = NULL
for(x in diff_rows){
# If one of the tissue values has a majority within the row, majority ruling is
# possible and the most frequent value is used.
value_frequency = sort(table(as.character(tissue_cat_df[x,])), decreasing=T)
stopifnot(length(value_frequency) >= 2)
# Case 1: one of the tissue values has a majority within the row. Majority ruling is
# possible and the most frequent value is used.
if(as.numeric(value_frequency[1]) > as.numeric(value_frequency[2])){
tissue_cat_df[x,] = names(value_frequency)[1]
if(SHOW_TISSUE_CATEGORY_MISMATCH_WARNING) raise_error(
msg = c(paste0('[tissue_category] values differ across files. ',
'Values were reconciled based on majority ruling.'),
paste0('Offending row: ', x)),
file = files_to_merge[1],
type = 'warning')
# Case 2: majority ruling is not possible.
} else{
raise_error(
msg = c('Could not merge individual marker files.',
'Reason: tissue_category values differ across files with no majority.',
paste0('Offending row: ', x),
paste0('Offending values: ', paste(tissue_cat_df[x,], collapse=' '))),
file = files_to_merge[1])
fixed_row = c(fixed_row, x)
}
}
stopifnot(all(tissue_cat_df == tissue_cat_df[,1]))
# Display warning message if any values were fixed through majority ruling.
if(length(fixed_row) > 0){
if(SHOW_TISSUE_CATEGORY_MISMATCH_WARNING) raise_error(
msg = c('[tissue_category] values differ across files.',
'The following mismatches were reconciled based on majority ruling.',
paste0('Number of offending rows: ',
length(fixed_row), ' [',
round(length(fixed_row)/nrow(tissue_cat_df)*100, 5),
'%]'),
'List of offending rows:'),
items_to_list = fixed_row,
file = files_to_merge[1],
type = 'warning')
# Re-evaluate the number of rows that differ between tissue category values.
diff_rows = sort(unique(unlist(lapply(2:ncol(tissue_cat_df),
FUN=function(x) which(tissue_cat_df[x] != tissue_cat_df[,1])))))
}
rm(fixed_row, x)
}
# At this point any remaining mismatch cannot be reconciled with majority ruling.
# Therefore the only options we have is to:
# 1. Delete the offending lines and trigger a warning if the number of offending lines
# is smaller than a user-defined threshold.
# 2. If the number of offending lines is above the threshold, an error is generated.
if(length(diff_rows) > 0){
percentage_mismatch = length(diff_rows) / nrow(tissue_cat_df) * 100
# Case 1: the number of non-reconciled mismatches exceeds the user-defined limit.
if(percentage_mismatch > TISSUE_CATEGORY_MISMATCH_THRESHOLD) raise_error(
msg = c('Too many [tissue_category] values differ across files to merge and ',
'could not be reconcilded through majority ruling.',
paste0('Number of mismatches: ', length(diff_rows), " [",
round(percentage_mismatch, 5), "%]"),
paste0('Threshold above which this error is generated: ',
TISSUE_CATEGORY_MISMATCH_THRESHOLD, '%'),
paste0('The threshold value can be modified through the ',
'TISSUE_CATEGORY_MISMATCH_THRESHOLD parameter in the ',
'configuration file.'),
'The offending rows are the following:'),
items_to_list = sapply(diff_rows,
function(x) paste0(x, ' [', tissue_cat_df[x,1],
', ', tissue_cat_df[x,2], ']')),
file = files_to_merge[1])
# Case 2: the number of mismatches is <= the threshold. We only display a warning.
if(SHOW_TISSUE_CATEGORY_MISMATCH_WARNING) raise_error(
msg = c('A number of [tissue_category] values differ across files and could not be',
'reconcilded through majority ruling. Instead, the offending rows were deleted',
'from the input data.',
paste0('Number of mismatches: ', length(diff_rows), " [",
round(percentage_mismatch, 5), "%]"),
'The following lines had mismatches and were deleted:'),
items_to_list = sapply(diff_rows,
function(x) paste0(x, ' [', tissue_cat_df[x,1],
', ', tissue_cat_df[x,2], ']')),
file = files_to_merge[1],
type = 'warning')
# Set tissue category values that differ to NA.
tissue_cat_df[diff_rows, ] = NA
}
# Add tissue category values to the merged data frame.
merged_df[,'tissue_category'] = tissue_cat_df[,1]
......@@ -185,18 +237,29 @@ merge_cell_data_files <- function(files_to_merge){
if(length(differing_values) > 0){
differing_values = as.vector(as.matrix(differences))[differing_values]
raise_error(
msg=c(paste0('Values for column [', col_name, '] differ accross individual files'),
paste0('to merge by more than ', MARKER_INTENSITY_THRESHOLD, ' at ',
msg=c(paste0('Values for column [', col_name, '] differ accross files to merge'),
paste0('by more than ', MARKER_INTENSITY_THRESHOLD, ' at ',
length(differing_values), ' occurences [',
round(length(differing_values)/nrow(marker_int_df)*100, 3), '%].'),
round(length(differing_values)/nrow(marker_int_df)*100, 5), '%].'),
'Values from the first file (alphabetically) will be used.'),
file=dirname(files_to_merge[1]),
type = 'warning')
}
}
# Add marker intensity values to merged dataframe.
# Add marker intensity values to merged data frame.
merged_df = cbind(merged_df, marker_int_df[, 1:length(marker_intensity_cols)])
# Delete rows that differed in tissue category. The stopifnot check is just to make sure the
# lines where deleted properly.
# TODO: at this point I'm using "which()" to identify the rows need to be removed, but this
# creates a problem when there are no such rows, since then diff_rows = integer(0), and
# [-integer(0),] is the same as [integer(0),] and thus removes all rows of the table!
# So for now I'm guarding this statement with an if() check, but maybe there is a better
# way of doing it.
if(length(diff_rows) > 0) merged_df = merged_df[-diff_rows,]
stopifnot(!any(is.na(merged_df)))
stopifnot(nrow(merged_df) > 0)
return(merged_df)
}
####################################################################################################
......@@ -293,7 +356,7 @@ remove_duplicated_rows <- function(input_df, key_fields, file_name){
duplicated_rows = which(duplicated(input_df[,key_fields]))
if(length(duplicated_rows) > 0){
input_df = input_df[-duplicated_rows,]
raise_error(msg = 'The following duplicated rows were deleted from input file:',
raise_error(msg = 'The following duplicated rows were deleted:',
file = file_name,
items_to_list = duplicated_rows,
type = 'warning')
......
......@@ -709,67 +709,50 @@ markers_in_file_name <- function(file_name){
####################################################################################################
#' Performs the following checks on the input data given a list of samples:
#' * Input samples are present in the raw data.
#' * Samples present in the raw data are also present in input parameter file [warning].
#' * Each sample sub-directory contains exactly one cell and one tissue segmentation file.
#'
#' @param samples_from_parameters Sample names passed as input parameter by the user.
#' @param session_root_dir [string] Root directory for the current session.
check_sample_names <- function(samples_from_parameters, session_root_dir){
# Retrieve samples present in data from subdirectory names present in session_root_dir.
samples_from_data = sort(basename(list.dirs(session_root_dir, full.names=TRUE, recursive=F)))
# If samples present in the raw data are missing from the input parameters, raise warning.
missing_samples = samples_from_data[!samples_from_data %in% samples_from_parameters]
if(length(missing_samples) > 0) raise_error(
#' @param samples [str vector] sample names passed in input by the user.
#' @param root_dir [str] Root directory for the current session.
#' @return [NULL]
check_sample_data <- function(samples, root_dir){
# Retrieve samples present in data based on sub-directory names present in session's root_dir.
samples_from_data = sort(basename(list.dirs(root_dir, full.names=TRUE, recursive=F)))
# Verify all samples passed in input by the user are present in the raw data.
if(length(setdiff(samples, samples_from_data)) > 0) raise_error(
msg = 'One or more samples listed in the input parameters are missing from the raw data:',
items_to_list = setdiff(samples, samples_from_data))
# Verify all samples present in the raw data are also present in the input parameters. If not,
# raise a warning.
if(length(setdiff(samples_from_data, samples)) > 0) raise_error(
msg = 'One or more samples present in the data are not listed in input parameters:',
items_to_list = missing_samples,
items_to_list = setdiff(samples_from_data, samples),
type = 'warning')
# All sample names passed as input by the user must be present in the data.
missing_samples = samples_from_parameters[!samples_from_parameters %in% samples_from_data]
if(length(missing_samples) > 0) raise_error(
msg = 'One or more samples listed in the input paramters are missing from the data:',
items_to_list = missing_samples)
}
####################################################################################################
####################################################################################################
check_sample_directories <- function(sample_list, session_root_dir){
# ********************************************************************************************
#
# sample_list : samples to process in the current analysis.
# session_root_dir: root directory for the current session.
# ********************************************************************************************
# Verify each sample sub-directory contains exactly one cell and one tissue segmentation file.
error_msg = NULL
for(sample_name in sample_list){
# Verify there is a subdirectory for the sample.
sample_dir = file.path(session_root_dir, sample_name)
if(!dir.exists(sample_dir)){
error_msg = c(error_msg, paste0(sample_name,': no subdirectory found for sample.'))
next
for(sample_name in samples){
sample_dir = file.path(root_dir, sample_name)
for(file_type in c(CELL_FILES_EXTENSION, TISSUE_FILES_EXTENSION)){
files = list.files(path=sample_dir, pattern=paste0('.*', file_type, '$'),
all.files=F, full.names=F, recursive=F, ignore.case=F)
if(length(files) == 0){
error_msg = c(error_msg, paste0('Sample [', sample_name, ']: no [',
file_type, '] file found.'))
} else if(length(files) > 1){
error_msg = c(error_msg, paste0('Sample [', sample_name,']: more than one [',
file_type, '] file in subdirectory.'))
}
}
# Verify that each sample subdirectory contains exactly one cell and one tissue
# segmentation file.
cell_files = list.files(path=sample_dir, pattern=paste0('.*', CELL_FILES_EXTENSION, '$'),
all.files=F, full.names=F, recursive=F, ignore.case=F)
tissue_files = list.files(path=sample_dir, pattern=paste0('.*',TISSUE_FILES_EXTENSION,'$'),
all.files=F, full.names=F, recursive=F, ignore.case=F)
if(length(cell_files) == 0) error_msg = c(error_msg,
paste0(sample_name,': no cell segmentation data found.'))
if(length(tissue_files) == 0) error_msg = c(error_msg,
paste0(sample_name,': no tissue segmentation data found.'))
if(length(cell_files) > 1) error_msg = c(error_msg,
paste0(sample_name,': more than one cell segmentation file in subdirectory.'))
if(length(tissue_files) > 1) error_msg = c(error_msg,
paste0(sample_name,': more than one tissue segmentation file in subdirectory.'))
}
if(length(error_msg) > 0){
log_message(paste('ERROR:', error_msg), padding='')
raise_error('One or more errors in input data detected. See above.')
}
if(length(error_msg) > 0) raise_error(msg = "One or more errors detected in input data:",
items_to_list = error_msg)
}
####################################################################################################
......
......@@ -164,10 +164,10 @@ postinform <- function(input_file_or_dir,
#' @return nothing.
#'
postinform_pipeline <- function(input_dir,
output_dir,
command,
immucan_output = FALSE,
no_bash = FALSE){
output_dir,
command,
immucan_output = FALSE,
no_bash = FALSE){
# Input data check
# ****************
......@@ -176,12 +176,22 @@ postinform_pipeline <- function(input_dir,
# - load and check marker thresholds file.
# - search for cell and tissue segmentation files in sub-directories or the session root.
log_message('Input data check:')
log_message(paste('input directory:', input_dir), level=2)
log_message(paste0('input directory [', input_dir, ']'), level=2)
inputdir_check(input_dir, output_dir)
log_message('input dir check: OK', level=2)
# Load session parameters from "parameters.txt".
log_message('session parameters...', level=2)
input_parameters = load_session_parameters(output_dir)
log_message('session parameters: OK', level=2)
# Optionally load "sample_rename.txt" values if the file is present.
if(file_test('-f', file.path(output_dir, SAMPLE_RENAME_FILE))){
log_message('sample rename file...', level=2)
sample_rename = load_sample_rename_file(file.path(output_dir, SAMPLE_RENAME_FILE))
input_parameters$sample_rename = sample_rename
check_sample_rename(sample_rename, original_samples=input_parameters$samples)
rm(sample_rename)
}
log_message('completed', level=2, add_empty_line=TRUE)
if(command == 'check') return(invisible(NULL))
......@@ -273,23 +283,20 @@ postinform_pipeline <- function(input_dir,
# Verify data is present for each sample.
# **************************************
# Verify all samples passed as input parameters are also present in the actual data.
check_sample_names(samples_from_parameters=input_parameters$samples, output_dir)
check_sample_directories(sample_list=input_parameters$samples, output_dir)
check_sample_data(samples=input_parameters$samples, root_dir=output_dir)
# Input sample renaming.
# *********************
if(file.exists(file.path(output_dir, SAMPLE_RENAME_FILE))){
if("sample_rename" %in% names(input_parameters)){
log_message('Sample renaming:')
# Modify sample names in input files.
rename_samples(samples=input_parameters$samples, output_dir)
rename_samples(sample_rename=input_parameters$sample_rename, root_dir=output_dir)
# Reload input parameters to update sample names.
input_parameters = load_session_parameters(output_dir)
check_sample_names(samples_from_parameters=input_parameters$samples, output_dir)
check_sample_directories(sample_list=input_parameters$samples, output_dir)
# Update input_parameters with new sample names.
input_parameters$samples = sort(unlist(input_parameters$sample_rename, use.names=FALSE))
check_sample_data(samples=input_parameters$samples, root_dir=output_dir)
log_message('completed', level=2, add_empty_line=TRUE)
}
......
####################################################################################################
rename_samples <- function(samples, session_root_dir){
# ********************************************************************************************
#
# ********************************************************************************************
# Load file that contains the sample renaming info.
sample_renaming = load_sample_rename_file(
input_file = file.path(session_root_dir, SAMPLE_RENAME_FILE),
sample_names = samples)
#' Rename samples in the input data and split them into two if needed.
#'
rename_samples <- function(sample_rename, root_dir){
# Rename samples in cell and tissue segmentation files.
# ****************************************************
for(old_name in samples){
new_name = sample_renaming[[old_name]]
old_dir = file.path(session_root_dir, old_name)
new_dir = file.path(session_root_dir, new_name)
for(old_name in names(sample_rename)){
new_name = sample_rename[[old_name]]
old_dir = file.path(root_dir, old_name)
new_dir = file.path(root_dir, new_name)
stopifnot(dir.exists(old_dir))
stopifnot(!dir.exists(new_dir))
for(x in new_dir) dir.create(x, recursive=FALSE)
log_message(paste0('rename ', old_name, ' to ', paste(new_name, collapse=' / '),
ifelse(length(new_name)>1, ' [sample will be split]','')),
ifelse(length(new_name)>1, ' [sample will be split]', '')),
level=2)
# Load cell and tissue segmentation files, change sample name and write to disk.
......@@ -30,8 +23,6 @@ rename_samples <- function(samples, session_root_dir){
new_file = file.path(new_dir, paste0(new_name, file_extension))
tmp = read.table(old_file, sep='\t', as.is=TRUE, header=TRUE,
check.names=TRUE, strip.white=TRUE, colClasses='character')
stopifnot('sample_name' %in% colnames(tmp))
stopifnot('image_id' %in% colnames(tmp))
# Case 1: the data does not need to be split.
if(length(new_name) == 1){
......@@ -42,18 +33,19 @@ rename_samples <- function(samples, session_root_dir){
} else if(length(new_name) == 2){
# Get image ID values that corresponds to the 2 sub-samples in the original sample.
if(file_extension == CELL_FILES_EXTENSION){
stopifnot('cell_y_position' %in% colnames(tmp))
image_id_split = split_by_coordinate(coordinates = tmp[,'cell_y_position'],
image_ids = tmp[,'image_id'])
#plot(tmp$cell_x_position[rand_sample], tmp$cell_y_position[rand_sample], col=as.factor(tmp$image_id[rand_sample]), pch=18)
}
#stopifnot(all(unique(tmp[,'image_id']) %in% unlist(image_id_split)))
if(!all(unique(tmp[,'image_id']) %in% unlist(image_id_split))) raise_error(
msg = 'Tissue seg data has more image IDs than cell seg data.',
file = old_file, type = 'warning')
msg = c("Tissue seg data has more image IDs than cell seg data.",
"Image ID values no present in the cell seg data will be removed."),
file = old_file,
type = 'warning')
# Temporary check for the split.
if(FALSE){
jpeg(filename = file.path(session_root_dir, paste0(old_name, '_split.jpg')),
jpeg(filename = file.path(root_dir, paste0(old_name, '_split.jpg')),
width = 700, height = 1000)
rand_sample = sample(1:nrow(tmp), ifelse(nrow(tmp) > 5000, 5000, nrow(tmp)))
plot(tmp$cell_x_position[rand_sample], tmp$cell_y_position[rand_sample],
......@@ -62,7 +54,7 @@ rename_samples <- function(samples, session_root_dir){
dev.off()
}
# Split original sample based on imageID values.
# Split original sample based on image ID values.
tmp_1 = tmp[tmp[,'image_id'] %in% image_id_split[[1]], ]
tmp_2 = tmp[tmp[,'image_id'] %in% image_id_split[[2]], ]
if(file_extension == CELL_FILES_EXTENSION) stopifnot(nrow(tmp_1)+nrow(tmp_2) == nrow(tmp))
......@@ -71,93 +63,22 @@ rename_samples <- function(samples, session_root_dir){
write.table(tmp_1, file=new_file[1], sep='\t', quote=FALSE, row.names=FALSE)
write.table(tmp_2, file=new_file[2], sep='\t', quote=FALSE, row.names=FALSE)
} else stop("ERROR: THIS CASE IS NOT IMPLEMENTED YET.")
} else stop("cannot split sample in more than 2. THIS CASE IS NOT IMPLEMENTED YET.")
unlink(old_file)
}
unlink(old_dir, recursive=T)
}
# Rename samples in PARAMETERS_FILE.
# *********************************
file_name = file.path(session_root_dir, PARAMETERS_FILE)
file_content = read_file_as_vector(file_name, ignore_comments=FALSE)
# Find start and end position of 'samples:' paramter in the parameter file.
start_position = grep('^samples:', file_content)
stopifnot(length(start_position) == 1)
parameter_position = grep(':', file_content)
stopifnot(length(parameter_position) >= 2)
if(start_position == tail(parameter_position, 1)){
end_position = length(file_content)
} else end_position = parameter_position[which(parameter_position == start_position) + 1] - 1
# Replace old sample values with new values.
if(start_position == 1){
file_content = c('samples:',
as.character(unlist(sample_renaming)),
file_content[(end_position + 1):length(file_content)])
} else if(end_position == length(file_content)){
file_content = c(file_content[1:(start_position - 1)],
'samples:',
as.character(unlist(sample_renaming)))
} else{
file_content = c(file_content[1:(start_position - 1)],
'samples:',
as.character(unlist(sample_renaming)),
file_content[(end_position + 1):length(file_content)])
}
# Overwrite paramter input file.
write(file_content, file=file_name, append=F, ncolumns=1)
# Rename samples in THRESHOLDS_FILE.
# *********************************
file_name = file.path(session_root_dir, THRESHOLDS_FILE)
if(file.exists(file_name)){
# Load thrsholds file.
tmp = read.table(file_name, sep='\t', as.is=TRUE, header=TRUE,
check.names=TRUE, strip.white=TRUE, colClasses='character')
# Legacy format support: change sample name column header.
if(colnames(tmp)[1] == 'sampleName') colnames(tmp)[1] = 'sample_name'
stopifnot('sample_name' %in% colnames(tmp))
sample_values = tmp[,'sample_name']
# Replace sample names.
for(old_name in samples){
new_name = sample_renaming[[old_name]]
if(length(new_name) == 1){
sample_values[which(sample_values == old_name)] = new_name
} else if(length(new_name) == 2){
row_nb = which(sample_values == old_name)
sample_values[row_nb] = new_name[1]
# Duplicate thresholds for second new name of sample.
sample_values = c(sample_values, rep(new_name[2], length(row_nb)))
tmp = rbind(tmp, tmp[row_nb,])
} else stop("ERROR: THIS CASE IS NOT IMPLEMENTED YET.")
}
tmp[,'sample_name'] = sample_values
tmp = tmp[order(tmp[,'sample_name']), ]
write.table(tmp, file=file_name, sep='\t', quote=FALSE, row.names=FALSE)
}
}
####################################################################################################
####################################################################################################
load_sample_rename_file <- function(input_file, sample_names){
# ********************************************************************************************
#
# ********************************************************************************************
#' Load a "sample rename" file from disk.
#'
#' @param input_file [string] path of the "sample rename" file to load.
#'
load_sample_rename_file <- function(input_file){
# Load file content by line. Lines starting with # are ignored.
file_content = read_file_as_vector(input_file)
......@@ -166,45 +87,72 @@ load_sample_rename_file <- function(input_file, sample_names){
file = input_file)
# For each line, extract the correspondence between the old and new sample names.
sample_renaming = list()
sample_rename = list()
for(line in file_content[-1]){
line_elements = string_to_vector(line)
if(!length(line_elements) %in% c(2,3)) raise_error(
msg = 'All rows of sample renaming files must have either 2 or 3 elements.',
msg = "All rows of 'sample_rename.txt' files must have either 2 or 3 elements.",
file = input_file)
sample_name_old = line_elements[1]
sample_name_new = line_elements[2:length(line_elements)]
old_name = line_elements[1]
new_name = line_elements[2:length(line_elements)]
# Verify the values we are adding are not duplicated
if(sample_name_old %in% names(sample_renaming)) raise_error(
msg = paste0('Duplicated sample name: ', sample_name_old),
file = input_file)
for(x in sample_name_new){
if(x %in% as.character(unlist(sample_renaming)) |
any(duplicated(sample_name_new))) raise_error(
msg = paste0('Duplicated new sample name: ', x),
file = input_file)
}
# Replace any ' character in the new sample name with "_"
sample_renaming[[sample_name_old]] = gsub("'", '-', sample_name_new)
# Add new old/new sample name pair to list.
# Replace any ' character in the new sample name with "_".
sample_rename[[old_name]] = gsub("'", '-', new_name)
}