Commit 23fd1ce7 authored by Robin Engler's avatar Robin Engler
Browse files

Add auto-removal of duplicated lines in input files

parent f634c728
......@@ -58,17 +58,13 @@ merge_cell_data_files <- function(files_to_merge){
merged_df = NULL
for(f in files_to_merge){
# Load data. If needed, remove prefectly duplicated rows.
input_df = read.table(f, sep='\t', header=TRUE,
as.is=TRUE, colClasses='character', strip.white=TRUE)
if(any(duplicated(input_df))) input_df = unique(input_df)
# Verify that the input data frame has unique rows for the key fields.
if(any(duplicated(input_df[,key_fields]))) raise_error(
msg = 'Duplicated rows detected in input file:',
file = f,
items_to_list = which(duplicated(input_df[,key_fields])))
# Load data and remove duplicated rows.
input_df = remove_duplicated_rows(
input_df = read.table(f, sep='\t', header=TRUE, as.is=TRUE,
colClasses='character', strip.white=TRUE),
key_fields = key_fields,
file_name = f)
# Verify all input files have the same columns. This must always be the case as we have
# already standardized the files earlier.
if(is.null(merged_df)){
......@@ -124,10 +120,36 @@ merge_cell_data_files <- function(files_to_merge){
# In principle, the "Tissue Category" columns of all individual markers should contain the
# same value. Here we verify that this is the case and then keep only one copy of them.
if(any(tissue_cat_df != tissue_cat_df[,1])){
raise_error(msg=c('Could not merge individual marker files.',
'Reason: tissue_category values differ across files.'),
file=dirname(files_to_merge[1]))
diff_rows = unlist(sapply(2:ncol(tissue_cat_df),
FUN=function(x) which(tissue_cat_df[x] != tissue_cat_df[,1])))
for(x in diff_rows){
value_frequency = sort(table(as.character(tissue_cat_df[x,])), decreasing=T)
stopifnot(length(value_frequency) >= 2)
# Case 1: one of the tissue values has a majority within the row. Majority ruling is
# possible and the most frequent value is used.
if(as.numeric(value_frequency[1]) > as.numeric(value_frequency[2])){
tissue_cat_df[x,] = names(value_frequency)[1]
raise_error(msg = c(paste0('tissue_category values differ accorss files. ',
'Values were reconciled based on majority ruling.'),
paste0('Offending row: ', x)),
file=files_to_merge[1],
type = 'warning')
# Case 2: majority ruling is not possible
} else{
raise_error(
msg = c('Could not merge individual marker files.',
'Reason: tissue_category values differ across files with no majority.',
paste0('Offending row: ', x),
paste0('Offending values:', paste(tissue_cat_df[x,], collapse=' '))),
file = files_to_merge[1])
}
}
stopifnot(all(tissue_cat_df == tissue_cat_df[,1]))
}
# Add tissue category values to merged dataframe.
merged_df[,'tissue_category'] = tissue_cat_df[,1]
......@@ -161,7 +183,7 @@ merge_cell_data_files <- function(files_to_merge){
differing_values = as.vector(as.matrix(differences))[differing_values]
raise_error(
msg=c(paste0('Values for column [', col_name, '] differ accross individual files'),
paste0('files to merge by more than ', tolerance_limit, ' at ',
paste0('to merge by more than ', tolerance_limit, ' at ',
length(differing_values), ' occurences [',
length(differing_values)/nrow(marker_int_df)*100, ' %].'),
'Values from the first file (alphabetically) will be used.'),
......@@ -200,20 +222,15 @@ merge_tissue_data_files <- function(files_to_merge){
merged_df = NULL
for(f in files_to_merge){
# Load data and verify that the column names are correct. At this point the files are
# standardized so they must all have the same column names.
input_df = read.table(f, sep='\t', header=TRUE, as.is=TRUE, strip.white=TRUE,
colClasses=c(rep('character',3), rep('numeric',2)))
# Load data and remove duplicated rows. Verify that the column names are correct - at this
# point the files are standardized so they must all have the same column names.
input_df = remove_duplicated_rows(
input_df = read.table(f, sep='\t', header=TRUE, as.is=TRUE, strip.white=T,
colClasses=c(rep('character',3), rep('numeric',2))),
key_fields = key_fields,
file_name = f)
stopifnot(all(colnames(input_df) == c(key_fields, non_key_fields)))
# If needed, remove prefectly duplicated rows. Then verify that the input data frame
# has unique rows for the key fields.
if(any(duplicated(input_df))) input_df = unique(input_df)
if(any(duplicated(input_df[,key_fields]))) raise_error(
msg = 'Duplicated rows detected in input file:',
file = f,
items_to_list = which(duplicated(input_df[,key_fields])))
# Merge data frame for the current marker with the global dataframe 'merged_df'.
if(is.null(merged_df)){
merged_df = input_df
......@@ -256,3 +273,30 @@ merge_tissue_data_files <- function(files_to_merge){
}
####################################################################################################
####################################################################################################
remove_duplicated_rows <- function(input_df, key_fields, file_name){
# ********************************************************************************************
# Check whether there are duplicated rows for key_fields in the input table (input_df), and if
# so, removes them and displays a warning.
#
# file_name: name of file from where input_df is loaded. Only used to display warning to user.
# ********************************************************************************************
# Remove perfectly duplicated rows without displaying any warning-
if(any(duplicated(input_df))) input_df = unique(input_df)
# Remove rows duplicated over the key_fields with a warning to the user.
duplicated_rows = which(duplicated(input_df[,key_fields]))
if(length(duplicated_rows) > 0){
input_df = input_df[-duplicated_rows,]
raise_error(msg = 'The following duplicated rows were deleted from input file:',
file = file_name,
items_to_list = duplicated_rows,
type = 'warning')
}
return(input_df)
}
####################################################################################################
......@@ -30,7 +30,7 @@ AUTHORIZED_COMPARTMENTS <<- c('nucleus', 'membrane', 'cytoplasm', 'entire_cell'
AUTHORIZED_STROMA_VALUES <<- c('DAPI', 'stroma', 'other')
AUTHORIZED_TUMOR_VALUES <<- c('CK', 'tumor')
AUTHORIZED_MARKERS <<- c('CAL', 'CD3', 'CD4', 'CD8', 'CD11C', 'CD15', 'CD20', 'CD56', 'CD68',
'CD163', 'CD206', 'FOXP3', 'GB', 'gH2AX', 'gH2AXN', 'IDO',
'CD103', 'CD163', 'CD206', 'FOXP3', 'GB', 'gH2AX', 'gH2AXN', 'IDO',
'Keratin', 'KI67', 'PD1', 'PDL1', 'WT1', 'CK', 'VISTA')
IGNORED_PHENOTYPES <<- c('DAPIp', 'MISSING')
......
......@@ -21,12 +21,13 @@ autodesc_main(input_file_or_dir, delete_input=FALSE, immucan_output=TRUE)
# Example of non-IMMUCAN samples processing.
# *****************************************
input_file = 'Session_LUD2014_P2_2.zip'
input_file = 'Session_Batch6_1_Curry2.zip'
# Run autodesc from input .zip file.
input_file_or_dir = file.path(AUTODESC_ROOT, 'test_data', input_file)
autodesc_main(input_file_or_dir, delete_input=FALSE, immucan_output=FALSE)
# Run autodesc from directory (i.e. decompressed .zip file.)
autodesc_main(sub('.zip$', '', input_file_or_dir), delete_input=FALSE, immucan_output=F)
autodesc_main(sub('.zip$|.tar.gz$', '', input_file_or_dir), delete_input=FALSE, immucan_output=F, command='reduce')
autodesc_main(sub('.zip$|.tar.gz$', '', input_file_or_dir), delete_input=FALSE, immucan_output=F, command='all')
####################################################################################################
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment