Commit 76908cd4 authored by Robin Engler's avatar Robin Engler
Browse files

Modify behavior of tissue surface merging for individual file: use min instead of median

parent 1a25c8c7
......@@ -20,7 +20,8 @@ AUTHORIZED_STROMA_VALUES <<- c('DAPI', 'stroma', 'other')
AUTHORIZED_TUMOR_VALUES <<- c('CK', 'tumor')
AUTHORIZED_MARKERS <<- c('CAL', 'CD3', 'CD4', 'CD8', 'CD11C', 'CD15', 'CD20', 'CD56', 'CD68',
'CD103', 'CD163', 'CD206', 'FOXP3', 'GB', 'gH2AX', 'gH2AXN', 'IDO',
'Keratin', 'KI67', 'PD1', 'PDL1', 'PERFORIN', 'WT1', 'CK', 'VISTA')
'Keratin', 'KI67', 'PD1', 'PDL1', 'PERFORIN', 'SOX10', 'WT1', 'CK',
'VISTA')
IGNORED_PHENOTYPES <<- c('DAPIp', 'MISSING')
DATAREDUCE_SCRIPT <<- file.path(dirname(dirname(sys.frame(1)$ofile)),
......
......@@ -542,34 +542,44 @@ extract_imageid <- function(input_vector, input_file='file not specified'){
####################################################################################################
decompress_file <- function(input_file, dry_run=FALSE){
decompress_file <- function(input_file, allow_overwrite=FALSE, dry_run=FALSE){
# ********************************************************************************************
# Decompress .zip and .tar.gz input files and return the directory containing the uncompressed
# data.
#
# Input parameters:
# -> input_file: input .tar.gz or .zip file to uncompress.
# -> input_file: input .tar.gz or .zip file to decompress.
# ********************************************************************************************
root_dir = dirname(input_file)
# Input file is tarball.
# Determine archive type:
if(endsWith(input_file, '.tar.gz')){
data_dir = file.path(root_dir, unlist(strsplit(untar(input_file, list=TRUE)[1], '/'))[1])
if(dry_run) return(data_dir)
if(dir.exists(data_dir)) unlink(data_dir, recursive=TRUE)
untar(input_file, list=FALSE, exdir=root_dir)
# Input file is zip archive.
type = 'tar'
} else if(endsWith(input_file, '.zip')){
type = 'zip'
} else stop('Unsupported compression format. Only [.tar.gz] and [.zip] files are supported.')
# Determine output directory:
if(type == 'tar'){
data_dir = file.path(root_dir, unlist(strsplit(untar(input_file, list=TRUE)[1], '/'))[1])
} else if(type == 'zip'){
data_dir = file.path(root_dir,
unlist(strsplit(zip::zip_list(zipfile=input_file)[1,1], '/'))[1])
if(dry_run) return(data_dir)
if(dir.exists(data_dir)) unlink(data_dir, recursive=TRUE)
zip::unzip(input_file, exdir=root_dir, overwrite=TRUE)
}
if(dry_run) return(data_dir)
# Unsupported compression format.
} else stop('Unsupported compression format.')
# Test whether output already exists, and if yes delete it if allowed by user.
if(dir.exists(data_dir)){
if(!allow_overwrite) stop('File decompression failed: output already exists [',data_dir,']')
unlink(data_dir, recursive=TRUE)
}
# Decompress file.
if(type == 'tar'){
untar(input_file, list=FALSE, exdir=root_dir)
} else if(type == 'zip'){
zip::unzip(input_file, exdir=root_dir, overwrite=TRUE)
}
# Return dirname of extracted files.
stopifnot(file.info(data_dir)$isdir)
......
......@@ -229,7 +229,7 @@ merge_tissue_data_files <- function(files_to_merge){
file_name = f)
stopifnot(all(colnames(input_df) == c(key_fields, non_key_fields)))
# Merge data frame for the current marker with the global dataframe 'merged_df'.
# Merge data frame for the current marker with the global data frame 'merged_df'.
if(is.null(merged_df)){
merged_df = input_df
} else{
......@@ -240,34 +240,22 @@ merge_tissue_data_files <- function(files_to_merge){
}
}
# Search for mismatches among values of non-key fields. If some are detected, a warning
# is displayed. For rows with mismatches, if any, compute the median of surface values. In
# this way, if one of the input files has a different values it gets excluded (provided there
# are at least 3 files).
mismatches = NULL
for(col_name in non_key_fields){
col_index = grep(col_name, colnames(merged_df))
difference = abs(merged_df[,col_index] - merged_df[,col_index[1]])
mismatches = unique(c(mismatches, which(apply(difference, MARGIN=1, sum) > 0)))
}
if(length(mismatches) > 0){
percentage = round(length(mismatches)/nrow(merged_df) * 100, 2)
raise_error(msg = c('Mismatches in tissue surface among tissue seg files were found',
'Median value of tissue surface will be used for the following rows:',
paste0(paste(mismatches, collapse=', '), ' [', percentage, '%]')),
file = dirname(files_to_merge[1]),
type='warning')
# Compute median values to reconciliate mismatches.
for(col_name in non_key_fields){
col_index = grep(col_name, colnames(merged_df))
merged_df[mismatches, col_index[1]] = apply(merged_df[mismatches, col_index], 1, median)
}
# Merge tissue surface values (absolute value or percentage).
# The merge is made by keeping the smallest surface value from the individual files. This is
# because, when merging cells value, we keep the intersection of all individual files, which,
# in terms of surface, corresponds (roughly) to the smallest surface value in the tissue
#surface files.
for(i in 1:nrow(merged_df)){
# Identify column with the smallest surface value.
col_index = grep('region_area_surface', colnames(merged_df))
min_index = which(merged_df[i, col_index] == min(merged_df[i, col_index]))[1]
# Replace tissue surface values with the minimum value across all files.
merged_df[i, 'region_area_surface'] = merged_df[i, col_index[min_index]]
merged_df[i, 'region_area_percent'] = merged_df[i, col_index[min_index] + 1]
}
# Remove duplicated columns.
merged_df = merged_df[,1:5]
return(merged_df)
return(merged_df[,1:5])
}
####################################################################################################
......
......@@ -94,7 +94,7 @@ postinform <- function(input_file_or_dir,
# If the input is a compressed .zip or .tar.gz file, decompress it.
if(!file.info(input_file_or_dir)$isdir){
message("Decompressing ", input_file_or_dir, "...")
input_dir = decompress_file(input_file_or_dir)
input_dir = decompress_file(input_file_or_dir, allow_overwrite=allow_overwrite)
}
# Create output directory and log file.
......
......@@ -62,3 +62,47 @@ This command will produce an output file named "Test_session_random_suffix.zip".
postinform(input_file_or_dir="Test_session.zip", command='process', output_suffix="random_suffix",
compress_output=TRUE, immucan_output=TRUE, allow_overwrite=FALSE)
```
### Post-inForm input parameter file format.
The list of samples, tissues, markers and marker combinations to process are passed to post-inForm
via a single plain text file that must be named `parameters.txt` and be located at the root of the
input directory.
**Important:** for **Windows users**, the `parameters.txt` file should be encoded in `UTF-8`
The parameters input file must contain the following 5 sections. For each section, values can be
passed on the same line separated by a `,`, or on multiple lines (one value per line). Any line
starting the a `#` value is ignored (allows to add comments to file).
For the `marker_combinations:` section, the special value `all` can be passed to process all
possible combinations of markers (to avoid having to enter all combinations manually).
```
samples:
tissues:
phenotyped_markers:
scored_markers:
marker_combinations:
```
A template file can be downloaded [here](tests/parameters.txt).
`parameters.txt` file example:
```
# List of samples to process.
samples:
SAMPLE_1
SAMPLE_2
SAMPLE_3
# List of tissues to process.
tissues: stroma, tumor
# List of markers.
phenotyped_markers: CD20, CD3, CD68
scored_markers:
# Marker combinations to test
marker_combinations: all
```
# Post-inForm input parameter template.
# List of samples to process.
samples:
# List of tissues to process.
tissues: stroma, tumor
# List of markers.
phenotyped_markers:
scored_markers:
# Marker combinations to test
marker_combinations: all
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment