Commit 24542faa authored by Robin Engler's avatar Robin Engler
Browse files

Feat: add auto-deletion of duplicated rows in input files

parent 36fe98b0
......@@ -57,6 +57,15 @@ FILE_LOSS_PERCENTAGE_THRESHOLD <<- 5
SHOW_TISSUE_CATEGORY_MISMATCH_WARNING <<- TRUE
TISSUE_CATEGORY_MISMATCH_THRESHOLD <<- 5
# Duplicated cell segmentation data threshold
# * CELL_SEG_DATA_DUPLICATE_THRESHOLD: threshold (as a percentage) above which a warning is
# displayed when deleting duplicated cell segmentation values.
# * TISSUE_SEG_DATA_DUPLICATE_THRESHOLD: threshold (as a percentage) above which an error is
# raised when deleting duplicated tissue segmentation values.
CELL_SEG_DATA_DUPLICATE_THRESHOLD <<- 1
TISSUE_SEG_DATA_DUPLICATE_THRESHOLD <<- 10
# InForm version data formats supported.
SUPPORTED_INFORM_VERSIONS <<- c(2.2, 2.4)
......
......@@ -4,7 +4,7 @@
#' @param input_dir
#' @param cell_compartment
#' @param output_dir
reduce_file_size <- function(input_dir, cell_compartment, output_dir = NULL){
reduce_file_size <- function(input_dir, output_dir, cell_compartment){
# Define columns to keep for cell and tissue segmentation files respectively.
cols_to_keep_cell = c("Sample Name", "Tissue Category", "Phenotype", "Cell ID",
......
......@@ -7,7 +7,7 @@
check_for_duplicated_rows <- function(data_frame){
duplicated_rows = which(duplicated(data_frame, MARGIN=1))
if(length(duplicated_rows) > 0) raise_error(
msg = "Duplicated rows found in input data:",
msg = "Duplicated rows found in input data tissue segmentation data:",
items_to_list = sapply(duplicated_rows,
FUN=function(x) paste(data_frame[x,], collapse=' ')))
}
......
This diff is collapsed.
......@@ -360,32 +360,51 @@ load_thresholds_file <- function(input_file,
####################################################################################################
scan_dir_for_seg_data_files <- function(input_directory){
# ********************************************************************************************
# Scan the input_directory for *_cell_seg_data and *_tissue_seg_data_summary files. Verify
# the files go in pairs, and return a list with two elements: 'cell_files' and 'tissue_files'.
#
# ********************************************************************************************
stopifnot(dir.exists(input_directory))
#' Scan the input directory for "_cell_seg_data" and "_tissue_seg_data_summary" files. Verify that
#' the files go in pairs, and return a list with two elements: 'cell_files' and 'tissue_files'.
#'
#' @param input_dir [str] Path of directory to scan.
scan_dir_for_seg_data_files <- function(input_dir){
stopifnot(dir.exists(input_dir))
# List of all *_cell_seg_data.txt and *_tissue_seg_data_summary.txt files in input directory.
cell_files = list.files(path=input_directory, pattern=paste0('.*', CELL_FILES_EXTENSION, '$'),
# Note: apparently list.files() does not like regexp pattern with lookahead, so the following
# does not work: pattern=paste0('^((?!_reject).)*', CELL_FILES_EXTENSION, '$')
# as in:
# test = c("do_keep_this_foobar", "keep_this_foobar_too", "do_reject_that_foobar",
# "and this as well", "_foobar")
# grep("^((?!_reject).)*_foobar.*$", test, perl=TRUE)
cell_files = list.files(path=input_dir, pattern=paste0('.*', CELL_FILES_EXTENSION, '$'),
all.files=FALSE, full.names=FALSE, recursive=FALSE, ignore.case=FALSE)
tissue_files = list.files(path=input_directory, pattern=paste0('.*',TISSUE_FILES_EXTENSION,'$'),
tissue_files = list.files(path=input_dir, pattern=paste0('.*',TISSUE_FILES_EXTENSION,'$'),
all.files=FALSE, full.names=FALSE, recursive=FALSE, ignore.case=FALSE)
# Remove any "rejected" files from the lists
reject_pattern = "_rejected_"
cell_files = cell_files[!grepl(reject_pattern, cell_files, ignore.case=TRUE)]
tissue_files = tissue_files[!grepl(reject_pattern, tissue_files, ignore.case=TRUE)]
# If no files were found, return NULL.
if(length(cell_files) + length(tissue_files) == 0) return(NULL)
# If "merge" files are present in the directory, keep only these only and discard any non-merge
# file.
merge_pattern = "_merge_"
merge_files = grep(merge_pattern, cell_files, ignore.case=TRUE)
if(length(merge_files) > 0) cell_files = cell_files[merge_files]
merge_files = grep(merge_pattern, tissue_files, ignore.case=TRUE)
if(length(merge_files) > 0) tissue_files = tissue_files[merge_files]
# Verify that the cell and tissue segmentation files go by pairs.
prefix_cell = gsub(pattern='_cell_seg_data.txt', replacement='', x=cell_files)
prefix_tissue = gsub(pattern='_tissue_seg_data_summary.txt', replacement='', x=tissue_files)
prefix_cell = sort(prefix_cell)
prefix_tissue = sort(prefix_tissue)
prefix_cell = sort(gsub(pattern=CELL_FILES_EXTENSION, replacement='', x=cell_files))
prefix_tissue = sort(gsub(pattern=TISSUE_FILES_EXTENSION, replacement='', x=tissue_files))
difference_list = c(setdiff(prefix_cell,prefix_tissue), setdiff(prefix_cell,prefix_tissue))
if(length(difference_list) > 0) raise_error(
msg = "The following cell and/or tissue segmentation files are not in pairs:",
file = input_directory,
file = input_dir,
items_to_list = difference_list)
# The function returns the prefix of all cell/tissue segmentation files.
......
......@@ -197,8 +197,8 @@ postinform_pipeline <- function(input_dir,
# Delete unnecessary columns in segmentation files
# ************************************************
# Copy cell and tissue segmentation files to output dir, delete unnecessary columns in files
# ******************************************************************************************
# The objective is to delete all columns in the input data that are not necessary for the
# analysis (only a small subset of columns are needed). This will allow to make the the
# loading of the files faster and consume less memory.
......
......@@ -42,66 +42,48 @@ fi
[[ ! -d "$outputDir" ]] && mkdir -p "$outputDir"
# Reduce cell segmentation files (*_cell_seg_data.txt).
# Reduce cell segmentation (*_cell_seg_data.txt) and
# tissue surface (*_tissue_seg_data_summary.txt) files
# ****************************************************
[[ $verbose -eq 1 ]] && echo "### Reduce *_cell_seg_data.txt files:"
# Retrieve all cell segmentation files.
originalIFS="$IFS"
IFS=$'\n'
fileList=( $( find "$inputDir" -type f -name '*_cell_seg_data.txt' ) )
IFS="$originalIFS"
# For each input file, keep only the columns that are needed for the analysis.
for fileName in "${fileList[@]}"; do
[[ $verbose -eq 1 ]] && echo "### -> ${fileName}"
colsToKeep=$( head -n1 "$fileName" | tr '\t' '\n' | \
grep -i -n "^Sample Name$\|^Tissue Category$\|^Phenotype$\|^Cell ID$\|^Cell X Position$\|^Cell Y Position$\|^Annotation ID\|^Confidence\|^${cellCompartment} .* Mean " | \
cut -f1 --delimiter=':' )
[[ -z $colsToKeep ]] && echo "### ERROR: no columns to keep found in [$fileName]" && exit 1
# Note that in bash it's not possible to use a file as both input and output, so we create
# a temporary file, delete the original file and then rename the temporary file.
cut -f$( echo $colsToKeep | tr ' ' ',' ) "$fileName" > "$outputDir/$( basename $fileName )"
unset colsToKeep
for fileExtension in '_cell_seg_data.txt' '_tissue_seg_data_summary.txt'; do
[[ $verbose -eq 1 ]] && echo "### Reduce ${fileExtension} files:"
# Retrieve all cell/tissue files, except "rejected" files. If "merge" file are present, keep
# only the "merge" files.
if [[ $( find "$inputDir" -type f -name "*_merge${fileExtension}" | wc -l ) -gt 0 ]]; then
fileList=($(find "$inputDir" -type f -name "*_merge${fileExtension}" -a ! -name "*_rejected_*"))
else
fileList=($(find "$inputDir" -type f -name "*${fileExtension}" -a ! -name "*_rejected_*"))
fi
# For each input file, keep only the columns that are needed for the analysis.
for fileName in "${fileList[@]}"; do
[[ $verbose -eq 1 ]] && echo "### -> ${fileName}"
# Get list of columns to keep.
if [[ $fileExtension == '_cell_seg_data.txt' ]]; then
colsToKeep=$(head -n1 "$fileName" | tr '\t' '\n' | \
grep -i -n "^Sample Name$\|^Tissue Category$\|^Phenotype$\|^Cell ID$\|^Cell X Position$\|^Cell Y Position$\|^Annotation ID\|^Confidence\|^${cellCompartment} .* Mean " | \
cut -f1 --delimiter=':')
else
colsToKeep=$(head -n1 "$fileName" | tr '\t' '\n' | \
grep -n "^Sample Name$\|^Tissue Category$\|^Region Area\|^Annotation ID" | \
cut -f1 --delimiter=':')
fi
[[ -z $colsToKeep ]] && echo "### ERROR: no columns to keep found in [$fileName]" && exit 1
# Save copy of file with only the selected columns to output directory.
cut -f$( echo $colsToKeep | tr ' ' ',' ) "$fileName" > "$outputDir/$( basename $fileName )"
unset colsToKeep
done
[[ $verbose -eq 1 ]] && echo -e "### -> completed. \n### "
unset fileList fileName
done
[[ $verbose -eq 1 ]] && echo -e "### -> completed. \n### "
unset fileList fileName colsToKeep
# Reduce tissue surface files (*_tissue_seg_data_summary.txt).
# ***********************************************************
[[ $verbose -eq 1 ]] && echo "### Reduce *_tissue_seg_data_summary.txt files:"
# Retrieve all tissue surface files (*_tissue_seg_data_summary.txt).
IFS=$'\n'
fileList=( $( find "$inputDir" -type f -name '*_tissue_seg_data_summary.txt' ) )
IFS="$originalIFS"
# For each input file, keep only the columns that are needed for the analysis.
for fileName in "${fileList[@]}"; do
[[ $verbose -eq 1 ]] && echo "### -> ${fileName}"
colsToKeep=$( head -n1 "$fileName" | tr '\t' '\n' | \
grep -n "^Sample Name$\|^Tissue Category$\|^Region Area\|^Annotation ID" | \
cut -f1 --delimiter=':' )
[[ -z $colsToKeep ]] && echo "### ERROR: no columns to keep found in [$fileName]" && exit 1
cut -f$( echo $colsToKeep | tr ' ' ',' ) "$fileName" > "$outputDir/$(basename $fileName)"
unset colsToKeep
done
[[ $verbose -eq 1 ]] && echo -e "### -> completed. \n### "
# End of script message.
if [[ $verbose -eq 1 ]]; then
echo "### Data reduction completed."
echo "################################################################################"
fi
unset fileList fileName colsToKeep originalIFS
exit 0
####################################################################################################
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment