Commit a2f08b97 authored by Robin Engler's avatar Robin Engler
Browse files

Initial commit

parents
^.*\.Rproj$
^\.Rproj\.user$
.Rproj.user
.Rhistory
.RData
.Ruserdata
Package: postinform
Type: Package
Title: Post-process cell immunofluorescence data produced by the inForm software
Version: 0.1.0
Author: Robin Engler, Martial Sankar
Maintainer: Robin Engler <robin.engler@sib.swiss>
Description: More about what it does (maybe more than one line)
Use four spaces when indenting paragraphs within the Description.
License: Not released yet
Encoding: UTF-8
LazyData: true
Imports:
zip,
openxlsx,
checkmate
RoxygenNote: 7.1.1
# Generated by roxygen2: do not edit by hand
# Global variables and function sourcing.
# Note: in principle the assignment of global variables "<<-" is not really necessary since the
# declaration is made at the top level.
# Set application version.
POSTINFORM_VERSION <<- '0.1.0'
# Define default parameter values.
DEFAULT_CELL_COMPARTMENT = 'nucleus'
DEFAULT_PHENOTYPE_CONFIDENCE_THRESHOLD = 40
# Define authorized values in input data.
NOTISSUE <<- 'missing'
NOTISSUES_SYNONYMS <<- c('other', 'nothing', 'none', 'no tissue', '')
AUTHORIZED_TISSUES <<- c('stroma', 'tumor', 'dermis', 'epidermis', 'melanocyte', 'necrosis',
NOTISSUE)
AUTHORIZED_COMPARTMENTS <<- c('nucleus', 'membrane', 'cytoplasm', 'entire_cell')
AUTHORIZED_STROMA_VALUES <<- c('DAPI', 'stroma', 'other')
AUTHORIZED_TUMOR_VALUES <<- c('CK', 'tumor')
AUTHORIZED_MARKERS <<- c('CAL', 'CD3', 'CD4', 'CD8', 'CD11C', 'CD15', 'CD20', 'CD56', 'CD68',
'CD103', 'CD163', 'CD206', 'FOXP3', 'GB', 'gH2AX', 'gH2AXN', 'IDO',
'Keratin', 'KI67', 'PD1', 'PDL1', 'PERFORIN', 'WT1', 'CK', 'VISTA')
IGNORED_PHENOTYPES <<- c('DAPIp', 'MISSING')
DATAREDUCE_SCRIPT <<- file.path(dirname(dirname(sys.frame(1)$ofile)),
'inst/bash/reduce_file_size.sh')
CELL_FILES_EXTENSION <<- '_cell_seg_data.txt'
TISSUE_FILES_EXTENSION <<- '_tissue_seg_data_summary.txt'
PARAMETERS_FILE <<- 'parameters.txt'
THRESHOLDS_FILE <<- 'thresholds.txt'
SAMPLE_RENAME_FILE <<- 'sample_rename.txt'
SEGMENTATION_DATA_DIR <<- 'seg_data'
SUMMARY_OUTPUT_DIR <<- 'summary_statistics'
LOG_FILE_NAME <<- 'log.txt'
# InForm version data formats supported.
SUPPORTED_INFORM_VERSIONS <<- c(2.2, 2.4)
# Load source code.
source('postinform.R')
source('input_check.R')
source('load_data.R')
source('data_reduction.R')
source('individual_markers.R')
source('rename_samples.R')
source('functions.R')
source('legacy_functions.R')
####################################################################################################
#'
#'
#' @param input_dir
#' @param cell_compartment
#' @param output_dir
reduce_file_size <- function(input_dir, cell_compartment, output_dir = NULL){
# Define columns to keep for cell and tissue segmentation files respectively.
cols_to_keep_cell = c("Sample Name", "Tissue Category", "Phenotype", "Cell ID",
"Cell X Position", "Cell Y Position", "Confidence", "Annotation ID",
paste0(cell_compartment, ".* Mean .*"))
cols_to_keep_tissue = c("Sample Name", "Tissue Category", "Region Area .*", "Annotation ID")
# Loop through all files present in the input directory and reduce their size.
#TODO: parallelized loop with "foreach" and "doParallel" libraries.
# core_nb = parallel::detectCores() - 2
# if(core_nb <= 0) core_nb = 1
# if(core_nb > 4) core_nb = 4
# cl = parallel::makeCluster(core_nb)
# doParallel::registerDoParallel(cl)
# foreach(prefix = scan_dir_for_seg_data_files(input_dir)) %dopar% {
# ...
# }
# parallel::stopCluster(cl)
for(prefix in scan_dir_for_seg_data_files(input_dir)){
cell_seg_file = paste0(prefix, CELL_FILES_EXTENSION)
tissue_seg_file = paste0(prefix, TISSUE_FILES_EXTENSION)
log_message(file.path(basename(input_dir), cell_seg_file), level=2)
delete_cols_in_file(input_file = file.path(input_dir, cell_seg_file),
colnames_to_keep = cols_to_keep_cell,
output_file = file.path(output_dir, cell_seg_file),
chunk_size = 200000,
sep = '\t')
log_message(file.path(basename(input_dir), tissue_seg_file), level=2)
delete_cols_in_file(input_file = file.path(input_dir, tissue_seg_file),
colnames_to_keep = cols_to_keep_tissue,
output_file = file.path(output_dir, tissue_seg_file),
chunk_size = 200000,
sep = '\t')
}
return(invisible(NULL))
}
####################################################################################################
####################################################################################################
#' Reads the input_file table chunk_size lines at a time and keep only columns listed in the
#' colnames_to_keep string vector.
#'
#' @param input_file [string] File to process.
#' @param colnames_to_keep [string vector] Names of columns from input_file to keep.
#' Regexp notation is allowed.
#' @param output_file [string] File where to save the output data table with the reduced number of
#' columns. By default the output file is set to input_file, meaning that the input file gets
#' overwritten.
#' @param chunk_size [integer] integer. Number of lines of the input file to read at a time. Larger
#' values result in faster processing but larger memory consumption.
#' @param sep [string] Delimiter (separator) used in the input table file.
delete_cols_in_file <- function(input_file,
colnames_to_keep,
output_file = input_file,
chunk_size = 100000,
sep = '\t'){
# Read file header and identify which columns of the file should be kept.
stopifnot(file.exists(input_file))
file_connection = file(input_file, open='r', encoding=guess_file_encoding(input_file))
header = unlist(strsplit(readLines(con=file_connection, n=1), split=sep))
cols_to_keep = grep(pattern=paste0('^', paste(colnames_to_keep, collapse='$|^'), '$'),
x=header, ignore.case=TRUE)
# If there are no columns to remove, exit the function immediately to save time.
if(length(cols_to_keep) == length(header)){
close(file_connection)
if(output_file != input_file){
if(! dir.exists(dirname(output_file))) dir.create(dirname(output_file), recursive=T)
file.copy(from=input_file, to=output_file, overwrite=FALSE)
}
return(invisible(NULL))
}
# Load content of file chunk_size lines at a time and keep only the subset of needed columns.
reduced_df = NULL
while(length(fchunk <- readLines(con=file_connection, n=chunk_size, warn=FALSE)) > 0){
reduced_df = rbind(reduced_df,
as.data.frame(matrix(unlist(strsplit(fchunk, split=sep)),
nrow=length(fchunk), byrow=T)[, cols_to_keep]))
#message("read lines:", nrow(reduced_df))
}
close(file_connection)
# Write the reduced data frame to output_file.
colnames(reduced_df) = header[cols_to_keep]
if(! dir.exists(dirname(output_file))) dir.create(dirname(output_file), recursive=T)
write.table(reduced_df, file=output_file, row.names=FALSE, quote=FALSE, sep=sep)
return(invisible(NULL))
}
####################################################################################################
####################################################################################################
#' Delete all files in the session_root_dir input directory that are not needed for the analysis.
#'
delete_unnecessary_files <- function(session_root_dir){
# Verify the input directory exists.
if(!file.exists(session_root_dir) || !file.info(session_root_dir)$isdir) raise_error(
paste('Input sub-directory could not be found:', session_root_dir))
# Delete unnecessary files from all sub-directories present in the session's root dir.
for(subdir in list.dirs(session_root_dir, full.names=TRUE, recursive=FALSE)){
# Scan sub-directory for input files.
# **********************************
# Get list of all files in the current sub-directory. Delete the sub-directory if empty.
file_list = list.files(path=subdir, all.files=FALSE, full.names=FALSE, recursive=FALSE)
if(length(file_list) == 0){
unlink(subdir, recursive=TRUE)
next
}
# Identify 'rejected', 'seg_data' and 'merged' files. 'seg_data' files correspond to
# *_cell_seg_data.txt and *_tissue_seg_data_summary.txt files.
is_rejected = grepl(pattern='_rejected_', x=file_list, ignore.case=F)
is_seg_data = grepl(pattern='.*_tissue_seg_data_summary.txt$|.*_cell_seg_data.txt$',
x=file_list, ignore.case=F)
is_merge = grepl(pattern='merge[_-]', x=file_list, ignore.case=T)
# Delete un-necessary files.
# *************************
# Delete all files that are not needed for the analysis.
# This includes:
# -> any file that is not a *_cell_seg_data.txt or *_tissue_seg_data_summary.txt file.
# -> *_rejected_* files. In principle only present if 'merge' files are present.
# -> if 'merge' files are present, any file that is not a merge file.
# If 'merge' files are present, all 'non-merge' files get added to the list of files to
# delete.
if(any(is_merge)){
to_delete = is_rejected | ! is_seg_data | ! is_merge
} else to_delete = is_rejected | ! is_seg_data
# Delete selected files, or the entire sub-directory if all files are to be deleted.
if(all(to_delete)){
unlink(subdir, recursive=T)
} else sapply(file.path(subdir, file_list[which(to_delete)]), unlink)
}
}
####################################################################################################
####################################################################################################
move_input_files_to_single_directory <- function(session_root_dir){
# ********************************************************************************************
# Move all cell and tissue segmentation files to single directory named SEGMENTATION_DATA_DIR.
# ********************************************************************************************
# Create the new 'seg_data' directory if needed.
unique_dir = file.path(session_root_dir, SEGMENTATION_DATA_DIR)
if(!dir.exists(unique_dir)) dir.create(unique_dir)
# Move files in each directory to the new directory and delete the now empty directory.
for(subdir in list.dirs(session_root_dir, full.names=TRUE, recursive=FALSE)){
if(subdir==unique_dir) next
lapply(list.files(subdir, all.files=FALSE, full.names=TRUE, recursive=F, include.dirs=F),
FUN=function(x) file.rename(x, file.path(unique_dir, basename(x))) )
unlink(subdir, recursive=T)
}
}
####################################################################################################
This diff is collapsed.
####################################################################################################
merge_individual_marker_files <- function(input_dir){
# ********************************************************************************************
# Merge individual marker files located in input_dir into a single file. The merged result is
# written to a new file and the original files are deleted from disk.
#
# ********************************************************************************************
sample_name = basename(input_dir)
for(extension in c(CELL_FILES_EXTENSION, TISSUE_FILES_EXTENSION)){
# Identify files to merge. Make sure at least 2 files were found.
files_to_merge = list.files(path=input_dir, pattern=paste0(extension, '$'),
all.files=F, full.names=T, recursive=F, ignore.case=F)
if(length(files_to_merge) == 1) raise_error(
msg = paste('Individual marker merge error. Only one file to merge for sample:',
sample_name),
file = input_dir)
if(any(!startsWith(basename(files_to_merge), 'individualmarker_'))) raise_error(
msg = 'Individual marker merge error: mix of individual and non-individual files.',
file = input_dir)
# Merge *_cell_seg_data.txt or *_tissue_seg_data_summary.txt files.
if(extension==CELL_FILES_EXTENSION) merged_df = merge_cell_data_files(files_to_merge)
if(extension==TISSUE_FILES_EXTENSION) merged_df = merge_tissue_data_files(files_to_merge)
# Write merged file to disk.
file_name = file.path(input_dir, paste0(sample_name, extension))
if(file.exists(file_name)) raise_error(
msg = 'Individual marker merge error: file already exists.', file = file_name)
write.table(merged_df, file=file_name, quote=FALSE, row.names=FALSE, sep='\t')
# Delete original individual marker files.
unlink(files_to_merge)
}
}
####################################################################################################
####################################################################################################
merge_cell_data_files <- function(files_to_merge){
# ********************************************************************************************
# Load the provided *_cell_seg_data.txt files and merge their content into a single data
# frame.
#
# Input arguments:
# -> files_to_merge : list[string], path + name of *_cell_seg_data.txt files to be merged.
#
# ********************************************************************************************
stopifnot(length(files_to_merge) > 1)
# Define columns that will be used as key fields during the merge.
key_fields = c('sample_name', 'image_id', 'cell_id', 'cell_x_position', 'cell_y_position')
# Loop through all files that match the prefix + suffix combination. Load and check the
# content of each file, then add it to the merged data frame.
merged_df = NULL
for(f in files_to_merge){
# Load data and remove duplicated rows.
input_df = remove_duplicated_rows(
input_df = read.table(f, sep='\t', header=TRUE, as.is=TRUE,
colClasses='character', strip.white=TRUE),
key_fields = key_fields,
file_name = f)
# Verify all input files have the same columns. This must always be the case as we have
# already standardized the files earlier.
if(is.null(merged_df)){
stopifnot(all(colnames(input_df)[1:length(key_fields)] == key_fields))
col_names = colnames(input_df)
}
stopifnot(all(colnames(input_df) == col_names))
# Merge data.
# **********
# Merge data frame for the current marker with the global dataframe 'merged_df'.
if(is.null(merged_df)){
merged_df = input_df
} else{
# Append a suffix to the non-key fields, so they have a unique name and get preserved
# during the merge.
colnames(input_df)[(length(key_fields)+1):ncol(input_df)] = paste0(
colnames(input_df)[(length(key_fields)+1):ncol(input_df)],
'_', which(files_to_merge==f))
row_count = nrow(merged_df)
merged_df = merge(merged_df, input_df, by=key_fields, all=FALSE, sort=FALSE)
# Compute percentage of data loss due to merge operation. If it's more than 2% a
# warning is issued.
row_loss_percentage = round((row_count - nrow(merged_df)) / row_count * 100, 2)
if(row_loss_percentage > 5) raise_error(
msg = c(paste0('A large number of records were lost while merging file [',
row_loss_percentage, '%].'),
'This could indicate a problem in the input data and should be investigated.'),
file = f,
type = 'warning')
}
rm(input_df)
}
# Extract tissue category, phenotype, and marker intensity values.
# ***************************************************************
# Create separate data frames for the non-key fields.
tissue_cat_df = merged_df[, grep('tissue_category', colnames(merged_df))]
phenotype_df = merged_df[, grep('phenotype', colnames(merged_df))]
marker_int_df = merged_df[, grep('_mean$|_mean_', colnames(merged_df))]
for(x in 1:ncol(marker_int_df)) marker_int_df[,x] = as.numeric(marker_int_df[,x])
# Remove the extracted columns from the merged data frame.
merged_df = merged_df[,1:length(key_fields)]
# Combine data from all "Tissue Category" columns.
# ***********************************************
# In principle, the "Tissue Category" columns of all individual markers should contain the
# same value. Here we verify that this is the case and then keep only one copy of them.
if(any(tissue_cat_df != tissue_cat_df[,1])){
diff_rows = unlist(sapply(2:ncol(tissue_cat_df),
FUN=function(x) which(tissue_cat_df[x] != tissue_cat_df[,1])))
for(x in diff_rows){
value_frequency = sort(table(as.character(tissue_cat_df[x,])), decreasing=T)
stopifnot(length(value_frequency) >= 2)
# Case 1: one of the tissue values has a majority within the row. Majority ruling is
# possible and the most frequent value is used.
if(as.numeric(value_frequency[1]) > as.numeric(value_frequency[2])){
tissue_cat_df[x,] = names(value_frequency)[1]
raise_error(msg = c(paste0('tissue_category values differ accorss files. ',
'Values were reconciled based on majority ruling.'),
paste0('Offending row: ', x)),
file=files_to_merge[1],
type = 'warning')
# Case 2: majority ruling is not possible
} else{
raise_error(
msg = c('Could not merge individual marker files.',
'Reason: tissue_category values differ across files with no majority.',
paste0('Offending row: ', x),
paste0('Offending values:', paste(tissue_cat_df[x,], collapse=' '))),
file = files_to_merge[1])
}
}
stopifnot(all(tissue_cat_df == tissue_cat_df[,1]))
}
# Add tissue category values to merged dataframe.
merged_df[,'tissue_category'] = tissue_cat_df[,1]
# Combine data from all 'Phenotype' columns.
# *****************************************
# Each 'Phenotype' column contains the phenotyping for one or more individual marker. Here we
# want to merge all these columns into a single column that will contain all phenotyping
# information. If more than one marker is found for a given row, they get combined with a '_'
# separator. E.g. if 'CD8' and 'PD1' are found in same row, the combination becomes 'CD8_PD1'.
regexp = paste(sapply(IGNORED_PHENOTYPES, FUN=function(x) paste0(x,'_|_',x)), collapse = '|')
merged_df[,'phenotype'] = apply(phenotype_df, MARGIN=1,
FUN=function(x) gsub(pattern = regexp,
replacement = '',
x = paste(unique(x), collapse='_')) )
# Combine data from all marker intensity columns.
# **********************************************
# Verify that all marker intensity columns in the merge data frame have the same values and
# keep only one copy of them. A difference in values <= 0.01 is accepted.
marker_intensity_cols = grep('_mean$', col_names, value=TRUE)
tolerance_limit = 0.01
for(col_name in marker_intensity_cols){
col_index = grep(col_name, colnames(marker_int_df))
differences = abs(marker_int_df[,col_index] - marker_int_df[,col_index[1]])
differing_values = which(differences > tolerance_limit)
# If there are values > tolerance_limit, an warning is displayed to the user.
if(length(differing_values) > 0){
differing_values = as.vector(as.matrix(differences))[differing_values]
raise_error(
msg=c(paste0('Values for column [', col_name, '] differ accross individual files'),
paste0('to merge by more than ', tolerance_limit, ' at ',
length(differing_values), ' occurences [',
length(differing_values)/nrow(marker_int_df)*100, ' %].'),
'Values from the first file (alphabetically) will be used.'),
file=dirname(files_to_merge[1]),
type = 'warning')
}
}
# Add marker intensity values to merged dataframe.
merged_df = cbind(merged_df, marker_int_df[, 1:length(marker_intensity_cols)])
return(merged_df)
}
####################################################################################################
####################################################################################################
merge_tissue_data_files <- function(files_to_merge){
# ********************************************************************************************
# "merges" the content of all provided *_tissue_seg_data_summary.txt files into a single
# data frame. Most of the time, all *_tissue_seg_data_summary.txt files are exactly the same,
# but it can happen that some are missing a number of rows (i.e. some image subsets are
# missing because there were excluded based on their poor quality).
# So the "merge" operation done in this functions consists keeping only rows from the input
# *_tissue_seg_data_summary.txt files that are found in all files (i.e. do the intersection
# of all files.).
#
# ********************************************************************************************
stopifnot(length(files_to_merge) > 1)
# Define columns that will be used as key fields to carry out the merge.
key_fields = c('sample_name', 'image_id', 'tissue_category')
non_key_fields = c('region_area_surface', 'region_area_percent')
# Merge all files in list.
merged_df = NULL
for(f in files_to_merge){
# Load data and remove duplicated rows. Verify that the column names are correct - at this
# point the files are standardized so they must all have the same column names.
input_df = remove_duplicated_rows(
input_df = read.table(f, sep='\t', header=TRUE, as.is=TRUE, strip.white=T,
colClasses=c(rep('character',3), rep('numeric',2))),
key_fields = key_fields,
file_name = f)
stopifnot(all(colnames(input_df) == c(key_fields, non_key_fields)))
# Merge data frame for the current marker with the global dataframe 'merged_df'.
if(is.null(merged_df)){
merged_df = input_df
} else{
# Append suffix to non-key fields, so they have a unique name and get preserved
# during the merge.
colnames(input_df)[4:5] = paste0(non_key_fields, '_', which(files_to_merge %in% f))
merged_df = merge(merged_df, input_df, by=key_fields, all=FALSE, sort=FALSE)
}
}
# Search for mismatches among values of non-key fields. If some are detected, a warning
# is displayed. For rows with mismatches, if any, compute the median of surface values. In
# this way, if one of the input files has a different values it gets excluded (provided there
# are at least 3 files).
mismatches = NULL
for(col_name in non_key_fields){
col_index = grep(col_name, colnames(merged_df))
difference = abs(merged_df[,col_index] - merged_df[,col_index[1]])
mismatches = unique(c(mismatches, which(apply(difference, MARGIN=1, sum) > 0)))
}
if(length(mismatches) > 0){
percentage = round(length(mismatches)/nrow(merged_df) * 100, 2)
raise_error(msg = c('Mismatches in tissue surface among tissue seg files were found',
'Median value of tissue surface will be used for the following rows:',
paste0(paste(mismatches, collapse=', '), ' [', percentage, '%]')),
file = dirname(files_to_merge[1]),
type='warning')
# Compute median values to reconciliate mismatches.
for(col_name in non_key_fields){
col_index = grep(col_name, colnames(merged_df))
merged_df[mismatches, col_index[1]] = apply(merged_df[mismatches, col_index], 1, median)
}
}
# Remove duplicated columns.
merged_df = merged_df[,1:5]
return(merged_df)
}
####################################################################################################
####################################################################################################
remove_duplicated_rows <- function(input_df, key_fields, file_name){
# ********************************************************************************************
# Check whether there are duplicated rows for key_fields in the input table (input_df), and if
# so, removes them and displays a warning.
#
# file_name: name of file from where input_df is loaded. Only used to display warning to user.
# ********************************************************************************************
# Remove perfectly duplicated rows without displaying any warning-
if(any(duplicated(input_df))) input_df = unique(input_df)
# Remove rows duplicated over the key_fields with a warning to the user.
duplicated_rows = which(duplicated(input_df[,key_fields]))
if(length(duplicated_rows) > 0){
input_df = input_df[-duplicated_rows,]
raise_error(msg = 'The following duplicated rows were deleted from input file:',
file = file_name,
items_to_list = duplicated_rows,
type = 'warning')
}
return(input_df)
}
####################################################################################################
This diff is collapsed.
This diff is collapsed.
####################################################################################################
load_session_parameters <- function(session_root_dir){
# ********************************************************************************************
# Load and verify parameter presence and their values.
#
# ********************************************************************************************
# Read parameters file. Verify all required arguments are present.
arg_values = read_parameters_file(file.path(session_root_dir, PARAMETERS_FILE))
# Sample list.
# ***********
# Check that at least one sample was provided.
if(! 'samples' %in% names(arg_values)) raise_error(msg="Parameter 'samples' is missing.",
file = PARAMETERS_FILE)
samples = sort(unique(arg_values[['samples']]))
if(length(samples) == 0) raise_error(msg="No 'samples' values in file.", file=PARAMETERS_FILE)
# Check that sample names do not contain '[' or ']'. These are not allowed in sample names.
if(any(grepl(pattern='\\[|\\]', x=samples))) raise_error(
"Some sample names contain the non-authorized characters '[' or ']'.", file=PARAMETERS_FILE)
# Verify that no sample name contains the string '_merge' or '-merge'. These are reserved for
# 'merged' files.
if(any(grepl(pattern='[_-]merge$', x=samples))) raise_error(
msg = "Sample names ending in '_merge' or '-merge' are not allowed.", file=PARAMETERS_FILE)