Commit a2f08b97 authored by Robin Engler's avatar Robin Engler
Browse files

Initial commit

parents
^.*\.Rproj$
^\.Rproj\.user$
.Rproj.user
.Rhistory
.RData
.Ruserdata
Package: postinform
Type: Package
Title: Post-process cell immunofluorescence data produced by the inForm software
Version: 0.1.0
Author: Robin Engler, Martial Sankar
Maintainer: Robin Engler <robin.engler@sib.swiss>
Description: More about what it does (maybe more than one line)
Use four spaces when indenting paragraphs within the Description.
License: Not released yet
Encoding: UTF-8
LazyData: true
Imports:
zip,
openxlsx,
checkmate
RoxygenNote: 7.1.1
# Generated by roxygen2: do not edit by hand
# Global variables and function sourcing.
# Note: in principle the assignment of global variables "<<-" is not really necessary since the
# declaration is made at the top level.
# Set application version.
POSTINFORM_VERSION <<- '0.1.0'
# Define default parameter values.
DEFAULT_CELL_COMPARTMENT = 'nucleus'
DEFAULT_PHENOTYPE_CONFIDENCE_THRESHOLD = 40
# Define authorized values in input data.
NOTISSUE <<- 'missing'
NOTISSUES_SYNONYMS <<- c('other', 'nothing', 'none', 'no tissue', '')
AUTHORIZED_TISSUES <<- c('stroma', 'tumor', 'dermis', 'epidermis', 'melanocyte', 'necrosis',
NOTISSUE)
AUTHORIZED_COMPARTMENTS <<- c('nucleus', 'membrane', 'cytoplasm', 'entire_cell')
AUTHORIZED_STROMA_VALUES <<- c('DAPI', 'stroma', 'other')
AUTHORIZED_TUMOR_VALUES <<- c('CK', 'tumor')
AUTHORIZED_MARKERS <<- c('CAL', 'CD3', 'CD4', 'CD8', 'CD11C', 'CD15', 'CD20', 'CD56', 'CD68',
'CD103', 'CD163', 'CD206', 'FOXP3', 'GB', 'gH2AX', 'gH2AXN', 'IDO',
'Keratin', 'KI67', 'PD1', 'PDL1', 'PERFORIN', 'WT1', 'CK', 'VISTA')
IGNORED_PHENOTYPES <<- c('DAPIp', 'MISSING')
DATAREDUCE_SCRIPT <<- file.path(dirname(dirname(sys.frame(1)$ofile)),
'inst/bash/reduce_file_size.sh')
CELL_FILES_EXTENSION <<- '_cell_seg_data.txt'
TISSUE_FILES_EXTENSION <<- '_tissue_seg_data_summary.txt'
PARAMETERS_FILE <<- 'parameters.txt'
THRESHOLDS_FILE <<- 'thresholds.txt'
SAMPLE_RENAME_FILE <<- 'sample_rename.txt'
SEGMENTATION_DATA_DIR <<- 'seg_data'
SUMMARY_OUTPUT_DIR <<- 'summary_statistics'
LOG_FILE_NAME <<- 'log.txt'
# InForm version data formats supported.
SUPPORTED_INFORM_VERSIONS <<- c(2.2, 2.4)
# Load source code.
source('postinform.R')
source('input_check.R')
source('load_data.R')
source('data_reduction.R')
source('individual_markers.R')
source('rename_samples.R')
source('functions.R')
source('legacy_functions.R')
####################################################################################################
#'
#'
#' @param input_dir
#' @param cell_compartment
#' @param output_dir
reduce_file_size <- function(input_dir, cell_compartment, output_dir = NULL){
# Define columns to keep for cell and tissue segmentation files respectively.
cols_to_keep_cell = c("Sample Name", "Tissue Category", "Phenotype", "Cell ID",
"Cell X Position", "Cell Y Position", "Confidence", "Annotation ID",
paste0(cell_compartment, ".* Mean .*"))
cols_to_keep_tissue = c("Sample Name", "Tissue Category", "Region Area .*", "Annotation ID")
# Loop through all files present in the input directory and reduce their size.
#TODO: parallelized loop with "foreach" and "doParallel" libraries.
# core_nb = parallel::detectCores() - 2
# if(core_nb <= 0) core_nb = 1
# if(core_nb > 4) core_nb = 4
# cl = parallel::makeCluster(core_nb)
# doParallel::registerDoParallel(cl)
# foreach(prefix = scan_dir_for_seg_data_files(input_dir)) %dopar% {
# ...
# }
# parallel::stopCluster(cl)
for(prefix in scan_dir_for_seg_data_files(input_dir)){
cell_seg_file = paste0(prefix, CELL_FILES_EXTENSION)
tissue_seg_file = paste0(prefix, TISSUE_FILES_EXTENSION)
log_message(file.path(basename(input_dir), cell_seg_file), level=2)
delete_cols_in_file(input_file = file.path(input_dir, cell_seg_file),
colnames_to_keep = cols_to_keep_cell,
output_file = file.path(output_dir, cell_seg_file),
chunk_size = 200000,
sep = '\t')
log_message(file.path(basename(input_dir), tissue_seg_file), level=2)
delete_cols_in_file(input_file = file.path(input_dir, tissue_seg_file),
colnames_to_keep = cols_to_keep_tissue,
output_file = file.path(output_dir, tissue_seg_file),
chunk_size = 200000,
sep = '\t')
}
return(invisible(NULL))
}
####################################################################################################
####################################################################################################
#' Reads the input_file table chunk_size lines at a time and keep only columns listed in the
#' colnames_to_keep string vector.
#'
#' @param input_file [string] File to process.
#' @param colnames_to_keep [string vector] Names of columns from input_file to keep.
#' Regexp notation is allowed.
#' @param output_file [string] File where to save the output data table with the reduced number of
#' columns. By default the output file is set to input_file, meaning that the input file gets
#' overwritten.
#' @param chunk_size [integer] integer. Number of lines of the input file to read at a time. Larger
#' values result in faster processing but larger memory consumption.
#' @param sep [string] Delimiter (separator) used in the input table file.
delete_cols_in_file <- function(input_file,
colnames_to_keep,
output_file = input_file,
chunk_size = 100000,
sep = '\t'){
# Read file header and identify which columns of the file should be kept.
stopifnot(file.exists(input_file))
file_connection = file(input_file, open='r', encoding=guess_file_encoding(input_file))
header = unlist(strsplit(readLines(con=file_connection, n=1), split=sep))
cols_to_keep = grep(pattern=paste0('^', paste(colnames_to_keep, collapse='$|^'), '$'),
x=header, ignore.case=TRUE)
# If there are no columns to remove, exit the function immediately to save time.
if(length(cols_to_keep) == length(header)){
close(file_connection)
if(output_file != input_file){
if(! dir.exists(dirname(output_file))) dir.create(dirname(output_file), recursive=T)
file.copy(from=input_file, to=output_file, overwrite=FALSE)
}
return(invisible(NULL))
}
# Load content of file chunk_size lines at a time and keep only the subset of needed columns.
reduced_df = NULL
while(length(fchunk <- readLines(con=file_connection, n=chunk_size, warn=FALSE)) > 0){
reduced_df = rbind(reduced_df,
as.data.frame(matrix(unlist(strsplit(fchunk, split=sep)),
nrow=length(fchunk), byrow=T)[, cols_to_keep]))
#message("read lines:", nrow(reduced_df))
}
close(file_connection)
# Write the reduced data frame to output_file.
colnames(reduced_df) = header[cols_to_keep]
if(! dir.exists(dirname(output_file))) dir.create(dirname(output_file), recursive=T)
write.table(reduced_df, file=output_file, row.names=FALSE, quote=FALSE, sep=sep)
return(invisible(NULL))
}
####################################################################################################
####################################################################################################
#' Delete all files in the session_root_dir input directory that are not needed for the analysis.
#'
delete_unnecessary_files <- function(session_root_dir){
# Verify the input directory exists.
if(!file.exists(session_root_dir) || !file.info(session_root_dir)$isdir) raise_error(
paste('Input sub-directory could not be found:', session_root_dir))
# Delete unnecessary files from all sub-directories present in the session's root dir.
for(subdir in list.dirs(session_root_dir, full.names=TRUE, recursive=FALSE)){
# Scan sub-directory for input files.
# **********************************
# Get list of all files in the current sub-directory. Delete the sub-directory if empty.
file_list = list.files(path=subdir, all.files=FALSE, full.names=FALSE, recursive=FALSE)
if(length(file_list) == 0){
unlink(subdir, recursive=TRUE)
next
}
# Identify 'rejected', 'seg_data' and 'merged' files. 'seg_data' files correspond to
# *_cell_seg_data.txt and *_tissue_seg_data_summary.txt files.
is_rejected = grepl(pattern='_rejected_', x=file_list, ignore.case=F)
is_seg_data = grepl(pattern='.*_tissue_seg_data_summary.txt$|.*_cell_seg_data.txt$',
x=file_list, ignore.case=F)
is_merge = grepl(pattern='merge[_-]', x=file_list, ignore.case=T)
# Delete un-necessary files.
# *************************
# Delete all files that are not needed for the analysis.
# This includes:
# -> any file that is not a *_cell_seg_data.txt or *_tissue_seg_data_summary.txt file.
# -> *_rejected_* files. In principle only present if 'merge' files are present.
# -> if 'merge' files are present, any file that is not a merge file.
# If 'merge' files are present, all 'non-merge' files get added to the list of files to
# delete.
if(any(is_merge)){
to_delete = is_rejected | ! is_seg_data | ! is_merge
} else to_delete = is_rejected | ! is_seg_data
# Delete selected files, or the entire sub-directory if all files are to be deleted.
if(all(to_delete)){
unlink(subdir, recursive=T)
} else sapply(file.path(subdir, file_list[which(to_delete)]), unlink)
}
}
####################################################################################################
####################################################################################################
move_input_files_to_single_directory <- function(session_root_dir){
# ********************************************************************************************
# Move all cell and tissue segmentation files to single directory named SEGMENTATION_DATA_DIR.
# ********************************************************************************************
# Create the new 'seg_data' directory if needed.
unique_dir = file.path(session_root_dir, SEGMENTATION_DATA_DIR)
if(!dir.exists(unique_dir)) dir.create(unique_dir)
# Move files in each directory to the new directory and delete the now empty directory.
for(subdir in list.dirs(session_root_dir, full.names=TRUE, recursive=FALSE)){
if(subdir==unique_dir) next
lapply(list.files(subdir, all.files=FALSE, full.names=TRUE, recursive=F, include.dirs=F),
FUN=function(x) file.rename(x, file.path(unique_dir, basename(x))) )
unlink(subdir, recursive=T)
}
}
####################################################################################################
####################################################################################################
check_for_duplicated_rows <- function(data_frame){
# ********************************************************************************************
# Checks for duplicated rows in the input data frame and raises and error if duplicates are
# found.
# ********************************************************************************************
duplicated_rows = which(duplicated(data_frame, MARGIN=1))
if(length(duplicated_rows) > 0) raise_error(
msg = "Duplicated rows found in input data:",
items_to_list = sapply(duplicated_rows,
FUN=function(x) paste(data_frame[x,], collapse=' ')))
}
####################################################################################################
####################################################################################################
reclass_cells_by_marker <- function(marker,
cell_values,
thresholds = NULL){
# ********************************************************************************************
# Computes a binary vector indicating whether a cell is positive for a given marker (1) or
# not (0). The function accepts cell_values that are either phenotypes, or marker intensity
# values.
#
# Input arguments:
# marker : string. Name of marker for which to reclassify cells.
# cell_values: if the marker was phenotyped: must be a vector of strings.
# if the marker was scored: must be a vector of floats.
# thresholds : list of thresholds for the given marker. If the marker was phenotyped (as
# opposed to scored), then thresholds must be set to NULL. This is how the
# function knows that the maker is phenotyped.
# ********************************************************************************************
# Case 1: marker was phenotyped.
# *****************************
if(length(thresholds) == 0){
# Test whether the elements of the cell_values vector contain the string "markerName + p".
# E.g. for CD11c, we test if the phenotype contains the string "CD11cp".
# "partial matches", means that if we are searching for say "CD11", then all cells
# whose phenotype contains "CD11p" (e.g. "CD11p", "CD11p_CD86p" or "CD8p_CD11p_CD86p") will be
# reclassified as 1 (the marker is present in the cell). Note that if the marker apprears in the
# phenotype with a "m", e.g. "CD8p_CD11m", then this is not counted as a match since the "m"
# stands for "minus", meaning the marker is not present in the cell.
marker = paste0(marker, 'p')
stopifnot(is.character(cell_values))
return(sapply(strsplit(cell_values, split='_'), FUN=function(x) ifelse(marker %in% x,1,0)))
}
# Case 2: marker was scored.
# **************************
# In this case we reclass the values depending on whether they are >= or < than the threshold.
# Values >= threshold get reclassified as 1, while values < threshold get reclassified as 0.
stopifnot(is.numeric(cell_values))
stop("########### NOT IMPLEMENTED YET !!!!")
# Create temporary vector of thresholds. This vector has the same length as the number of rows in
# the cell table, and simply contains the threshold value coresponding to the tissue type for
# each row of the cell table.
thresholdVector = as.numeric(sapply(intensity_table[,'tissue_category'],
FUN=function(x) thresholdList[1,x]))
# Test whether intensity value is >= threshold.
return(as.numeric(intensity_table[,tmpColNb] > thresholdVector))
}
####################################################################################################
####################################################################################################
generate_summary_table <- function(sample_name,
image_ids,
cell_types,
tissue_list,
markers_phenotyped,
markers_scored,
thresholds,
tissue_table){
# ********************************************************************************************
# Generate a table where each row contains values for a given sample, image ID, cell type and
# tissue type combination. The summary_table has the following columns:
# - sample_name : name of sample.
# - image_id : name of image subset within the sample.
# - tissue_type : name of tissue (e.g. "stroma", "tumor").
# - cell_type : name of cell type. Either an individual marker or a combination (e.g. "CD4", "CD8p_FPXP3n").
# - threshold : threshold value for mean marker intensity values. These are the threshold that are used to
# reclassify the marker intensity values into 0/1 values. Note that these values only exist
# for the rows corresponding to individual markers (combination cell types and Total rows
# have no values in the column).
# - SurfacePIX : surface of each cellType in pixel units.
# - SurfaceMM2 : surface of each cellType, in square mm.
# - CellDensity: density of cells per square mm.
# - CellCount : cell count for the given cell type in the given tissue type.
# - IntMean : mean value of marker intensity for the cell belonging to the row's cellType.
# - IntMedian : median value " " "
# - IntMin : minimum value " " "
# - IntMax : maximum value " " "
# - IntSD : standard deviation value " " "
#
# SampleName ImageID CellType TissueType Threshold SurfacePIX SurfaceMM2
# His2757_7 Total CD4 Stroma 0.5 NA NA
# His2757_7 Total CD4 Tumor 0.5 NA NA
# His2757_7 Total FOXP3 Stroma 0.2 NA NA
# His2757_7 Total FOXP3 Tumor 0.2 NA NA
# His2757_7 Total CD4p_FOXP3p Stroma NA NA NA
# His2757_7 Total CD4p_FOXP3p Tumor NA NA NA
# His2757_7 Total Total Stroma NA 299488 NA
# His2757_7 Total Total Tumor NA 2108192 NA
# His2757_7 39995_14773 CD4 Stroma 0.5 NA NA
# His2757_7 39995_14773 CD4 Tumor 0.5 NA NA
# His2757_7 39995_14773 FOXP3 Stroma 0.2 NA NA
# His2757_7 39995_14773 FOXP3 Tumor 0.2 NA NA
# His2757_7 39995_14773 CD4p_FOXP3p Stroma NA NA NA
# His2757_7 39995_14773 CD4p_FOXP3p Tumor NA NA NA
# His2757_7 39995_14773 Total Stroma NA 149744 NA
# His2757_7 39995_14773 Total Tumor NA 1054096 NA
#
#
# Input arguments:
# sample_name: string. Name of sample.
#
# ********************************************************************************************
stopifnot(all(tissue_table[,'sample_name'] == sample_name))
# Create summary table backbone.
# *****************************
# Add "Total" values to the list of image IDs and cell types. These represent the total for
# all image IDs, and the total for all cell types (i.e. all cell types together).
image_ids = c('Total', image_ids)
cell_types = c(cell_types, 'Total')
# Compute nuber of rows of the summary_table (i.e. image ID, cell type and tissue type combinations)
row_nb = length(tissue_list) * length(cell_types) * length(image_ids)
# Create table.
summary_table = data.frame(
SampleName = rep(sample_name, times=row_nb),
ImageID = rep(image_ids, each=length(cell_types) * length(tissue_list)),
TissueType = rep(sort(tissue_list), each=length(cell_types), times=length(image_ids)),
CellType = rep(cell_types, each=1, times=length(tissue_list) * length(image_ids)),
Threshold = NA,
SurfacePIX = NA,
SurfaceMM2 = NA,
CellDensity = NA,
CellCount = NA,
IntMean = NA,
IntMedian = NA,
IntMin = NA,
IntMax = NA,
IntSD = NA,
stringsAsFactors = FALSE)
# Add threshold values to summary_table.
# *************************************
# Threshold values for phenotyped markers are set to -1
summary_table[summary_table[,'CellType'] %in%
paste0(rep(markers_phenotyped, each=2), c('p','p_total')), 'Threshold'] = -1
# Scored markers get the threshold value extracted from the thresholds table.
for(marker in markers_scored){
for(tissue in tissue_list){
stop("######## NOT IMPLEMENTED YET")
tmpColNb = which( summary_table[,'CellType'] %in% paste0(marker, c('p','p_total'), sep='') &
summary_table[,'TissueType'] == tissue )
summary_table[tmpColNb,'Threshold'] = threshold_table[sample_name,marker,tissue]
}
}
# Add surface values in MM2 to summary table.
# ******************************************
# Add the surface values for each image ID and tissue type to the "Total" cell type row of the
# summary table.
for(image_id in image_ids){
for(tissue in tissue_list){
row_nb = which(summary_table$ImageID == image_id &
summary_table$CellType == 'Total' &
summary_table$TissueType == tissue)
surface_rows = switch(ifelse(image_id == 'Total', 1, 2),
which(tissue_table[,'tissue_category']==tissue),
which(tissue_table[,'tissue_category']==tissue &
tissue_table[,'image_id']==image_id))
summary_table[row_nb, 'SurfaceMM2'] = sum(tissue_table[surface_rows, 'region_area_surface'])
}
}
# The function returns the summary_table data frame.
return(summary_table)
}
####################################################################################################
####################################################################################################
cell_type_statistics <- function(image_id, tissue_type, cell_types, cell_table){
# *******************************************************************************************
# Computes the following statistics for all cell types of a given image_id and tissue type:
# - cell count : number of cells with type "cell_type" in tissue "tissue_type".
# - intensity mean : mean intensity value for cells maching "cell_type" and "tissue_type".
# - intensity median: median ...
# - intensity min : minimum ...
# - intensity max : maximum ...
# - intensity SD : standard deviation ...
#
# The function returns a data frame with the statistic values in the same order as listed above.
#
# Input arguments:
# cell_type : string describing cell type, e.g. "CD4", "CD8", "CD4_CD8".
# tissue_type: string for tissue type, e.g. "stroma", "tumor".
# cell_table : data frame containing MEAN and RECLASSIFIED marker intensity values,
# in the format as returned by the load_cell_data() function.
# image_id : string giving the image_id value of the subset image for which the
# statistics should be computed. If this value is set to "Total" (the
# default), then the statistics are computed on all images contained in
# the cell_table.
# *******************************************************************************************
# Create return data frame.
cell_types = c(cell_types, 'Total')
output_df = data.frame('ImageID' = image_id,
'TissueType' = tissue_type,
'CellType' = cell_types,
'CellCount' = 0,
row.names = cell_types,
stringsAsFactors = FALSE)
output_df[,c('IntMean', 'IntMedian', 'IntMin', 'IntMax', 'IntSD')] = NA
# Subset cell_table to keep only the rows that are matching the requested "tissue_category"
# and "image_id" values. If the image ID is 'Total', then we only subset by tissue type
# because it corresponds values for the sum accross all image_id values.
# Note: it's much faster to first subset by image_id and then by tissue_type than the reverse.
# get("image_id", 1L)
if(image_id != 'Total') cell_table = cell_table[cell_table$image_id==image_id, ]
cell_table = cell_table[cell_table$tissue_category==tissue_type, ]
# If the subset has 0 rows, the image ID has no cells for the given tissue type. This does
# sometimes occur.
if(nrow(cell_table) == 0) return(output_df)
output_df['Total', 'CellCount'] = nrow(cell_table)
# Tests with data.table.
#img_id = image_id
#if(img_id != 'Total') sub_table = cell_table[image_id == img_id]
#sub_table = sub_table[tissue_category == tissue_type]
#
#system.time(dt[image_id == tmp])
#system.time(cell_table[cell_table$image_id==image_id, ])
#system.time(dt[tissue_category == tissue_type])
#system.time(cell_table[cell_table$tissue_category==tissue_type, ])
# Loop through all cell types and compute statistics
for(cell_type in cell_types[-length(cell_types)]){
# Split the cell type into individual markers.
# *******************************************
# cell_type are strings that contain the names of individual markers, followed by either
# 'p' (positive) or 'm' (negative), separated by '_' characters, e.g: "CD68p_CD11cp"
tmp = unlist(strsplit(sub('_total$', '', cell_type), split='_'))
marker_names = as.character(sapply(tmp, FUN=function(x) substr(x, 1, nchar(x)-1)))
marker_statuses = as.character(sapply(tmp, FUN=function(x) substr(x, start=nchar(x),
stop=nchar(x))))
stopifnot(all(marker_statuses %in% c('p','m')))
# For any marker with negative status ("m"), inverse the values in the reclassified
# column of the sub_table: the 0 become 1 and 1 become 0.
sub_table = cell_table[]
for(x in marker_names[marker_statuses == 'm']) sub_table[, paste0(x,'_reclassified')] =
1 - sub_table[, paste0(x,'_reclassified')]
# Identify which reclassified columns are part of the cell type or not.
col_names = colnames(sub_table)
in_cell_type = which(col_names %in% paste0(marker_names, '_reclassified'))
not_in_cell_type = which(endsWith(col_names, '_reclassified') &
!col_names %in% col_names[in_cell_type] )
# Subset intensity values table to keep only the rows that are matching the cell_type.
# ***********************************************************************************
# The way to do this depends on whether the cell type is a "regular" or '_total' type:
# -> total : keep all cells (rows of table) that are positive for the markers associated
# to the target cell_type.
# -> regular: keep only cells (rows of table) that are positive for the markers of the
# target cell_type and negative for all other markers.
row_sum_cell_type = rowSums(sub_table[, in_cell_type, drop=F])
row_sum_total = rowSums(sub_table[, c(in_cell_type,not_in_cell_type), drop=F])
if(endsWith(cell_type, '_total')){
sub_table = sub_table[row_sum_cell_type == length(in_cell_type),]
} else{
sub_table = sub_table[row_sum_cell_type == length(in_cell_type) &
row_sum_cell_type == row_sum_total, ]
}
# Compute cell count and intensity value statistics.
# *************************************************
# Cell count.
output_df[cell_type, 'CellCount'] = nrow(sub_table)
# Intensity value statistics are only computed for single marker types. In addition,
# phenotyped markers don't necessarily have an associated intensity column.
col_nb = which(col_names %in% paste0(marker_names, '_mean'))
if(length(marker_names) == 1 & length(col_nb) == 1){
if(nrow(sub_table) > 0){
output_df[cell_type, 'IntMean'] = round(mean(sub_table[,col_nb]), 3)
output_df[cell_type, 'IntMedian'] = round(median(sub_table[,col_nb]), 3)
output_df[cell_type, 'IntMin'] = round(min(sub_table[,col_nb]), 3)
output_df[cell_type, 'IntMax'] = round(max(sub_table[,col_nb]), 3)
output_df[cell_type, 'IntSD'] = round(sd(sub_table[,col_nb]), 3)
} else output_df[cell_type, 5:9] = 0
}
}
return(output_df)
}
####################################################################################################
####################################################################################################
guess_host_os = function(){
# ********************************************************************************************
# Determine what is the operating system of the machine executing this function.
#
# ********************************************************************************************
# Get name of host operating system.
sysinf = Sys.info()
# Case 1: name of opperating system can be retrieved with "Sys.info()". This is the usual case.
if(!is.null(sysinf)){
os <- sysinf['sysname']
if(os == 'Darwin'){
os = 'osx'
} else if(os == 'Linux'){
os = 'linux'
} else if(os == 'Windows'){
os = 'windows'
} else os = 'unknown'
# Case 2: Exotic OS and Sys.info() did not work.
} else{
os = .Platform$OS.type
if(grepl('^darwin', R.version$os)){