Skip to content
GitLab
Menu
Projects
Groups
Snippets
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
Vital-IT
post-inForm
Commits
40dadf5f
Commit
40dadf5f
authored
Oct 08, 2020
by
Robin Engler
Browse files
Add support for scored markers
parent
bb2cd129
Changes
7
Show whitespace changes
Inline
Side-by-side
R/config.R
View file @
40dadf5f
...
...
@@ -18,11 +18,13 @@ AUTHORIZED_TISSUES <<- c('stroma', 'tumor', 'dermis', 'epidermis', 'melano
AUTHORIZED_COMPARTMENTS
<<-
c
(
'nucleus'
,
'membrane'
,
'cytoplasm'
,
'entire_cell'
)
AUTHORIZED_STROMA_VALUES
<<-
c
(
'DAPI'
,
'stroma'
,
'other'
)
AUTHORIZED_TUMOR_VALUES
<<-
c
(
'CK'
,
'tumor'
)
AUTHORIZED_MARKERS
<<-
c
(
'CAL'
,
'CD3'
,
'CD4'
,
'CD8'
,
'CD11
C
'
,
'CD15'
,
'CD20'
,
'CD56'
,
'CD68'
,
AUTHORIZED_MARKERS
<<-
c
(
'CAL'
,
'CD3'
,
'CD4'
,
'CD8'
,
'CD11
c
'
,
'CD15'
,
'CD20'
,
'CD56'
,
'CD68'
,
'CD103'
,
'CD163'
,
'CD206'
,
'FOXP3'
,
'GB'
,
'gH2AX'
,
'gH2AXN'
,
'IDO'
,
'IL10R'
,
'Keratin'
,
'KI67'
,
'PD1'
,
'PDL1'
,
'PERFORIN'
,
'SOX10'
,
'WT1'
,
'CK'
,
'VISTA'
)
IGNORED_PHENOTYPES
<<-
c
(
'DAPIp'
,
'MISSING'
)
NO_PHENOTYPE
<<-
'MISSING'
NO_PHENOTYPE_SYNONYMS
<<-
c
(
'ARTEFACT'
,
'ARTEFACTS'
,
'OTHER'
)
DATAREDUCE_SCRIPT
<<-
file.path
(
dirname
(
dirname
(
sys.frame
(
1
)
$
ofile
)),
'inst/bash/reduce_file_size.sh'
)
...
...
R/functions.R
View file @
40dadf5f
...
...
@@ -16,122 +16,112 @@ check_for_duplicated_rows <- function(data_frame){
####################################################################################################
reclass_cells_by_marker
<-
function
(
marker
,
cell_values
,
thresholds
=
NULL
){
# ********************************************************************************************
# Computes a binary vector indicating whether a cell is positive for a given marker (1) or
# not (0). The function accepts cell_values that are either phenotypes, or marker intensity
# values.
#
# Input arguments:
# marker : string. Name of marker for which to reclassify cells.
# cell_values: if the marker was phenotyped: must be a vector of strings.
# if the marker was scored: must be a vector of floats.
# thresholds : list of thresholds for the given marker. If the marker was phenotyped (as
# opposed to scored), then thresholds must be set to NULL. This is how the
# function knows that the maker is phenotyped.
# ********************************************************************************************
#' Reclass input 'cell_values' into a binary vector indicating whether a cell is positive for a
#' given marker (1) or not (0). This function is for phenotyped markers and expects 'cell_values'
#' to be a vector of strings.
#'
#' @param marker [string] Name of marker for which to reclass cells. This must be a single marker.
#' @param cell_values [string vector] List of cell values (phenotypes) to reclassify.
#'
reclass_cells_by_marker_phenotyped
<-
function
(
marker
,
cell_values
){
# Case 1: marker was phenotyped.
# *****************************
if
(
length
(
thresholds
)
==
0
){
# Test whether the elements of the cell_values vector contain the string "markerName + p".
# E.g. for CD11c, we test if the phenotype contains the string "CD11cp".
# "partial matches", means that if we are searching for say "CD11", then all cells
# whose phenotype contains "CD11p" (e.g. "CD11p", "CD11p_CD86p" or "CD8p_CD11p_CD86p") will be
# reclassified as 1 (the marker is present in the cell). Note that if the marker apprears in the
# phenotype with a "m", e.g. "CD8p_CD11m", then this is not counted as a match since the "m"
# stands for "minus", meaning the marker is not present in the cell.
marker
=
paste0
(
marker
,
'p'
)
# Verify input values.
stopifnot
(
is.character
(
cell_values
))
return
(
sapply
(
strsplit
(
cell_values
,
split
=
'_'
),
FUN
=
function
(
x
)
ifelse
(
marker
%in%
x
,
1
,
0
)))
}
# For each phenotype value, test whether maker is part of the phenotype. E.g. test whether
# "CD8" is part of "KI67p_CD8p_CD11cp".
marker
=
paste0
(
marker
,
'p'
)
return
(
as.numeric
(
sapply
(
strsplit
(
cell_values
,
split
=
'_'
),
FUN
=
function
(
x
)
marker
%in%
x
)))
}
####################################################################################################
# Case 2: marker was scored.
# **************************
# In this case we reclass the values depending on whether they are >= or < than the threshold.
# Values >= threshold get reclassified as 1, while values < threshold get reclassified as 0.
stopifnot
(
is.numeric
(
cell_values
))
stop
(
"########### NOT IMPLEMENTED YET !!!!"
)
# Create temporary vector of thresholds. This vector has the same length as the number of rows in
# the cell table, and simply contains the threshold value coresponding to the tissue type for
# each row of the cell table.
thresholdVector
=
as.numeric
(
sapply
(
intensity_table
[,
'tissue_category'
],
FUN
=
function
(
x
)
thresholdList
[
1
,
x
]))
# Test whether intensity value is >= threshold.
return
(
as.numeric
(
intensity_table
[,
tmpColNb
]
>
thresholdVector
))
####################################################################################################
#' Reclass input 'cell_values' into a binary vector indicating whether a cell is positive for a
#' given marker (1) or not (0). This function is for scored markers and expects 'cell_values'
#' to be a vector of numeric values, the marker intensity values for the given marker.
#'
#' @param marker [string] Name of marker for which to reclass cells. This must be a single marker.
#' @param cell_values [numeric vector] List of cell values (intensity values) to reclassify.
#' @param tissue_type [string vector] Tissue type associated to each cell. This vector must have
#' the same length as [cell_values].
#' @param thresholds [data frame] Reclassification threshold table. Must contain a "tissue_type"
#' column and a column bearing the name of the marker.
reclass_cells_by_marker_scored
<-
function
(
marker
,
cell_values
,
tissue_type
,
thresholds
){
# Verify input values.
stopifnot
(
is.numeric
(
cell_values
))
stopifnot
(
length
(
cell_values
)
==
length
(
tissue_type
))
stopifnot
(
sort
(
unique
(
tissue_type
[
tissue_type
!=
'missing'
]))
==
sort
(
thresholds
$
tissue_type
))
# Reclass cell values depending on whether they are >= or < than the threshold.
# Values >= threshold get reclassified as 1, while values < threshold get reclassified as 0.
# The threshold value can be different for each tissue type.
thresholds
=
setNames
(
thresholds
[,
marker
],
thresholds
[,
'tissue_type'
])
reclassed_values
=
as.numeric
(
sapply
(
1
:
length
(
cell_values
),
function
(
x
)
cell_values
[
x
]
>=
thresholds
[
tissue_type
[
x
]]))
# If the tissue_type vector contains any "missing" values, this introduces NA values in the
# reclassified values output. Here we set those values to 0.
reclassed_values
[
which
(
is.na
(
reclassed_values
))]
=
0
return
(
reclassed_values
)
}
####################################################################################################
####################################################################################################
generate_summary_table
<-
function
(
sample_name
,
image_ids
,
cell_types
,
tissue_list
,
markers_phenotyped
,
markers_scored
,
thresholds
,
tissue_table
){
# ********************************************************************************************
# Generate a table where each row contains values for a given sample, image ID, cell type and
# tissue type combination. The summary_table has the following columns:
# - sample_name : name of sample.
# - image_id : name of image subset within the sample.
# - tissue_type : name of tissue (e.g. "stroma", "tumor").
# - cell_type : name of cell type. Either an individual marker or a combination (e.g. "CD4", "CD8p_FPXP3n").
# - threshold : threshold value for mean marker intensity values. These are the threshold that are used to
# reclassify the marker intensity values into 0/1 values. Note that these values only exist
# for the rows corresponding to individual markers (combination cell types and Total rows
# have no values in the column).
# - SurfacePIX : surface of each cellType in pixel units.
# - SurfaceMM2 : surface of each cellType, in square mm.
# - CellDensity: density of cells per square mm.
# - CellCount : cell count for the given cell type in the given tissue type.
# - IntMean : mean value of marker intensity for the cell belonging to the row's cellType.
# - IntMedian : median value " " "
# - IntMin : minimum value " " "
# - IntMax : maximum value " " "
# - IntSD : standard deviation value " " "
#
# SampleName ImageID CellType TissueType Threshold SurfacePIX SurfaceMM2
# His2757_7 Total CD4 Stroma 0.5 NA NA
# His2757_7 Total CD4 Tumor 0.5 NA NA
# His2757_7 Total FOXP3 Stroma 0.2 NA NA
# His2757_7 Total FOXP3 Tumor 0.2 NA NA
# His2757_7 Total CD4p_FOXP3p Stroma NA NA NA
# His2757_7 Total CD4p_FOXP3p Tumor NA NA NA
# His2757_7 Total Total Stroma NA 299488 NA
# His2757_7 Total Total Tumor NA 2108192 NA
# His2757_7 39995_14773 CD4 Stroma 0.5 NA NA
# His2757_7 39995_14773 CD4 Tumor 0.5 NA NA
# His2757_7 39995_14773 FOXP3 Stroma 0.2 NA NA
# His2757_7 39995_14773 FOXP3 Tumor 0.2 NA NA
# His2757_7 39995_14773 CD4p_FOXP3p Stroma NA NA NA
# His2757_7 39995_14773 CD4p_FOXP3p Tumor NA NA NA
# His2757_7 39995_14773 Total Stroma NA 149744 NA
# His2757_7 39995_14773 Total Tumor NA 1054096 NA
#
#
# Input arguments:
# sample_name: string. Name of sample.
#
# ********************************************************************************************
stopifnot
(
all
(
tissue_table
[,
'sample_name'
]
==
sample_name
))
#' Generate a table where each row contains values for a given sample, image ID, cell type and
#' tissue type combination. The summary_table has the following columns:
#'
#' SampleName ImageID TissueType CellType Threshold SurfacePIX SurfaceMM2 CellDensity CellCount IntMean IntMedian IntMin IntMax IntSD
#' TMA_IF02 Total stroma CD8p -1.00 NA NA NA NA NA NA NA NA NA
#' TMA_IF02 Total stroma CD8p_total -1.00 NA NA NA NA NA NA NA NA NA
#' TMA_IF02 Total stroma GBp -1.00 NA NA NA NA NA NA NA NA NA
#' TMA_IF02 Total stroma GBp_total -1.00 NA NA NA NA NA NA NA NA NA
#' TMA_IF02 Total stroma KI67p 19.99 NA NA NA NA NA NA NA NA NA
#' TMA_IF02 Total stroma KI67p_total 19.99 NA NA NA NA NA NA NA NA NA
#' TMA_IF02 Total stroma CD8p_GBp NA NA NA NA NA NA NA NA NA NA
#' TMA_IF02 Total stroma CD8p_GBp_total NA NA NA NA NA NA NA NA NA NA
#' TMA_IF02 Total stroma CD8p_KI67p NA NA NA NA NA NA NA NA NA NA
#' TMA_IF02 Total stroma CD8p_KI67p_total NA NA NA NA NA NA NA NA NA NA
#' TMA_IF02 Total stroma Total NA NA 13431416.5 NA NA NA NA NA NA NA
#' TMA_IF02 Total tumor CD8p -1.00 NA NA NA NA NA NA NA NA NA
#' TMA_IF02 Total tumor CD8p_total -1.00 NA NA NA NA NA NA NA NA NA
#' TMA_IF02 Total tumor GBp -1.00 NA NA NA NA NA NA NA NA NA
#' TMA_IF02 Total tumor GBp_total -1.00 NA NA NA NA NA NA NA NA NA
#' TMA_IF02 Total tumor KI67p 19.99 NA NA NA NA NA NA NA NA NA
#' TMA_IF02 Total tumor KI67p_total 19.99 NA NA NA NA NA NA NA NA NA
#' TMA_IF02 Total tumor CD8p_GBp NA NA NA NA NA NA NA NA NA NA
#' TMA_IF02 Total tumor CD8p_GBp_total NA NA NA NA NA NA NA NA NA NA
#' TMA_IF02 Total tumor CD8p_KI67p NA NA NA NA NA NA NA NA NA NA
#' TMA_IF02 Total tumor CD8p_KI67p_total NA NA NA NA NA NA NA NA NA NA
#' TMA_IF02 Total tumor Total NA NA 2321735.6 NA NA NA NA NA NA NA
#'
#' @param sample_name [string]
#' @param image_ids [string vector]
#' @param cell_types [string vector]
#' @param tissue_list [string vector]
#' @param markers_phenotyped [string vector]
#' @param markers_scored [string vector]
#' @param thresholds [data frame]
#' @param tissue_table [data frame]
generate_summary_table
<-
function
(
sample_name
,
image_ids
,
cell_types
,
tissue_list
,
markers_phenotyped
,
markers_scored
,
thresholds
,
tissue_table
){
# Input check.
stopifnot
(
tissue_table
[,
'sample_name'
]
==
sample_name
)
# Create summary table backbone.
# *****************************
# Add "Total" values to the list of image IDs and cell types. These represent the total for
# all image IDs, and the total for all cell types
(
i.e. all cell types together
)
.
# all image IDs, and the total for all cell types
,
i.e. all cell types together.
image_ids
=
c
(
'Total'
,
image_ids
)
cell_types
=
c
(
cell_types
,
'Total'
)
# Compute nuber of rows of the summary_table (i.e. image ID, cell type and tissue type combinations)
# Compute number of rows of the summary_table, i.e. image ID, cell type and tissue type
# combinations.
row_nb
=
length
(
tissue_list
)
*
length
(
cell_types
)
*
length
(
image_ids
)
# Create table.
...
...
@@ -159,31 +149,31 @@ generate_summary_table <- function(sample_name,
summary_table
[
summary_table
[,
'CellType'
]
%in%
paste0
(
rep
(
markers_phenotyped
,
each
=
2
),
c
(
'p'
,
'p_total'
)),
'Threshold'
]
=
-1
# Scored markers get the threshold value extracted from the thresholds table.
# Scored markers get the
ir
threshold value extracted from the thresholds table.
for
(
marker
in
markers_scored
){
for
(
tissue
in
tissue_list
){
stop
(
"######## NOT IMPLEMENTED YET"
)
tmpColNb
=
which
(
summary_table
[,
'CellType'
]
%in%
paste0
(
marker
,
c
(
'p'
,
'p_total'
),
sep
=
''
)
&
summary_table
[,
'TissueType'
]
==
tissue
)
summary_table
[
tmpColNb
,
'Threshold'
]
=
threshold_table
[
sample_name
,
marker
,
tissue
]
col_index
=
which
(
summary_table
[,
'CellType'
]
%in%
paste0
(
marker
,
c
(
'p'
,
'p_total'
))
&
summary_table
[,
'TissueType'
]
==
tissue
)
summary_table
[
col_index
,
'Threshold'
]
=
thresholds
[
thresholds
$
tissue_type
==
tissue
,
marker
]
}
}
# Add surface values in
MM
2 to summary table.
# Add surface values in
mm
2 to summary table.
# ******************************************
# Add the surface values for each image ID and tissue type to the "Total" cell type row of the
# summary table.
for
(
image_id
in
image_ids
){
for
(
tissue
in
tissue_list
){
row_
nb
=
which
(
summary_table
$
ImageID
==
image_id
&
row_
index
=
which
(
summary_table
$
ImageID
==
image_id
&
summary_table
$
CellType
==
'Total'
&
summary_table
$
TissueType
==
tissue
)
surface_rows
=
switch
(
ifelse
(
image_id
==
'Total'
,
1
,
2
),
which
(
tissue_table
[,
'tissue_category'
]
==
tissue
),
which
(
tissue_table
[,
'tissue_category'
]
==
tissue
&
tissue_table
[,
'image_id'
]
==
image_id
))
summary_table
[
row_nb
,
'SurfaceMM2'
]
=
sum
(
tissue_table
[
surface_rows
,
'region_area_surface'
])
summary_table
[
row_index
,
'SurfaceMM2'
]
=
sum
(
tissue_table
[
surface_rows
,
'region_area_surface'
])
}
}
...
...
@@ -195,46 +185,46 @@ generate_summary_table <- function(sample_name,
####################################################################################################
#' Computes the following statistics for all cell types of a given image_id and tissue type:
#'
#' - cell count : number of cells with type "cell_type" in tissue "tissue_type".
#' - intensity mean : mean intensity value for cells matching "cell_type" and "tissue_type".
#' - intensity median: median ...
#' - intensity min : minimum ...
#' - intensity max : maximum ...
#' - intensity SD : standard deviation ...
#'
#' The function returns a data frame with the statistic values in the same order as listed above.
#'
#' @param image_id [string] image_id for which the statistics should be computed. If this value is
#' set to "Total" the statistics are computed on all images in cell_table.
#' @param tissue_type [string] Tissue type, e.g. "stroma", "tumor".
#' @param cell_types [string] cell type, e.g. "CD4", "CD8", "CD4_CD8".
#' @param cell_table [data frame] Data frame containing MEAN and RECLASSIFIED marker intensity
#' values.
#'
cell_type_statistics
<-
function
(
image_id
,
tissue_type
,
cell_types
,
cell_table
){
# *******************************************************************************************
# Computes the following statistics for all cell types of a given image_id and tissue type:
# - cell count : number of cells with type "cell_type" in tissue "tissue_type".
# - intensity mean : mean intensity value for cells maching "cell_type" and "tissue_type".
# - intensity median: median ...
# - intensity min : minimum ...
# - intensity max : maximum ...
# - intensity SD : standard deviation ...
#
# The function returns a data frame with the statistic values in the same order as listed above.
#
# Input arguments:
# cell_type : string describing cell type, e.g. "CD4", "CD8", "CD4_CD8".
# tissue_type: string for tissue type, e.g. "stroma", "tumor".
# cell_table : data frame containing MEAN and RECLASSIFIED marker intensity values,
# in the format as returned by the load_cell_data() function.
# image_id : string giving the image_id value of the subset image for which the
# statistics should be computed. If this value is set to "Total" (the
# default), then the statistics are computed on all images contained in
# the cell_table.
# *******************************************************************************************
# Create return data frame.
cell_types
=
c
(
cell_types
,
'Total'
)
output_df
=
data.frame
(
'ImageID'
=
image_id
,
'TissueType'
=
tissue_type
,
'CellType'
=
cell_types
,
'CellType'
=
c
(
cell_types
,
'Total'
),
'CellCount'
=
0
,
row.names
=
cell_types
,
'IntMean'
=
NA
,
'IntMedian'
=
NA
,
'IntMin'
=
NA
,
'IntMax'
=
NA
,
'IntSD'
=
NA
,
row.names
=
c
(
cell_types
,
'Total'
),
stringsAsFactors
=
FALSE
)
output_df
[,
c
(
'IntMean'
,
'IntMedian'
,
'IntMin'
,
'IntMax'
,
'IntSD'
)]
=
NA
# Subset cell_table to keep only the rows that are matching the requested
"
tissue
_category"
#
and "
image_id
"
value
s
. If
the
image
ID
is 'Total',
then we
only subset by tissue type
#
because it corresponds values for the sum accross all
image_id values.
# Subset cell_table to keep only the rows that are matching the requested tissue
type and
# image_id value. If image
_id
is 'Total', only subset by tissue type
to get the sum across all
# image_id values.
# Note: it's much faster to first subset by image_id and then by tissue_type than the reverse.
# get("image_id", 1L)
if
(
image_id
!=
'Total'
)
cell_table
=
cell_table
[
cell_table
$
image_id
==
image_id
,
]
cell_table
=
cell_table
[
cell_table
$
tissue_category
==
tissue_type
,
]
if
(
image_id
!=
'Total'
)
cell_table
=
cell_table
[
cell_table
$
image_id
==
image_id
,
]
cell_table
=
cell_table
[
cell_table
$
tissue_category
==
tissue_type
,
]
# If the subset has 0 rows, the image ID has no cells for the given tissue type. This does
# sometimes occur.
if
(
nrow
(
cell_table
)
==
0
)
return
(
output_df
)
...
...
@@ -252,7 +242,7 @@ cell_type_statistics <- function(image_id, tissue_type, cell_types, cell_table){
# Loop through all cell types and compute statistics
for
(
cell_type
in
cell_types
[
-
length
(
cell_types
)]
){
for
(
cell_type
in
cell_types
){
# Split the cell type into individual markers.
# *******************************************
...
...
@@ -284,8 +274,8 @@ cell_type_statistics <- function(image_id, tissue_type, cell_types, cell_table){
# to the target cell_type.
# -> regular: keep only cells (rows of table) that are positive for the markers of the
# target cell_type and negative for all other markers.
row_sum_cell_type
=
rowSums
(
sub_table
[,
in_cell_type
,
drop
=
F
])
row_sum_total
=
rowSums
(
sub_table
[,
c
(
in_cell_type
,
not_in_cell_type
),
drop
=
F
])
row_sum_cell_type
=
rowSums
(
sub_table
[,
in_cell_type
,
drop
=
F
ALSE
])
row_sum_total
=
rowSums
(
sub_table
[,
c
(
in_cell_type
,
not_in_cell_type
),
drop
=
F
ALSE
])
if
(
endsWith
(
cell_type
,
'_total'
)){
sub_table
=
sub_table
[
row_sum_cell_type
==
length
(
in_cell_type
),]
}
else
{
...
...
R/individual_markers.R
View file @
40dadf5f
...
...
@@ -131,19 +131,19 @@ merge_cell_data_files <- function(files_to_merge){
if
(
as.numeric
(
value_frequency
[
1
])
>
as.numeric
(
value_frequency
[
2
])){
tissue_cat_df
[
x
,]
=
names
(
value_frequency
)[
1
]
if
(
SHOW_TISSUE_CATEGORY_MISMATCH_WARNING
)
raise_error
(
msg
=
c
(
paste0
(
'
T
issue_category values differ across files. '
,
msg
=
c
(
paste0
(
'
[t
issue_category
]
values differ across files. '
,
'Values were reconciled based on majority ruling.'
),
paste0
(
'Offending row: '
,
x
)),
file
=
files_to_merge
[
1
],
file
=
files_to_merge
[
1
],
type
=
'warning'
)
# Case 2: majority ruling is not possible
# Case 2: majority ruling is not possible
.
}
else
{
raise_error
(
msg
=
c
(
'Could not merge individual marker files.'
,
'Reason: tissue_category values differ across files with no majority.'
,
paste0
(
'Offending row: '
,
x
),
paste0
(
'Offending values:'
,
paste
(
tissue_cat_df
[
x
,],
collapse
=
' '
))),
paste0
(
'Offending values:
'
,
paste
(
tissue_cat_df
[
x
,],
collapse
=
' '
))),
file
=
files_to_merge
[
1
])
}
}
...
...
R/input_check.R
View file @
40dadf5f
...
...
@@ -36,6 +36,8 @@ inputdir_check <- function(input_dir, output_dir){
# Verify that input parameters and sample rename files are present, and if needed rename them.
rename_file_by_pattern
(
file_name
=
PARAMETERS_FILE
,
pattern
=
'param'
,
dir_name
=
input_dir
,
out_dir
=
output_dir
,
raise_error_if_absent
=
TRUE
)
rename_file_by_pattern
(
file_name
=
THRESHOLDS_FILE
,
pattern
=
'threshold'
,
dir_name
=
input_dir
,
out_dir
=
output_dir
,
raise_error_if_absent
=
FALSE
)
rename_file_by_pattern
(
file_name
=
SAMPLE_RENAME_FILE
,
pattern
=
'rename'
,
dir_name
=
input_dir
,
out_dir
=
output_dir
,
raise_error_if_absent
=
FALSE
)
return
(
invisible
(
NULL
))
...
...
@@ -104,7 +106,6 @@ standardize_and_split_cell_data <- function(input_file,
phenotype_confidence_threshold
,
delete_input_file
=
FALSE
){
# ********************************************************************************************
#
# Differences between inForm versions:
# version 2.2
# - *_tissue_seg_data_summary.txt files contain a column named "Region Area (pixels)".
...
...
@@ -113,7 +114,7 @@ standardize_and_split_cell_data <- function(input_file,
# - a new column named "Annotation ID" is added to *_cell_seg_data.txt.
# - *_tissue_seg_data_summary.txt files contain column named "Region Area (square microns)"
# - in addition the "Sample ID" column of the *_cell_seg_data.txt files no longer contains
# the imageID values. These are now present in the "Annotation ID" column.
# the image
ID values. These are now present in the "Annotation ID" column.
#
#
# Input arguments:
...
...
@@ -125,7 +126,7 @@ standardize_and_split_cell_data <- function(input_file,
# delete_input_file: if TRUE, the input_file is deleted after it was split by samples.
# ********************************************************************************************
# Load input table. Verif
t
y it is not empty and standardize the column names.
# Load input table. Verify it is not empty and standardize the column names.
input_table
=
read.table
(
input_file
,
sep
=
'\t'
,
as.is
=
T
,
header
=
T
,
colClasses
=
'character'
,
check.names
=
T
,
strip.white
=
T
)
if
(
nrow
(
input_table
)
==
0
)
raise_error
(
'Input file has zero rows.'
,
file
=
input_file
)
...
...
@@ -148,7 +149,10 @@ standardize_and_split_cell_data <- function(input_file,
sample_names
=
extract_sample_name
(
input_table
[,
'sample_name'
],
input_file
=
input_file
)
input_table
=
input_table
[
sample_names
%in%
samples
,]
sample_names
=
sample_names
[
sample_names
%in%
samples
]
if
(
nrow
(
input_table
)
==
0
)
raise_error
(
'Input file has zero rows.'
,
file
=
input_file
)
if
(
nrow
(
input_table
)
==
0
)
raise_error
(
msg
=
'No matching values found in [sample_name] column for any of the input samples'
,
file
=
input_file
,
items_to_list
=
samples
)
# Extract image ID values. If the "annotation_id" column is present (inForm 2.4), extract
...
...
@@ -173,7 +177,7 @@ standardize_and_split_cell_data <- function(input_file,
file_values_are_from
=
input_file
)
# Reclass 'phenotype_values' for rows where confidence < phenotype_confidence_threshold
# Re
-
class 'phenotype_values' for rows where confidence < phenotype_confidence_threshold
# to the value of 'MISSING'.
phenotype_values
[
which
(
confidence_values
<
phenotype_confidence_threshold
)]
=
'MISSING'
...
...
@@ -352,17 +356,17 @@ standardize_and_split_tissue_data <- function(input_file,
####################################################################################################
#' Standardize column names of input files.
#'
#' @param column_names [string vector] Names of columns to standardize.
#' @param input_file [string] Path and name of file from which the columns were taken. Only used to
#' display an error message.
#' @return Standardized column names.
standardize_column_names
=
function
(
column_names
,
input_file
){
# ********************************************************************************************
#
# Input arguments:
# - column_names: string vector. Names of columns to standardize.
# - input_file: file name from which the columns were taken. Only used to display an error
# message.
# ********************************************************************************************
# Replace any '.' in column names by an '_'. The '.' are generally introduced in column names
# by R as a replacement of a non-authorized character such as a blank space or a bracket.
# For readability, multiple '.' are replaced by a single '_'.
column_names
=
gsub
(
pattern
=
'\\.+'
,
replacement
=
'_'
,
x
=
column_names
)
column_names
=
gsub
(
pattern
=
'_+$'
,
replacement
=
''
,
x
=
column_names
)
...
...
@@ -382,10 +386,18 @@ standardize_column_names = function(column_names, input_file){
for
(
i
in
grep
(
paste0
(
col_start_regexp
,
'.*_mean_.*'
),
x
=
column_names
)){
marker_name
=
sub
(
col_start_regexp
,
''
,
column_names
[
i
])
marker_name
=
sub
(
'_.*$'
,
''
,
marker_name
)
# If the marker is present in the AUTHORIZED_MARKERS list, correct its capitalization if
# needed.
x
=
which
(
toupper
(
marker_name
)
==
toupper
(
AUTHORIZED_MARKERS
))
stopifnot
(
length
(
x
)
<=
1
)
marker_name
=
ifelse
(
length
(
x
)
==
0
,
marker_name
,
AUTHORIZED_MARKERS
[
x
])
# Rename column.
column_names
[
i
]
=
paste0
(
marker_name
,
'_mean'
)
}
# Verify there
is
no duplicate column.
# Verify there
are
no duplicate column
s
.
duplicated_columns
=
which
(
duplicated
(
column_names
))
if
(
length
(
duplicated_columns
)
>
0
)
raise_error
(
msg
=
'Duplicated column names found in input file:'
,
...
...
@@ -454,6 +466,8 @@ check_and_fix_phenotype_values <- function(phenotype_values,
type
=
'warning'
)
}
# Replace
# Substitute '-' with '_' in Phenotype values. This is for the case where a '-' was used as
# separator value instead of a '_'.
phenotype_values
=
gsub
(
pattern
=
'-'
,
replacement
=
'_'
,
phenotype_values
)
...
...
@@ -473,11 +487,15 @@ check_and_fix_phenotype_values <- function(phenotype_values,
ignored
=
IGNORED_PHENOTYPES
))
==
0
)
return
(
phenotype_values
)
# Replace accepted NO_PHENOTYPE synonym values with NO_PHENOTYPE.
phenotype_values
[
which
(
toupper
(
phenotype_values
)
%in%
toupper
(
c
(
NO_PHENOTYPE
,
NO_PHENOTYPE_SYNONYMS
)))]
=
NO_PHENOTYPE
# Replace accepted stroma and tumor synonyms with 'DAPIp' and 'CKp' respectively.
for
(
x
in
1
:
2
)
phenotype_values
=
gsub
(
pattern
=
paste0
(
rep
(
switch
(
x
,
AUTHORIZED_STROMA_VALUES
,
AUTHORIZED_TUMOR_VALUES
),
each
=
2
),
c
(
''
,
'p'
),
collapse
=
'|'
),
replacement
=
switch
(
x
,
'DAPIp'
,
'CKp'
),
x
=
phenotype_values
,
ignore.case
=
TRUE
)
replacement
=
switch
(
x
,
'DAPIp'
,
'CKp'
),
x
=
phenotype_values
,
ignore.case
=
TRUE
)
# Correct capitalization and add a 'p' suffix (for 'positive') to any marker missing it.
...
...
@@ -678,15 +696,13 @@ check_marker_is_authorized <- function(marker_list, marker_type){
####################################################################################################
#' Test whether a file contains "individual marker" values. This is simply done by looking at the
#' file name. The convention is that individual marker files will contain at least one marker name
#' in their file name.
#' The function returns the list of markers in the file name, or character(0) if none is found.
markers_in_file_name
<-
function
(
file_name
){
# ********************************************************************************************
# Test whether a file contains "individual marker" values. This is simply done by looking at
# the file name. The convention is that individual marker files will contain at least one
# marker name in their file name.
# The function returns the list of markers in the file name, or character(0) if none is found.
# ********************************************************************************************
return
(
sort
(
as.character
(
names
(
unlist
(
sapply
(
AUTHORIZED_MARKERS
,
FUN
=
function
(
x
)
grep
(
x
,
file_name
,
ignore.case
=
T
)))))))
sapply
(
AUTHORIZED_MARKERS
,
FUN
=
function
(
x
)
grep
(
x
,
basename
(
file_name
)
,
ignore.case
=
T
)))))))
}
####################################################################################################
...
...
R/legacy_functions.R
View file @
40dadf5f
...
...
@@ -53,12 +53,16 @@ generate_parameter_file_from_legacy_input <- function(session_root_dir){
}
else
scored_markers
=
c
(
scored_markers
,
marker
)
}
# Rename thresholds file.
# Rename thresholds file, or delete it if there are no scored markers.
if
(
length
(
scored_markers
)
>
0
){
file.rename
(
threshold_file
,
file.path
(
session_root_dir
,
THRESHOLDS_FILE
))
}
else
{
file.remove
(
threshold_file
)
}
# Create new "parameters.txt" file.
file_connection
=
file
(
file.path
(
session_root_dir
,
"parameters.txt"
),
open
=
"w"
)
writeLines
(
paste0
(
"# Auto-generated parameter file for "
,
basename
(
session_root_dir
),
"."
),
file_connection
=
file
(
file.path
(
session_root_dir
,
"parameters.txt"
),
open
=
'w'
)
writeLines
(
paste0
(
"# Auto-generated parameter file for "
,
basename
(
session_root_dir
),
'.'
),
con
=
file_connection
)
writeLines
(
"samples:"
,
con
=
file_connection
)
for
(
x
in
samples
)
writeLines
(
x
,
con
=
file_connection
)
...
...
R/load_data.R
View file @
40dadf5f
...
...
@@ -31,7 +31,7 @@ load_session_parameters <- function(session_root_dir){
# ************
# Check that at least one tissue type was provided and that all tissues are part of the
# authorized list.
if
(
!
'tissues'
%in%
names
(
arg_values
)
)
raise_error
(
msg
=
"Parameter 'tissues' is missing."
,
if
(
!
'tissues'
%in%
names
(
arg_values
))
raise_error
(
msg
=
"Parameter 'tissues' is missing."
,
file
=
PARAMETERS_FILE
)
arg_values
[[
'tissues'
]]
=
tolower
(
arg_values
[[
'tissues'
]])
tissues
=
unique
(
arg_values
[[
'tissues'
]])
...
...
R/postinform.R
View file @
40dadf5f
...
...
@@ -26,7 +26,6 @@
#' @return nothing.
#' @examples
#' postinform(input_file_or_dir=input_file, delete_input=FALSE, immucan_output=FALSE)
#'
postinform
<-
function
(
input_file_or_dir
,
command
=
'process'
,
output_suffix
=
''
,
...
...
@@ -106,8 +105,8 @@ postinform <- function(input_file_or_dir,
# Run the
P
ost-inForm pipeline
# **************************
# Run the
p
ost-inForm pipeline
# **************************
**
log_message
(
paste
(
rep
(
'#'
,
80
),
collapse
=
''
),
padding
=
''
)
log_message
(
paste0
(
'Starting Post-inForm - version '
,
POSTINFORM_VERSION
))
log_message
(
'**********************************'
)
...
...
@@ -178,8 +177,6 @@ postinform_pipeline <- function(input_dir,
# - search for cell and tissue segmentation files in sub-directories or the session root.
log_message
(
'Input data check:'
)
log_message
(
paste
(
'input directory:'
,
input_dir
),
level
=
2
)
#delete_unnecessary_files(input_dir)
inputdir_check
(
input_dir
,
output_dir
)
log_message
(
'input dir check: OK'
,
level
=
2
)
...
...
@@ -406,9 +403,9 @@ run_sample <- function(sample_name, input_parameters){
thresholds
=
input_parameters
$
thresholds
# Keep only the thresholds for the current sample.
thresholds
=
thresholds
[
which
(
thresholds
[,
'sample_name'
]
==
sample_name
),
]
thresholds_subset
=
as.matrix
(
thresholds
[
sample_name
,,])
rownames
(
thresholds_subset
)
=
dimnames
(
thresholds
)[[
2
]]
colnames
(
thresholds_subset
)
=
dimnames
(
thresholds
)[[
3
]]
#
thresholds_subset = as.matrix(thresholds[sample_name,,])
#
rownames(thresholds_subset) = dimnames(thresholds)[[2]]
#
colnames(thresholds_subset) = dimnames(thresholds)[[3]]
}
else
thresholds
=
NULL
...
...
@@ -434,7 +431,7 @@ run_sample <- function(sample_name, input_parameters){
# Load cell segmentation data for current sample.
# **********************************************
# Load data and convert marker inten