individual_markers.R 15.2 KB
Newer Older
Robin Engler's avatar
Robin Engler committed
1
2
3
4
5
6
7
8
9
10

####################################################################################################
merge_individual_marker_files <- function(input_dir){
    # ********************************************************************************************
    # Merge individual marker files located in input_dir into a single file. The merged result is
    # written to a new file and the original files are deleted from disk.
    #
    # ********************************************************************************************
    sample_name = basename(input_dir)
    for(extension in c(CELL_FILES_EXTENSION, TISSUE_FILES_EXTENSION)){
11

Robin Engler's avatar
Robin Engler committed
12
        # Identify files to merge. Make sure at least 2 files were found.
13
        files_to_merge = list.files(path=input_dir, pattern=paste0(extension, '$'),
Robin Engler's avatar
Robin Engler committed
14
15
                                    all.files=F, full.names=T, recursive=F, ignore.case=F)
        if(length(files_to_merge) == 1) raise_error(
16
17
            msg = paste('Individual marker merge error. Only one file to merge for sample:',
                        sample_name),
Robin Engler's avatar
Robin Engler committed
18
19
            file = input_dir)
        if(any(!startsWith(basename(files_to_merge), 'individualmarker_'))) raise_error(
20
            msg = 'Individual marker merge error: mix of individual and non-individual files.',
Robin Engler's avatar
Robin Engler committed
21
            file = input_dir)
22

Robin Engler's avatar
Robin Engler committed
23
24
25
        # Merge *_cell_seg_data.txt or *_tissue_seg_data_summary.txt files.
        if(extension==CELL_FILES_EXTENSION)   merged_df = merge_cell_data_files(files_to_merge)
        if(extension==TISSUE_FILES_EXTENSION) merged_df = merge_tissue_data_files(files_to_merge)
26

Robin Engler's avatar
Robin Engler committed
27
28
29
30
31
        # Write merged file to disk.
        file_name = file.path(input_dir, paste0(sample_name, extension))
        if(file.exists(file_name)) raise_error(
            msg = 'Individual marker merge error: file already exists.', file = file_name)
        write.table(merged_df, file=file_name, quote=FALSE, row.names=FALSE, sep='\t')
32

Robin Engler's avatar
Robin Engler committed
33
34
35
36
37
38
39
40
41
42
43
        # Delete original individual marker files.
        unlink(files_to_merge)
    }
}
####################################################################################################



####################################################################################################
merge_cell_data_files <- function(files_to_merge){
    # ********************************************************************************************
44
    # Load the provided *_cell_seg_data.txt files and merge their content into a single data
Robin Engler's avatar
Robin Engler committed
45
46
47
48
49
50
51
    # frame.
    #
    # Input arguments:
    #  -> files_to_merge : list[string], path + name of *_cell_seg_data.txt files to be merged.
    #
    # ********************************************************************************************
    stopifnot(length(files_to_merge) > 1)
52

Robin Engler's avatar
Robin Engler committed
53
    # Define columns that will be used as key fields during the merge.
54
55
    key_fields = c('sample_name', 'image_id', 'cell_x_position', 'cell_y_position')

Robin Engler's avatar
Robin Engler committed
56
57
58
59
    # Loop through all files that match the prefix + suffix combination. Load and check the
    # content of each file, then add it to the merged data frame.
    merged_df = NULL
    for(f in files_to_merge){
60

Robin Engler's avatar
Robin Engler committed
61
62
        # Load data and remove duplicated rows.
        input_df = remove_duplicated_rows(
63
64
65
                        input_df = read.table(f, sep='\t', header=TRUE, as.is=TRUE,
                                              colClasses='character', strip.white=TRUE),
                        key_fields = key_fields,
Robin Engler's avatar
Robin Engler committed
66
                        file_name = f)
67

Robin Engler's avatar
Robin Engler committed
68
69
70
71
72
        # Verify all input files have the same columns. This must always be the case as we have
        # already standardized the files earlier.
        if(is.null(merged_df)){
            stopifnot(all(colnames(input_df)[1:length(key_fields)] == key_fields))
            col_names = colnames(input_df)
73
        }
Robin Engler's avatar
Robin Engler committed
74
        stopifnot(all(colnames(input_df) == col_names))
75
76


Robin Engler's avatar
Robin Engler committed
77
78
79
80
81
82
83
84
85
        # Merge data.
        # **********
        # Merge data frame for the current marker with the global dataframe 'merged_df'.
        if(is.null(merged_df)){
            merged_df = input_df
        } else{
            # Append a suffix to the non-key fields, so they have a unique name and get preserved
            # during the merge.
            colnames(input_df)[(length(key_fields)+1):ncol(input_df)] = paste0(
86
                colnames(input_df)[(length(key_fields)+1):ncol(input_df)],
Robin Engler's avatar
Robin Engler committed
87
                '_', which(files_to_merge==f))
88

Robin Engler's avatar
Robin Engler committed
89
90
            row_count = nrow(merged_df)
            merged_df = merge(merged_df, input_df, by=key_fields, all=FALSE, sort=FALSE)
91
92

            # Compute percentage of data loss due to merge operation. If it's more than 2% a
Robin Engler's avatar
Robin Engler committed
93
94
95
            # warning is issued.
            row_loss_percentage = round((row_count - nrow(merged_df)) / row_count * 100, 2)
            if(row_loss_percentage > 5) raise_error(
96
97
98
99
                msg = c(paste0('A large number of records were lost while merging file [',
                               row_loss_percentage, '%].'),
                     'This could indicate a problem in the input data and should be investigated.'),
                file = f,
Robin Engler's avatar
Robin Engler committed
100
101
102
103
                type = 'warning')
        }
        rm(input_df)
    }
104
105


Robin Engler's avatar
Robin Engler committed
106
107
108
109
110
111
112
    # Extract tissue category, phenotype, and marker intensity values.
    # ***************************************************************
    # Create separate data frames for the non-key fields.
    tissue_cat_df = merged_df[, grep('tissue_category', colnames(merged_df))]
    phenotype_df  = merged_df[, grep('phenotype', colnames(merged_df))]
    marker_int_df = merged_df[, grep('_mean$|_mean_', colnames(merged_df))]
    for(x in 1:ncol(marker_int_df)) marker_int_df[,x] = as.numeric(marker_int_df[,x])
113

Robin Engler's avatar
Robin Engler committed
114
    # Remove the extracted columns from the merged data frame.
115
116
117
    merged_df = merged_df[,which(colnames(merged_df) %in% c(key_fields, 'cell_id'))]


Robin Engler's avatar
Robin Engler committed
118
119
120
121
122
    # Combine data from all "Tissue Category" columns.
    # ***********************************************
    # In principle, the "Tissue Category" columns of all individual markers should contain the
    # same value. Here we verify that this is the case and then keep only one copy of them.
    if(any(tissue_cat_df != tissue_cat_df[,1])){
123
124
        diff_rows = sort(unique(unlist(sapply(2:ncol(tissue_cat_df),
                                FUN=function(x) which(tissue_cat_df[x] != tissue_cat_df[,1])))))
Robin Engler's avatar
Robin Engler committed
125
126
127
        for(x in diff_rows){
            value_frequency = sort(table(as.character(tissue_cat_df[x,])), decreasing=T)
            stopifnot(length(value_frequency) >= 2)
128
129

            # Case 1: one of the tissue values has a majority within the row. Majority ruling is
Robin Engler's avatar
Robin Engler committed
130
131
132
            # possible and the most frequent value is used.
            if(as.numeric(value_frequency[1]) > as.numeric(value_frequency[2])){
                tissue_cat_df[x,] = names(value_frequency)[1]
133
                raise_error(msg = c(paste0('tissue_category values differ accorss files. ',
Robin Engler's avatar
Robin Engler committed
134
                                           'Values were reconciled based on majority ruling.'),
135
136
                                    paste0('Offending row: ', x)),
                            file=files_to_merge[1],
Robin Engler's avatar
Robin Engler committed
137
                            type = 'warning')
138
139

            # Case 2: majority ruling is not possible
Robin Engler's avatar
Robin Engler committed
140
141
142
            } else{
                raise_error(
                    msg = c('Could not merge individual marker files.',
143
144
145
                            'Reason: tissue_category values differ across files with no majority.',
                            paste0('Offending row: ', x),
                            paste0('Offending values:', paste(tissue_cat_df[x,], collapse=' '))),
Robin Engler's avatar
Robin Engler committed
146
                    file = files_to_merge[1])
147
            }
Robin Engler's avatar
Robin Engler committed
148
149
150
        }
        stopifnot(all(tissue_cat_df == tissue_cat_df[,1]))
    }
151
    # Add tissue category values to the merged data frame.
Robin Engler's avatar
Robin Engler committed
152
153
    merged_df[,'tissue_category'] = tissue_cat_df[,1]

154

Robin Engler's avatar
Robin Engler committed
155
156
157
    # Combine data from all 'Phenotype' columns.
    # *****************************************
    # Each 'Phenotype' column contains the phenotyping for one or more individual marker. Here we
158
159
    # want to merge all these columns into a single column that will contain all phenotyping
    # information. If more than one marker is found for a given row, they get combined with a '_'
Robin Engler's avatar
Robin Engler committed
160
161
    # separator. E.g. if 'CD8' and 'PD1' are found in same row, the combination becomes 'CD8_PD1'.
    regexp = paste(sapply(IGNORED_PHENOTYPES, FUN=function(x) paste0(x,'_|_',x)), collapse = '|')
162
163
164
165
166
167
    merged_df[,'phenotype'] = apply(phenotype_df, MARGIN=1,
                                    FUN=function(x) gsub(pattern = regexp,
                                                         replacement = '',
                                                         x = paste(unique(x), collapse='_')))


Robin Engler's avatar
Robin Engler committed
168
169
170
171
172
173
174
175
176
177
    # Combine data from all marker intensity columns.
    # **********************************************
    # Verify that all marker intensity columns in the merge data frame have the same values and
    # keep only one copy of them. A difference in values <= 0.01 is accepted.
    marker_intensity_cols = grep('_mean$', col_names, value=TRUE)
    tolerance_limit = 0.01
    for(col_name in marker_intensity_cols){
        col_index = grep(col_name, colnames(marker_int_df))
        differences = abs(marker_int_df[,col_index] - marker_int_df[,col_index[1]])
        differing_values = which(differences > tolerance_limit)
178

Robin Engler's avatar
Robin Engler committed
179
180
181
182
        # If there are values > tolerance_limit, an warning is displayed to the user.
        if(length(differing_values) > 0){
            differing_values = as.vector(as.matrix(differences))[differing_values]
            raise_error(
183
184
185
186
                msg=c(paste0('Values for column [', col_name, '] differ accross individual files'),
                      paste0('to merge by more than ', tolerance_limit, ' at ',
                             length(differing_values), ' occurences [',
                             round(length(differing_values)/nrow(marker_int_df)*100, 3), '%].'),
Robin Engler's avatar
Robin Engler committed
187
                      'Values from the first file (alphabetically) will be used.'),
188
                file=dirname(files_to_merge[1]),
Robin Engler's avatar
Robin Engler committed
189
190
191
                type = 'warning')
        }
    }
192

Robin Engler's avatar
Robin Engler committed
193
194
195
196
197
198
199
200
201
202
203
    # Add marker intensity values to merged dataframe.
    merged_df = cbind(merged_df, marker_int_df[, 1:length(marker_intensity_cols)])
    return(merged_df)
}
####################################################################################################



####################################################################################################
merge_tissue_data_files <- function(files_to_merge){
    # ********************************************************************************************
204
205
206
    # "merges" the content of all provided *_tissue_seg_data_summary.txt files into a single
    # data frame. Most of the time, all *_tissue_seg_data_summary.txt files are exactly the same,
    # but it can happen that some are missing a number of rows (i.e. some image subsets are
Robin Engler's avatar
Robin Engler committed
207
208
209
    # missing because there were excluded based on their poor quality).
    # So the "merge" operation done in this functions consists keeping only rows from the input
    # *_tissue_seg_data_summary.txt files that are found in all files (i.e. do the intersection
210
    # of all files.).
Robin Engler's avatar
Robin Engler committed
211
212
213
    #
    # ********************************************************************************************
    stopifnot(length(files_to_merge) > 1)
214

Robin Engler's avatar
Robin Engler committed
215
216
217
    # Define columns that will be used as key fields to carry out the merge.
    key_fields = c('sample_name', 'image_id', 'tissue_category')
    non_key_fields = c('region_area_surface', 'region_area_percent')
218

Robin Engler's avatar
Robin Engler committed
219
220
221
    # Merge all files in list.
    merged_df = NULL
    for(f in files_to_merge){
222
223

        # Load data and remove duplicated rows. Verify that the column names are correct - at this
Robin Engler's avatar
Robin Engler committed
224
225
        # point the files are standardized so they must all have the same column names.
        input_df = remove_duplicated_rows(
226
227
228
                        input_df = read.table(f, sep='\t', header=TRUE, as.is=TRUE, strip.white=T,
                                              colClasses=c(rep('character',3), rep('numeric',2))),
                        key_fields = key_fields,
Robin Engler's avatar
Robin Engler committed
229
230
                        file_name = f)
        stopifnot(all(colnames(input_df) == c(key_fields, non_key_fields)))
231

Robin Engler's avatar
Robin Engler committed
232
233
234
235
        # Merge data frame for the current marker with the global dataframe 'merged_df'.
        if(is.null(merged_df)){
            merged_df = input_df
        } else{
236
            # Append suffix to non-key fields, so they have a unique name and get preserved
Robin Engler's avatar
Robin Engler committed
237
238
239
240
241
            # during the merge.
            colnames(input_df)[4:5] = paste0(non_key_fields, '_', which(files_to_merge %in% f))
            merged_df = merge(merged_df, input_df, by=key_fields, all=FALSE, sort=FALSE)
        }
    }
242

Robin Engler's avatar
Robin Engler committed
243
    # Search for mismatches among values of non-key fields. If some are detected, a warning
244
245
    # is displayed. For rows with mismatches, if any, compute the median of surface values. In
    # this way, if one of the input files has a different values it gets excluded (provided there
Robin Engler's avatar
Robin Engler committed
246
247
248
249
250
251
252
253
254
    # are at least 3 files).
    mismatches = NULL
    for(col_name in non_key_fields){
        col_index = grep(col_name, colnames(merged_df))
        difference = abs(merged_df[,col_index] - merged_df[,col_index[1]])
        mismatches = unique(c(mismatches, which(apply(difference, MARGIN=1, sum) > 0)))
    }
    if(length(mismatches) > 0){
        percentage = round(length(mismatches)/nrow(merged_df) * 100, 2)
255
256
257
258
        raise_error(msg = c('Mismatches in tissue surface among tissue seg files were found',
                            'Median value of tissue surface will be used for the following rows:',
                            paste0(paste(mismatches, collapse=', '), ' [', percentage, '%]')),
                    file = dirname(files_to_merge[1]),
Robin Engler's avatar
Robin Engler committed
259
                    type='warning')
260

Robin Engler's avatar
Robin Engler committed
261
262
263
264
265
266
        # Compute median values to reconciliate mismatches.
        for(col_name in non_key_fields){
            col_index = grep(col_name, colnames(merged_df))
            merged_df[mismatches, col_index[1]] = apply(merged_df[mismatches, col_index], 1, median)
        }
    }
267

Robin Engler's avatar
Robin Engler committed
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
    # Remove duplicated columns.
    merged_df = merged_df[,1:5]
    return(merged_df)
}
####################################################################################################



####################################################################################################
remove_duplicated_rows <- function(input_df, key_fields, file_name){
    # ********************************************************************************************
    # Check whether there are duplicated rows for key_fields in the input table (input_df), and if
    # so, removes them and displays a warning.
    #
    # file_name: name of file from where input_df is loaded. Only used to display warning to user.
    # ********************************************************************************************
284

Robin Engler's avatar
Robin Engler committed
285
286
    # Remove perfectly duplicated rows without displaying any warning-
    if(any(duplicated(input_df))) input_df = unique(input_df)
287

Robin Engler's avatar
Robin Engler committed
288
289
290
291
    # Remove rows duplicated over the key_fields with a warning to the user.
    duplicated_rows = which(duplicated(input_df[,key_fields]))
    if(length(duplicated_rows) > 0){
        input_df = input_df[-duplicated_rows,]
292
293
294
        raise_error(msg  = 'The following duplicated rows were deleted from input file:',
                    file = file_name,
                    items_to_list = duplicated_rows,
Robin Engler's avatar
Robin Engler committed
295
296
297
298
299
300
                    type = 'warning')
    }
    return(input_df)
}
####################################################################################################