rename_samples.R 9.56 KB
Newer Older
Robin Engler's avatar
Robin Engler committed
1
2

####################################################################################################
3
4
5
#' Rename samples in the input data and split them into two if needed.
#'
rename_samples <- function(sample_rename, root_dir){
Robin Engler's avatar
Robin Engler committed
6
7
8

    # Rename samples in cell and tissue segmentation files.
    # ****************************************************
9
10
11
12
    for(old_name in names(sample_rename)){
        new_name = sample_rename[[old_name]]
        old_dir = file.path(root_dir, old_name)
        new_dir = file.path(root_dir, new_name)
Robin Engler's avatar
Robin Engler committed
13
14
15
16
        stopifnot(dir.exists(old_dir))
        stopifnot(!dir.exists(new_dir))
        for(x in new_dir) dir.create(x, recursive=FALSE)
        log_message(paste0('rename ', old_name, ' to ', paste(new_name, collapse=' / '),
17
                           ifelse(length(new_name)>1, ' [sample will be split]', '')),
Robin Engler's avatar
Robin Engler committed
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
                    level=2)

        # Load cell and tissue segmentation files, change sample name and write to disk.
        for(file_extension in c(CELL_FILES_EXTENSION, TISSUE_FILES_EXTENSION)){
            old_file = file.path(old_dir, paste0(old_name, file_extension))
            new_file = file.path(new_dir, paste0(new_name, file_extension))
            tmp = read.table(old_file, sep='\t', as.is=TRUE, header=TRUE,
                             check.names=TRUE, strip.white=TRUE, colClasses='character')

            # Case 1: the data does not need to be split.
            if(length(new_name) == 1){
                tmp[,'sample_name'] = new_name
                write.table(tmp, file=new_file, sep='\t', quote=FALSE, row.names=FALSE)

            # Case 2: the data must be split into 2.
            } else if(length(new_name) == 2){
34
                # Get image ID values that corresponds to the 2 sub-samples in the original sample.
Robin Engler's avatar
Robin Engler committed
35
36
37
                if(file_extension == CELL_FILES_EXTENSION){
                    image_id_split = split_by_coordinate(coordinates = tmp[,'cell_y_position'],
                                                         image_ids   = tmp[,'image_id'])
38
                    #plot(tmp$cell_x_position[rand_sample], tmp$cell_y_position[rand_sample], col=as.factor(tmp$image_id[rand_sample]), pch=18)
Robin Engler's avatar
Robin Engler committed
39
40
                }
                if(!all(unique(tmp[,'image_id']) %in% unlist(image_id_split))) raise_error(
41
42
43
44
                    msg = c("Tissue seg data has more image IDs than cell seg data.",
                            "Image ID values no present in the cell seg data will be removed."),
                    file = old_file,
                    type = 'warning')
Robin Engler's avatar
Robin Engler committed
45
46
47

                # Temporary check for the split.
                if(FALSE){
48
                    jpeg(filename = file.path(root_dir, paste0(old_name, '_split.jpg')),
Robin Engler's avatar
Robin Engler committed
49
50
51
52
53
54
55
56
                         width = 700, height = 1000)
                        rand_sample = sample(1:nrow(tmp), ifelse(nrow(tmp) > 5000, 5000, nrow(tmp)))
                        plot(tmp$cell_x_position[rand_sample], tmp$cell_y_position[rand_sample],
                             col=ifelse(tmp[rand_sample, 'image_id'] %in% image_id_split[[1]],
                                        'darkgreen', 'darkred'))
                    dev.off()
                }

57
                # Split original sample based on image ID values.
Robin Engler's avatar
Robin Engler committed
58
59
60
61
62
63
64
65
                tmp_1 = tmp[tmp[,'image_id'] %in% image_id_split[[1]], ]
                tmp_2 = tmp[tmp[,'image_id'] %in% image_id_split[[2]], ]
                if(file_extension == CELL_FILES_EXTENSION) stopifnot(nrow(tmp_1)+nrow(tmp_2) == nrow(tmp))
                tmp_1[,'sample_name'] = new_name[1]
                tmp_2[,'sample_name'] = new_name[2]
                write.table(tmp_1, file=new_file[1], sep='\t', quote=FALSE, row.names=FALSE)
                write.table(tmp_2, file=new_file[2], sep='\t', quote=FALSE, row.names=FALSE)

66
            } else stop("cannot split sample in more than 2. THIS CASE IS NOT IMPLEMENTED YET.")
Robin Engler's avatar
Robin Engler committed
67
68
69
70
71
72
73
74
75
76
            unlink(old_file)
        }
        unlink(old_dir, recursive=T)
    }
}
####################################################################################################



####################################################################################################
77
78
79
80
81
#' Load a "sample rename" file from disk.
#'
#' @param input_file [string] path of the "sample rename" file to load.
#'
load_sample_rename_file <- function(input_file){
Robin Engler's avatar
Robin Engler committed
82
83

    # Load file content by line. Lines starting with # are ignored.
84
    file_content = read_file_as_vector(input_file, ignore_comments=TRUE, ignore_empty_line=TRUE)
Robin Engler's avatar
Robin Engler committed
85
86
87
88
89
    if(length(file_content) < 2) raise_error(
        msg  = 'Sample renaming files must contain at least 2 lines: header + one sample.',
        file = input_file)

    # For each line, extract the correspondence between the old and new sample names.
90
    sample_rename = list()
Robin Engler's avatar
Robin Engler committed
91
92
93
    for(line in file_content[-1]){
        line_elements = string_to_vector(line)
        if(!length(line_elements) %in% c(2,3)) raise_error(
94
            msg  = "All rows of 'sample_rename.txt' files must have either 2 or 3 elements.",
Robin Engler's avatar
Robin Engler committed
95
            file = input_file)
96
97
        old_name = line_elements[1]
        new_name = line_elements[2:length(line_elements)]
Robin Engler's avatar
Robin Engler committed
98

99
100
101
        # Add new old/new sample name pair to list.
        # Replace any ' character in the new sample name with "_".
        sample_rename[[old_name]] = gsub("'", '-', new_name)
Robin Engler's avatar
Robin Engler committed
102
103
    }

104
105
106
    return(sample_rename)
}
####################################################################################################
Robin Engler's avatar
Robin Engler committed
107

108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133

####################################################################################################
#'
#' @param sample_rename [list]
#'
check_sample_rename <- function(sample_rename, original_samples){

    # Verify that no old/new sample names are duplicated.
    check_for_duplicates(names(sample_rename),
                         error_message = 'Duplicated sample names found in file:',
                         error_file = SAMPLE_RENAME_FILE)
    check_for_duplicates(unlist(sample_rename, use.names=F),
                         error_message = 'Duplicated new sample names found in file:',
                         error_file = SAMPLE_RENAME_FILE)
    check_for_duplicates(c(names(sample_rename), unlist(sample_rename, use.names=F)),
                         error_message = 'New sample name has same name as old sample:',
                         error_file = SAMPLE_RENAME_FILE)

    # Verify that there is a 1:1 match between the sample_rename values and the original
    # sample names as passed in the input parameter file.
    check_for_difference(original_samples, names(sample_rename),
        error_message = "One or more samples have no matching value in the 'sample rename' file:",
        error_file = SAMPLE_RENAME_FILE)
    check_for_difference(names(sample_rename), original_samples,
        error_message = "One or more values in 'sample rename' file do not match any actual sample:",
        error_file = SAMPLE_RENAME_FILE)
Robin Engler's avatar
Robin Engler committed
134
135
136
137
}
####################################################################################################


138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
####################################################################################################
check_for_duplicates <- function(values_to_check, error_message, error_file){
    duplicates = values_to_check[duplicated(values_to_check)]
    if(length(duplicates) > 0) raise_error(msg = error_message,
                                          items_to_list = duplicates,
                                          file = error_file)
}
####################################################################################################


####################################################################################################
check_for_difference <- function(set_1, set_2, error_message, error_file){
    if(length(setdiff(set_1, set_2)) > 0) raise_error(msg = error_message,
                                                      items_to_list = setdiff(set_1, set_2),
                                                      file = error_file)
}
####################################################################################################

Robin Engler's avatar
Robin Engler committed
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187

####################################################################################################
split_by_coordinate <- function(coordinates, image_ids){
    # ********************************************************************************************
    #
    # ********************************************************************************************
    coordinates = as.numeric(coordinates)
    stopifnot(all(!is.na(coordinates)))

    # Find position of largest gap in coordinates: this is where the split must be done.
    sorted_coordinates = sort(coordinates)
    gap_size = diff(sorted_coordinates)
    gap_max_position = which(gap_size == max(gap_size))
    stopifnot(length(gap_max_position) == 1)
    coordinate_split_position = sorted_coordinates[gap_max_position] + max(gap_size)/2

    # Split image ID values at the position of the coordinate split.
    id_list_1 = unique(image_ids[which(coordinates <= coordinate_split_position)])
    id_list_2 = unique(image_ids[which(coordinates > coordinate_split_position)])
    stopifnot(length(id_list_1) > 0)
    stopifnot(length(id_list_2) > 0)
    if(length(intersect(id_list_1, id_list_2)) > 0) raise_error(
        msg = 'Split by coordinate failed: some image ID values overalp the split:',
        items_to_list = intersect(id_list_1, id_list_2))
    if(any(sort(unique(image_ids)) != sort(c(id_list_1, id_list_2)))) raise_error(
        msg = 'Split by coordinate failed: some image ID are missing after split.')

    # Return the two lists of image ID values.
    return( list('id_list_1'=id_list_1, 'id_list_2'=id_list_2) )
}
####################################################################################################