csi_images.csi_events

Contains the Event class, which represents a single event in a scan. The Event class optionally holds metadata and features. Lists of events with similar metadata or features can be combined into DataFrames for analysis.

The Event class holds the position of the event in the frame, which can be converted to the position in the scanner or slide coordinate positions. See the csi_utils.csi_scans documentation page for more information on the coordinate systems.

  1"""
  2Contains the Event class, which represents a single event in a scan.
  3The Event class optionally holds metadata and features. Lists of events with
  4similar metadata or features can be combined into DataFrames for analysis.
  5
  6The Event class holds the position of the event in the frame, which can be converted
  7to the position in the scanner or slide coordinate positions. See the
  8csi_utils.csi_scans documentation page for more information on the coordinate systems.
  9"""
 10
 11import os
 12import math
 13import typing
 14
 15import numpy as np
 16import pandas as pd
 17
 18from .csi_scans import Scan
 19from .csi_tiles import Tile
 20from .csi_frames import Frame
 21
 22# Optional dependencies; will raise errors in particular functions if not installed
 23try:
 24    import pyreadr
 25except ImportError:
 26    pyreadr = None
 27
 28
 29class Event:
 30    """
 31    A class that represents a single event in a scan, making it easy to evaluate
 32    singular events. Required metadata is exposed as attributes, and optional
 33    metadata and features are stored as DataFrames.
 34    """
 35
 36    SCAN_TO_SLIDE_TRANSFORM = {
 37        # Axioscan zero is in the top-right corner instead of top-left
 38        Scan.Type.AXIOSCAN7: np.array(
 39            [
 40                [1, 0, 75000],
 41                [0, 1, 0],
 42                [0, 0, 1],
 43            ]
 44        ),
 45        # BZScanner coordinates are a special kind of messed up:
 46        # - The slide is upside-down.
 47        # - The slide is oriented vertically, with the barcode at the bottom.
 48        # - Tiles are numbered from the top-right
 49        Scan.Type.BZSCANNER: np.array(
 50            [
 51                [0, -1, 75000],
 52                [-1, 0, 25000],
 53                [0, 0, 1],
 54            ]
 55        ),
 56    }
 57    """
 58    Homogeneous transformation matrices for converting between scanner and slide
 59    coordinates. The matrices are 3x3, with the final column representing the
 60    translation in micrometers (um). For more information, see 
 61    [affine transformations](https://en.wikipedia.org/wiki/Transformation_matrix#Affine_transformations).
 62    
 63    Transformations are nominal, and accuracy is not guaranteed; this is due to 
 64    imperfections in slides and alignment in the scanners. Units are in micrometers.
 65    """
 66
 67    def __init__(
 68        self,
 69        scan: Scan,
 70        tile: Tile,
 71        x: int,
 72        y: int,
 73        size: int = 12,  # End-to-end size in pixels
 74        metadata: pd.Series = None,
 75        features: pd.Series = None,
 76    ):
 77        self.scan = scan
 78        self.tile = tile
 79        self.x = x
 80        self.y = y
 81        self.size = size
 82        self.metadata = metadata
 83        self.features = features
 84
 85    def __repr__(self) -> str:
 86        return f"{self.scan.slide_id}-{self.tile.n}-{self.x}-{self.y}"
 87
 88    def __eq__(self, other) -> bool:
 89        return self.__repr__() == other.__repr__()
 90
 91    def __lt__(self, other):
 92        return self.__repr__() < other.__repr__()
 93
 94    def get_scan_position(self) -> tuple[float, float]:
 95        """
 96        Get the position of the event in the scanner's coordinate frame.
 97        :return: the scan position of the event in micrometers (um).
 98        """
 99        # Get overall pixel position
100        pixel_x = self.x + (self.scan.tile_width_px * self.tile.x)
101        pixel_y = self.y + (self.scan.tile_height_px * self.tile.y)
102        # Convert to micrometers
103        x_um = pixel_x * self.scan.pixel_size_um
104        y_um = pixel_y * self.scan.pixel_size_um
105        # Add the scan's origin in the scanner frame
106        x_um += self.scan.roi[self.tile.n_roi].origin_x_um
107        y_um += self.scan.roi[self.tile.n_roi].origin_y_um
108        return x_um, y_um
109
110    def get_slide_position(self) -> tuple[float, float]:
111        """
112        Get the slide position of the event in micrometers (um).
113        :return: the slide position of the event.
114        """
115        # Turn scan_position into a 3x1 vector
116        scan_position = self.get_scan_position()
117        scan_position = np.array([[scan_position[0]], [scan_position[1]], [1]])
118
119        # Multiply by the appropriate homogeneous matrix
120        if self.scan.scanner_id.startswith(self.scan.Type.AXIOSCAN7.value):
121            transform = self.SCAN_TO_SLIDE_TRANSFORM[self.scan.Type.AXIOSCAN7]
122        elif self.scan.scanner_id.startswith(self.scan.Type.BZSCANNER.value):
123            transform = self.SCAN_TO_SLIDE_TRANSFORM[self.scan.Type.BZSCANNER]
124        else:
125            raise ValueError(f"Scanner type {self.scan.scanner_id} not supported.")
126        slide_position = np.matmul(transform, scan_position)
127        return float(slide_position[0][0]), float(slide_position[1][0])
128
129    def crop_images(
130        self, images: list[np.ndarray], crop_size: int = 100, in_pixels: bool = True
131    ) -> list[np.ndarray]:
132        """
133        Get the event crops from the frame images. Called "get" because it does not
134        need to extract anything; it is very quick for extracting multiple events from
135        the same tile.
136        Use this if you're interested in many events.
137        :param images: the frame images.
138        :param crop_size: the square size of the image crop to get for this event.
139        :param in_pixels: whether the crop size is in pixels or micrometers. Defaults to pixels.
140        :return: image_size x image_size crops of the event in the provided frames. If
141        the event is too close to the edge, the crop will be smaller and not centered.
142        """
143        # Convert a crop size in micrometers to pixels
144        if not in_pixels:
145            crop_size = round(crop_size / self.scan.pixel_size_um)
146        # Find the crop bounds
147        bounds = [
148            self.x - crop_size // 2,
149            self.y - crop_size // 2,
150            self.x + math.ceil(crop_size / 2),
151            self.y + math.ceil(crop_size / 2),
152        ]
153        # Determine how much the bounds violate the image size
154        displacements = [
155            max(0, -bounds[0]),
156            max(0, -bounds[1]),
157            max(0, bounds[2] - images[0].shape[1]),
158            max(0, bounds[3] - images[0].shape[0]),
159        ]
160        # Cap off the bounds
161        bounds = [
162            max(0, bounds[0]),
163            max(0, bounds[1]),
164            min(images[0].shape[1], bounds[2]),
165            min(images[0].shape[0], bounds[3]),
166        ]
167
168        # Crop the images
169        cropped_images = []
170        for image in images:
171            # Create a blank image of the right size
172            cropped_image = np.zeros((crop_size, crop_size), dtype=image.dtype)
173
174            # Insert the cropped image into the blank image, leaving a black buffer
175            # around the edges if the crop would go beyond the original image bounds
176            cropped_image[
177                displacements[1] : crop_size - displacements[3],
178                displacements[0] : crop_size - displacements[2],
179            ] = image[bounds[1] : bounds[3], bounds[0] : bounds[2]]
180            cropped_images.append(cropped_image)
181        return cropped_images
182
183    def extract_images(
184        self, crop_size: int = 100, in_pixels: bool = True
185    ) -> list[np.ndarray]:
186        """
187        Extract the images from the scan and tile, reading from the file. Called
188        "extract" because it must read and extract the images from file, which is slow.
189        Use this if you're interested in only a few events, as it is inefficient when
190        reading multiple events from the same tile.
191        :param crop_size: the square size of the image crop to get for this event.
192        :param in_pixels: whether the crop size is in pixels or micrometers. Defaults to pixels.
193        :return: a list of cropped images from the scan in the order of the channels.
194        """
195        frames = Frame.get_frames(self.tile)
196        images = [frame.get_image() for frame in frames]
197        return self.crop_images(images, crop_size, in_pixels)
198
199    @classmethod
200    def extract_images_for_list(
201        cls,
202        events: list[typing.Self],
203        crop_size: int | list[int] = None,
204        in_pixels: bool = True,
205    ) -> list[list[np.ndarray]]:
206        """
207        Get the images for a list of events, ensuring that there is no wasteful reading
208        of the same tile multiple times. This function is more efficient than calling
209        extract_event_images for each event.
210        TODO: test this function
211        :param events: the events to extract images for.
212        :param crop_size: the square size of the image crop to get for this event.
213                          Defaults to four times the size of the event.
214        :param in_pixels: whether the crop size is in pixels or micrometers.
215                          Defaults to pixels, and is ignored if crop_size is None.
216        :return: a list of lists of cropped images for each event.
217        """
218        if len(events) == 0:
219            return []
220
221        # Populate a crop size if none provided
222        if crop_size is None:
223            crop_size = [4 * event.size for event in events]
224            in_pixels = True
225        # Propagate a constant crop size
226        elif isinstance(crop_size, int):
227            crop_size = [crop_size] * len(events)
228
229        # Sort the events by tile; use a shallow copy to avoid modifying the original
230        order, _ = zip(*sorted(enumerate(events), key=lambda x: x[1].__repr__()))
231
232        # Allocate the list to size
233        images = [None] * len(events)
234        last_tile = None
235        frame_images = None  # Holds large numpy arrays, so expensive to compare
236        # Iterate through in sorted order
237        for i in order:
238            if last_tile != events[i].tile:
239                # Gather the frame images, preserving them for the next event
240                frames = Frame.get_frames(events[i].tile)
241                frame_images = [frame.get_image() for frame in frames]
242
243                last_tile = events[i].tile
244            # Use the frame images to crop the event images
245            # Preserve the original order using order[i]
246            images[i] = events[i].crop_images(frame_images, crop_size[i], in_pixels)
247        return images
248
249
250class EventArray:
251    """
252    A class that holds a large number of events' data, making it easy to analyze and
253    manipulate many events at once. A more separated version of the Event class.
254    """
255
256    INFO_COLUMNS = ["slide_id", "tile", "roi", "x", "y", "size"]
257
258    def __init__(
259        self,
260        info: pd.DataFrame = None,
261        metadata: pd.DataFrame = None,
262        features: pd.DataFrame = None,
263    ):
264        # Info must be a DataFrame with columns "slide_id", "tile", "roi", "x", "y", "size"
265        if info is not None and (
266            not all(
267                col in info.columns
268                for col in ["slide_id", "tile", "roi", "x", "y", "size"]
269            )
270            or len(info.columns) != 6
271        ):
272            raise ValueError(
273                "EventArray.info must have columns 'slide_id', 'tile', 'roi', 'x', 'y', 'size'"
274            )
275        # All DataFrames must all have the same number of rows
276        if metadata is not None and (info is None or len(info) != len(metadata)):
277            raise ValueError(
278                "If EventArray.metadata is not None, it should match rows with .info"
279            )
280        if features is not None and (info is None or len(info) != len(features)):
281            raise ValueError(
282                "If EventArray.features is not None, it should match rows with .info"
283            )
284        self.info = info
285        self.metadata = metadata
286        self.features = features
287
288    def __len__(self) -> int:
289        # Convenience method to get the number of events
290        if self.info is None:
291            return 0
292        else:
293            return len(self.info)
294
295    def __eq__(self, other):
296        is_equal = True
297        # Parse all possibilities for info
298        if isinstance(self.info, pd.DataFrame):
299            if isinstance(other.info, pd.DataFrame):
300                is_equal = self.info.equals(other.info)
301                if not is_equal:
302                    return False
303            else:
304                return False
305        elif self.info is None:
306            if other.info is not None:
307                return False
308
309        # Parse all possibilities for metadata
310        if isinstance(self.metadata, pd.DataFrame):
311            if isinstance(other.metadata, pd.DataFrame):
312                is_equal = self.metadata.equals(other.metadata)
313                if not is_equal:
314                    return False
315            else:
316                return False
317        elif self.metadata is None:
318            if other.metadata is not None:
319                return False
320
321        # Parse all possibilities for features
322        if isinstance(self.features, pd.DataFrame):
323            if isinstance(other.features, pd.DataFrame):
324                is_equal = self.features.equals(other.features)
325                if not is_equal:
326                    return False
327            else:
328                return False
329        elif self.features is None:
330            if other.features is not None:
331                return False
332
333        return is_equal
334
335    def get_sort_order(self, by: str | list[str], ascending: bool | list[bool] = True):
336        """
337        Get the sort order for the EventArray by a column in the info, metadata, or features DataFrames.
338        :param by: name of the column(s) to sort by.
339        :param ascending: whether to sort in ascending order; can be a list to match by
340        :return: the order of the indices to sort by.
341        """
342        columns = self.get(by)
343        return columns.sort_values(by=by, ascending=ascending).index
344
345    def sort(
346        self, by: str | list[str], ascending: bool | list[bool] = True
347    ) -> typing.Self:
348        """
349        Sort the EventArray by column(s) in the info, metadata, or features DataFrames.
350        :param by: name of the column(s) to sort by.
351        :param ascending: whether to sort in ascending order; can be a list to match by
352        :return: a new, sorted EventArray.
353        """
354        order = self.get_sort_order(by, ascending)
355        info = self.info.loc[order].reset_index(drop=True)
356        if self.metadata is not None:
357            metadata = self.metadata.loc[order].reset_index(drop=True)
358        else:
359            metadata = None
360        if self.features is not None:
361            features = self.features.loc[order].reset_index(drop=True)
362        else:
363            features = None
364        return EventArray(info, metadata, features)
365
366    def get(self, column_names: int | str | list[int] | list[str]) -> pd.DataFrame:
367        """
368        Get a DataFrame with the specified columns from the EventArray, by value.
369        :param column_names: the names of the columns to get.
370        :return: a DataFrame with the specified columns.
371        """
372        if isinstance(column_names, int) or isinstance(column_names, str):
373            column_names = [column_names]
374        columns = []
375        for column_name in column_names:
376            if column_name in self.info.columns:
377                columns.append(self.info[column_name])
378            elif self.metadata is not None and column_name in self.metadata.columns:
379                columns.append(self.metadata[column_name])
380            elif self.features is not None and column_name in self.features.columns:
381                columns.append(self.features[column_name])
382            else:
383                raise ValueError(f"Column {column_name} not found in EventArray")
384        return pd.concat(columns, axis=1)
385
386    def rows(self, rows) -> typing.Self:
387        """
388        Get a subset of the EventArray rows based on a boolean or integer index, by value.
389        :param rows: the indices to get as a 1D boolean/integer list/array/series
390        :return: a new EventArray with the subset of events.
391        """
392        info = self.info.loc[rows].reset_index(drop=True)
393        if self.metadata is not None:
394            metadata = self.metadata.loc[rows].reset_index(drop=True)
395        else:
396            metadata = None
397        if self.features is not None:
398            features = self.features.loc[rows].reset_index(drop=True)
399        else:
400            features = None
401        return EventArray(info, metadata, features)
402
403    def copy(self) -> typing.Self:
404        """
405        Create a deep copy of the EventArray.
406        :return: a deep copy of the EventArray.
407        """
408        return EventArray(
409            info=self.info.copy(),
410            metadata=None if self.metadata is None else self.metadata.copy(),
411            features=None if self.features is None else self.features.copy(),
412        )
413
414    def add_metadata(self, new_metadata: pd.Series | pd.DataFrame) -> None:
415        """
416        Add metadata to the EventArray. Removes the need to check if metadata is None.
417        Overwrites any existing metadata with the same column names as the new metadata.
418        :param new_metadata: the metadata to add.
419        """
420        if len(self) != len(new_metadata):
421            raise ValueError("New metadata must match length of existing info")
422
423        if self.metadata is None:
424            self.metadata = new_metadata
425        else:
426            if isinstance(new_metadata, pd.Series):
427                self.metadata[new_metadata.name] = new_metadata
428            else:
429                # It's a DataFrame
430                self.metadata[new_metadata.columns] = new_metadata
431
432    def add_features(self, new_features: pd.Series | pd.DataFrame) -> None:
433        """
434        Add features to the EventArray. Removes the need to check if features is None.
435        Overwrites any existing features with the same column names as the new features.
436        :param new_features: the features to add.
437        """
438        if len(self) != len(new_features):
439            raise ValueError("New features must match length of existing info")
440
441        if self.features is None:
442            self.features = new_features
443        else:
444            if isinstance(new_features, pd.Series):
445                self.features[new_features.name] = new_features
446            else:
447                # It's a DataFrame
448                self.features[new_features.columns] = new_features
449
450    @classmethod
451    def merge(cls, events: list[typing.Self]) -> typing.Self:
452        """
453        Combine EventArrays in a list into a single EventArray.
454        :param events: the new list of events.
455        """
456        all_info = []
457        all_metadata = []
458        all_features = []
459        for event_array in events:
460            # Skip empty EventArrays
461            if event_array.info is not None:
462                all_info.append(event_array.info)
463            if event_array.metadata is not None:
464                all_metadata.append(event_array.metadata)
465            if event_array.features is not None:
466                all_features.append(event_array.features)
467        if len(all_info) == 0:
468            return EventArray()
469        else:
470            all_info = pd.concat(all_info, ignore_index=True)
471        if len(all_metadata) == 0:
472            all_metadata = None
473        else:
474            all_metadata = pd.concat(all_metadata, ignore_index=True)
475        if len(all_features) == 0:
476            all_features = None
477        else:
478            all_features = pd.concat(all_features, ignore_index=True)
479
480        return EventArray(all_info, all_metadata, all_features)
481
482    @classmethod
483    def from_events(cls, events: list[Event]) -> typing.Self:
484        """
485        Set the events in the EventArray to a new list of events.
486        :param events: the new list of events.
487        """
488        # Return an empty array if we were passed nothing
489        if events is None or len(events) == 0:
490            return EventArray()
491        # Otherwise, grab the info
492        info = pd.DataFrame(
493            {
494                "slide_id": [event.scan.slide_id for event in events],
495                "tile": [event.tile.n for event in events],
496                "roi": [event.tile.n_roi for event in events],
497                "x": [event.x for event in events],
498                "y": [event.y for event in events],
499                "size": [event.size for event in events],
500            }
501        )
502        metadata_list = [event.metadata for event in events]
503        # Iterate through and ensure that all metadata is the same shape
504        for metadata in metadata_list:
505            if type(metadata) != type(metadata_list[0]):
506                raise ValueError("All metadata must be the same type.")
507            if metadata is not None and metadata.shape != metadata_list[0].shape:
508                raise ValueError("All metadata must be the same shape.")
509        if metadata_list[0] is None:
510            metadata = None
511        else:
512            metadata = pd.DataFrame(metadata_list)
513        features_list = [event.features for event in events]
514        # Iterate through and ensure that all features are the same shape
515        for features in features_list:
516            if type(features) != type(features_list[0]):
517                raise ValueError("All features must be the same type.")
518            if features is not None and features.shape != features_list[0].shape:
519                raise ValueError("All features must be the same shape.")
520        if features_list[0] is None:
521            features = None
522        else:
523            features = pd.DataFrame(features_list)
524        return EventArray(info=info, metadata=metadata, features=features)
525
526    def to_events(
527        self,
528        scans: list[Scan],
529        ignore_missing_scans=True,
530        ignore_metadata=False,
531        ignore_features=False,
532    ) -> list[Event]:
533        """
534        Get the events in the EventArray as a list of events.
535        :param scans: the scans that the events belong to. Pass an empty list if you
536                      don't care about scan metadata.
537        :param ignore_missing_scans: whether to create blank scans for events without scans.
538        :param ignore_metadata: whether to ignore metadata or not
539        :param ignore_features: whether to ignore features or not
540        :return:
541        """
542        events = []
543        for i in range(len(self.info)):
544            # Determine the associated scan
545            scan = None
546            for s in scans:
547                if s.slide_id == self.info["slide_id"][i]:
548                    scan = s
549                    break
550            if scan is None:
551                if ignore_missing_scans:
552                    # Create a placeholder scan if the scan is missing
553                    scan = Scan.make_placeholder(
554                        self.info["slide_id"][i],
555                        self.info["tile"][i],
556                        self.info["roi"][i],
557                    )
558                else:
559                    raise ValueError(
560                        f"Scan {self.info['slide_id'][i]} not found for event {i}."
561                    )
562            # Add to the list
563            events.append(
564                Event(
565                    scan,
566                    Tile(scan, self.info["tile"][i], self.info["roi"][i]),
567                    self.info["x"][i],
568                    self.info["y"][i],
569                    size=self.info["size"][i],
570                    metadata=None if ignore_metadata else self.metadata.loc[i],
571                    features=None if ignore_features else self.features.loc[i],
572                )
573            )
574        return events
575
576    def to_dataframe(self) -> pd.DataFrame:
577        """
578        Convert all the data in the EventArray to a single DataFrame.
579        :return: a DataFrame with all the data in the EventArray.
580        """
581        # Make a copy of the info DataFrame and prepend "info_" to the column names
582        output = self.info.copy()
583        output.columns = [f"info_{col}" for col in output.columns]
584        # Combine with the metadata and prepend "metadata_" to the column names
585        if self.metadata is not None:
586            metadata = self.metadata.copy()
587            metadata.columns = [f"metadata_{col}" for col in metadata.columns]
588            output = pd.concat([output, metadata], axis=1)
589        # Combine with the features and prepend "features_" to the column names
590        if self.features is not None:
591            features = self.features.copy()
592            features.columns = [f"features_{col}" for col in features.columns]
593            output = pd.concat([output, features], axis=1)
594        return output
595
596    @classmethod
597    def from_dataframe(cls, df) -> typing.Self:
598        """
599        From a single, special DataFrame, create an EventArray.
600        :return: a DataFrame with all the data in the EventArray.
601        """
602        # Split the columns into info, metadata, and features and strip prefix
603        info = df[[col for col in df.columns if col.startswith("info_")]].copy()
604        info.columns = [col.replace("info_", "") for col in info.columns]
605        if info.size == 0:
606            info = None
607        metadata = df[[col for col in df.columns if col.startswith("metadata_")]].copy()
608        metadata.columns = [col.replace("metadata_", "") for col in metadata.columns]
609        if metadata.size == 0:
610            metadata = None
611        features = df[[col for col in df.columns if col.startswith("features_")]].copy()
612        features.columns = [col.replace("features_", "") for col in features.columns]
613        if features.size == 0:
614            features = None
615        return cls(info=info, metadata=metadata, features=features)
616
617    def save_csv(self, output_path: str) -> bool:
618        """
619        Save the events to an CSV file, including metadata and features.
620        :param output_path:
621        :return:
622        """
623        self.to_dataframe().to_csv(output_path, index=False)
624        return os.path.exists(output_path)
625
626    @classmethod
627    def load_csv(cls, input_path: str) -> typing.Self:
628        """
629        Load the events from an CSV file, including metadata and features.
630        :param input_path:
631        :return:
632        """
633        # Load the CSV file
634        df = pd.read_csv(input_path)
635        return cls.from_dataframe(df)
636
637    def save_hdf5(self, output_path: str) -> bool:
638        """
639        Save the events to an HDF5 file, including metadata and features.
640        Uses the pandas-provided HDF5 functions for ease, and external compatibility,
641        though these files are slightly harder to view in HDFView or similar.
642        :param output_path:
643        :return:
644        """
645        # Open the output_path as an HDF5 file
646        with pd.HDFStore(output_path) as store:
647            # Store the dataframes in the HDF5 file
648            if self.info is not None:
649                store.put("info", self.info, index=False)
650            if self.metadata is not None:
651                store.put("metadata", self.metadata, index=False)
652            if self.features is not None:
653                store.put("features", self.features, index=False)
654        return os.path.exists(output_path)
655
656    @classmethod
657    def load_hdf5(cls, input_path: str) -> typing.Self:
658        """
659        Load the events from an HDF5 file, including metadata and features.
660        :param input_path:
661        :return:
662        """
663        # Open the input_path as an HDF5 file
664        with pd.HDFStore(input_path) as store:
665            # Load the dataframes from the HDF5 file
666            info = store.get("info") if "info" in store else None
667            metadata = store.get("metadata") if "metadata" in store else None
668            features = store.get("features") if "features" in store else None
669        return cls(info=info, metadata=metadata, features=features)
670
671    @classmethod
672    def load_ocular(
673        cls,
674        input_path: str,
675        event_type="cells",
676        cell_data_files=(
677            "rc-final1.rds",
678            "rc-final2.rds",
679            "rc-final3.rds",
680            "rc-final4.rds",
681            "ocular_interesting.rds",
682        ),
683        others_data_files=(
684            "others-final1.rds",
685            "others-final2.rds",
686            "others-final3.rds",
687            "others-final4.rds",
688        ),
689        atlas_data_files=(
690            "ocular_interesting.rds",
691            "ocular_not_interesting.rds",
692        ),
693        drop_common_events=True,
694        log=None,
695    ) -> typing.Self:
696        """
697
698        :param input_path:
699        :param event_type:
700        :param cell_data_files:
701        :param others_data_files:
702        :param atlas_data_files:
703        :param drop_common_events:
704        :param log:
705        :return:
706        """
707        if pyreadr is None:
708            raise ModuleNotFoundError(
709                "pyreadr not installed. Install pyreadr directly "
710                "or install csi-images with [rds] option to resolve."
711            )
712        # Check if the input path is a directory or a file
713        if os.path.isfile(input_path):
714            data_files = [os.path.basename(input_path)]
715            input_path = os.path.dirname(input_path)
716        if event_type == "cells":
717            data_files = cell_data_files
718        elif event_type == "others":
719            data_files = others_data_files
720        else:
721            raise ValueError("Invalid event type.")
722
723        # Load the data from the OCULAR files
724        file_data = {}
725        for file in data_files:
726            file_path = os.path.join(input_path, file)
727            if not os.path.isfile(file_path):
728                if log is not None:
729                    log.warning(f"{file} not found for in {input_path}")
730                continue
731            file_data[file] = pyreadr.read_r(file_path)
732            # Get the DataFrame associated with None (pyreadr dict quirk)
733            file_data[file] = file_data[file][None]
734            if len(file_data[file]) == 0:
735                # File gets dropped from the dict
736                file_data.pop(file)
737                if log is not None:
738                    log.warning(f"{file} has no cells")
739                continue
740
741            if log is not None:
742                log.debug(f"{file} has {len(file_data[file])} cells")
743
744            # Drop common cells if requested and in this file
745            if file in atlas_data_files and drop_common_events:
746                common_cell_indices = (
747                    file_data[file]["catalogue_classification"] == "common_cell"
748                )
749                if log is not None:
750                    log.debug(
751                        f"Dropping {int(pd.Series.sum(common_cell_indices))}"
752                        f"common cells from {file}"
753                    )
754                file_data[file] = file_data[file][common_cell_indices == False]
755
756            if len(file_data[file]) == 0:
757                # File gets dropped from the dict
758                file_data.pop(file)
759                if log is not None:
760                    log.warning(f"{file} has no cells after dropping common cells")
761                continue
762
763            # Extract frame_id and cell_id
764            # DAPI- events already have frame_id cell_id outside rowname
765            if event_type == "cells":
766                file_data[file]["rowname"] = file_data[file]["rowname"].astype("str")
767                # get frame_id cell_id from rownames column and split into two columns
768                split_res = file_data[file]["rowname"].str.split(" ", n=1, expand=True)
769                if len(split_res.columns) != 2:
770                    log.warning(
771                        f'Expected "frame_id cell_id" but got {file_data[file]["rowname"]}'
772                    )
773                # then assign it back to the dataframe
774                file_data[file][["frame_id", "cell_id"]] = split_res.astype("int")
775            # reset indexes since they can cause NaN values in concat
776            file_data[file] = file_data[file].reset_index(drop=True)
777
778        # Merge the data from all files
779        if len(file_data) == 0:
780            return EventArray()
781        elif len(file_data) == 1:
782            data = [file_data[file] for file in file_data.keys()][0]
783        else:
784            data = pd.concat(file_data.values())
785
786        if log is not None:
787            log.debug(f"Gathered a total of {len(data)} events")
788
789        # Others is missing the "slide_id". Insert it right before "frame_id" column
790        if event_type == "others" and "slide_id" not in data.columns:
791            if os.path.basename(input_path) == "ocular":
792                slide_id = os.path.basename(os.path.dirname(input_path))
793            else:
794                slide_id = "UNKNOWN"
795            data.insert(data.columns.get_loc("frame_id"), "slide_id", slide_id)
796
797        # Sort according to ascending cell_id to keep the original, which is in manual_df
798        data = data.sort_values(by=["cell_id"], ascending=True)
799        # Filter out duplicates by x & y
800        data = data.assign(
801            unique_id=data["slide_id"]
802            + "_"
803            + data["frame_id"].astype(str)
804            + "_"
805            + data["cellx"].astype(int).astype(str)
806            + "_"
807            + data["celly"].astype(int).astype(str)
808        )
809        data = data.drop_duplicates(subset=["unique_id"], keep="first")
810        # Normal unique_id is with cell_id
811        data = data.assign(
812            unique_id=data["slide_id"]
813            + "_"
814            + data["frame_id"].astype(str)
815            + "_"
816            + data["cell_id"].astype(str)
817        )
818        data = data.reset_index(drop=True)
819        # All columns up to "slide_id" are features; drop the "slide_id"
820        features = data.loc[:, :"slide_id"].iloc[:, :-1]
821        data = data.loc[:, "slide_id":]
822        # Grab the info columns
823        info = data[["slide_id", "frame_id", "cellx", "celly"]]
824        info.columns = ["slide_id", "tile", "x", "y"]
825        info = info.assign(
826            roi=0,  # OCULAR only works on 1 ROI, as far as known
827            size=25,  # Static, for later montaging
828        )
829        info = info[["slide_id", "tile", "roi", "x", "y", "size"]]
830        # Metadata has duplicate columns for later convenience
831        metadata = data
832        # Certain columns tend to be problematic with mixed data formats...
833        for col in ["TRITC", "CY5", "FITC"]:
834            if col in metadata:
835                labels = {
836                    "False": False,
837                    "True": True,
838                    "FALSE": False,
839                    "TRUE": True,
840                }
841                metadata[col] = metadata[col].map(labels).astype(bool)
842        for col in ["catalogue_id", "catalogue_distance", "clust", "hcpc"]:
843            if col in metadata:
844                metadata[col] = metadata[col].fillna(-1).astype(int)
845        return EventArray(info, metadata, features)
846
847    def save_ocular(self, output_path: str, event_type: str = "cells"):
848        """
849        Save the events to an OCULAR file. Relies on the dataframe originating
850        from an OCULAR file (same columns; duplicate metadata/info).
851        :param output_path:
852        :param event_type:
853        :return:
854        """
855        if event_type == "cells":
856            file_stub = "rc-final"
857        elif event_type == "others":
858            file_stub = "others-final"
859        else:
860            raise ValueError("Invalid event type. Must be cells or others.")
861
862        # Check for the "ocular_interesting" column
863        if event_type == "cells":
864            if "ocular_interesting" in self.metadata.columns:
865                interesting_rows = self.metadata["ocular_interesting"].to_numpy(
866                    dtype=bool
867                )
868            elif "hcpc" in self.metadata.columns:
869                # Interesting cells don't get an hcpc designation, leaving them as -1
870                interesting_rows = (
871                    self.metadata["hcpc"].to_numpy() == -1
872                )  # interesting cells
873            else:
874                interesting_rows = []
875            if sum(interesting_rows) > 0:
876                # Split the metadata into interesting and regular
877                interesting_events = self.rows(interesting_rows)
878                interesting_df = pd.concat(
879                    [interesting_events.features, interesting_events.metadata], axis=1
880                )
881                data_events = self.rows(~interesting_rows)
882                data_df = pd.concat(
883                    [data_events.features, data_events.metadata], axis=1
884                )
885                data_df = data_df.drop(columns=["ocular_interesting"], errors="ignore")
886
887                # Drop particular columns for "interesting"
888                interesting_df = interesting_df.drop(
889                    [
890                        "clust",
891                        "hcpc",
892                        "frame_id",
893                        "cell_id",
894                        "unique_id",
895                        "ocular_interesting",
896                    ],
897                    axis=1,
898                    errors="ignore",
899                )
900                # Save both .csv and .rds
901                interesting_df.to_csv(
902                    os.path.join(output_path, "ocular_interesting.csv"), index=False
903                )
904                pyreadr.write_rds(
905                    os.path.join(output_path, "ocular_interesting.rds"), interesting_df
906                )
907            else:
908                data_df = pd.concat([self.features, self.metadata], axis=1)
909        else:
910            # Get all data and reset_index (will copy it)
911            data_df = pd.concat([self.features, self.metadata], axis=1)
912
913        # Split based on cluster number to conform to *-final[1-4].rds
914        n_clusters = max(data_df["clust"]) + 1
915        split_idx = [round(i * n_clusters / 4) for i in range(5)]
916        for i in range(4):
917            subset = (split_idx[i] <= data_df["clust"]) & (
918                data_df["clust"] < split_idx[i + 1]
919            )
920            data_df.loc[subset, "hcpc"] = i + 1
921            subset = data_df[subset].reset_index(drop=True)
922            pyreadr.write_rds(
923                os.path.join(output_path, f"{file_stub}{i+1}.rds"), subset
924            )
925
926        # Create new example cell strings
927        data_df["example_cell_id"] = (
928            data_df["slide_id"]
929            + " "
930            + data_df["frame_id"].astype(str)
931            + " "
932            + data_df["cell_id"].astype(str)
933            + " "
934            + data_df["cellx"].astype(int).astype(str)
935            + " "
936            + data_df["celly"].astype(int).astype(str)
937        )
938        # Find averagable data columns
939        if "cellcluster_id" in data_df.columns:
940            end_idx = data_df.columns.get_loc("cellcluster_id")
941        else:
942            end_idx = data_df.columns.get_loc("slide_id")
943        avg_cols = data_df.columns[:end_idx].tolist()
944        # Group by cluster and average
945        data_df = data_df.groupby("clust").agg(
946            **{col: (col, "mean") for col in avg_cols},
947            count=("clust", "size"),  # count rows in each cluster
948            example_cells=("example_cell_id", lambda x: ",".join(x)),
949            hcpc=("hcpc", lambda x: x.iloc[0]),
950        )
951        data_df = data_df.reset_index()  # Do NOT drop, index is "clust"
952        # Create new columns
953        metadata = pd.DataFrame(
954            {
955                "count": data_df["count"],
956                "example_cells": data_df["example_cells"],
957                "clust": data_df["clust"].astype(int),
958                "hcpc": data_df["hcpc"].astype(int),
959                "id": data_df["clust"].astype(int).astype(str),
960                "cccluster": "0",  # Dummy value
961                "ccdistance": 0.0,  # Dummy value
962                "rownum": list(range(len(data_df))),
963                "framegroup": 0,  # Dummy value
964            }
965        )
966        data_df = pd.concat([data_df[avg_cols], metadata], axis=1)
967        # Save the cluster data
968        data_df.to_csv(os.path.join(output_path, f"{file_stub}.csv"), index=False)
969        pyreadr.write_rds(os.path.join(output_path, f"{file_stub}.rds"), data_df)
class Event:
 30class Event:
 31    """
 32    A class that represents a single event in a scan, making it easy to evaluate
 33    singular events. Required metadata is exposed as attributes, and optional
 34    metadata and features are stored as DataFrames.
 35    """
 36
 37    SCAN_TO_SLIDE_TRANSFORM = {
 38        # Axioscan zero is in the top-right corner instead of top-left
 39        Scan.Type.AXIOSCAN7: np.array(
 40            [
 41                [1, 0, 75000],
 42                [0, 1, 0],
 43                [0, 0, 1],
 44            ]
 45        ),
 46        # BZScanner coordinates are a special kind of messed up:
 47        # - The slide is upside-down.
 48        # - The slide is oriented vertically, with the barcode at the bottom.
 49        # - Tiles are numbered from the top-right
 50        Scan.Type.BZSCANNER: np.array(
 51            [
 52                [0, -1, 75000],
 53                [-1, 0, 25000],
 54                [0, 0, 1],
 55            ]
 56        ),
 57    }
 58    """
 59    Homogeneous transformation matrices for converting between scanner and slide
 60    coordinates. The matrices are 3x3, with the final column representing the
 61    translation in micrometers (um). For more information, see 
 62    [affine transformations](https://en.wikipedia.org/wiki/Transformation_matrix#Affine_transformations).
 63    
 64    Transformations are nominal, and accuracy is not guaranteed; this is due to 
 65    imperfections in slides and alignment in the scanners. Units are in micrometers.
 66    """
 67
 68    def __init__(
 69        self,
 70        scan: Scan,
 71        tile: Tile,
 72        x: int,
 73        y: int,
 74        size: int = 12,  # End-to-end size in pixels
 75        metadata: pd.Series = None,
 76        features: pd.Series = None,
 77    ):
 78        self.scan = scan
 79        self.tile = tile
 80        self.x = x
 81        self.y = y
 82        self.size = size
 83        self.metadata = metadata
 84        self.features = features
 85
 86    def __repr__(self) -> str:
 87        return f"{self.scan.slide_id}-{self.tile.n}-{self.x}-{self.y}"
 88
 89    def __eq__(self, other) -> bool:
 90        return self.__repr__() == other.__repr__()
 91
 92    def __lt__(self, other):
 93        return self.__repr__() < other.__repr__()
 94
 95    def get_scan_position(self) -> tuple[float, float]:
 96        """
 97        Get the position of the event in the scanner's coordinate frame.
 98        :return: the scan position of the event in micrometers (um).
 99        """
100        # Get overall pixel position
101        pixel_x = self.x + (self.scan.tile_width_px * self.tile.x)
102        pixel_y = self.y + (self.scan.tile_height_px * self.tile.y)
103        # Convert to micrometers
104        x_um = pixel_x * self.scan.pixel_size_um
105        y_um = pixel_y * self.scan.pixel_size_um
106        # Add the scan's origin in the scanner frame
107        x_um += self.scan.roi[self.tile.n_roi].origin_x_um
108        y_um += self.scan.roi[self.tile.n_roi].origin_y_um
109        return x_um, y_um
110
111    def get_slide_position(self) -> tuple[float, float]:
112        """
113        Get the slide position of the event in micrometers (um).
114        :return: the slide position of the event.
115        """
116        # Turn scan_position into a 3x1 vector
117        scan_position = self.get_scan_position()
118        scan_position = np.array([[scan_position[0]], [scan_position[1]], [1]])
119
120        # Multiply by the appropriate homogeneous matrix
121        if self.scan.scanner_id.startswith(self.scan.Type.AXIOSCAN7.value):
122            transform = self.SCAN_TO_SLIDE_TRANSFORM[self.scan.Type.AXIOSCAN7]
123        elif self.scan.scanner_id.startswith(self.scan.Type.BZSCANNER.value):
124            transform = self.SCAN_TO_SLIDE_TRANSFORM[self.scan.Type.BZSCANNER]
125        else:
126            raise ValueError(f"Scanner type {self.scan.scanner_id} not supported.")
127        slide_position = np.matmul(transform, scan_position)
128        return float(slide_position[0][0]), float(slide_position[1][0])
129
130    def crop_images(
131        self, images: list[np.ndarray], crop_size: int = 100, in_pixels: bool = True
132    ) -> list[np.ndarray]:
133        """
134        Get the event crops from the frame images. Called "get" because it does not
135        need to extract anything; it is very quick for extracting multiple events from
136        the same tile.
137        Use this if you're interested in many events.
138        :param images: the frame images.
139        :param crop_size: the square size of the image crop to get for this event.
140        :param in_pixels: whether the crop size is in pixels or micrometers. Defaults to pixels.
141        :return: image_size x image_size crops of the event in the provided frames. If
142        the event is too close to the edge, the crop will be smaller and not centered.
143        """
144        # Convert a crop size in micrometers to pixels
145        if not in_pixels:
146            crop_size = round(crop_size / self.scan.pixel_size_um)
147        # Find the crop bounds
148        bounds = [
149            self.x - crop_size // 2,
150            self.y - crop_size // 2,
151            self.x + math.ceil(crop_size / 2),
152            self.y + math.ceil(crop_size / 2),
153        ]
154        # Determine how much the bounds violate the image size
155        displacements = [
156            max(0, -bounds[0]),
157            max(0, -bounds[1]),
158            max(0, bounds[2] - images[0].shape[1]),
159            max(0, bounds[3] - images[0].shape[0]),
160        ]
161        # Cap off the bounds
162        bounds = [
163            max(0, bounds[0]),
164            max(0, bounds[1]),
165            min(images[0].shape[1], bounds[2]),
166            min(images[0].shape[0], bounds[3]),
167        ]
168
169        # Crop the images
170        cropped_images = []
171        for image in images:
172            # Create a blank image of the right size
173            cropped_image = np.zeros((crop_size, crop_size), dtype=image.dtype)
174
175            # Insert the cropped image into the blank image, leaving a black buffer
176            # around the edges if the crop would go beyond the original image bounds
177            cropped_image[
178                displacements[1] : crop_size - displacements[3],
179                displacements[0] : crop_size - displacements[2],
180            ] = image[bounds[1] : bounds[3], bounds[0] : bounds[2]]
181            cropped_images.append(cropped_image)
182        return cropped_images
183
184    def extract_images(
185        self, crop_size: int = 100, in_pixels: bool = True
186    ) -> list[np.ndarray]:
187        """
188        Extract the images from the scan and tile, reading from the file. Called
189        "extract" because it must read and extract the images from file, which is slow.
190        Use this if you're interested in only a few events, as it is inefficient when
191        reading multiple events from the same tile.
192        :param crop_size: the square size of the image crop to get for this event.
193        :param in_pixels: whether the crop size is in pixels or micrometers. Defaults to pixels.
194        :return: a list of cropped images from the scan in the order of the channels.
195        """
196        frames = Frame.get_frames(self.tile)
197        images = [frame.get_image() for frame in frames]
198        return self.crop_images(images, crop_size, in_pixels)
199
200    @classmethod
201    def extract_images_for_list(
202        cls,
203        events: list[typing.Self],
204        crop_size: int | list[int] = None,
205        in_pixels: bool = True,
206    ) -> list[list[np.ndarray]]:
207        """
208        Get the images for a list of events, ensuring that there is no wasteful reading
209        of the same tile multiple times. This function is more efficient than calling
210        extract_event_images for each event.
211        TODO: test this function
212        :param events: the events to extract images for.
213        :param crop_size: the square size of the image crop to get for this event.
214                          Defaults to four times the size of the event.
215        :param in_pixels: whether the crop size is in pixels or micrometers.
216                          Defaults to pixels, and is ignored if crop_size is None.
217        :return: a list of lists of cropped images for each event.
218        """
219        if len(events) == 0:
220            return []
221
222        # Populate a crop size if none provided
223        if crop_size is None:
224            crop_size = [4 * event.size for event in events]
225            in_pixels = True
226        # Propagate a constant crop size
227        elif isinstance(crop_size, int):
228            crop_size = [crop_size] * len(events)
229
230        # Sort the events by tile; use a shallow copy to avoid modifying the original
231        order, _ = zip(*sorted(enumerate(events), key=lambda x: x[1].__repr__()))
232
233        # Allocate the list to size
234        images = [None] * len(events)
235        last_tile = None
236        frame_images = None  # Holds large numpy arrays, so expensive to compare
237        # Iterate through in sorted order
238        for i in order:
239            if last_tile != events[i].tile:
240                # Gather the frame images, preserving them for the next event
241                frames = Frame.get_frames(events[i].tile)
242                frame_images = [frame.get_image() for frame in frames]
243
244                last_tile = events[i].tile
245            # Use the frame images to crop the event images
246            # Preserve the original order using order[i]
247            images[i] = events[i].crop_images(frame_images, crop_size[i], in_pixels)
248        return images

A class that represents a single event in a scan, making it easy to evaluate singular events. Required metadata is exposed as attributes, and optional metadata and features are stored as DataFrames.

Event( scan: csi_images.csi_scans.Scan, tile: csi_images.csi_tiles.Tile, x: int, y: int, size: int = 12, metadata: pandas.core.series.Series = None, features: pandas.core.series.Series = None)
68    def __init__(
69        self,
70        scan: Scan,
71        tile: Tile,
72        x: int,
73        y: int,
74        size: int = 12,  # End-to-end size in pixels
75        metadata: pd.Series = None,
76        features: pd.Series = None,
77    ):
78        self.scan = scan
79        self.tile = tile
80        self.x = x
81        self.y = y
82        self.size = size
83        self.metadata = metadata
84        self.features = features
SCAN_TO_SLIDE_TRANSFORM = {<Type.AXIOSCAN7: 'axioscan7'>: array([[ 1, 0, 75000], [ 0, 1, 0], [ 0, 0, 1]]), <Type.BZSCANNER: 'bzscanner'>: array([[ 0, -1, 75000], [ -1, 0, 25000], [ 0, 0, 1]])}

Homogeneous transformation matrices for converting between scanner and slide coordinates. The matrices are 3x3, with the final column representing the translation in micrometers (um). For more information, see affine transformations.

Transformations are nominal, and accuracy is not guaranteed; this is due to imperfections in slides and alignment in the scanners. Units are in micrometers.

scan
tile
x
y
size
metadata
features
def get_scan_position(self) -> tuple[float, float]:
 95    def get_scan_position(self) -> tuple[float, float]:
 96        """
 97        Get the position of the event in the scanner's coordinate frame.
 98        :return: the scan position of the event in micrometers (um).
 99        """
100        # Get overall pixel position
101        pixel_x = self.x + (self.scan.tile_width_px * self.tile.x)
102        pixel_y = self.y + (self.scan.tile_height_px * self.tile.y)
103        # Convert to micrometers
104        x_um = pixel_x * self.scan.pixel_size_um
105        y_um = pixel_y * self.scan.pixel_size_um
106        # Add the scan's origin in the scanner frame
107        x_um += self.scan.roi[self.tile.n_roi].origin_x_um
108        y_um += self.scan.roi[self.tile.n_roi].origin_y_um
109        return x_um, y_um

Get the position of the event in the scanner's coordinate frame.

Returns

the scan position of the event in micrometers (um).

def get_slide_position(self) -> tuple[float, float]:
111    def get_slide_position(self) -> tuple[float, float]:
112        """
113        Get the slide position of the event in micrometers (um).
114        :return: the slide position of the event.
115        """
116        # Turn scan_position into a 3x1 vector
117        scan_position = self.get_scan_position()
118        scan_position = np.array([[scan_position[0]], [scan_position[1]], [1]])
119
120        # Multiply by the appropriate homogeneous matrix
121        if self.scan.scanner_id.startswith(self.scan.Type.AXIOSCAN7.value):
122            transform = self.SCAN_TO_SLIDE_TRANSFORM[self.scan.Type.AXIOSCAN7]
123        elif self.scan.scanner_id.startswith(self.scan.Type.BZSCANNER.value):
124            transform = self.SCAN_TO_SLIDE_TRANSFORM[self.scan.Type.BZSCANNER]
125        else:
126            raise ValueError(f"Scanner type {self.scan.scanner_id} not supported.")
127        slide_position = np.matmul(transform, scan_position)
128        return float(slide_position[0][0]), float(slide_position[1][0])

Get the slide position of the event in micrometers (um).

Returns

the slide position of the event.

def crop_images( self, images: list[numpy.ndarray], crop_size: int = 100, in_pixels: bool = True) -> list[numpy.ndarray]:
130    def crop_images(
131        self, images: list[np.ndarray], crop_size: int = 100, in_pixels: bool = True
132    ) -> list[np.ndarray]:
133        """
134        Get the event crops from the frame images. Called "get" because it does not
135        need to extract anything; it is very quick for extracting multiple events from
136        the same tile.
137        Use this if you're interested in many events.
138        :param images: the frame images.
139        :param crop_size: the square size of the image crop to get for this event.
140        :param in_pixels: whether the crop size is in pixels or micrometers. Defaults to pixels.
141        :return: image_size x image_size crops of the event in the provided frames. If
142        the event is too close to the edge, the crop will be smaller and not centered.
143        """
144        # Convert a crop size in micrometers to pixels
145        if not in_pixels:
146            crop_size = round(crop_size / self.scan.pixel_size_um)
147        # Find the crop bounds
148        bounds = [
149            self.x - crop_size // 2,
150            self.y - crop_size // 2,
151            self.x + math.ceil(crop_size / 2),
152            self.y + math.ceil(crop_size / 2),
153        ]
154        # Determine how much the bounds violate the image size
155        displacements = [
156            max(0, -bounds[0]),
157            max(0, -bounds[1]),
158            max(0, bounds[2] - images[0].shape[1]),
159            max(0, bounds[3] - images[0].shape[0]),
160        ]
161        # Cap off the bounds
162        bounds = [
163            max(0, bounds[0]),
164            max(0, bounds[1]),
165            min(images[0].shape[1], bounds[2]),
166            min(images[0].shape[0], bounds[3]),
167        ]
168
169        # Crop the images
170        cropped_images = []
171        for image in images:
172            # Create a blank image of the right size
173            cropped_image = np.zeros((crop_size, crop_size), dtype=image.dtype)
174
175            # Insert the cropped image into the blank image, leaving a black buffer
176            # around the edges if the crop would go beyond the original image bounds
177            cropped_image[
178                displacements[1] : crop_size - displacements[3],
179                displacements[0] : crop_size - displacements[2],
180            ] = image[bounds[1] : bounds[3], bounds[0] : bounds[2]]
181            cropped_images.append(cropped_image)
182        return cropped_images

Get the event crops from the frame images. Called "get" because it does not need to extract anything; it is very quick for extracting multiple events from the same tile. Use this if you're interested in many events.

Parameters
  • images: the frame images.
  • crop_size: the square size of the image crop to get for this event.
  • in_pixels: whether the crop size is in pixels or micrometers. Defaults to pixels.
Returns

image_size x image_size crops of the event in the provided frames. If the event is too close to the edge, the crop will be smaller and not centered.

def extract_images( self, crop_size: int = 100, in_pixels: bool = True) -> list[numpy.ndarray]:
184    def extract_images(
185        self, crop_size: int = 100, in_pixels: bool = True
186    ) -> list[np.ndarray]:
187        """
188        Extract the images from the scan and tile, reading from the file. Called
189        "extract" because it must read and extract the images from file, which is slow.
190        Use this if you're interested in only a few events, as it is inefficient when
191        reading multiple events from the same tile.
192        :param crop_size: the square size of the image crop to get for this event.
193        :param in_pixels: whether the crop size is in pixels or micrometers. Defaults to pixels.
194        :return: a list of cropped images from the scan in the order of the channels.
195        """
196        frames = Frame.get_frames(self.tile)
197        images = [frame.get_image() for frame in frames]
198        return self.crop_images(images, crop_size, in_pixels)

Extract the images from the scan and tile, reading from the file. Called "extract" because it must read and extract the images from file, which is slow. Use this if you're interested in only a few events, as it is inefficient when reading multiple events from the same tile.

Parameters
  • crop_size: the square size of the image crop to get for this event.
  • in_pixels: whether the crop size is in pixels or micrometers. Defaults to pixels.
Returns

a list of cropped images from the scan in the order of the channels.

@classmethod
def extract_images_for_list( cls, events: list[typing.Self], crop_size: int | list[int] = None, in_pixels: bool = True) -> list[list[numpy.ndarray]]:
200    @classmethod
201    def extract_images_for_list(
202        cls,
203        events: list[typing.Self],
204        crop_size: int | list[int] = None,
205        in_pixels: bool = True,
206    ) -> list[list[np.ndarray]]:
207        """
208        Get the images for a list of events, ensuring that there is no wasteful reading
209        of the same tile multiple times. This function is more efficient than calling
210        extract_event_images for each event.
211        TODO: test this function
212        :param events: the events to extract images for.
213        :param crop_size: the square size of the image crop to get for this event.
214                          Defaults to four times the size of the event.
215        :param in_pixels: whether the crop size is in pixels or micrometers.
216                          Defaults to pixels, and is ignored if crop_size is None.
217        :return: a list of lists of cropped images for each event.
218        """
219        if len(events) == 0:
220            return []
221
222        # Populate a crop size if none provided
223        if crop_size is None:
224            crop_size = [4 * event.size for event in events]
225            in_pixels = True
226        # Propagate a constant crop size
227        elif isinstance(crop_size, int):
228            crop_size = [crop_size] * len(events)
229
230        # Sort the events by tile; use a shallow copy to avoid modifying the original
231        order, _ = zip(*sorted(enumerate(events), key=lambda x: x[1].__repr__()))
232
233        # Allocate the list to size
234        images = [None] * len(events)
235        last_tile = None
236        frame_images = None  # Holds large numpy arrays, so expensive to compare
237        # Iterate through in sorted order
238        for i in order:
239            if last_tile != events[i].tile:
240                # Gather the frame images, preserving them for the next event
241                frames = Frame.get_frames(events[i].tile)
242                frame_images = [frame.get_image() for frame in frames]
243
244                last_tile = events[i].tile
245            # Use the frame images to crop the event images
246            # Preserve the original order using order[i]
247            images[i] = events[i].crop_images(frame_images, crop_size[i], in_pixels)
248        return images

Get the images for a list of events, ensuring that there is no wasteful reading of the same tile multiple times. This function is more efficient than calling extract_event_images for each event. TODO: test this function

Parameters
  • events: the events to extract images for.
  • crop_size: the square size of the image crop to get for this event. Defaults to four times the size of the event.
  • in_pixels: whether the crop size is in pixels or micrometers. Defaults to pixels, and is ignored if crop_size is None.
Returns

a list of lists of cropped images for each event.

class EventArray:
251class EventArray:
252    """
253    A class that holds a large number of events' data, making it easy to analyze and
254    manipulate many events at once. A more separated version of the Event class.
255    """
256
257    INFO_COLUMNS = ["slide_id", "tile", "roi", "x", "y", "size"]
258
259    def __init__(
260        self,
261        info: pd.DataFrame = None,
262        metadata: pd.DataFrame = None,
263        features: pd.DataFrame = None,
264    ):
265        # Info must be a DataFrame with columns "slide_id", "tile", "roi", "x", "y", "size"
266        if info is not None and (
267            not all(
268                col in info.columns
269                for col in ["slide_id", "tile", "roi", "x", "y", "size"]
270            )
271            or len(info.columns) != 6
272        ):
273            raise ValueError(
274                "EventArray.info must have columns 'slide_id', 'tile', 'roi', 'x', 'y', 'size'"
275            )
276        # All DataFrames must all have the same number of rows
277        if metadata is not None and (info is None or len(info) != len(metadata)):
278            raise ValueError(
279                "If EventArray.metadata is not None, it should match rows with .info"
280            )
281        if features is not None and (info is None or len(info) != len(features)):
282            raise ValueError(
283                "If EventArray.features is not None, it should match rows with .info"
284            )
285        self.info = info
286        self.metadata = metadata
287        self.features = features
288
289    def __len__(self) -> int:
290        # Convenience method to get the number of events
291        if self.info is None:
292            return 0
293        else:
294            return len(self.info)
295
296    def __eq__(self, other):
297        is_equal = True
298        # Parse all possibilities for info
299        if isinstance(self.info, pd.DataFrame):
300            if isinstance(other.info, pd.DataFrame):
301                is_equal = self.info.equals(other.info)
302                if not is_equal:
303                    return False
304            else:
305                return False
306        elif self.info is None:
307            if other.info is not None:
308                return False
309
310        # Parse all possibilities for metadata
311        if isinstance(self.metadata, pd.DataFrame):
312            if isinstance(other.metadata, pd.DataFrame):
313                is_equal = self.metadata.equals(other.metadata)
314                if not is_equal:
315                    return False
316            else:
317                return False
318        elif self.metadata is None:
319            if other.metadata is not None:
320                return False
321
322        # Parse all possibilities for features
323        if isinstance(self.features, pd.DataFrame):
324            if isinstance(other.features, pd.DataFrame):
325                is_equal = self.features.equals(other.features)
326                if not is_equal:
327                    return False
328            else:
329                return False
330        elif self.features is None:
331            if other.features is not None:
332                return False
333
334        return is_equal
335
336    def get_sort_order(self, by: str | list[str], ascending: bool | list[bool] = True):
337        """
338        Get the sort order for the EventArray by a column in the info, metadata, or features DataFrames.
339        :param by: name of the column(s) to sort by.
340        :param ascending: whether to sort in ascending order; can be a list to match by
341        :return: the order of the indices to sort by.
342        """
343        columns = self.get(by)
344        return columns.sort_values(by=by, ascending=ascending).index
345
346    def sort(
347        self, by: str | list[str], ascending: bool | list[bool] = True
348    ) -> typing.Self:
349        """
350        Sort the EventArray by column(s) in the info, metadata, or features DataFrames.
351        :param by: name of the column(s) to sort by.
352        :param ascending: whether to sort in ascending order; can be a list to match by
353        :return: a new, sorted EventArray.
354        """
355        order = self.get_sort_order(by, ascending)
356        info = self.info.loc[order].reset_index(drop=True)
357        if self.metadata is not None:
358            metadata = self.metadata.loc[order].reset_index(drop=True)
359        else:
360            metadata = None
361        if self.features is not None:
362            features = self.features.loc[order].reset_index(drop=True)
363        else:
364            features = None
365        return EventArray(info, metadata, features)
366
367    def get(self, column_names: int | str | list[int] | list[str]) -> pd.DataFrame:
368        """
369        Get a DataFrame with the specified columns from the EventArray, by value.
370        :param column_names: the names of the columns to get.
371        :return: a DataFrame with the specified columns.
372        """
373        if isinstance(column_names, int) or isinstance(column_names, str):
374            column_names = [column_names]
375        columns = []
376        for column_name in column_names:
377            if column_name in self.info.columns:
378                columns.append(self.info[column_name])
379            elif self.metadata is not None and column_name in self.metadata.columns:
380                columns.append(self.metadata[column_name])
381            elif self.features is not None and column_name in self.features.columns:
382                columns.append(self.features[column_name])
383            else:
384                raise ValueError(f"Column {column_name} not found in EventArray")
385        return pd.concat(columns, axis=1)
386
387    def rows(self, rows) -> typing.Self:
388        """
389        Get a subset of the EventArray rows based on a boolean or integer index, by value.
390        :param rows: the indices to get as a 1D boolean/integer list/array/series
391        :return: a new EventArray with the subset of events.
392        """
393        info = self.info.loc[rows].reset_index(drop=True)
394        if self.metadata is not None:
395            metadata = self.metadata.loc[rows].reset_index(drop=True)
396        else:
397            metadata = None
398        if self.features is not None:
399            features = self.features.loc[rows].reset_index(drop=True)
400        else:
401            features = None
402        return EventArray(info, metadata, features)
403
404    def copy(self) -> typing.Self:
405        """
406        Create a deep copy of the EventArray.
407        :return: a deep copy of the EventArray.
408        """
409        return EventArray(
410            info=self.info.copy(),
411            metadata=None if self.metadata is None else self.metadata.copy(),
412            features=None if self.features is None else self.features.copy(),
413        )
414
415    def add_metadata(self, new_metadata: pd.Series | pd.DataFrame) -> None:
416        """
417        Add metadata to the EventArray. Removes the need to check if metadata is None.
418        Overwrites any existing metadata with the same column names as the new metadata.
419        :param new_metadata: the metadata to add.
420        """
421        if len(self) != len(new_metadata):
422            raise ValueError("New metadata must match length of existing info")
423
424        if self.metadata is None:
425            self.metadata = new_metadata
426        else:
427            if isinstance(new_metadata, pd.Series):
428                self.metadata[new_metadata.name] = new_metadata
429            else:
430                # It's a DataFrame
431                self.metadata[new_metadata.columns] = new_metadata
432
433    def add_features(self, new_features: pd.Series | pd.DataFrame) -> None:
434        """
435        Add features to the EventArray. Removes the need to check if features is None.
436        Overwrites any existing features with the same column names as the new features.
437        :param new_features: the features to add.
438        """
439        if len(self) != len(new_features):
440            raise ValueError("New features must match length of existing info")
441
442        if self.features is None:
443            self.features = new_features
444        else:
445            if isinstance(new_features, pd.Series):
446                self.features[new_features.name] = new_features
447            else:
448                # It's a DataFrame
449                self.features[new_features.columns] = new_features
450
451    @classmethod
452    def merge(cls, events: list[typing.Self]) -> typing.Self:
453        """
454        Combine EventArrays in a list into a single EventArray.
455        :param events: the new list of events.
456        """
457        all_info = []
458        all_metadata = []
459        all_features = []
460        for event_array in events:
461            # Skip empty EventArrays
462            if event_array.info is not None:
463                all_info.append(event_array.info)
464            if event_array.metadata is not None:
465                all_metadata.append(event_array.metadata)
466            if event_array.features is not None:
467                all_features.append(event_array.features)
468        if len(all_info) == 0:
469            return EventArray()
470        else:
471            all_info = pd.concat(all_info, ignore_index=True)
472        if len(all_metadata) == 0:
473            all_metadata = None
474        else:
475            all_metadata = pd.concat(all_metadata, ignore_index=True)
476        if len(all_features) == 0:
477            all_features = None
478        else:
479            all_features = pd.concat(all_features, ignore_index=True)
480
481        return EventArray(all_info, all_metadata, all_features)
482
483    @classmethod
484    def from_events(cls, events: list[Event]) -> typing.Self:
485        """
486        Set the events in the EventArray to a new list of events.
487        :param events: the new list of events.
488        """
489        # Return an empty array if we were passed nothing
490        if events is None or len(events) == 0:
491            return EventArray()
492        # Otherwise, grab the info
493        info = pd.DataFrame(
494            {
495                "slide_id": [event.scan.slide_id for event in events],
496                "tile": [event.tile.n for event in events],
497                "roi": [event.tile.n_roi for event in events],
498                "x": [event.x for event in events],
499                "y": [event.y for event in events],
500                "size": [event.size for event in events],
501            }
502        )
503        metadata_list = [event.metadata for event in events]
504        # Iterate through and ensure that all metadata is the same shape
505        for metadata in metadata_list:
506            if type(metadata) != type(metadata_list[0]):
507                raise ValueError("All metadata must be the same type.")
508            if metadata is not None and metadata.shape != metadata_list[0].shape:
509                raise ValueError("All metadata must be the same shape.")
510        if metadata_list[0] is None:
511            metadata = None
512        else:
513            metadata = pd.DataFrame(metadata_list)
514        features_list = [event.features for event in events]
515        # Iterate through and ensure that all features are the same shape
516        for features in features_list:
517            if type(features) != type(features_list[0]):
518                raise ValueError("All features must be the same type.")
519            if features is not None and features.shape != features_list[0].shape:
520                raise ValueError("All features must be the same shape.")
521        if features_list[0] is None:
522            features = None
523        else:
524            features = pd.DataFrame(features_list)
525        return EventArray(info=info, metadata=metadata, features=features)
526
527    def to_events(
528        self,
529        scans: list[Scan],
530        ignore_missing_scans=True,
531        ignore_metadata=False,
532        ignore_features=False,
533    ) -> list[Event]:
534        """
535        Get the events in the EventArray as a list of events.
536        :param scans: the scans that the events belong to. Pass an empty list if you
537                      don't care about scan metadata.
538        :param ignore_missing_scans: whether to create blank scans for events without scans.
539        :param ignore_metadata: whether to ignore metadata or not
540        :param ignore_features: whether to ignore features or not
541        :return:
542        """
543        events = []
544        for i in range(len(self.info)):
545            # Determine the associated scan
546            scan = None
547            for s in scans:
548                if s.slide_id == self.info["slide_id"][i]:
549                    scan = s
550                    break
551            if scan is None:
552                if ignore_missing_scans:
553                    # Create a placeholder scan if the scan is missing
554                    scan = Scan.make_placeholder(
555                        self.info["slide_id"][i],
556                        self.info["tile"][i],
557                        self.info["roi"][i],
558                    )
559                else:
560                    raise ValueError(
561                        f"Scan {self.info['slide_id'][i]} not found for event {i}."
562                    )
563            # Add to the list
564            events.append(
565                Event(
566                    scan,
567                    Tile(scan, self.info["tile"][i], self.info["roi"][i]),
568                    self.info["x"][i],
569                    self.info["y"][i],
570                    size=self.info["size"][i],
571                    metadata=None if ignore_metadata else self.metadata.loc[i],
572                    features=None if ignore_features else self.features.loc[i],
573                )
574            )
575        return events
576
577    def to_dataframe(self) -> pd.DataFrame:
578        """
579        Convert all the data in the EventArray to a single DataFrame.
580        :return: a DataFrame with all the data in the EventArray.
581        """
582        # Make a copy of the info DataFrame and prepend "info_" to the column names
583        output = self.info.copy()
584        output.columns = [f"info_{col}" for col in output.columns]
585        # Combine with the metadata and prepend "metadata_" to the column names
586        if self.metadata is not None:
587            metadata = self.metadata.copy()
588            metadata.columns = [f"metadata_{col}" for col in metadata.columns]
589            output = pd.concat([output, metadata], axis=1)
590        # Combine with the features and prepend "features_" to the column names
591        if self.features is not None:
592            features = self.features.copy()
593            features.columns = [f"features_{col}" for col in features.columns]
594            output = pd.concat([output, features], axis=1)
595        return output
596
597    @classmethod
598    def from_dataframe(cls, df) -> typing.Self:
599        """
600        From a single, special DataFrame, create an EventArray.
601        :return: a DataFrame with all the data in the EventArray.
602        """
603        # Split the columns into info, metadata, and features and strip prefix
604        info = df[[col for col in df.columns if col.startswith("info_")]].copy()
605        info.columns = [col.replace("info_", "") for col in info.columns]
606        if info.size == 0:
607            info = None
608        metadata = df[[col for col in df.columns if col.startswith("metadata_")]].copy()
609        metadata.columns = [col.replace("metadata_", "") for col in metadata.columns]
610        if metadata.size == 0:
611            metadata = None
612        features = df[[col for col in df.columns if col.startswith("features_")]].copy()
613        features.columns = [col.replace("features_", "") for col in features.columns]
614        if features.size == 0:
615            features = None
616        return cls(info=info, metadata=metadata, features=features)
617
618    def save_csv(self, output_path: str) -> bool:
619        """
620        Save the events to an CSV file, including metadata and features.
621        :param output_path:
622        :return:
623        """
624        self.to_dataframe().to_csv(output_path, index=False)
625        return os.path.exists(output_path)
626
627    @classmethod
628    def load_csv(cls, input_path: str) -> typing.Self:
629        """
630        Load the events from an CSV file, including metadata and features.
631        :param input_path:
632        :return:
633        """
634        # Load the CSV file
635        df = pd.read_csv(input_path)
636        return cls.from_dataframe(df)
637
638    def save_hdf5(self, output_path: str) -> bool:
639        """
640        Save the events to an HDF5 file, including metadata and features.
641        Uses the pandas-provided HDF5 functions for ease, and external compatibility,
642        though these files are slightly harder to view in HDFView or similar.
643        :param output_path:
644        :return:
645        """
646        # Open the output_path as an HDF5 file
647        with pd.HDFStore(output_path) as store:
648            # Store the dataframes in the HDF5 file
649            if self.info is not None:
650                store.put("info", self.info, index=False)
651            if self.metadata is not None:
652                store.put("metadata", self.metadata, index=False)
653            if self.features is not None:
654                store.put("features", self.features, index=False)
655        return os.path.exists(output_path)
656
657    @classmethod
658    def load_hdf5(cls, input_path: str) -> typing.Self:
659        """
660        Load the events from an HDF5 file, including metadata and features.
661        :param input_path:
662        :return:
663        """
664        # Open the input_path as an HDF5 file
665        with pd.HDFStore(input_path) as store:
666            # Load the dataframes from the HDF5 file
667            info = store.get("info") if "info" in store else None
668            metadata = store.get("metadata") if "metadata" in store else None
669            features = store.get("features") if "features" in store else None
670        return cls(info=info, metadata=metadata, features=features)
671
672    @classmethod
673    def load_ocular(
674        cls,
675        input_path: str,
676        event_type="cells",
677        cell_data_files=(
678            "rc-final1.rds",
679            "rc-final2.rds",
680            "rc-final3.rds",
681            "rc-final4.rds",
682            "ocular_interesting.rds",
683        ),
684        others_data_files=(
685            "others-final1.rds",
686            "others-final2.rds",
687            "others-final3.rds",
688            "others-final4.rds",
689        ),
690        atlas_data_files=(
691            "ocular_interesting.rds",
692            "ocular_not_interesting.rds",
693        ),
694        drop_common_events=True,
695        log=None,
696    ) -> typing.Self:
697        """
698
699        :param input_path:
700        :param event_type:
701        :param cell_data_files:
702        :param others_data_files:
703        :param atlas_data_files:
704        :param drop_common_events:
705        :param log:
706        :return:
707        """
708        if pyreadr is None:
709            raise ModuleNotFoundError(
710                "pyreadr not installed. Install pyreadr directly "
711                "or install csi-images with [rds] option to resolve."
712            )
713        # Check if the input path is a directory or a file
714        if os.path.isfile(input_path):
715            data_files = [os.path.basename(input_path)]
716            input_path = os.path.dirname(input_path)
717        if event_type == "cells":
718            data_files = cell_data_files
719        elif event_type == "others":
720            data_files = others_data_files
721        else:
722            raise ValueError("Invalid event type.")
723
724        # Load the data from the OCULAR files
725        file_data = {}
726        for file in data_files:
727            file_path = os.path.join(input_path, file)
728            if not os.path.isfile(file_path):
729                if log is not None:
730                    log.warning(f"{file} not found for in {input_path}")
731                continue
732            file_data[file] = pyreadr.read_r(file_path)
733            # Get the DataFrame associated with None (pyreadr dict quirk)
734            file_data[file] = file_data[file][None]
735            if len(file_data[file]) == 0:
736                # File gets dropped from the dict
737                file_data.pop(file)
738                if log is not None:
739                    log.warning(f"{file} has no cells")
740                continue
741
742            if log is not None:
743                log.debug(f"{file} has {len(file_data[file])} cells")
744
745            # Drop common cells if requested and in this file
746            if file in atlas_data_files and drop_common_events:
747                common_cell_indices = (
748                    file_data[file]["catalogue_classification"] == "common_cell"
749                )
750                if log is not None:
751                    log.debug(
752                        f"Dropping {int(pd.Series.sum(common_cell_indices))}"
753                        f"common cells from {file}"
754                    )
755                file_data[file] = file_data[file][common_cell_indices == False]
756
757            if len(file_data[file]) == 0:
758                # File gets dropped from the dict
759                file_data.pop(file)
760                if log is not None:
761                    log.warning(f"{file} has no cells after dropping common cells")
762                continue
763
764            # Extract frame_id and cell_id
765            # DAPI- events already have frame_id cell_id outside rowname
766            if event_type == "cells":
767                file_data[file]["rowname"] = file_data[file]["rowname"].astype("str")
768                # get frame_id cell_id from rownames column and split into two columns
769                split_res = file_data[file]["rowname"].str.split(" ", n=1, expand=True)
770                if len(split_res.columns) != 2:
771                    log.warning(
772                        f'Expected "frame_id cell_id" but got {file_data[file]["rowname"]}'
773                    )
774                # then assign it back to the dataframe
775                file_data[file][["frame_id", "cell_id"]] = split_res.astype("int")
776            # reset indexes since they can cause NaN values in concat
777            file_data[file] = file_data[file].reset_index(drop=True)
778
779        # Merge the data from all files
780        if len(file_data) == 0:
781            return EventArray()
782        elif len(file_data) == 1:
783            data = [file_data[file] for file in file_data.keys()][0]
784        else:
785            data = pd.concat(file_data.values())
786
787        if log is not None:
788            log.debug(f"Gathered a total of {len(data)} events")
789
790        # Others is missing the "slide_id". Insert it right before "frame_id" column
791        if event_type == "others" and "slide_id" not in data.columns:
792            if os.path.basename(input_path) == "ocular":
793                slide_id = os.path.basename(os.path.dirname(input_path))
794            else:
795                slide_id = "UNKNOWN"
796            data.insert(data.columns.get_loc("frame_id"), "slide_id", slide_id)
797
798        # Sort according to ascending cell_id to keep the original, which is in manual_df
799        data = data.sort_values(by=["cell_id"], ascending=True)
800        # Filter out duplicates by x & y
801        data = data.assign(
802            unique_id=data["slide_id"]
803            + "_"
804            + data["frame_id"].astype(str)
805            + "_"
806            + data["cellx"].astype(int).astype(str)
807            + "_"
808            + data["celly"].astype(int).astype(str)
809        )
810        data = data.drop_duplicates(subset=["unique_id"], keep="first")
811        # Normal unique_id is with cell_id
812        data = data.assign(
813            unique_id=data["slide_id"]
814            + "_"
815            + data["frame_id"].astype(str)
816            + "_"
817            + data["cell_id"].astype(str)
818        )
819        data = data.reset_index(drop=True)
820        # All columns up to "slide_id" are features; drop the "slide_id"
821        features = data.loc[:, :"slide_id"].iloc[:, :-1]
822        data = data.loc[:, "slide_id":]
823        # Grab the info columns
824        info = data[["slide_id", "frame_id", "cellx", "celly"]]
825        info.columns = ["slide_id", "tile", "x", "y"]
826        info = info.assign(
827            roi=0,  # OCULAR only works on 1 ROI, as far as known
828            size=25,  # Static, for later montaging
829        )
830        info = info[["slide_id", "tile", "roi", "x", "y", "size"]]
831        # Metadata has duplicate columns for later convenience
832        metadata = data
833        # Certain columns tend to be problematic with mixed data formats...
834        for col in ["TRITC", "CY5", "FITC"]:
835            if col in metadata:
836                labels = {
837                    "False": False,
838                    "True": True,
839                    "FALSE": False,
840                    "TRUE": True,
841                }
842                metadata[col] = metadata[col].map(labels).astype(bool)
843        for col in ["catalogue_id", "catalogue_distance", "clust", "hcpc"]:
844            if col in metadata:
845                metadata[col] = metadata[col].fillna(-1).astype(int)
846        return EventArray(info, metadata, features)
847
848    def save_ocular(self, output_path: str, event_type: str = "cells"):
849        """
850        Save the events to an OCULAR file. Relies on the dataframe originating
851        from an OCULAR file (same columns; duplicate metadata/info).
852        :param output_path:
853        :param event_type:
854        :return:
855        """
856        if event_type == "cells":
857            file_stub = "rc-final"
858        elif event_type == "others":
859            file_stub = "others-final"
860        else:
861            raise ValueError("Invalid event type. Must be cells or others.")
862
863        # Check for the "ocular_interesting" column
864        if event_type == "cells":
865            if "ocular_interesting" in self.metadata.columns:
866                interesting_rows = self.metadata["ocular_interesting"].to_numpy(
867                    dtype=bool
868                )
869            elif "hcpc" in self.metadata.columns:
870                # Interesting cells don't get an hcpc designation, leaving them as -1
871                interesting_rows = (
872                    self.metadata["hcpc"].to_numpy() == -1
873                )  # interesting cells
874            else:
875                interesting_rows = []
876            if sum(interesting_rows) > 0:
877                # Split the metadata into interesting and regular
878                interesting_events = self.rows(interesting_rows)
879                interesting_df = pd.concat(
880                    [interesting_events.features, interesting_events.metadata], axis=1
881                )
882                data_events = self.rows(~interesting_rows)
883                data_df = pd.concat(
884                    [data_events.features, data_events.metadata], axis=1
885                )
886                data_df = data_df.drop(columns=["ocular_interesting"], errors="ignore")
887
888                # Drop particular columns for "interesting"
889                interesting_df = interesting_df.drop(
890                    [
891                        "clust",
892                        "hcpc",
893                        "frame_id",
894                        "cell_id",
895                        "unique_id",
896                        "ocular_interesting",
897                    ],
898                    axis=1,
899                    errors="ignore",
900                )
901                # Save both .csv and .rds
902                interesting_df.to_csv(
903                    os.path.join(output_path, "ocular_interesting.csv"), index=False
904                )
905                pyreadr.write_rds(
906                    os.path.join(output_path, "ocular_interesting.rds"), interesting_df
907                )
908            else:
909                data_df = pd.concat([self.features, self.metadata], axis=1)
910        else:
911            # Get all data and reset_index (will copy it)
912            data_df = pd.concat([self.features, self.metadata], axis=1)
913
914        # Split based on cluster number to conform to *-final[1-4].rds
915        n_clusters = max(data_df["clust"]) + 1
916        split_idx = [round(i * n_clusters / 4) for i in range(5)]
917        for i in range(4):
918            subset = (split_idx[i] <= data_df["clust"]) & (
919                data_df["clust"] < split_idx[i + 1]
920            )
921            data_df.loc[subset, "hcpc"] = i + 1
922            subset = data_df[subset].reset_index(drop=True)
923            pyreadr.write_rds(
924                os.path.join(output_path, f"{file_stub}{i+1}.rds"), subset
925            )
926
927        # Create new example cell strings
928        data_df["example_cell_id"] = (
929            data_df["slide_id"]
930            + " "
931            + data_df["frame_id"].astype(str)
932            + " "
933            + data_df["cell_id"].astype(str)
934            + " "
935            + data_df["cellx"].astype(int).astype(str)
936            + " "
937            + data_df["celly"].astype(int).astype(str)
938        )
939        # Find averagable data columns
940        if "cellcluster_id" in data_df.columns:
941            end_idx = data_df.columns.get_loc("cellcluster_id")
942        else:
943            end_idx = data_df.columns.get_loc("slide_id")
944        avg_cols = data_df.columns[:end_idx].tolist()
945        # Group by cluster and average
946        data_df = data_df.groupby("clust").agg(
947            **{col: (col, "mean") for col in avg_cols},
948            count=("clust", "size"),  # count rows in each cluster
949            example_cells=("example_cell_id", lambda x: ",".join(x)),
950            hcpc=("hcpc", lambda x: x.iloc[0]),
951        )
952        data_df = data_df.reset_index()  # Do NOT drop, index is "clust"
953        # Create new columns
954        metadata = pd.DataFrame(
955            {
956                "count": data_df["count"],
957                "example_cells": data_df["example_cells"],
958                "clust": data_df["clust"].astype(int),
959                "hcpc": data_df["hcpc"].astype(int),
960                "id": data_df["clust"].astype(int).astype(str),
961                "cccluster": "0",  # Dummy value
962                "ccdistance": 0.0,  # Dummy value
963                "rownum": list(range(len(data_df))),
964                "framegroup": 0,  # Dummy value
965            }
966        )
967        data_df = pd.concat([data_df[avg_cols], metadata], axis=1)
968        # Save the cluster data
969        data_df.to_csv(os.path.join(output_path, f"{file_stub}.csv"), index=False)
970        pyreadr.write_rds(os.path.join(output_path, f"{file_stub}.rds"), data_df)

A class that holds a large number of events' data, making it easy to analyze and manipulate many events at once. A more separated version of the Event class.

EventArray( info: pandas.core.frame.DataFrame = None, metadata: pandas.core.frame.DataFrame = None, features: pandas.core.frame.DataFrame = None)
259    def __init__(
260        self,
261        info: pd.DataFrame = None,
262        metadata: pd.DataFrame = None,
263        features: pd.DataFrame = None,
264    ):
265        # Info must be a DataFrame with columns "slide_id", "tile", "roi", "x", "y", "size"
266        if info is not None and (
267            not all(
268                col in info.columns
269                for col in ["slide_id", "tile", "roi", "x", "y", "size"]
270            )
271            or len(info.columns) != 6
272        ):
273            raise ValueError(
274                "EventArray.info must have columns 'slide_id', 'tile', 'roi', 'x', 'y', 'size'"
275            )
276        # All DataFrames must all have the same number of rows
277        if metadata is not None and (info is None or len(info) != len(metadata)):
278            raise ValueError(
279                "If EventArray.metadata is not None, it should match rows with .info"
280            )
281        if features is not None and (info is None or len(info) != len(features)):
282            raise ValueError(
283                "If EventArray.features is not None, it should match rows with .info"
284            )
285        self.info = info
286        self.metadata = metadata
287        self.features = features
INFO_COLUMNS = ['slide_id', 'tile', 'roi', 'x', 'y', 'size']
info
metadata
features
def get_sort_order(self, by: str | list[str], ascending: bool | list[bool] = True):
336    def get_sort_order(self, by: str | list[str], ascending: bool | list[bool] = True):
337        """
338        Get the sort order for the EventArray by a column in the info, metadata, or features DataFrames.
339        :param by: name of the column(s) to sort by.
340        :param ascending: whether to sort in ascending order; can be a list to match by
341        :return: the order of the indices to sort by.
342        """
343        columns = self.get(by)
344        return columns.sort_values(by=by, ascending=ascending).index

Get the sort order for the EventArray by a column in the info, metadata, or features DataFrames.

Parameters
  • by: name of the column(s) to sort by.
  • ascending: whether to sort in ascending order; can be a list to match by
Returns

the order of the indices to sort by.

def sort(self, by: str | list[str], ascending: bool | list[bool] = True) -> Self:
346    def sort(
347        self, by: str | list[str], ascending: bool | list[bool] = True
348    ) -> typing.Self:
349        """
350        Sort the EventArray by column(s) in the info, metadata, or features DataFrames.
351        :param by: name of the column(s) to sort by.
352        :param ascending: whether to sort in ascending order; can be a list to match by
353        :return: a new, sorted EventArray.
354        """
355        order = self.get_sort_order(by, ascending)
356        info = self.info.loc[order].reset_index(drop=True)
357        if self.metadata is not None:
358            metadata = self.metadata.loc[order].reset_index(drop=True)
359        else:
360            metadata = None
361        if self.features is not None:
362            features = self.features.loc[order].reset_index(drop=True)
363        else:
364            features = None
365        return EventArray(info, metadata, features)

Sort the EventArray by column(s) in the info, metadata, or features DataFrames.

Parameters
  • by: name of the column(s) to sort by.
  • ascending: whether to sort in ascending order; can be a list to match by
Returns

a new, sorted EventArray.

def get( self, column_names: int | str | list[int] | list[str]) -> pandas.core.frame.DataFrame:
367    def get(self, column_names: int | str | list[int] | list[str]) -> pd.DataFrame:
368        """
369        Get a DataFrame with the specified columns from the EventArray, by value.
370        :param column_names: the names of the columns to get.
371        :return: a DataFrame with the specified columns.
372        """
373        if isinstance(column_names, int) or isinstance(column_names, str):
374            column_names = [column_names]
375        columns = []
376        for column_name in column_names:
377            if column_name in self.info.columns:
378                columns.append(self.info[column_name])
379            elif self.metadata is not None and column_name in self.metadata.columns:
380                columns.append(self.metadata[column_name])
381            elif self.features is not None and column_name in self.features.columns:
382                columns.append(self.features[column_name])
383            else:
384                raise ValueError(f"Column {column_name} not found in EventArray")
385        return pd.concat(columns, axis=1)

Get a DataFrame with the specified columns from the EventArray, by value.

Parameters
  • column_names: the names of the columns to get.
Returns

a DataFrame with the specified columns.

def rows(self, rows) -> Self:
387    def rows(self, rows) -> typing.Self:
388        """
389        Get a subset of the EventArray rows based on a boolean or integer index, by value.
390        :param rows: the indices to get as a 1D boolean/integer list/array/series
391        :return: a new EventArray with the subset of events.
392        """
393        info = self.info.loc[rows].reset_index(drop=True)
394        if self.metadata is not None:
395            metadata = self.metadata.loc[rows].reset_index(drop=True)
396        else:
397            metadata = None
398        if self.features is not None:
399            features = self.features.loc[rows].reset_index(drop=True)
400        else:
401            features = None
402        return EventArray(info, metadata, features)

Get a subset of the EventArray rows based on a boolean or integer index, by value.

Parameters
  • rows: the indices to get as a 1D boolean/integer list/array/series
Returns

a new EventArray with the subset of events.

def copy(self) -> Self:
404    def copy(self) -> typing.Self:
405        """
406        Create a deep copy of the EventArray.
407        :return: a deep copy of the EventArray.
408        """
409        return EventArray(
410            info=self.info.copy(),
411            metadata=None if self.metadata is None else self.metadata.copy(),
412            features=None if self.features is None else self.features.copy(),
413        )

Create a deep copy of the EventArray.

Returns

a deep copy of the EventArray.

def add_metadata( self, new_metadata: pandas.core.series.Series | pandas.core.frame.DataFrame) -> None:
415    def add_metadata(self, new_metadata: pd.Series | pd.DataFrame) -> None:
416        """
417        Add metadata to the EventArray. Removes the need to check if metadata is None.
418        Overwrites any existing metadata with the same column names as the new metadata.
419        :param new_metadata: the metadata to add.
420        """
421        if len(self) != len(new_metadata):
422            raise ValueError("New metadata must match length of existing info")
423
424        if self.metadata is None:
425            self.metadata = new_metadata
426        else:
427            if isinstance(new_metadata, pd.Series):
428                self.metadata[new_metadata.name] = new_metadata
429            else:
430                # It's a DataFrame
431                self.metadata[new_metadata.columns] = new_metadata

Add metadata to the EventArray. Removes the need to check if metadata is None. Overwrites any existing metadata with the same column names as the new metadata.

Parameters
  • new_metadata: the metadata to add.
def add_features( self, new_features: pandas.core.series.Series | pandas.core.frame.DataFrame) -> None:
433    def add_features(self, new_features: pd.Series | pd.DataFrame) -> None:
434        """
435        Add features to the EventArray. Removes the need to check if features is None.
436        Overwrites any existing features with the same column names as the new features.
437        :param new_features: the features to add.
438        """
439        if len(self) != len(new_features):
440            raise ValueError("New features must match length of existing info")
441
442        if self.features is None:
443            self.features = new_features
444        else:
445            if isinstance(new_features, pd.Series):
446                self.features[new_features.name] = new_features
447            else:
448                # It's a DataFrame
449                self.features[new_features.columns] = new_features

Add features to the EventArray. Removes the need to check if features is None. Overwrites any existing features with the same column names as the new features.

Parameters
  • new_features: the features to add.
@classmethod
def merge(cls, events: list[typing.Self]) -> Self:
451    @classmethod
452    def merge(cls, events: list[typing.Self]) -> typing.Self:
453        """
454        Combine EventArrays in a list into a single EventArray.
455        :param events: the new list of events.
456        """
457        all_info = []
458        all_metadata = []
459        all_features = []
460        for event_array in events:
461            # Skip empty EventArrays
462            if event_array.info is not None:
463                all_info.append(event_array.info)
464            if event_array.metadata is not None:
465                all_metadata.append(event_array.metadata)
466            if event_array.features is not None:
467                all_features.append(event_array.features)
468        if len(all_info) == 0:
469            return EventArray()
470        else:
471            all_info = pd.concat(all_info, ignore_index=True)
472        if len(all_metadata) == 0:
473            all_metadata = None
474        else:
475            all_metadata = pd.concat(all_metadata, ignore_index=True)
476        if len(all_features) == 0:
477            all_features = None
478        else:
479            all_features = pd.concat(all_features, ignore_index=True)
480
481        return EventArray(all_info, all_metadata, all_features)

Combine EventArrays in a list into a single EventArray.

Parameters
  • events: the new list of events.
@classmethod
def from_events(cls, events: list[Event]) -> Self:
483    @classmethod
484    def from_events(cls, events: list[Event]) -> typing.Self:
485        """
486        Set the events in the EventArray to a new list of events.
487        :param events: the new list of events.
488        """
489        # Return an empty array if we were passed nothing
490        if events is None or len(events) == 0:
491            return EventArray()
492        # Otherwise, grab the info
493        info = pd.DataFrame(
494            {
495                "slide_id": [event.scan.slide_id for event in events],
496                "tile": [event.tile.n for event in events],
497                "roi": [event.tile.n_roi for event in events],
498                "x": [event.x for event in events],
499                "y": [event.y for event in events],
500                "size": [event.size for event in events],
501            }
502        )
503        metadata_list = [event.metadata for event in events]
504        # Iterate through and ensure that all metadata is the same shape
505        for metadata in metadata_list:
506            if type(metadata) != type(metadata_list[0]):
507                raise ValueError("All metadata must be the same type.")
508            if metadata is not None and metadata.shape != metadata_list[0].shape:
509                raise ValueError("All metadata must be the same shape.")
510        if metadata_list[0] is None:
511            metadata = None
512        else:
513            metadata = pd.DataFrame(metadata_list)
514        features_list = [event.features for event in events]
515        # Iterate through and ensure that all features are the same shape
516        for features in features_list:
517            if type(features) != type(features_list[0]):
518                raise ValueError("All features must be the same type.")
519            if features is not None and features.shape != features_list[0].shape:
520                raise ValueError("All features must be the same shape.")
521        if features_list[0] is None:
522            features = None
523        else:
524            features = pd.DataFrame(features_list)
525        return EventArray(info=info, metadata=metadata, features=features)

Set the events in the EventArray to a new list of events.

Parameters
  • events: the new list of events.
def to_events( self, scans: list[csi_images.csi_scans.Scan], ignore_missing_scans=True, ignore_metadata=False, ignore_features=False) -> list[Event]:
527    def to_events(
528        self,
529        scans: list[Scan],
530        ignore_missing_scans=True,
531        ignore_metadata=False,
532        ignore_features=False,
533    ) -> list[Event]:
534        """
535        Get the events in the EventArray as a list of events.
536        :param scans: the scans that the events belong to. Pass an empty list if you
537                      don't care about scan metadata.
538        :param ignore_missing_scans: whether to create blank scans for events without scans.
539        :param ignore_metadata: whether to ignore metadata or not
540        :param ignore_features: whether to ignore features or not
541        :return:
542        """
543        events = []
544        for i in range(len(self.info)):
545            # Determine the associated scan
546            scan = None
547            for s in scans:
548                if s.slide_id == self.info["slide_id"][i]:
549                    scan = s
550                    break
551            if scan is None:
552                if ignore_missing_scans:
553                    # Create a placeholder scan if the scan is missing
554                    scan = Scan.make_placeholder(
555                        self.info["slide_id"][i],
556                        self.info["tile"][i],
557                        self.info["roi"][i],
558                    )
559                else:
560                    raise ValueError(
561                        f"Scan {self.info['slide_id'][i]} not found for event {i}."
562                    )
563            # Add to the list
564            events.append(
565                Event(
566                    scan,
567                    Tile(scan, self.info["tile"][i], self.info["roi"][i]),
568                    self.info["x"][i],
569                    self.info["y"][i],
570                    size=self.info["size"][i],
571                    metadata=None if ignore_metadata else self.metadata.loc[i],
572                    features=None if ignore_features else self.features.loc[i],
573                )
574            )
575        return events

Get the events in the EventArray as a list of events.

Parameters
  • scans: the scans that the events belong to. Pass an empty list if you don't care about scan metadata.
  • ignore_missing_scans: whether to create blank scans for events without scans.
  • ignore_metadata: whether to ignore metadata or not
  • ignore_features: whether to ignore features or not
Returns
def to_dataframe(self) -> pandas.core.frame.DataFrame:
577    def to_dataframe(self) -> pd.DataFrame:
578        """
579        Convert all the data in the EventArray to a single DataFrame.
580        :return: a DataFrame with all the data in the EventArray.
581        """
582        # Make a copy of the info DataFrame and prepend "info_" to the column names
583        output = self.info.copy()
584        output.columns = [f"info_{col}" for col in output.columns]
585        # Combine with the metadata and prepend "metadata_" to the column names
586        if self.metadata is not None:
587            metadata = self.metadata.copy()
588            metadata.columns = [f"metadata_{col}" for col in metadata.columns]
589            output = pd.concat([output, metadata], axis=1)
590        # Combine with the features and prepend "features_" to the column names
591        if self.features is not None:
592            features = self.features.copy()
593            features.columns = [f"features_{col}" for col in features.columns]
594            output = pd.concat([output, features], axis=1)
595        return output

Convert all the data in the EventArray to a single DataFrame.

Returns

a DataFrame with all the data in the EventArray.

@classmethod
def from_dataframe(cls, df) -> Self:
597    @classmethod
598    def from_dataframe(cls, df) -> typing.Self:
599        """
600        From a single, special DataFrame, create an EventArray.
601        :return: a DataFrame with all the data in the EventArray.
602        """
603        # Split the columns into info, metadata, and features and strip prefix
604        info = df[[col for col in df.columns if col.startswith("info_")]].copy()
605        info.columns = [col.replace("info_", "") for col in info.columns]
606        if info.size == 0:
607            info = None
608        metadata = df[[col for col in df.columns if col.startswith("metadata_")]].copy()
609        metadata.columns = [col.replace("metadata_", "") for col in metadata.columns]
610        if metadata.size == 0:
611            metadata = None
612        features = df[[col for col in df.columns if col.startswith("features_")]].copy()
613        features.columns = [col.replace("features_", "") for col in features.columns]
614        if features.size == 0:
615            features = None
616        return cls(info=info, metadata=metadata, features=features)

From a single, special DataFrame, create an EventArray.

Returns

a DataFrame with all the data in the EventArray.

def save_csv(self, output_path: str) -> bool:
618    def save_csv(self, output_path: str) -> bool:
619        """
620        Save the events to an CSV file, including metadata and features.
621        :param output_path:
622        :return:
623        """
624        self.to_dataframe().to_csv(output_path, index=False)
625        return os.path.exists(output_path)

Save the events to an CSV file, including metadata and features.

Parameters
  • output_path:
Returns
@classmethod
def load_csv(cls, input_path: str) -> Self:
627    @classmethod
628    def load_csv(cls, input_path: str) -> typing.Self:
629        """
630        Load the events from an CSV file, including metadata and features.
631        :param input_path:
632        :return:
633        """
634        # Load the CSV file
635        df = pd.read_csv(input_path)
636        return cls.from_dataframe(df)

Load the events from an CSV file, including metadata and features.

Parameters
  • input_path:
Returns
def save_hdf5(self, output_path: str) -> bool:
638    def save_hdf5(self, output_path: str) -> bool:
639        """
640        Save the events to an HDF5 file, including metadata and features.
641        Uses the pandas-provided HDF5 functions for ease, and external compatibility,
642        though these files are slightly harder to view in HDFView or similar.
643        :param output_path:
644        :return:
645        """
646        # Open the output_path as an HDF5 file
647        with pd.HDFStore(output_path) as store:
648            # Store the dataframes in the HDF5 file
649            if self.info is not None:
650                store.put("info", self.info, index=False)
651            if self.metadata is not None:
652                store.put("metadata", self.metadata, index=False)
653            if self.features is not None:
654                store.put("features", self.features, index=False)
655        return os.path.exists(output_path)

Save the events to an HDF5 file, including metadata and features. Uses the pandas-provided HDF5 functions for ease, and external compatibility, though these files are slightly harder to view in HDFView or similar.

Parameters
  • output_path:
Returns
@classmethod
def load_hdf5(cls, input_path: str) -> Self:
657    @classmethod
658    def load_hdf5(cls, input_path: str) -> typing.Self:
659        """
660        Load the events from an HDF5 file, including metadata and features.
661        :param input_path:
662        :return:
663        """
664        # Open the input_path as an HDF5 file
665        with pd.HDFStore(input_path) as store:
666            # Load the dataframes from the HDF5 file
667            info = store.get("info") if "info" in store else None
668            metadata = store.get("metadata") if "metadata" in store else None
669            features = store.get("features") if "features" in store else None
670        return cls(info=info, metadata=metadata, features=features)

Load the events from an HDF5 file, including metadata and features.

Parameters
  • input_path:
Returns
@classmethod
def load_ocular( cls, input_path: str, event_type='cells', cell_data_files=('rc-final1.rds', 'rc-final2.rds', 'rc-final3.rds', 'rc-final4.rds', 'ocular_interesting.rds'), others_data_files=('others-final1.rds', 'others-final2.rds', 'others-final3.rds', 'others-final4.rds'), atlas_data_files=('ocular_interesting.rds', 'ocular_not_interesting.rds'), drop_common_events=True, log=None) -> Self:
672    @classmethod
673    def load_ocular(
674        cls,
675        input_path: str,
676        event_type="cells",
677        cell_data_files=(
678            "rc-final1.rds",
679            "rc-final2.rds",
680            "rc-final3.rds",
681            "rc-final4.rds",
682            "ocular_interesting.rds",
683        ),
684        others_data_files=(
685            "others-final1.rds",
686            "others-final2.rds",
687            "others-final3.rds",
688            "others-final4.rds",
689        ),
690        atlas_data_files=(
691            "ocular_interesting.rds",
692            "ocular_not_interesting.rds",
693        ),
694        drop_common_events=True,
695        log=None,
696    ) -> typing.Self:
697        """
698
699        :param input_path:
700        :param event_type:
701        :param cell_data_files:
702        :param others_data_files:
703        :param atlas_data_files:
704        :param drop_common_events:
705        :param log:
706        :return:
707        """
708        if pyreadr is None:
709            raise ModuleNotFoundError(
710                "pyreadr not installed. Install pyreadr directly "
711                "or install csi-images with [rds] option to resolve."
712            )
713        # Check if the input path is a directory or a file
714        if os.path.isfile(input_path):
715            data_files = [os.path.basename(input_path)]
716            input_path = os.path.dirname(input_path)
717        if event_type == "cells":
718            data_files = cell_data_files
719        elif event_type == "others":
720            data_files = others_data_files
721        else:
722            raise ValueError("Invalid event type.")
723
724        # Load the data from the OCULAR files
725        file_data = {}
726        for file in data_files:
727            file_path = os.path.join(input_path, file)
728            if not os.path.isfile(file_path):
729                if log is not None:
730                    log.warning(f"{file} not found for in {input_path}")
731                continue
732            file_data[file] = pyreadr.read_r(file_path)
733            # Get the DataFrame associated with None (pyreadr dict quirk)
734            file_data[file] = file_data[file][None]
735            if len(file_data[file]) == 0:
736                # File gets dropped from the dict
737                file_data.pop(file)
738                if log is not None:
739                    log.warning(f"{file} has no cells")
740                continue
741
742            if log is not None:
743                log.debug(f"{file} has {len(file_data[file])} cells")
744
745            # Drop common cells if requested and in this file
746            if file in atlas_data_files and drop_common_events:
747                common_cell_indices = (
748                    file_data[file]["catalogue_classification"] == "common_cell"
749                )
750                if log is not None:
751                    log.debug(
752                        f"Dropping {int(pd.Series.sum(common_cell_indices))}"
753                        f"common cells from {file}"
754                    )
755                file_data[file] = file_data[file][common_cell_indices == False]
756
757            if len(file_data[file]) == 0:
758                # File gets dropped from the dict
759                file_data.pop(file)
760                if log is not None:
761                    log.warning(f"{file} has no cells after dropping common cells")
762                continue
763
764            # Extract frame_id and cell_id
765            # DAPI- events already have frame_id cell_id outside rowname
766            if event_type == "cells":
767                file_data[file]["rowname"] = file_data[file]["rowname"].astype("str")
768                # get frame_id cell_id from rownames column and split into two columns
769                split_res = file_data[file]["rowname"].str.split(" ", n=1, expand=True)
770                if len(split_res.columns) != 2:
771                    log.warning(
772                        f'Expected "frame_id cell_id" but got {file_data[file]["rowname"]}'
773                    )
774                # then assign it back to the dataframe
775                file_data[file][["frame_id", "cell_id"]] = split_res.astype("int")
776            # reset indexes since they can cause NaN values in concat
777            file_data[file] = file_data[file].reset_index(drop=True)
778
779        # Merge the data from all files
780        if len(file_data) == 0:
781            return EventArray()
782        elif len(file_data) == 1:
783            data = [file_data[file] for file in file_data.keys()][0]
784        else:
785            data = pd.concat(file_data.values())
786
787        if log is not None:
788            log.debug(f"Gathered a total of {len(data)} events")
789
790        # Others is missing the "slide_id". Insert it right before "frame_id" column
791        if event_type == "others" and "slide_id" not in data.columns:
792            if os.path.basename(input_path) == "ocular":
793                slide_id = os.path.basename(os.path.dirname(input_path))
794            else:
795                slide_id = "UNKNOWN"
796            data.insert(data.columns.get_loc("frame_id"), "slide_id", slide_id)
797
798        # Sort according to ascending cell_id to keep the original, which is in manual_df
799        data = data.sort_values(by=["cell_id"], ascending=True)
800        # Filter out duplicates by x & y
801        data = data.assign(
802            unique_id=data["slide_id"]
803            + "_"
804            + data["frame_id"].astype(str)
805            + "_"
806            + data["cellx"].astype(int).astype(str)
807            + "_"
808            + data["celly"].astype(int).astype(str)
809        )
810        data = data.drop_duplicates(subset=["unique_id"], keep="first")
811        # Normal unique_id is with cell_id
812        data = data.assign(
813            unique_id=data["slide_id"]
814            + "_"
815            + data["frame_id"].astype(str)
816            + "_"
817            + data["cell_id"].astype(str)
818        )
819        data = data.reset_index(drop=True)
820        # All columns up to "slide_id" are features; drop the "slide_id"
821        features = data.loc[:, :"slide_id"].iloc[:, :-1]
822        data = data.loc[:, "slide_id":]
823        # Grab the info columns
824        info = data[["slide_id", "frame_id", "cellx", "celly"]]
825        info.columns = ["slide_id", "tile", "x", "y"]
826        info = info.assign(
827            roi=0,  # OCULAR only works on 1 ROI, as far as known
828            size=25,  # Static, for later montaging
829        )
830        info = info[["slide_id", "tile", "roi", "x", "y", "size"]]
831        # Metadata has duplicate columns for later convenience
832        metadata = data
833        # Certain columns tend to be problematic with mixed data formats...
834        for col in ["TRITC", "CY5", "FITC"]:
835            if col in metadata:
836                labels = {
837                    "False": False,
838                    "True": True,
839                    "FALSE": False,
840                    "TRUE": True,
841                }
842                metadata[col] = metadata[col].map(labels).astype(bool)
843        for col in ["catalogue_id", "catalogue_distance", "clust", "hcpc"]:
844            if col in metadata:
845                metadata[col] = metadata[col].fillna(-1).astype(int)
846        return EventArray(info, metadata, features)
Parameters
  • input_path:
  • event_type:
  • cell_data_files:
  • others_data_files:
  • atlas_data_files:
  • drop_common_events:
  • log:
Returns
def save_ocular(self, output_path: str, event_type: str = 'cells'):
848    def save_ocular(self, output_path: str, event_type: str = "cells"):
849        """
850        Save the events to an OCULAR file. Relies on the dataframe originating
851        from an OCULAR file (same columns; duplicate metadata/info).
852        :param output_path:
853        :param event_type:
854        :return:
855        """
856        if event_type == "cells":
857            file_stub = "rc-final"
858        elif event_type == "others":
859            file_stub = "others-final"
860        else:
861            raise ValueError("Invalid event type. Must be cells or others.")
862
863        # Check for the "ocular_interesting" column
864        if event_type == "cells":
865            if "ocular_interesting" in self.metadata.columns:
866                interesting_rows = self.metadata["ocular_interesting"].to_numpy(
867                    dtype=bool
868                )
869            elif "hcpc" in self.metadata.columns:
870                # Interesting cells don't get an hcpc designation, leaving them as -1
871                interesting_rows = (
872                    self.metadata["hcpc"].to_numpy() == -1
873                )  # interesting cells
874            else:
875                interesting_rows = []
876            if sum(interesting_rows) > 0:
877                # Split the metadata into interesting and regular
878                interesting_events = self.rows(interesting_rows)
879                interesting_df = pd.concat(
880                    [interesting_events.features, interesting_events.metadata], axis=1
881                )
882                data_events = self.rows(~interesting_rows)
883                data_df = pd.concat(
884                    [data_events.features, data_events.metadata], axis=1
885                )
886                data_df = data_df.drop(columns=["ocular_interesting"], errors="ignore")
887
888                # Drop particular columns for "interesting"
889                interesting_df = interesting_df.drop(
890                    [
891                        "clust",
892                        "hcpc",
893                        "frame_id",
894                        "cell_id",
895                        "unique_id",
896                        "ocular_interesting",
897                    ],
898                    axis=1,
899                    errors="ignore",
900                )
901                # Save both .csv and .rds
902                interesting_df.to_csv(
903                    os.path.join(output_path, "ocular_interesting.csv"), index=False
904                )
905                pyreadr.write_rds(
906                    os.path.join(output_path, "ocular_interesting.rds"), interesting_df
907                )
908            else:
909                data_df = pd.concat([self.features, self.metadata], axis=1)
910        else:
911            # Get all data and reset_index (will copy it)
912            data_df = pd.concat([self.features, self.metadata], axis=1)
913
914        # Split based on cluster number to conform to *-final[1-4].rds
915        n_clusters = max(data_df["clust"]) + 1
916        split_idx = [round(i * n_clusters / 4) for i in range(5)]
917        for i in range(4):
918            subset = (split_idx[i] <= data_df["clust"]) & (
919                data_df["clust"] < split_idx[i + 1]
920            )
921            data_df.loc[subset, "hcpc"] = i + 1
922            subset = data_df[subset].reset_index(drop=True)
923            pyreadr.write_rds(
924                os.path.join(output_path, f"{file_stub}{i+1}.rds"), subset
925            )
926
927        # Create new example cell strings
928        data_df["example_cell_id"] = (
929            data_df["slide_id"]
930            + " "
931            + data_df["frame_id"].astype(str)
932            + " "
933            + data_df["cell_id"].astype(str)
934            + " "
935            + data_df["cellx"].astype(int).astype(str)
936            + " "
937            + data_df["celly"].astype(int).astype(str)
938        )
939        # Find averagable data columns
940        if "cellcluster_id" in data_df.columns:
941            end_idx = data_df.columns.get_loc("cellcluster_id")
942        else:
943            end_idx = data_df.columns.get_loc("slide_id")
944        avg_cols = data_df.columns[:end_idx].tolist()
945        # Group by cluster and average
946        data_df = data_df.groupby("clust").agg(
947            **{col: (col, "mean") for col in avg_cols},
948            count=("clust", "size"),  # count rows in each cluster
949            example_cells=("example_cell_id", lambda x: ",".join(x)),
950            hcpc=("hcpc", lambda x: x.iloc[0]),
951        )
952        data_df = data_df.reset_index()  # Do NOT drop, index is "clust"
953        # Create new columns
954        metadata = pd.DataFrame(
955            {
956                "count": data_df["count"],
957                "example_cells": data_df["example_cells"],
958                "clust": data_df["clust"].astype(int),
959                "hcpc": data_df["hcpc"].astype(int),
960                "id": data_df["clust"].astype(int).astype(str),
961                "cccluster": "0",  # Dummy value
962                "ccdistance": 0.0,  # Dummy value
963                "rownum": list(range(len(data_df))),
964                "framegroup": 0,  # Dummy value
965            }
966        )
967        data_df = pd.concat([data_df[avg_cols], metadata], axis=1)
968        # Save the cluster data
969        data_df.to_csv(os.path.join(output_path, f"{file_stub}.csv"), index=False)
970        pyreadr.write_rds(os.path.join(output_path, f"{file_stub}.rds"), data_df)

Save the events to an OCULAR file. Relies on the dataframe originating from an OCULAR file (same columns; duplicate metadata/info).

Parameters
  • output_path:
  • event_type:
Returns