Note
Click here to download the full example code
spatial organization of the somatosensory cortex revealed by cyclic smfish¶
simone codeluppi, lars e. borm, amit zeisel, gioele la manno, josina a. van lunteren, camilla i. svensson, sten linnarsson
this publication can be found at https://www.nature.com/articles/s41592-018-0175-z and the data referenced below can be downloaded from http://linnarssonlab.org/osmfish/
checklist: - [x] point locations - [x] cell locations - [x] cell x gene expression matrix
load the data¶
import pickle
import re
import requests
from itertools import repeat
from io import BytesIO
import tempfile
import dask.array as da
import h5py
import loompy
import numpy as np
import pandas as pd
import starspace
from starspace.constants import *
response = requests.get(
"https://d24h2xsgaj29mf.cloudfront.net/raw/"
"osmfish_codeluppi_2018_nat-methods_somatosensory-cortex/"
"mRNA_coords_raw_counting.hdf5"
)
spots_data = h5py.File(BytesIO(response.content), "r")
response = requests.get(
"https://d24h2xsgaj29mf.cloudfront.net/raw/"
"osmfish_codeluppi_2018_nat-methods_somatosensory-cortex/"
"polyT_seg.pkl"
)
region_data = pickle.load(BytesIO(response.content))
# load spot info
gene = []
x = []
y = []
imaging_round = []
pattern = r"^(.*?)_Hybridization(\d*?)$"
for k in spots_data.keys():
gene_, round_ = re.match(pattern, k).groups()
gene_data = spots_data[k]
x.extend(gene_data[:, 0])
y.extend(gene_data[:, 1])
gene.extend(repeat(gene_, gene_data.shape[0]))
imaging_round.extend(repeat(int(round_), gene_data.shape[0]))
spots_data.close()
# build the spot information
spot_data = pd.DataFrame({
SPOTS_REQUIRED_VARIABLES.GENE_NAME: gene,
SPOTS_REQUIRED_VARIABLES.X_SPOT: x,
SPOTS_REQUIRED_VARIABLES.Y_SPOT: y,
SPOTS_OPTIONAL_VARIABLES.ROUND: imaging_round,
})
# construct the attributes
attributes = {
REQUIRED_ATTRIBUTES.AUTHORS: (
"Simone Codeluppi", "Lars E. Borm", "Amit Zeisel", "Gioele La Manno",
"Josina A. van Lunteren", "Camilla I. Svensson", "Sten Linnarsson"
),
REQUIRED_ATTRIBUTES.YEAR: 2018,
REQUIRED_ATTRIBUTES.SAMPLE_TYPE: "somatosensory cortex",
REQUIRED_ATTRIBUTES.ORGANISM: "mouse",
REQUIRED_ATTRIBUTES.ASSAY: ASSAYS.OSMFISH.value,
OPTIONAL_ATTRIBUTES.PUBLICATION_NAME: (
"Spatial organization of the somatosensory cortex revealed by cyclic smFISH"
),
OPTIONAL_ATTRIBUTES.PUBLICATION_URL: "https://www.nature.com/articles/s41592-018-0175-z"
}
spots = starspace.Spots.from_spot_data(spot_data, attrs=attributes)
s3_url = (
"s3://starfish.data.output-warehouse/osmfish-codeluppi-2018-nat-methods-somatosensory-cortex/"
)
url = "osmfish-codeluppi-2018-nat-methods-somatosensory-cortex/"
spots.save_zarr(url=url)
load the region information; we’re gonna be lazy and just create a label image. Makes for simple lookups. It’s only 6 gb, and we can put it in dask, so w/e find the extent of the images from the spots and the region data
x_min, x_max = np.percentile(spot_data[SPOTS_REQUIRED_VARIABLES.X_SPOT], [0, 100])
y_min, y_max = np.percentile(spot_data[SPOTS_REQUIRED_VARIABLES.Y_SPOT], [0, 100])
label = np.empty((int(x_max) + 1, int(y_max) + 1), dtype=np.int16)
for region_id, array in region_data.items():
region_id = int(region_id)
x = array[:, 0]
y = array[:, 1]
label[x, y] = region_id
dims = tuple(REGIONS_AXES)
regions = starspace.Regions.from_label_image(label, dims=dims, attrs=attributes)
regions.save_zarr(url=url)
load up the count matrix
response = requests.get(
"https://d24h2xsgaj29mf.cloudfront.net/raw/"
"osmfish_codeluppi_2018_nat-methods_somatosensory-cortex/"
"osmFISH_SScortex_mouse_all_cells.loom"
)
with tempfile.TemporaryDirectory() as tmpdirname:
with open(os.path.join(tmpdirname, "temp.loom"), 'wb') as f:
f.write(response.content)
conn = loompy.connect(os.path.join(tmpdirname, "temp.loom"), mode="r")
row_attrs = dict(conn.row_attrs)
col_attrs = dict(conn.col_attrs)
data = da.from_array(conn[:, :].T, chunks=MATRIX_CHUNK_SIZE)
# region id should be int dtype
col_attrs["CellID"] = col_attrs["CellID"].astype(int)
dims = (MATRIX_AXES.REGIONS.value, MATRIX_AXES.FEATURES.value)
coords = {
MATRIX_REQUIRED_REGIONS.REGION_ID: (MATRIX_AXES.REGIONS, col_attrs["CellID"]),
MATRIX_REQUIRED_REGIONS.X_REGION: (MATRIX_AXES.REGIONS, col_attrs["X"]),
MATRIX_REQUIRED_REGIONS.Y_REGION: (MATRIX_AXES.REGIONS, col_attrs["Y"]),
MATRIX_OPTIONAL_REGIONS.GROUP_ID: (MATRIX_AXES.REGIONS, col_attrs["ClusterID"]),
MATRIX_OPTIONAL_REGIONS.TYPE_ANNOTATION: (MATRIX_AXES.REGIONS, col_attrs["ClusterName"]),
MATRIX_OPTIONAL_REGIONS.PHYS_ANNOTATION: (MATRIX_AXES.REGIONS, col_attrs["Region"]),
MATRIX_OPTIONAL_REGIONS.AREA_PIXELS: (MATRIX_AXES.REGIONS, col_attrs["size_pix"]),
MATRIX_OPTIONAL_REGIONS.AREA_UM2: (MATRIX_AXES.REGIONS, col_attrs["size_um2"]),
"valid": (MATRIX_AXES.REGIONS, col_attrs["Valid"]),
"tsne_1": (MATRIX_AXES.REGIONS, col_attrs["_tSNE_1"]),
"tsne_2": (MATRIX_AXES.REGIONS, col_attrs["_tSNE_2"]),
MATRIX_OPTIONAL_FEATURES.CHANNEL: (MATRIX_AXES.FEATURES, row_attrs["Fluorophore"]),
MATRIX_REQUIRED_FEATURES.GENE_NAME: (MATRIX_AXES.FEATURES, row_attrs["Gene"]),
MATRIX_OPTIONAL_FEATURES.ROUND.value: (MATRIX_AXES.FEATURES, row_attrs["Hybridization"]),
}
matrix = starspace.Matrix.from_expression_data(
data=data, coords=coords, dims=dims, name="matrix", attrs=attributes
)
matrix.save_zarr(url=url)
Total running time of the script: ( 0 minutes 0.000 seconds)