{
  "cells": [
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "collapsed": false
      },
      "outputs": [],
      "source": [
        "%matplotlib inline"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {},
      "source": [
        "\nSpatially resolved, highly multiplexed RNA profiling in single cells\n============================================================================================\n\nKok Hao Chen, Alistair N. Boettiger, Jeffrey R. Moffitt, Siyuan Wang, Xiaowei Zhuang\n\nThis publication can be found at https://science.sciencemag.org/content/348/6233/aaa6090 and the\ndata referenced below can be downloaded from\n\nChecklist:\n- [x] point locations\n- [ ] cell locations\n- [x] cell x gene expression matrix (derivable)\n\nLoad the data\n-------------\n"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "collapsed": false
      },
      "outputs": [],
      "source": [
        "import requests\nfrom io import BytesIO\n\nimport pandas as pd\n\nimport starspace\nfrom starspace.constants import *\n\nresponse = requests.get(\n    \"https://d24h2xsgaj29mf.cloudfront.net/raw/merfish_chen_2015_science_imr90/\"\n    \"140genesData.xlsx\"\n)\ndata = pd.read_excel(BytesIO(response.content))\n\nname = \"merfish chen 2015 science imr90\""
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {},
      "source": [
        "This data file is a cell x gene expression matrix that contains additional metadata as columns\nof the matrix. Extract those extra columns and clean up the data file.\n\n"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "collapsed": false
      },
      "outputs": [],
      "source": [
        "# map column names to schema\n\ncolumn_map = {\n    \"RNACentroidX\": SPOTS_REQUIRED_VARIABLES.X_SPOT,\n    \"RNACentroidY\": SPOTS_REQUIRED_VARIABLES.Y_SPOT,\n    \"cellID\": \"per_slice_cell_id\",  # this is not unique experiment-wide\n    \"CellPositionX\": SPOTS_OPTIONAL_VARIABLES.X_REGION,\n    \"CellPositionY\": SPOTS_OPTIONAL_VARIABLES.Y_REGION,\n    \"geneName\": SPOTS_REQUIRED_VARIABLES.GENE_NAME,\n    \"experiment\": \"experiment\",\n    \"library\": \"library\",\n    \"intCodeword\": \"int_codeword\",\n    \"isCorrectedMatch\": \"is_corrected_match\",\n    \"isExactMatch\": \"is_exact_match\"\n}\ncolumns = [column_map[c] for c in data.columns]\ndata.columns = columns\n\n# demonstrate that cellID is not unique:\ngroup_columns = (\n    \"per_slice_cell_id\",\n    SPOTS_OPTIONAL_VARIABLES.Y_REGION,\n    SPOTS_OPTIONAL_VARIABLES.X_REGION,\n)\n\n# group by the columns, use size to run a no-op aggregation routine, then drop the size column\n# (labeled zero)\nnot_unique = data.groupby(group_columns).size().reset_index().drop(0, axis=1)\n\nassert_cols = [\"per_slice_cell_id\"]\nassert not_unique[assert_cols].drop_duplicates().shape != not_unique[assert_cols].shape\n\n# fix region ids so that they uniquely identify cells across the experiment.\ngroup_columns = (\n    \"experiment\", \"library\", \"per_slice_cell_id\",\n    SPOTS_OPTIONAL_VARIABLES.Y_REGION, SPOTS_OPTIONAL_VARIABLES.X_REGION\n)\nregion_ids_map = data.groupby(group_columns).size().reset_index().drop(0, axis=1)\n\nassert_cols = [\"per_slice_cell_id\", \"library\", \"experiment\"]\nassert region_ids_map[assert_cols].drop_duplicates().shape == region_ids_map[assert_cols].shape\n\n# map each region to a unique identifier and add it to the data frame\nregion_ids_map = region_ids_map.drop(\n    [SPOTS_OPTIONAL_VARIABLES.Y_REGION, SPOTS_OPTIONAL_VARIABLES.X_REGION], axis=1\n)\nregion_ids_map = region_ids_map.reset_index().set_index(assert_cols)\n\nregion_ids = region_ids_map.loc[pd.MultiIndex.from_frame(data[assert_cols])]\ndata[SPOTS_OPTIONAL_VARIABLES.REGION_ID] = region_ids.values"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {},
      "source": [
        "Write down some important metadata from the publication.\n\n"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "collapsed": false
      },
      "outputs": [],
      "source": [
        "attrs = {\n    REQUIRED_ATTRIBUTES.ASSAY: ASSAYS.MERFISH,\n    REQUIRED_ATTRIBUTES.SAMPLE_TYPE: \"IMR90 lung fibroblast cell line\",\n    REQUIRED_ATTRIBUTES.AUTHORS: (\n        \"Kok Hao Chen\", \"Alistair N. Boettiger\", \"Jeffrey R. Moffitt\", \"Siyuan Wang\",\n        \"Xiaowei Zhuang\"\n    ),\n    REQUIRED_ATTRIBUTES.YEAR: 2015,\n    REQUIRED_ATTRIBUTES.ORGANISM: \"human\",\n    OPTIONAL_ATTRIBUTES.NOTES: (\n        \"cellID field from author data renamed per_slice_cell_id to reflect stored data\"\n    )\n}"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {},
      "source": [
        "convert the dataframe into an xarray dataset\n\n"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "collapsed": false
      },
      "outputs": [],
      "source": [
        "spots = starspace.Spots.from_spot_data(data, attrs)"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {},
      "source": [
        "Write the data to zarr on s3\n\n"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "collapsed": false
      },
      "outputs": [],
      "source": [
        "s3_url = \"s3://starfish.data.output-warehouse/merfish-chen-2015-science-imr90/\"\nurl = \"merfish-chen-2015-science-imr90/\"\nspots.save_zarr(url)"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {},
      "source": [
        "Convert the xarray dataset to a matrix.\n\n"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "collapsed": false
      },
      "outputs": [],
      "source": [
        "matrix = spots.to_spatial_matrix()\nmatrix.save_zarr(url)"
      ]
    }
  ],
  "metadata": {
    "kernelspec": {
      "display_name": "Python 3",
      "language": "python",
      "name": "python3"
    },
    "language_info": {
      "codemirror_mode": {
        "name": "ipython",
        "version": 3
      },
      "file_extension": ".py",
      "mimetype": "text/x-python",
      "name": "python",
      "nbconvert_exporter": "python",
      "pygments_lexer": "ipython3",
      "version": "3.7.3"
    }
  },
  "nbformat": 4,
  "nbformat_minor": 0
}