{
  "cells": [
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "collapsed": false
      },
      "outputs": [],
      "source": [
        "%matplotlib inline"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {},
      "source": [
        "\nMolecular, spatial, and functional single-cell profiling of the hypothalamic preoptic region\n============================================================================================\n\nJeffrey R. Moffitt, Dhananjay Bambah-Mukku, Stephen W. Eichhorn, Eric Vaughn, Karthik Shekhar,\nJulio D. Perez, Nimrod D. Rubinstein, Junjie Hao, Aviv Regev, Catherine Dulac, Xiaowei Zhuang\n\nThis publication can be found at https://science.sciencemag.org/content/362/6416/eaau5324 and the\ndata referenced below can be downloaded from https://datadryad.org/handle/10255/dryad.192644\n\nChecklist:\n- [ ] point locations\n- [ ] cell locations\n- [x] cell x gene expression matrix\n\nLoad the data\n-------------\n"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "collapsed": false
      },
      "outputs": [],
      "source": [
        "import os\nimport requests\nfrom io import BytesIO\n\nimport dask.array as da\nimport numpy as np\nimport pandas as pd\n\nimport starspace\nfrom starspace.constants import *\n\nresponse = requests.get(\n    \"https://d24h2xsgaj29mf.cloudfront.net/raw/merfish_moffit_2018_science_hypothalamic-preoptic/\"\n    \"Moffitt_and_Bambah-Mukku_et_al_merfish_all_cells.csv\"\n)\ndata = pd.read_csv(BytesIO(response.content), header=0)\n\nname = \"merfish moffit 2018 science hypothalamic preoptic\""
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {},
      "source": [
        "This data file is a cell x gene expression matrix that contains additional metadata as columns\nof the matrix. Extract those extra columns and clean up the data file.\n\n"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "collapsed": false
      },
      "outputs": [],
      "source": [
        "annotation = np.array(data[\"Cell_class\"], dtype=\"U\")\ngroup_id = np.array(data[\"Neuron_cluster_ID\"], dtype=\"U\")\nx = data[\"Centroid_X\"]\ny = data[\"Centroid_Y\"]\nregion_id = np.array(data[\"Cell_ID\"], dtype=\"U\")\n\nunstructured_field_names = [\"Animal_ID\", \"Animal_sex\", \"Behavior\", \"Bregma\"]\nunstructured_metadata = data[unstructured_field_names]\nnon_expression_fields = (\n        unstructured_field_names\n        + [\"Cell_class\", \"Neuron_cluster_ID\", \"Centroid_X\", \"Centroid_Y\", \"Cell_ID\"]\n)\nexpression_data = data.drop(non_expression_fields, axis=1)\ngene_name = [v.lower() for v in expression_data.columns]"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {},
      "source": [
        "Write down some important metadata from the publication.\n\n"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "collapsed": false
      },
      "outputs": [],
      "source": [
        "attrs = {\n    REQUIRED_ATTRIBUTES.ASSAY: ASSAYS.MERFISH,\n    REQUIRED_ATTRIBUTES.SAMPLE_TYPE: \"hypothalamic pre-optic nucleus\",\n    REQUIRED_ATTRIBUTES.AUTHORS: [\n        \"Jeffrey R. Moffitt\", \"Dhananjay Bambah-Mukku\", \"Stephen W. Eichhorn\", \"Eric Vaughn\",\n        \"Karthik Shekhar\", \"Julio D. Perez\", \"Nimrod D. Rubinstein\", \"Junjie Hao\", \"Aviv Regev\",\n        \"Catherine Dulac\", \"Xiaowei Zhuang\"\n    ],\n    REQUIRED_ATTRIBUTES.YEAR: 2018,\n    REQUIRED_ATTRIBUTES.ORGANISM: \"mouse\",\n    OPTIONAL_ATTRIBUTES.PUBLICATION_NAME: (\n        \"Molecular, spatial, and functional single-cell profiling of the hypothalamic preoptic \"\n        \"region\"\n    ),\n    OPTIONAL_ATTRIBUTES.PUBLICATION_URL: \"https://science.sciencemag.org/content/362/6416/eaau5324\",\n}"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {},
      "source": [
        "Create the chunked dataset.\n\n"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "collapsed": false
      },
      "outputs": [],
      "source": [
        "chunk_data = da.from_array(expression_data.values, chunks=MATRIX_CHUNK_SIZE)"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {},
      "source": [
        "Wrap the dask array in an xarray, adding the metadata fields as \"coordinates\".\n\n"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "collapsed": false
      },
      "outputs": [],
      "source": [
        "# convert columns with object dtype into fixed-length strings\n\ncoords = {\n    MATRIX_REQUIRED_FEATURES.GENE_NAME: (MATRIX_AXES.FEATURES.value, gene_name),\n    MATRIX_REQUIRED_REGIONS.X_REGION: (MATRIX_AXES.REGIONS.value, x),\n    MATRIX_REQUIRED_REGIONS.Y_REGION: (MATRIX_AXES.REGIONS.value, y),\n    MATRIX_REQUIRED_REGIONS.REGION_ID: (MATRIX_AXES.REGIONS.value, region_id),\n    MATRIX_OPTIONAL_REGIONS.GROUP_ID: (MATRIX_AXES.REGIONS.value, group_id),\n    MATRIX_OPTIONAL_REGIONS.TYPE_ANNOTATION: (MATRIX_AXES.REGIONS.value, annotation)\n}\ndims = (MATRIX_AXES.REGIONS.value, MATRIX_AXES.FEATURES.value)\nmatrix = starspace.Matrix.from_expression_data(\n    data=chunk_data, coords=coords, dims=dims, name=name, attrs=attrs\n)\n\ns3_url = \"s3://starfish.data.output-warehouse/merfish-moffit-2018-science-hypothalamic-preoptic\"\nurl = \"merfish-moffit-2018-science-hypothalamic-preoptic\"\nmatrix.save_zarr(url=url)"
      ]
    }
  ],
  "metadata": {
    "kernelspec": {
      "display_name": "Python 3",
      "language": "python",
      "name": "python3"
    },
    "language_info": {
      "codemirror_mode": {
        "name": "ipython",
        "version": 3
      },
      "file_extension": ".py",
      "mimetype": "text/x-python",
      "name": "python",
      "nbconvert_exporter": "python",
      "pygments_lexer": "ipython3",
      "version": "3.7.3"
    }
  },
  "nbformat": 4,
  "nbformat_minor": 0
}