{
  "cells": [
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "collapsed": false
      },
      "outputs": [],
      "source": [
        "%matplotlib inline"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {},
      "source": [
        "\nModeling Spatial Correlation of Transcripts with Application to Developing Pancreas\n===================================================================================\n\nRuishan Liu, Marco Mignardi, Robert Jones, Martin Enge, Seung K. Kim, Stephen R. Quake & James Zou\n\nThis publication can be found at https://www.nature.com/articles/s41598-019-41951-2 and the data\ncan be downloaded from https://cirm.ucsc.edu/projects\n\nChecklist:\n- [x] point locations\n- [~] cell locations  (centroids only)\n- [x] cell x gene expression matrix (derivable)\n"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "collapsed": false
      },
      "outputs": [],
      "source": [
        "import requests\nfrom pathlib import Path\nfrom io import BytesIO\n\nimport pandas as pd\n\nimport starspace\nfrom starspace.constants import *\n\nresponse = requests.get(\n    \"https://d24h2xsgaj29mf.cloudfront.net/raw/iss_liu_2019_nat-sci-reports_pancreas-dev/\"\n    \"Nuc_TOT_2p2.txt\"\n)\nregion_data = pd.read_csv(BytesIO(response.content))\n\nresponse = requests.get(\n    \"https://d24h2xsgaj29mf.cloudfront.net/raw/iss_liu_2019_nat-sci-reports_pancreas-dev/\"\n    \"RNA_TOT_2p2.txt\"\n)\nrna_data = pd.read_csv(BytesIO(response.content))\n\nresponse = requests.get(\n    \"https://d24h2xsgaj29mf.cloudfront.net/raw/iss_liu_2019_nat-sci-reports_pancreas-dev/\"\n    \"Conversion_Pool2.txt\"\n)\ngene_map = pd.read_csv(BytesIO(response.content))"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {},
      "source": [
        "Build the spot table\n\n"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "collapsed": false
      },
      "outputs": [],
      "source": [
        "# some of these spots don't map to real genes. Interesting. Definitely retain \"Barcode_Num\" and\n# \"Barcode_Letter\"\ngene_map = gene_map.set_index(\"Barcode_Num\")\ngene_info = gene_map.loc[rna_data.Seq_num, :]\ngene_info.index = rna_data.index\n\nrna_data = pd.concat([rna_data, gene_info], axis=1)\n\n# \"ObjectNumber\" is the join key for gene ids, but we've joined all the tables, so we can drop it.\nrna_data = rna_data.drop(\"ObjectNumber\", axis=1)\n\n# merge in cell centroids\nregion_data = region_data.set_index(\"ObjectNumber\")\nregion_data = region_data.drop(\"ImageNumber\", axis=1)  # duplicated in rna_data\nregion_info = region_data.loc[rna_data[\"Parent_Cells\"], :]\nregion_info.index = rna_data.index\n\nrna_data = pd.concat([rna_data, region_info], axis=1)\n\nnotes = list()\nnotes.append(\"'seq_num' contains channel information for the in-situ sequencing code of each gene\")\nnotes.append(\"'barcode_letter' contains the nucleotides read out using ISS\")\n\n\ncolumn_map = {\n    \"ImageNumber\": SPOTS_OPTIONAL_VARIABLES.FIELD_OF_VIEW,\n    \"Blob_X\": SPOTS_REQUIRED_VARIABLES.X_SPOT,\n    \"Blob_Y\": SPOTS_REQUIRED_VARIABLES.Y_SPOT,\n    \"Parent_Cells\": SPOTS_OPTIONAL_VARIABLES.REGION_ID,\n    \"Location_Center_X\": SPOTS_OPTIONAL_VARIABLES.X_REGION,\n    \"Location_Center_Y\": SPOTS_OPTIONAL_VARIABLES.Y_REGION,\n    \"Gene_Name\": SPOTS_REQUIRED_VARIABLES.GENE_NAME,\n    \"Seq_qual\": SPOTS_OPTIONAL_VARIABLES.QUALITY,\n    \"Seq_num\": \"seq_num\",\n    \"Barcode_Letter\": \"barcode_letter\",\n}\n\ncolumns = [column_map[c] for c in rna_data.columns]\nrna_data.columns = columns\n\nattributes = {\n    REQUIRED_ATTRIBUTES.ASSAY: ASSAYS.ISS.value,\n    REQUIRED_ATTRIBUTES.SAMPLE_TYPE: \"fetal pancreas\",\n    REQUIRED_ATTRIBUTES.AUTHORS: [\n        \"Ruishan Liu\", \"Marco Mignardi\", \"Robert Jones\", \"Martin Enge\", \"Seung K. Kim\", \"Stephen R. Quake\" \"James Zou\"\n    ],\n    REQUIRED_ATTRIBUTES.YEAR: 2019,\n    REQUIRED_ATTRIBUTES.ORGANISM: \"human\",\n    OPTIONAL_ATTRIBUTES.PUBLICATION_NAME: (\n        \"Modeling Spatial Correlation of Transcripts with Application to Developing Pancreas\"\n    ),\n    OPTIONAL_ATTRIBUTES.PUBLICATION_URL: \"https://www.nature.com/articles/s41598-019-41951-2\"\n}\n\nspots = starspace.Spots.from_spot_data(rna_data, attributes)\n\n\n# s3_url = \"s3://starfish.data.output-warehouse/iss_liu_2019_nat-sci-reports_pancreas-dev/\"\nurl = \"iss_liu_2019_nat-sci-reports_pancreas-dev/\"\nspots.save_zarr(url=url)"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {},
      "source": [
        "we have the needed information to pivot into a matrix, too\n\n"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "collapsed": false
      },
      "outputs": [],
      "source": [
        "matrix = spots.to_spatial_matrix()\nmatrix.save_zarr(url=url)"
      ]
    }
  ],
  "metadata": {
    "kernelspec": {
      "display_name": "Python 3",
      "language": "python",
      "name": "python3"
    },
    "language_info": {
      "codemirror_mode": {
        "name": "ipython",
        "version": 3
      },
      "file_extension": ".py",
      "mimetype": "text/x-python",
      "name": "python",
      "nbconvert_exporter": "python",
      "pygments_lexer": "ipython3",
      "version": "3.7.3"
    }
  },
  "nbformat": 4,
  "nbformat_minor": 0
}