{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# Parallel bootstrap calculations\n", "\n", "**[Dataset download](https://s3.amazonaws.com/bebi103.caltech.edu/data/singer_transcript_counts.csv)**\n", "\n", "
" ] }, { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [ { "data": { "text/html": [ "\n", "
\n", " \n", " Loading BokehJS ...\n", "
" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "application/javascript": [ "\n", "(function(root) {\n", " function now() {\n", " return new Date();\n", " }\n", "\n", " var force = true;\n", "\n", " if (typeof root._bokeh_onload_callbacks === \"undefined\" || force === true) {\n", " root._bokeh_onload_callbacks = [];\n", " root._bokeh_is_loading = undefined;\n", " }\n", "\n", " var JS_MIME_TYPE = 'application/javascript';\n", " var HTML_MIME_TYPE = 'text/html';\n", " var EXEC_MIME_TYPE = 'application/vnd.bokehjs_exec.v0+json';\n", " var CLASS_NAME = 'output_bokeh rendered_html';\n", "\n", " /**\n", " * Render data to the DOM node\n", " */\n", " function render(props, node) {\n", " var script = document.createElement(\"script\");\n", " node.appendChild(script);\n", " }\n", "\n", " /**\n", " * Handle when an output is cleared or removed\n", " */\n", " function handleClearOutput(event, handle) {\n", " var cell = handle.cell;\n", "\n", " var id = cell.output_area._bokeh_element_id;\n", " var server_id = cell.output_area._bokeh_server_id;\n", " // Clean up Bokeh references\n", " if (id != null && id in Bokeh.index) {\n", " Bokeh.index[id].model.document.clear();\n", " delete Bokeh.index[id];\n", " }\n", "\n", " if (server_id !== undefined) {\n", " // Clean up Bokeh references\n", " var cmd = \"from bokeh.io.state import curstate; print(curstate().uuid_to_server['\" + server_id + \"'].get_sessions()[0].document.roots[0]._id)\";\n", " cell.notebook.kernel.execute(cmd, {\n", " iopub: {\n", " output: function(msg) {\n", " var id = msg.content.text.trim();\n", " if (id in Bokeh.index) {\n", " Bokeh.index[id].model.document.clear();\n", " delete Bokeh.index[id];\n", " }\n", " }\n", " }\n", " });\n", " // Destroy server and session\n", " var cmd = \"import bokeh.io.notebook as ion; ion.destroy_server('\" + server_id + \"')\";\n", " cell.notebook.kernel.execute(cmd);\n", " }\n", " }\n", "\n", " /**\n", " * Handle when a new output is added\n", " */\n", " function handleAddOutput(event, handle) {\n", " var output_area = handle.output_area;\n", " var output = handle.output;\n", "\n", " // limit handleAddOutput to display_data with EXEC_MIME_TYPE content only\n", " if ((output.output_type != \"display_data\") || (!output.data.hasOwnProperty(EXEC_MIME_TYPE))) {\n", " return\n", " }\n", "\n", " var toinsert = output_area.element.find(\".\" + CLASS_NAME.split(' ')[0]);\n", "\n", " if (output.metadata[EXEC_MIME_TYPE][\"id\"] !== undefined) {\n", " toinsert[toinsert.length - 1].firstChild.textContent = output.data[JS_MIME_TYPE];\n", " // store reference to embed id on output_area\n", " output_area._bokeh_element_id = output.metadata[EXEC_MIME_TYPE][\"id\"];\n", " }\n", " if (output.metadata[EXEC_MIME_TYPE][\"server_id\"] !== undefined) {\n", " var bk_div = document.createElement(\"div\");\n", " bk_div.innerHTML = output.data[HTML_MIME_TYPE];\n", " var script_attrs = bk_div.children[0].attributes;\n", " for (var i = 0; i < script_attrs.length; i++) {\n", " toinsert[toinsert.length - 1].firstChild.setAttribute(script_attrs[i].name, script_attrs[i].value);\n", " }\n", " // store reference to server id on output_area\n", " output_area._bokeh_server_id = output.metadata[EXEC_MIME_TYPE][\"server_id\"];\n", " }\n", " }\n", "\n", " function register_renderer(events, OutputArea) {\n", "\n", " function append_mime(data, metadata, element) {\n", " // create a DOM node to render to\n", " var toinsert = this.create_output_subarea(\n", " metadata,\n", " CLASS_NAME,\n", " EXEC_MIME_TYPE\n", " );\n", " this.keyboard_manager.register_events(toinsert);\n", " // Render to node\n", " var props = {data: data, metadata: metadata[EXEC_MIME_TYPE]};\n", " render(props, toinsert[toinsert.length - 1]);\n", " element.append(toinsert);\n", " return toinsert\n", " }\n", "\n", " /* Handle when an output is cleared or removed */\n", " events.on('clear_output.CodeCell', handleClearOutput);\n", " events.on('delete.Cell', handleClearOutput);\n", "\n", " /* Handle when a new output is added */\n", " events.on('output_added.OutputArea', handleAddOutput);\n", "\n", " /**\n", " * Register the mime type and append_mime function with output_area\n", " */\n", " OutputArea.prototype.register_mime_type(EXEC_MIME_TYPE, append_mime, {\n", " /* Is output safe? */\n", " safe: true,\n", " /* Index of renderer in `output_area.display_order` */\n", " index: 0\n", " });\n", " }\n", "\n", " // register the mime type if in Jupyter Notebook environment and previously unregistered\n", " if (root.Jupyter !== undefined) {\n", " var events = require('base/js/events');\n", " var OutputArea = require('notebook/js/outputarea').OutputArea;\n", "\n", " if (OutputArea.prototype.mime_types().indexOf(EXEC_MIME_TYPE) == -1) {\n", " register_renderer(events, OutputArea);\n", " }\n", " }\n", "\n", " \n", " if (typeof (root._bokeh_timeout) === \"undefined\" || force === true) {\n", " root._bokeh_timeout = Date.now() + 5000;\n", " root._bokeh_failed_load = false;\n", " }\n", "\n", " var NB_LOAD_WARNING = {'data': {'text/html':\n", " \"
\\n\"+\n", " \"

\\n\"+\n", " \"BokehJS does not appear to have successfully loaded. If loading BokehJS from CDN, this \\n\"+\n", " \"may be due to a slow or bad network connection. Possible fixes:\\n\"+\n", " \"

\\n\"+\n", " \"\\n\"+\n", " \"\\n\"+\n", " \"from bokeh.resources import INLINE\\n\"+\n", " \"output_notebook(resources=INLINE)\\n\"+\n", " \"\\n\"+\n", " \"
\"}};\n", "\n", " function display_loaded() {\n", " var el = document.getElementById(\"1001\");\n", " if (el != null) {\n", " el.textContent = \"BokehJS is loading...\";\n", " }\n", " if (root.Bokeh !== undefined) {\n", " if (el != null) {\n", " el.textContent = \"BokehJS \" + root.Bokeh.version + \" successfully loaded.\";\n", " }\n", " } else if (Date.now() < root._bokeh_timeout) {\n", " setTimeout(display_loaded, 100)\n", " }\n", " }\n", "\n", "\n", " function run_callbacks() {\n", " try {\n", " root._bokeh_onload_callbacks.forEach(function(callback) {\n", " if (callback != null)\n", " callback();\n", " });\n", " } finally {\n", " delete root._bokeh_onload_callbacks\n", " }\n", " console.debug(\"Bokeh: all callbacks have finished\");\n", " }\n", "\n", " function load_libs(css_urls, js_urls, callback) {\n", " if (css_urls == null) css_urls = [];\n", " if (js_urls == null) js_urls = [];\n", "\n", " root._bokeh_onload_callbacks.push(callback);\n", " if (root._bokeh_is_loading > 0) {\n", " console.debug(\"Bokeh: BokehJS is being loaded, scheduling callback at\", now());\n", " return null;\n", " }\n", " if (js_urls == null || js_urls.length === 0) {\n", " run_callbacks();\n", " return null;\n", " }\n", " console.debug(\"Bokeh: BokehJS not loaded, scheduling load and callback at\", now());\n", " root._bokeh_is_loading = css_urls.length + js_urls.length;\n", "\n", " function on_load() {\n", " root._bokeh_is_loading--;\n", " if (root._bokeh_is_loading === 0) {\n", " console.debug(\"Bokeh: all BokehJS libraries/stylesheets loaded\");\n", " run_callbacks()\n", " }\n", " }\n", "\n", " function on_error() {\n", " console.error(\"failed to load \" + url);\n", " }\n", "\n", " for (var i = 0; i < css_urls.length; i++) {\n", " var url = css_urls[i];\n", " const element = document.createElement(\"link\");\n", " element.onload = on_load;\n", " element.onerror = on_error;\n", " element.rel = \"stylesheet\";\n", " element.type = \"text/css\";\n", " element.href = url;\n", " console.debug(\"Bokeh: injecting link tag for BokehJS stylesheet: \", url);\n", " document.body.appendChild(element);\n", " }\n", "\n", " for (var i = 0; i < js_urls.length; i++) {\n", " var url = js_urls[i];\n", " var element = document.createElement('script');\n", " element.onload = on_load;\n", " element.onerror = on_error;\n", " element.async = false;\n", " element.src = url;\n", " console.debug(\"Bokeh: injecting script tag for BokehJS library: \", url);\n", " document.head.appendChild(element);\n", " }\n", " };var element = document.getElementById(\"1001\");\n", " if (element == null) {\n", " console.error(\"Bokeh: ERROR: autoload.js configured with elementid '1001' but no matching script tag was found. \")\n", " return false;\n", " }\n", "\n", " function inject_raw_css(css) {\n", " const element = document.createElement(\"style\");\n", " element.appendChild(document.createTextNode(css));\n", " document.body.appendChild(element);\n", " }\n", "\n", " \n", " var js_urls = [\"https://cdn.pydata.org/bokeh/release/bokeh-1.4.0.min.js\", \"https://cdn.pydata.org/bokeh/release/bokeh-widgets-1.4.0.min.js\", \"https://cdn.pydata.org/bokeh/release/bokeh-tables-1.4.0.min.js\", \"https://cdn.pydata.org/bokeh/release/bokeh-gl-1.4.0.min.js\"];\n", " var css_urls = [];\n", " \n", "\n", " var inline_js = [\n", " function(Bokeh) {\n", " Bokeh.set_log_level(\"info\");\n", " },\n", " function(Bokeh) {\n", " \n", " \n", " }\n", " ];\n", "\n", " function run_inline_js() {\n", " \n", " if (root.Bokeh !== undefined || force === true) {\n", " \n", " for (var i = 0; i < inline_js.length; i++) {\n", " inline_js[i].call(root, root.Bokeh);\n", " }\n", " if (force === true) {\n", " display_loaded();\n", " }} else if (Date.now() < root._bokeh_timeout) {\n", " setTimeout(run_inline_js, 100);\n", " } else if (!root._bokeh_failed_load) {\n", " console.log(\"Bokeh: BokehJS failed to load within specified timeout.\");\n", " root._bokeh_failed_load = true;\n", " } else if (force !== true) {\n", " var cell = $(document.getElementById(\"1001\")).parents('.cell').data().cell;\n", " cell.output_area.append_execute_result(NB_LOAD_WARNING)\n", " }\n", "\n", " }\n", "\n", " if (root._bokeh_is_loading === 0) {\n", " console.debug(\"Bokeh: BokehJS loaded, going straight to plotting\");\n", " run_inline_js();\n", " } else {\n", " load_libs(css_urls, js_urls, function() {\n", " console.debug(\"Bokeh: BokehJS plotting callback run at\", now());\n", " run_inline_js();\n", " });\n", " }\n", "}(window));" ], "application/vnd.bokehjs_load.v0+json": "\n(function(root) {\n function now() {\n return new Date();\n }\n\n var force = true;\n\n if (typeof root._bokeh_onload_callbacks === \"undefined\" || force === true) {\n root._bokeh_onload_callbacks = [];\n root._bokeh_is_loading = undefined;\n }\n\n \n\n \n if (typeof (root._bokeh_timeout) === \"undefined\" || force === true) {\n root._bokeh_timeout = Date.now() + 5000;\n root._bokeh_failed_load = false;\n }\n\n var NB_LOAD_WARNING = {'data': {'text/html':\n \"
\\n\"+\n \"

\\n\"+\n \"BokehJS does not appear to have successfully loaded. If loading BokehJS from CDN, this \\n\"+\n \"may be due to a slow or bad network connection. Possible fixes:\\n\"+\n \"

\\n\"+\n \"\\n\"+\n \"\\n\"+\n \"from bokeh.resources import INLINE\\n\"+\n \"output_notebook(resources=INLINE)\\n\"+\n \"\\n\"+\n \"
\"}};\n\n function display_loaded() {\n var el = document.getElementById(\"1001\");\n if (el != null) {\n el.textContent = \"BokehJS is loading...\";\n }\n if (root.Bokeh !== undefined) {\n if (el != null) {\n el.textContent = \"BokehJS \" + root.Bokeh.version + \" successfully loaded.\";\n }\n } else if (Date.now() < root._bokeh_timeout) {\n setTimeout(display_loaded, 100)\n }\n }\n\n\n function run_callbacks() {\n try {\n root._bokeh_onload_callbacks.forEach(function(callback) {\n if (callback != null)\n callback();\n });\n } finally {\n delete root._bokeh_onload_callbacks\n }\n console.debug(\"Bokeh: all callbacks have finished\");\n }\n\n function load_libs(css_urls, js_urls, callback) {\n if (css_urls == null) css_urls = [];\n if (js_urls == null) js_urls = [];\n\n root._bokeh_onload_callbacks.push(callback);\n if (root._bokeh_is_loading > 0) {\n console.debug(\"Bokeh: BokehJS is being loaded, scheduling callback at\", now());\n return null;\n }\n if (js_urls == null || js_urls.length === 0) {\n run_callbacks();\n return null;\n }\n console.debug(\"Bokeh: BokehJS not loaded, scheduling load and callback at\", now());\n root._bokeh_is_loading = css_urls.length + js_urls.length;\n\n function on_load() {\n root._bokeh_is_loading--;\n if (root._bokeh_is_loading === 0) {\n console.debug(\"Bokeh: all BokehJS libraries/stylesheets loaded\");\n run_callbacks()\n }\n }\n\n function on_error() {\n console.error(\"failed to load \" + url);\n }\n\n for (var i = 0; i < css_urls.length; i++) {\n var url = css_urls[i];\n const element = document.createElement(\"link\");\n element.onload = on_load;\n element.onerror = on_error;\n element.rel = \"stylesheet\";\n element.type = \"text/css\";\n element.href = url;\n console.debug(\"Bokeh: injecting link tag for BokehJS stylesheet: \", url);\n document.body.appendChild(element);\n }\n\n for (var i = 0; i < js_urls.length; i++) {\n var url = js_urls[i];\n var element = document.createElement('script');\n element.onload = on_load;\n element.onerror = on_error;\n element.async = false;\n element.src = url;\n console.debug(\"Bokeh: injecting script tag for BokehJS library: \", url);\n document.head.appendChild(element);\n }\n };var element = document.getElementById(\"1001\");\n if (element == null) {\n console.error(\"Bokeh: ERROR: autoload.js configured with elementid '1001' but no matching script tag was found. \")\n return false;\n }\n\n function inject_raw_css(css) {\n const element = document.createElement(\"style\");\n element.appendChild(document.createTextNode(css));\n document.body.appendChild(element);\n }\n\n \n var js_urls = [\"https://cdn.pydata.org/bokeh/release/bokeh-1.4.0.min.js\", \"https://cdn.pydata.org/bokeh/release/bokeh-widgets-1.4.0.min.js\", \"https://cdn.pydata.org/bokeh/release/bokeh-tables-1.4.0.min.js\", \"https://cdn.pydata.org/bokeh/release/bokeh-gl-1.4.0.min.js\"];\n var css_urls = [];\n \n\n var inline_js = [\n function(Bokeh) {\n Bokeh.set_log_level(\"info\");\n },\n function(Bokeh) {\n \n \n }\n ];\n\n function run_inline_js() {\n \n if (root.Bokeh !== undefined || force === true) {\n \n for (var i = 0; i < inline_js.length; i++) {\n inline_js[i].call(root, root.Bokeh);\n }\n if (force === true) {\n display_loaded();\n }} else if (Date.now() < root._bokeh_timeout) {\n setTimeout(run_inline_js, 100);\n } else if (!root._bokeh_failed_load) {\n console.log(\"Bokeh: BokehJS failed to load within specified timeout.\");\n root._bokeh_failed_load = true;\n } else if (force !== true) {\n var cell = $(document.getElementById(\"1001\")).parents('.cell').data().cell;\n cell.output_area.append_execute_result(NB_LOAD_WARNING)\n }\n\n }\n\n if (root._bokeh_is_loading === 0) {\n console.debug(\"Bokeh: BokehJS loaded, going straight to plotting\");\n run_inline_js();\n } else {\n load_libs(css_urls, js_urls, function() {\n console.debug(\"Bokeh: BokehJS plotting callback run at\", now());\n run_inline_js();\n });\n }\n}(window));" }, "metadata": {}, "output_type": "display_data" } ], "source": [ "import multiprocessing\n", "import warnings\n", "\n", "import numpy as np\n", "import pandas as pd\n", "import scipy.optimize\n", "import scipy.stats as st\n", "\n", "import tqdm\n", "\n", "import bebi103\n", "\n", "import bokeh.io\n", "bokeh.io.output_notebook()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "
\n", "\n", "In the last part of this lesson, we saw that computing the MLE confidence intervals can take some time (several minutes). To speed up calculations, we can compute the bootstrap replicates in parallel. That is, we split the sampling job up into parts, and different processing cores work on each part, and then combine the results together at the end. \n", "\n", "To begin, we load in the data." ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [], "source": [ "df = pd.read_csv('../data/singer_transcript_counts.csv', comment='#')" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "And we also need the functions we used for performing the MLE calculation from earlier in the lesson." ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [], "source": [ "def log_like_iid_nbinom(params, n):\n", " \"\"\"Log likelihood for i.i.d. NBinom measurements, parametrized\n", " by alpha, b=1/beta.\"\"\"\n", " alpha, b = params\n", " \n", " if alpha <= 0 or b <= 0:\n", " return -np.inf\n", "\n", " return np.sum(st.nbinom.logpmf(n, alpha, 1/(1+b)))\n", "\n", "\n", "def mle_iid_nbinom(n):\n", " \"\"\"Perform maximum likelihood estimates for parameters for i.i.d. \n", " NBinom measurements, parametrized by alpha, b=1/beta\"\"\"\n", "\n", " with warnings.catch_warnings():\n", " warnings.simplefilter(\"ignore\")\n", " res = scipy.optimize.minimize(\n", " fun=lambda params, n: -log_like_iid_nbinom(params, n),\n", " x0=np.array([3, 3]),\n", " args=(n,),\n", " method='Powell'\n", " )\n", "\n", " if res.success:\n", " return res.x\n", " else:\n", " raise RuntimeError('Convergence failed with message', res.message)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Explicit RNGs\n", "\n", "When we parallelize the calculation, we have to be very careful about the random number generators we use. If we define a random number generator globally with `rg = np.random.default_rng()`, then each parallel process will use the same sequence of pseudorandom numbers. We therefore need to make a new RNG for each of the parallel processes. As a result, we need to explicitly supply a random number generator to the function we use to generate samples. This changes the call signature to include the RNG, but requires no other changes." ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [], "source": [ "def gen_nbinom(alpha, b, size, rg):\n", " return rg.negative_binomial(alpha, 1 / (1 + b), size=size)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Next, we need to adjust the `draw_bs_reps_mle()` function to also have explicit specification of a RNG. If we do not supply one, a new one should be created within the function.\n", "\n", "Finally, since the parallel version will supersede the one `draw_bs_reps_mle()` function we have already written, we will prepend its name with an underscore, designating it as a private function (essentially a function an end user will not call), since we will not really call it directly again except for demonstration purposes." ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [], "source": [ "def _draw_parametric_bs_reps_mle(\n", " mle_fun, gen_fun, data, args=(), size=1, progress_bar=False, rg=None,\n", "):\n", " \"\"\"Draw parametric bootstrap replicates of maximum likelihood estimator.\n", " \n", " Parameters\n", " ----------\n", " mle_fun : function\n", " Function with call signature mle_fun(data, *args) that computes\n", " a MLE for the parameters\n", " gen_fun : function\n", " Function to randomly draw a new data set out of the model\n", " distribution parametrized by the MLE. Must have call\n", " signature `gen_fun(*params, size, *args, rg)`.\n", " data : one-dimemsional Numpy array\n", " Array of measurements\n", " args : tuple, default ()\n", " Arguments to be passed to `mle_fun()`.\n", " size : int, default 1\n", " Number of bootstrap replicates to draw.\n", " progress_bar : bool, default False\n", " Whether or not to display progress bar.\n", " rg : numpy.random.Generator instance, default None\n", " RNG to be used in bootstrapping. If None, the default\n", " Numpy RNG is used with a fresh seed based on the clock.\n", " \n", " Returns\n", " -------\n", " output : numpy array\n", " Bootstrap replicates of MLEs.\n", " \"\"\"\n", " if rg is None:\n", " rg = np.random.default_rng()\n", " \n", " params = mle_fun(data, *args)\n", "\n", " if progress_bar:\n", " iterator = tqdm.tqdm(range(size))\n", " else:\n", " iterator = range(size)\n", "\n", " return np.array(\n", " [mle_fun(gen_fun(*params, size=len(data), *args, rg=rg)) for _ in iterator]\n", " )" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "These functions still work as advertised. We can call `_draw_parametric_bs_reps_mle()` exactly as we have earlier in the lesson. To get 100 bootstrap replicates (only 100 so we don't have to wait too long), we can do the following for Nanog." ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "100%|██████████| 100/100 [00:03<00:00, 25.84it/s]\n" ] } ], "source": [ "bs_reps = _draw_parametric_bs_reps_mle(\n", " mle_iid_nbinom, gen_nbinom, df[\"Nanog\"].values, args=(), size=100, progress_bar=True\n", ")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Enabling parallelization\n", "\n", "To enable multiple cores to draw bootstrap replicates, we will use Python's built-in [multiprocessing module](https://docs.python.org/3.7/library/multiprocessing.html). The syntax for using this module is as follows.\n", "\n", "```python\n", "with multiprocessing.Pool(n_jobs) as pool:\n", " result = pool.starmap(fun, arg_iterable)\n", "```\n", "\n", "Here, `fun` is a function and `arg_iterable` is an iterable that produces arguments to the `fun` as tuples that will be splatted when passed to `fun`. This is best seen with a simple example before we get into the more complicated example of bootstrapping. We will choose a function that adds four arguments. Then, `arg_iterable` will be an iterable (in this case a list) with tuples, each of which has four elements." ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "[(0, 1, 2, 3),\n", " (1, 2, 3, 4),\n", " (2, 3, 4, 5),\n", " (3, 4, 5, 6),\n", " (4, 5, 6, 7),\n", " (5, 6, 7, 8),\n", " (6, 7, 8, 9),\n", " (7, 8, 9, 10)]" ] }, "execution_count": 7, "metadata": {}, "output_type": "execute_result" } ], "source": [ "def fun(a, b, c, d):\n", " return a + b + c + d\n", "\n", "arg_iterable = [(i, i+1, i+2, i+3) for i in range(8)]\n", "\n", "# Display arg_iterable\n", "arg_iterable" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "If we call `fun` with one of the arguments from the `arg_iterable` using a splat operator, we get the appropriate sum." ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "6" ] }, "execution_count": 8, "metadata": {}, "output_type": "execute_result" } ], "source": [ "fun(*arg_iterable[0])" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "To iterate through the arguments and evaluate them in `fun` each time, we can do it in parallel, for example using two cores." ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "[6, 10, 14, 18, 22, 26, 30, 34]" ] }, "execution_count": 9, "metadata": {}, "output_type": "execute_result" } ], "source": [ "with multiprocessing.Pool(2) as pool:\n", " result = pool.starmap(fun, arg_iterable)\n", " \n", "# Take a look\n", "result" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "This gives the same result as if we did the following." ] }, { "cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "[6, 10, 14, 18, 22, 26, 30, 34]" ] }, "execution_count": 10, "metadata": {}, "output_type": "execute_result" } ], "source": [ "[fun(*args) for args in arg_iterable]" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "We can see the equivalence then.\n", "\n", "```python\n", "with multiprocessing.Pool(2) as pool:\n", " result = pool.starmap(fun, arg_iterable)\n", "```\n", "\n", "is the same as\n", "\n", "```python\n", "[fun(*args) for args in arg_iterable]\n", "```\n", "\n", "We can use this same structure for our bootstrap replicates. Here, `fun` is `_draw_bs_reps_mle`, and we need to make a list of arguments to pass to that function. This means splitting up `size` into chunks of size `size // n_jobs`," ] }, { "cell_type": "code", "execution_count": 11, "metadata": {}, "outputs": [], "source": [ "def draw_parametric_bs_reps_mle(\n", " mle_fun, gen_fun, data, args=(), size=1, n_jobs=1, progress_bar=False\n", "):\n", " \"\"\"Draw nonparametric bootstrap replicates of maximum likelihood estimator.\n", " \n", " Parameters\n", " ----------\n", " mle_fun : function\n", " Function with call signature mle_fun(data, *args) that computes\n", " a MLE for the parameters\n", " gen_fun : function\n", " Function to randomly draw a new data set out of the model\n", " distribution parametrized by the MLE. Must have call\n", " signature `gen_fun(*params, size, *args, rg)`.\n", " data : one-dimemsional Numpy array\n", " Array of measurements\n", " args : tuple, default ()\n", " Arguments to be passed to `mle_fun()`.\n", " size : int, default 1\n", " Number of bootstrap replicates to draw.\n", " n_jobs : int, default 1\n", " Number of cores to use in drawing bootstrap replicates.\n", " progress_bar : bool, default False\n", " Whether or not to display progress bar.\n", " \n", " Returns\n", " -------\n", " output : numpy array\n", " Bootstrap replicates of MLEs.\n", " \"\"\"\n", " # Just call the original function if n_jobs is 1 (no parallelization)\n", " if n_jobs == 1:\n", " return _draw_parametric_bs_reps_mle(\n", " mle_fun, gen_fun, data, args=args, size=size, progress_bar=progress_bar\n", " )\n", "\n", " # Set up sizes of bootstrap replicates for each core, making sure we\n", " # get all of them, even if sizes % n_jobs != 0\n", " sizes = [size // n_jobs for _ in range(n_jobs)]\n", " sizes[-1] += size - sum(sizes)\n", "\n", " # Build arguments\n", " arg_iterable = [(mle_fun, gen_fun, data, args, s, progress_bar, None) for s in sizes]\n", "\n", " with multiprocessing.Pool(n_jobs) as pool:\n", " result = pool.starmap(_draw_parametric_bs_reps_mle, arg_iterable)\n", "\n", " return np.concatenate(result)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Now, we can specify `n_jobs` to be greater than one to take advantage of multiple cores. It is important to know how many cores you have on your machine. I usually keep two cores open for other processes on my computer so it doesn't die on me. You can check how many cores you have using the `multiprocessing.cpu_count()` function." ] }, { "cell_type": "code", "execution_count": 12, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "4" ] }, "execution_count": 12, "metadata": {}, "output_type": "execute_result" } ], "source": [ "multiprocessing.cpu_count()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "My computer has four cores, I will therefore use three cores. And now that I am unleashing three cores on the problem, I will take 5000 replicates again, and this time it will only take about 1/3 as long as it did the first time I did the calculation." ] }, { "cell_type": "code", "execution_count": 13, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "100%|██████████| 1666/1666 [01:07<00:00, 24.52it/s]\n", "100%|██████████| 1668/1668 [01:08<00:00, 24.52it/s]\n", "100%|██████████| 1666/1666 [01:08<00:00, 24.47it/s]\n" ] } ], "source": [ "bs_reps = draw_parametric_bs_reps_mle(\n", " mle_iid_nbinom, gen_nbinom, df[\"Nanog\"].values, args=(), size=5000, n_jobs=3, progress_bar=True\n", ")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "From these replicates, we can compute the confidence intervals for $\\alpha$ and $b$ for Nanog." ] }, { "cell_type": "code", "execution_count": 14, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "array([[ 1.08917998, 56.65831501],\n", " [ 1.50354856, 82.49318134]])" ] }, "execution_count": 14, "metadata": {}, "output_type": "execute_result" } ], "source": [ "np.percentile(bs_reps, [2.5, 97.5], axis=0)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Parameter estimation for all genes\n", "\n", "We are not empowered to compute the MLE and confidence intervals for all four genes. We will do so with parametric bootstrap. We can ignore the warnings, as the occur when the solver attempts to try illegal (negative) parameter values. Powell's method is immune to this issue." ] }, { "cell_type": "code", "execution_count": 15, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "100%|██████████| 4/4 [04:31<00:00, 67.78s/it]\n" ] } ], "source": [ "genes = [\"Nanog\", \"Prdm14\", \"Rest\", \"Rex1\"]\n", "\n", "# Data frame to store results\n", "df_mle = pd.DataFrame(\n", " columns=[\"gene\", \"parameter\", \"mle\", \"conf_int_low\", \"conf_int_high\"]\n", ")\n", "\n", "# Perform MLE for each gene\n", "for gene in tqdm.tqdm(genes):\n", " mle = mle_iid_nbinom(df[gene].values)\n", "\n", " bs_reps = draw_parametric_bs_reps_mle(\n", " mle_iid_nbinom,\n", " gen_nbinom,\n", " df[gene].values,\n", " args=(),\n", " size=5000,\n", " n_jobs=3,\n", " progress_bar=False,\n", " )\n", " conf_int = np.percentile(bs_reps, [2.5, 97.5], axis=0)\n", "\n", " sub_df = pd.DataFrame(\n", " {\n", " \"gene\": [gene] * 2,\n", " \"parameter\": [\"alpha\", \"b\"],\n", " \"mle\": mle,\n", " \"conf_int_low\": conf_int[0, :],\n", " \"conf_int_high\": conf_int[1, :],\n", " }\n", " )\n", " df_mle = df_mle.append(sub_df)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "We now have a data frame with the results from our statistical inference. Let's take a look." ] }, { "cell_type": "code", "execution_count": 19, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
geneparametermleconf_int_lowconf_int_high
0Nanogalpha1.2630971.0876151.490602
1Nanogb69.34784256.89880482.831787
0Prdm14alpha0.5528860.4519170.687147
1Prdm14b8.2006366.23048610.624190
0Restalpha4.5303353.8732235.448273
1Restb16.54305413.60370419.284410
0Rex1alpha1.6345621.4045121.927031
1Rex1b84.68091569.676482100.342542
\n", "
" ], "text/plain": [ " gene parameter mle conf_int_low conf_int_high\n", "0 Nanog alpha 1.263097 1.087615 1.490602\n", "1 Nanog b 69.347842 56.898804 82.831787\n", "0 Prdm14 alpha 0.552886 0.451917 0.687147\n", "1 Prdm14 b 8.200636 6.230486 10.624190\n", "0 Rest alpha 4.530335 3.873223 5.448273\n", "1 Rest b 16.543054 13.603704 19.284410\n", "0 Rex1 alpha 1.634562 1.404512 1.927031\n", "1 Rex1 b 84.680915 69.676482 100.342542" ] }, "execution_count": 19, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df_mle" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Finally, we can make a plot of the MLEs of the parameter values with confidence interbals." ] }, { "cell_type": "code", "execution_count": 17, "metadata": {}, "outputs": [ { "data": { "text/html": [ "\n", "\n", "\n", "\n", "\n", "\n", "
\n" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "application/javascript": [ "(function(root) {\n", " function embed_document(root) {\n", " \n", " var docs_json = {\"b6107144-3c49-41f0-9af8-64f9950dff44\":{\"roots\":{\"references\":[{\"attributes\":{\"below\":[{\"id\":\"1011\",\"type\":\"LinearAxis\"}],\"center\":[{\"id\":\"1015\",\"type\":\"Grid\"},{\"id\":\"1019\",\"type\":\"Grid\"}],\"frame_height\":75,\"frame_width\":450,\"left\":[{\"id\":\"1016\",\"type\":\"CategoricalAxis\"}],\"renderers\":[{\"id\":\"1036\",\"type\":\"GlyphRenderer\"},{\"id\":\"1041\",\"type\":\"GlyphRenderer\"},{\"id\":\"1046\",\"type\":\"GlyphRenderer\"},{\"id\":\"1051\",\"type\":\"GlyphRenderer\"},{\"id\":\"1056\",\"type\":\"GlyphRenderer\"}],\"title\":{\"id\":\"1059\",\"type\":\"Title\"},\"toolbar\":{\"id\":\"1026\",\"type\":\"Toolbar\"},\"x_range\":{\"id\":\"1003\",\"type\":\"DataRange1d\"},\"x_scale\":{\"id\":\"1007\",\"type\":\"LinearScale\"},\"y_range\":{\"id\":\"1005\",\"type\":\"FactorRange\"},\"y_scale\":{\"id\":\"1009\",\"type\":\"CategoricalScale\"}},\"id\":\"1002\",\"subtype\":\"Figure\",\"type\":\"Plot\"},{\"attributes\":{},\"id\":\"1009\",\"type\":\"CategoricalScale\"},{\"attributes\":{},\"id\":\"1024\",\"type\":\"ResetTool\"},{\"attributes\":{\"axis_label\":\"\\u03b1*\",\"formatter\":{\"id\":\"1063\",\"type\":\"BasicTickFormatter\"},\"ticker\":{\"id\":\"1012\",\"type\":\"BasicTicker\"}},\"id\":\"1011\",\"type\":\"LinearAxis\"},{\"attributes\":{\"source\":{\"id\":\"1033\",\"type\":\"ColumnDataSource\"}},\"id\":\"1037\",\"type\":\"CDSView\"},{\"attributes\":{\"line_color\":\"#1f77b4\",\"line_width\":2,\"x\":{\"field\":\"x\"},\"y\":{\"field\":\"y\"}},\"id\":\"1049\",\"type\":\"Line\"},{\"attributes\":{\"callback\":null,\"data\":{\"x\":{\"__ndarray__\":\"AIAkFDXs3D+gczO+G/3lPw==\",\"dtype\":\"float64\",\"shape\":[2]},\"y\":[\"Prdm14\",\"Prdm14\"]},\"selected\":{\"id\":\"1069\",\"type\":\"Selection\"},\"selection_policy\":{\"id\":\"1068\",\"type\":\"UnionRenderers\"}},\"id\":\"1043\",\"type\":\"ColumnDataSource\"},{\"attributes\":{},\"id\":\"1063\",\"type\":\"BasicTickFormatter\"},{\"attributes\":{},\"id\":\"1012\",\"type\":\"BasicTicker\"},{\"attributes\":{},\"id\":\"1061\",\"type\":\"CategoricalTickFormatter\"},{\"attributes\":{\"bottom_units\":\"screen\",\"fill_alpha\":{\"value\":0.5},\"fill_color\":{\"value\":\"lightgrey\"},\"left_units\":\"screen\",\"level\":\"overlay\",\"line_alpha\":{\"value\":1.0},\"line_color\":{\"value\":\"black\"},\"line_dash\":[4,4],\"line_width\":{\"value\":2},\"render_mode\":\"css\",\"right_units\":\"screen\",\"top_units\":\"screen\"},\"id\":\"1074\",\"type\":\"BoxAnnotation\"},{\"attributes\":{\"ticker\":{\"id\":\"1012\",\"type\":\"BasicTicker\"}},\"id\":\"1015\",\"type\":\"Grid\"},{\"attributes\":{\"line_alpha\":0.1,\"line_color\":\"#1f77b4\",\"line_width\":2,\"x\":{\"field\":\"x\"},\"y\":{\"field\":\"y\"}},\"id\":\"1045\",\"type\":\"Line\"},{\"attributes\":{\"fill_alpha\":{\"value\":0.1},\"fill_color\":{\"value\":\"#1f77b4\"},\"line_alpha\":{\"value\":0.1},\"line_color\":{\"value\":\"#1f77b4\"},\"x\":{\"field\":\"x\"},\"y\":{\"field\":\"y\"}},\"id\":\"1035\",\"type\":\"Circle\"},{\"attributes\":{\"overlay\":{\"id\":\"1074\",\"type\":\"BoxAnnotation\"}},\"id\":\"1022\",\"type\":\"BoxZoomTool\"},{\"attributes\":{\"data_source\":{\"id\":\"1043\",\"type\":\"ColumnDataSource\"},\"glyph\":{\"id\":\"1044\",\"type\":\"Line\"},\"hover_glyph\":null,\"muted_glyph\":null,\"nonselection_glyph\":{\"id\":\"1045\",\"type\":\"Line\"},\"selection_glyph\":null,\"view\":{\"id\":\"1047\",\"type\":\"CDSView\"}},\"id\":\"1046\",\"type\":\"GlyphRenderer\"},{\"attributes\":{\"data_source\":{\"id\":\"1033\",\"type\":\"ColumnDataSource\"},\"glyph\":{\"id\":\"1034\",\"type\":\"Circle\"},\"hover_glyph\":null,\"muted_glyph\":null,\"nonselection_glyph\":{\"id\":\"1035\",\"type\":\"Circle\"},\"selection_glyph\":null,\"view\":{\"id\":\"1037\",\"type\":\"CDSView\"}},\"id\":\"1036\",\"type\":\"GlyphRenderer\"},{\"attributes\":{},\"id\":\"1065\",\"type\":\"Selection\"},{\"attributes\":{},\"id\":\"1023\",\"type\":\"SaveTool\"},{\"attributes\":{\"line_alpha\":0.1,\"line_color\":\"#1f77b4\",\"line_width\":2,\"x\":{\"field\":\"x\"},\"y\":{\"field\":\"y\"}},\"id\":\"1040\",\"type\":\"Line\"},{\"attributes\":{},\"id\":\"1066\",\"type\":\"UnionRenderers\"},{\"attributes\":{\"source\":{\"id\":\"1043\",\"type\":\"ColumnDataSource\"}},\"id\":\"1047\",\"type\":\"CDSView\"},{\"attributes\":{},\"id\":\"1067\",\"type\":\"Selection\"},{\"attributes\":{\"callback\":null,\"data\":{\"x\":{\"__ndarray__\":\"ba9ia1z8DkDdDsIHCMsVQA==\",\"dtype\":\"float64\",\"shape\":[2]},\"y\":[\"Rest\",\"Rest\"]},\"selected\":{\"id\":\"1071\",\"type\":\"Selection\"},\"selection_policy\":{\"id\":\"1070\",\"type\":\"UnionRenderers\"}},\"id\":\"1048\",\"type\":\"ColumnDataSource\"},{\"attributes\":{},\"id\":\"1017\",\"type\":\"CategoricalTicker\"},{\"attributes\":{},\"id\":\"1025\",\"type\":\"HelpTool\"},{\"attributes\":{\"callback\":null,\"data\":{\"x\":{\"__ndarray__\":\"epMal6U19D/ARyfoPbHhP4HCpxsQHxJAUHRb7yon+j8=\",\"dtype\":\"float64\",\"shape\":[4]},\"y\":[\"Nanog\",\"Prdm14\",\"Rest\",\"Rex1\"]},\"selected\":{\"id\":\"1065\",\"type\":\"Selection\"},\"selection_policy\":{\"id\":\"1064\",\"type\":\"UnionRenderers\"}},\"id\":\"1033\",\"type\":\"ColumnDataSource\"},{\"attributes\":{\"line_color\":\"#1f77b4\",\"line_width\":2,\"x\":{\"field\":\"x\"},\"y\":{\"field\":\"y\"}},\"id\":\"1054\",\"type\":\"Line\"},{\"attributes\":{},\"id\":\"1068\",\"type\":\"UnionRenderers\"},{\"attributes\":{\"active_drag\":\"auto\",\"active_inspect\":\"auto\",\"active_multi\":null,\"active_scroll\":\"auto\",\"active_tap\":\"auto\",\"tools\":[{\"id\":\"1020\",\"type\":\"PanTool\"},{\"id\":\"1021\",\"type\":\"WheelZoomTool\"},{\"id\":\"1022\",\"type\":\"BoxZoomTool\"},{\"id\":\"1023\",\"type\":\"SaveTool\"},{\"id\":\"1024\",\"type\":\"ResetTool\"},{\"id\":\"1025\",\"type\":\"HelpTool\"}]},\"id\":\"1026\",\"type\":\"Toolbar\"},{\"attributes\":{\"source\":{\"id\":\"1053\",\"type\":\"ColumnDataSource\"}},\"id\":\"1057\",\"type\":\"CDSView\"},{\"attributes\":{\"line_alpha\":0.1,\"line_color\":\"#1f77b4\",\"line_width\":2,\"x\":{\"field\":\"x\"},\"y\":{\"field\":\"y\"}},\"id\":\"1050\",\"type\":\"Line\"},{\"attributes\":{},\"id\":\"1069\",\"type\":\"Selection\"},{\"attributes\":{\"data_source\":{\"id\":\"1048\",\"type\":\"ColumnDataSource\"},\"glyph\":{\"id\":\"1049\",\"type\":\"Line\"},\"hover_glyph\":null,\"muted_glyph\":null,\"nonselection_glyph\":{\"id\":\"1050\",\"type\":\"Line\"},\"selection_glyph\":null,\"view\":{\"id\":\"1052\",\"type\":\"CDSView\"}},\"id\":\"1051\",\"type\":\"GlyphRenderer\"},{\"attributes\":{},\"id\":\"1070\",\"type\":\"UnionRenderers\"},{\"attributes\":{\"dimension\":1,\"ticker\":{\"id\":\"1017\",\"type\":\"CategoricalTicker\"}},\"id\":\"1019\",\"type\":\"Grid\"},{\"attributes\":{\"callback\":null},\"id\":\"1003\",\"type\":\"DataRange1d\"},{\"attributes\":{},\"id\":\"1071\",\"type\":\"Selection\"},{\"attributes\":{\"formatter\":{\"id\":\"1061\",\"type\":\"CategoricalTickFormatter\"},\"ticker\":{\"id\":\"1017\",\"type\":\"CategoricalTicker\"}},\"id\":\"1016\",\"type\":\"CategoricalAxis\"},{\"attributes\":{\"line_color\":\"#1f77b4\",\"line_width\":2,\"x\":{\"field\":\"x\"},\"y\":{\"field\":\"y\"}},\"id\":\"1039\",\"type\":\"Line\"},{\"attributes\":{\"source\":{\"id\":\"1048\",\"type\":\"ColumnDataSource\"}},\"id\":\"1052\",\"type\":\"CDSView\"},{\"attributes\":{},\"id\":\"1021\",\"type\":\"WheelZoomTool\"},{\"attributes\":{\"callback\":null,\"data\":{\"x\":{\"__ndarray__\":\"jXVXxN5m8T/i8CcLgdn3Pw==\",\"dtype\":\"float64\",\"shape\":[2]},\"y\":[\"Nanog\",\"Nanog\"]},\"selected\":{\"id\":\"1067\",\"type\":\"Selection\"},\"selection_policy\":{\"id\":\"1066\",\"type\":\"UnionRenderers\"}},\"id\":\"1038\",\"type\":\"ColumnDataSource\"},{\"attributes\":{},\"id\":\"1072\",\"type\":\"UnionRenderers\"},{\"attributes\":{\"callback\":null,\"factors\":[\"Rex1\",\"Rest\",\"Prdm14\",\"Nanog\"]},\"id\":\"1005\",\"type\":\"FactorRange\"},{\"attributes\":{\"callback\":null,\"data\":{\"x\":{\"__ndarray__\":\"NFJFMeF49j+5SYPkHtX+Pw==\",\"dtype\":\"float64\",\"shape\":[2]},\"y\":[\"Rex1\",\"Rex1\"]},\"selected\":{\"id\":\"1073\",\"type\":\"Selection\"},\"selection_policy\":{\"id\":\"1072\",\"type\":\"UnionRenderers\"}},\"id\":\"1053\",\"type\":\"ColumnDataSource\"},{\"attributes\":{},\"id\":\"1020\",\"type\":\"PanTool\"},{\"attributes\":{},\"id\":\"1064\",\"type\":\"UnionRenderers\"},{\"attributes\":{},\"id\":\"1073\",\"type\":\"Selection\"},{\"attributes\":{},\"id\":\"1007\",\"type\":\"LinearScale\"},{\"attributes\":{\"line_alpha\":0.1,\"line_color\":\"#1f77b4\",\"line_width\":2,\"x\":{\"field\":\"x\"},\"y\":{\"field\":\"y\"}},\"id\":\"1055\",\"type\":\"Line\"},{\"attributes\":{\"data_source\":{\"id\":\"1038\",\"type\":\"ColumnDataSource\"},\"glyph\":{\"id\":\"1039\",\"type\":\"Line\"},\"hover_glyph\":null,\"muted_glyph\":null,\"nonselection_glyph\":{\"id\":\"1040\",\"type\":\"Line\"},\"selection_glyph\":null,\"view\":{\"id\":\"1042\",\"type\":\"CDSView\"}},\"id\":\"1041\",\"type\":\"GlyphRenderer\"},{\"attributes\":{\"text\":\"\"},\"id\":\"1059\",\"type\":\"Title\"},{\"attributes\":{\"data_source\":{\"id\":\"1053\",\"type\":\"ColumnDataSource\"},\"glyph\":{\"id\":\"1054\",\"type\":\"Line\"},\"hover_glyph\":null,\"muted_glyph\":null,\"nonselection_glyph\":{\"id\":\"1055\",\"type\":\"Line\"},\"selection_glyph\":null,\"view\":{\"id\":\"1057\",\"type\":\"CDSView\"}},\"id\":\"1056\",\"type\":\"GlyphRenderer\"},{\"attributes\":{\"fill_color\":{\"value\":\"#1f77b4\"},\"line_color\":{\"value\":\"#1f77b4\"},\"x\":{\"field\":\"x\"},\"y\":{\"field\":\"y\"}},\"id\":\"1034\",\"type\":\"Circle\"},{\"attributes\":{\"line_color\":\"#1f77b4\",\"line_width\":2,\"x\":{\"field\":\"x\"},\"y\":{\"field\":\"y\"}},\"id\":\"1044\",\"type\":\"Line\"},{\"attributes\":{\"source\":{\"id\":\"1038\",\"type\":\"ColumnDataSource\"}},\"id\":\"1042\",\"type\":\"CDSView\"}],\"root_ids\":[\"1002\"]},\"title\":\"Bokeh Application\",\"version\":\"1.4.0\"}};\n", " var render_items = [{\"docid\":\"b6107144-3c49-41f0-9af8-64f9950dff44\",\"roots\":{\"1002\":\"5af81298-d064-409d-bbb2-27227fdc7442\"}}];\n", " root.Bokeh.embed.embed_items_notebook(docs_json, render_items);\n", "\n", " }\n", " if (root.Bokeh !== undefined) {\n", " embed_document(root);\n", " } else {\n", " var attempts = 0;\n", " var timer = setInterval(function(root) {\n", " if (root.Bokeh !== undefined) {\n", " clearInterval(timer);\n", " embed_document(root);\n", " } else {\n", " attempts++;\n", " if (attempts > 100) {\n", " clearInterval(timer);\n", " console.log(\"Bokeh: ERROR: Unable to run BokehJS code because BokehJS library is missing\");\n", " }\n", " }\n", " }, 10, root)\n", " }\n", "})(window);" ], "application/vnd.bokehjs_exec.v0+json": "" }, "metadata": { "application/vnd.bokehjs_exec.v0+json": { "id": "1002" } }, "output_type": "display_data" }, { "data": { "text/html": [ "\n", "\n", "\n", "\n", "\n", "\n", "
\n" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "application/javascript": [ "(function(root) {\n", " function embed_document(root) {\n", " \n", " var docs_json = {\"5a2ae302-bdaf-4536-b485-70c601597e5e\":{\"roots\":{\"references\":[{\"attributes\":{\"below\":[{\"id\":\"1186\",\"type\":\"LinearAxis\"}],\"center\":[{\"id\":\"1190\",\"type\":\"Grid\"},{\"id\":\"1194\",\"type\":\"Grid\"}],\"frame_height\":75,\"frame_width\":450,\"left\":[{\"id\":\"1191\",\"type\":\"CategoricalAxis\"}],\"renderers\":[{\"id\":\"1211\",\"type\":\"GlyphRenderer\"},{\"id\":\"1216\",\"type\":\"GlyphRenderer\"},{\"id\":\"1221\",\"type\":\"GlyphRenderer\"},{\"id\":\"1226\",\"type\":\"GlyphRenderer\"},{\"id\":\"1231\",\"type\":\"GlyphRenderer\"}],\"title\":{\"id\":\"1251\",\"type\":\"Title\"},\"toolbar\":{\"id\":\"1201\",\"type\":\"Toolbar\"},\"x_range\":{\"id\":\"1178\",\"type\":\"DataRange1d\"},\"x_scale\":{\"id\":\"1182\",\"type\":\"LinearScale\"},\"y_range\":{\"id\":\"1180\",\"type\":\"FactorRange\"},\"y_scale\":{\"id\":\"1184\",\"type\":\"CategoricalScale\"}},\"id\":\"1177\",\"subtype\":\"Figure\",\"type\":\"Plot\"},{\"attributes\":{\"line_color\":\"#1f77b4\",\"line_width\":2,\"x\":{\"field\":\"x\"},\"y\":{\"field\":\"y\"}},\"id\":\"1224\",\"type\":\"Line\"},{\"attributes\":{},\"id\":\"1184\",\"type\":\"CategoricalScale\"},{\"attributes\":{\"dimension\":1,\"ticker\":{\"id\":\"1192\",\"type\":\"CategoricalTicker\"}},\"id\":\"1194\",\"type\":\"Grid\"},{\"attributes\":{\"source\":{\"id\":\"1213\",\"type\":\"ColumnDataSource\"}},\"id\":\"1217\",\"type\":\"CDSView\"},{\"attributes\":{\"line_alpha\":0.1,\"line_color\":\"#1f77b4\",\"line_width\":2,\"x\":{\"field\":\"x\"},\"y\":{\"field\":\"y\"}},\"id\":\"1215\",\"type\":\"Line\"},{\"attributes\":{\"callback\":null,\"data\":{\"x\":{\"__ndarray__\":\"sU9/DENWUUD9UjC5uWYgQILQ4Y8FizBANCTRHZQrVUA=\",\"dtype\":\"float64\",\"shape\":[4]},\"y\":[\"Nanog\",\"Prdm14\",\"Rest\",\"Rex1\"]},\"selected\":{\"id\":\"1257\",\"type\":\"Selection\"},\"selection_policy\":{\"id\":\"1256\",\"type\":\"UnionRenderers\"}},\"id\":\"1208\",\"type\":\"ColumnDataSource\"},{\"attributes\":{\"callback\":null,\"data\":{\"x\":{\"__ndarray__\":\"NQRDAQxzTECd2nf/O7VUQA==\",\"dtype\":\"float64\",\"shape\":[2]},\"y\":[\"Nanog\",\"Nanog\"]},\"selected\":{\"id\":\"1259\",\"type\":\"Selection\"},\"selection_policy\":{\"id\":\"1258\",\"type\":\"UnionRenderers\"}},\"id\":\"1213\",\"type\":\"ColumnDataSource\"},{\"attributes\":{\"source\":{\"id\":\"1228\",\"type\":\"ColumnDataSource\"}},\"id\":\"1232\",\"type\":\"CDSView\"},{\"attributes\":{},\"id\":\"1258\",\"type\":\"UnionRenderers\"},{\"attributes\":{\"data_source\":{\"id\":\"1213\",\"type\":\"ColumnDataSource\"},\"glyph\":{\"id\":\"1214\",\"type\":\"Line\"},\"hover_glyph\":null,\"muted_glyph\":null,\"nonselection_glyph\":{\"id\":\"1215\",\"type\":\"Line\"},\"selection_glyph\":null,\"view\":{\"id\":\"1217\",\"type\":\"CDSView\"}},\"id\":\"1216\",\"type\":\"GlyphRenderer\"},{\"attributes\":{\"data_source\":{\"id\":\"1228\",\"type\":\"ColumnDataSource\"},\"glyph\":{\"id\":\"1229\",\"type\":\"Line\"},\"hover_glyph\":null,\"muted_glyph\":null,\"nonselection_glyph\":{\"id\":\"1230\",\"type\":\"Line\"},\"selection_glyph\":null,\"view\":{\"id\":\"1232\",\"type\":\"CDSView\"}},\"id\":\"1231\",\"type\":\"GlyphRenderer\"},{\"attributes\":{\"callback\":null},\"id\":\"1178\",\"type\":\"DataRange1d\"},{\"attributes\":{\"overlay\":{\"id\":\"1266\",\"type\":\"BoxAnnotation\"}},\"id\":\"1197\",\"type\":\"BoxZoomTool\"},{\"attributes\":{\"line_color\":\"#1f77b4\",\"line_width\":2,\"x\":{\"field\":\"x\"},\"y\":{\"field\":\"y\"}},\"id\":\"1219\",\"type\":\"Line\"},{\"attributes\":{\"callback\":null,\"factors\":[\"Rex1\",\"Rest\",\"Prdm14\",\"Nanog\"]},\"id\":\"1180\",\"type\":\"FactorRange\"},{\"attributes\":{\"line_alpha\":0.1,\"line_color\":\"#1f77b4\",\"line_width\":2,\"x\":{\"field\":\"x\"},\"y\":{\"field\":\"y\"}},\"id\":\"1230\",\"type\":\"Line\"},{\"attributes\":{\"line_alpha\":0.1,\"line_color\":\"#1f77b4\",\"line_width\":2,\"x\":{\"field\":\"x\"},\"y\":{\"field\":\"y\"}},\"id\":\"1225\",\"type\":\"Line\"},{\"attributes\":{},\"id\":\"1255\",\"type\":\"BasicTickFormatter\"},{\"attributes\":{},\"id\":\"1253\",\"type\":\"CategoricalTickFormatter\"},{\"attributes\":{\"data_source\":{\"id\":\"1223\",\"type\":\"ColumnDataSource\"},\"glyph\":{\"id\":\"1224\",\"type\":\"Line\"},\"hover_glyph\":null,\"muted_glyph\":null,\"nonselection_glyph\":{\"id\":\"1225\",\"type\":\"Line\"},\"selection_glyph\":null,\"view\":{\"id\":\"1227\",\"type\":\"CDSView\"}},\"id\":\"1226\",\"type\":\"GlyphRenderer\"},{\"attributes\":{},\"id\":\"1256\",\"type\":\"UnionRenderers\"},{\"attributes\":{},\"id\":\"1187\",\"type\":\"BasicTicker\"},{\"attributes\":{},\"id\":\"1260\",\"type\":\"UnionRenderers\"},{\"attributes\":{\"source\":{\"id\":\"1208\",\"type\":\"ColumnDataSource\"}},\"id\":\"1212\",\"type\":\"CDSView\"},{\"attributes\":{},\"id\":\"1259\",\"type\":\"Selection\"},{\"attributes\":{\"fill_color\":{\"value\":\"#1f77b4\"},\"line_color\":{\"value\":\"#1f77b4\"},\"x\":{\"field\":\"x\"},\"y\":{\"field\":\"y\"}},\"id\":\"1209\",\"type\":\"Circle\"},{\"attributes\":{},\"id\":\"1192\",\"type\":\"CategoricalTicker\"},{\"attributes\":{},\"id\":\"1257\",\"type\":\"Selection\"},{\"attributes\":{\"data_source\":{\"id\":\"1208\",\"type\":\"ColumnDataSource\"},\"glyph\":{\"id\":\"1209\",\"type\":\"Circle\"},\"hover_glyph\":null,\"muted_glyph\":null,\"nonselection_glyph\":{\"id\":\"1210\",\"type\":\"Circle\"},\"selection_glyph\":null,\"view\":{\"id\":\"1212\",\"type\":\"CDSView\"}},\"id\":\"1211\",\"type\":\"GlyphRenderer\"},{\"attributes\":{\"source\":{\"id\":\"1223\",\"type\":\"ColumnDataSource\"}},\"id\":\"1227\",\"type\":\"CDSView\"},{\"attributes\":{\"ticker\":{\"id\":\"1187\",\"type\":\"BasicTicker\"}},\"id\":\"1190\",\"type\":\"Grid\"},{\"attributes\":{\"callback\":null,\"data\":{\"x\":{\"__ndarray__\":\"kyR0fEtrUUBbri037BVZQA==\",\"dtype\":\"float64\",\"shape\":[2]},\"y\":[\"Rex1\",\"Rex1\"]},\"selected\":{\"id\":\"1265\",\"type\":\"Selection\"},\"selection_policy\":{\"id\":\"1264\",\"type\":\"UnionRenderers\"}},\"id\":\"1228\",\"type\":\"ColumnDataSource\"},{\"attributes\":{},\"id\":\"1200\",\"type\":\"HelpTool\"},{\"attributes\":{\"formatter\":{\"id\":\"1253\",\"type\":\"CategoricalTickFormatter\"},\"ticker\":{\"id\":\"1192\",\"type\":\"CategoricalTicker\"}},\"id\":\"1191\",\"type\":\"CategoricalAxis\"},{\"attributes\":{\"fill_alpha\":{\"value\":0.1},\"fill_color\":{\"value\":\"#1f77b4\"},\"line_alpha\":{\"value\":0.1},\"line_color\":{\"value\":\"#1f77b4\"},\"x\":{\"field\":\"x\"},\"y\":{\"field\":\"y\"}},\"id\":\"1210\",\"type\":\"Circle\"},{\"attributes\":{\"line_color\":\"#1f77b4\",\"line_width\":2,\"x\":{\"field\":\"x\"},\"y\":{\"field\":\"y\"}},\"id\":\"1229\",\"type\":\"Line\"},{\"attributes\":{\"active_drag\":\"auto\",\"active_inspect\":\"auto\",\"active_multi\":null,\"active_scroll\":\"auto\",\"active_tap\":\"auto\",\"tools\":[{\"id\":\"1195\",\"type\":\"PanTool\"},{\"id\":\"1196\",\"type\":\"WheelZoomTool\"},{\"id\":\"1197\",\"type\":\"BoxZoomTool\"},{\"id\":\"1198\",\"type\":\"SaveTool\"},{\"id\":\"1199\",\"type\":\"ResetTool\"},{\"id\":\"1200\",\"type\":\"HelpTool\"}]},\"id\":\"1201\",\"type\":\"Toolbar\"},{\"attributes\":{},\"id\":\"1262\",\"type\":\"UnionRenderers\"},{\"attributes\":{},\"id\":\"1265\",\"type\":\"Selection\"},{\"attributes\":{\"callback\":null,\"data\":{\"x\":{\"__ndarray__\":\"mRuTvBg1K0CdSj4az0gzQA==\",\"dtype\":\"float64\",\"shape\":[2]},\"y\":[\"Rest\",\"Rest\"]},\"selected\":{\"id\":\"1263\",\"type\":\"Selection\"},\"selection_policy\":{\"id\":\"1262\",\"type\":\"UnionRenderers\"}},\"id\":\"1223\",\"type\":\"ColumnDataSource\"},{\"attributes\":{\"text\":\"\"},\"id\":\"1251\",\"type\":\"Title\"},{\"attributes\":{},\"id\":\"1261\",\"type\":\"Selection\"},{\"attributes\":{},\"id\":\"1264\",\"type\":\"UnionRenderers\"},{\"attributes\":{},\"id\":\"1182\",\"type\":\"LinearScale\"},{\"attributes\":{\"source\":{\"id\":\"1218\",\"type\":\"ColumnDataSource\"}},\"id\":\"1222\",\"type\":\"CDSView\"},{\"attributes\":{},\"id\":\"1195\",\"type\":\"PanTool\"},{\"attributes\":{},\"id\":\"1196\",\"type\":\"WheelZoomTool\"},{\"attributes\":{\"data_source\":{\"id\":\"1218\",\"type\":\"ColumnDataSource\"},\"glyph\":{\"id\":\"1219\",\"type\":\"Line\"},\"hover_glyph\":null,\"muted_glyph\":null,\"nonselection_glyph\":{\"id\":\"1220\",\"type\":\"Line\"},\"selection_glyph\":null,\"view\":{\"id\":\"1222\",\"type\":\"CDSView\"}},\"id\":\"1221\",\"type\":\"GlyphRenderer\"},{\"attributes\":{},\"id\":\"1198\",\"type\":\"SaveTool\"},{\"attributes\":{\"axis_label\":\"b*\",\"formatter\":{\"id\":\"1255\",\"type\":\"BasicTickFormatter\"},\"ticker\":{\"id\":\"1187\",\"type\":\"BasicTicker\"}},\"id\":\"1186\",\"type\":\"LinearAxis\"},{\"attributes\":{\"bottom_units\":\"screen\",\"fill_alpha\":{\"value\":0.5},\"fill_color\":{\"value\":\"lightgrey\"},\"left_units\":\"screen\",\"level\":\"overlay\",\"line_alpha\":{\"value\":1.0},\"line_color\":{\"value\":\"black\"},\"line_dash\":[4,4],\"line_width\":{\"value\":2},\"render_mode\":\"css\",\"right_units\":\"screen\",\"top_units\":\"screen\"},\"id\":\"1266\",\"type\":\"BoxAnnotation\"},{\"attributes\":{\"line_alpha\":0.1,\"line_color\":\"#1f77b4\",\"line_width\":2,\"x\":{\"field\":\"x\"},\"y\":{\"field\":\"y\"}},\"id\":\"1220\",\"type\":\"Line\"},{\"attributes\":{},\"id\":\"1199\",\"type\":\"ResetTool\"},{\"attributes\":{},\"id\":\"1263\",\"type\":\"Selection\"},{\"attributes\":{\"line_color\":\"#1f77b4\",\"line_width\":2,\"x\":{\"field\":\"x\"},\"y\":{\"field\":\"y\"}},\"id\":\"1214\",\"type\":\"Line\"},{\"attributes\":{\"callback\":null,\"data\":{\"x\":{\"__ndarray__\":\"Vom5oATsGEAs0bXYlT8lQA==\",\"dtype\":\"float64\",\"shape\":[2]},\"y\":[\"Prdm14\",\"Prdm14\"]},\"selected\":{\"id\":\"1261\",\"type\":\"Selection\"},\"selection_policy\":{\"id\":\"1260\",\"type\":\"UnionRenderers\"}},\"id\":\"1218\",\"type\":\"ColumnDataSource\"}],\"root_ids\":[\"1177\"]},\"title\":\"Bokeh Application\",\"version\":\"1.4.0\"}};\n", " var render_items = [{\"docid\":\"5a2ae302-bdaf-4536-b485-70c601597e5e\",\"roots\":{\"1177\":\"631c007c-99b5-4856-9a46-a1377f85e1c0\"}}];\n", " root.Bokeh.embed.embed_items_notebook(docs_json, render_items);\n", "\n", " }\n", " if (root.Bokeh !== undefined) {\n", " embed_document(root);\n", " } else {\n", " var attempts = 0;\n", " var timer = setInterval(function(root) {\n", " if (root.Bokeh !== undefined) {\n", " clearInterval(timer);\n", " embed_document(root);\n", " } else {\n", " attempts++;\n", " if (attempts > 100) {\n", " clearInterval(timer);\n", " console.log(\"Bokeh: ERROR: Unable to run BokehJS code because BokehJS library is missing\");\n", " }\n", " }\n", " }, 10, root)\n", " }\n", "})(window);" ], "application/vnd.bokehjs_exec.v0+json": "" }, "metadata": { "application/vnd.bokehjs_exec.v0+json": { "id": "1177" } }, "output_type": "display_data" } ], "source": [ "sub_df = df_mle.loc[df_mle[\"parameter\"] == \"alpha\", :]\n", "p1 = bokeh.io.show(\n", " bebi103.viz.plot_with_error_bars(\n", " centers=sub_df[\"mle\"].values,\n", " confs=sub_df[[\"conf_int_low\", \"conf_int_high\"]].values,\n", " names=sub_df['gene'],\n", " x_axis_label='α*',\n", " frame_height=75,\n", " )\n", ")\n", "\n", "sub_df = df_mle.loc[df_mle[\"parameter\"] == \"b\", :]\n", "p2 = bokeh.io.show(\n", " bebi103.viz.plot_with_error_bars(\n", " centers=sub_df[\"mle\"].values,\n", " confs=sub_df[[\"conf_int_low\", \"conf_int_high\"]].values,\n", " names=sub_df['gene'],\n", " x_axis_label='b*',\n", " frame_height=75,\n", " )\n", ")\n", "\n", "bokeh.layouts.gridplot([[p1, p2]]);" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Implementation of MLE bootstrapping in the bebi103 package\n", "\n", "The functionality we have developed in this lesson for bootstrapping MLE estimates is available in the function `bebi103.draw_bs_reps_mle()`. It has call signature\n", "\n", "```python\n", "bebi103.draw_bs_reps_mle(\n", " mle_fun,\n", " gen_fun,\n", " data,\n", " mle_args=(),\n", " gen_args=(),\n", " size=1,\n", " n_jobs=1,\n", " progress_bar=False,\n", " rg=None,\n", "):\n", "```\n", "\n", "Here, `mle_fun` is a function with call signature `mle_fun(data, *mle_fun_args)` that computes the maximum likelihood estimate from the data. `gen_fun` is a function with call signature `gen_fun(params, *gen_args, size, rg)` that generates bootstrap samples to be used to make bootstrap replicates of the maximum likelihood estimates. Here, `params` is a tuple of the parameters of the model. Note that not all of the inputs for `gen_fun` are necessarily used, but having them allows for flexibility. The `bebi103.draw_bs_reps_mle()` function allows for both parametric and nonparametric bootstrapping.\n", "\n", "As an example, we will repeat the bootstrapped MLE confidence region calculation for Nanog, except we will do it nonparametrically this time." ] }, { "cell_type": "code", "execution_count": 18, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "100%|██████████| 1666/1666 [01:07<00:00, 24.81it/s]\n", "100%|██████████| 1668/1668 [01:07<00:00, 24.80it/s]\n", "100%|██████████| 1666/1666 [01:07<00:00, 24.70it/s]\n" ] }, { "data": { "text/plain": [ "array([[ 1.05978581, 57.32856127],\n", " [ 1.5373875 , 82.13496011]])" ] }, "execution_count": 18, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# This is gen_fun, for nonparametric, just a resampling\n", "# params = (alpha, beta), but is ignored\n", "# gen_params = (n,); this is used for nonparametric sample\n", "def resample_fun(params, n, size, rg):\n", " return rg.choice(n, size=size)\n", "\n", "\n", "bs_reps = bebi103.draw_bs_reps_mle(\n", " mle_iid_nbinom,\n", " resample_fun,\n", " df['Nanog'].values,\n", " gen_args=(df['Nanog'].values, ),\n", " size=5000,\n", " n_jobs=3,\n", " progress_bar=True,\n", ")\n", "\n", "# Show the confidence interval\n", "np.percentile(bs_reps, [2.5, 97.5], axis=0)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "The confidence intervals are slightly different than in the parametric case, but are quite similar." ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Computing environment" ] }, { "cell_type": "code", "execution_count": 18, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "CPython 3.7.5\n", "IPython 7.1.1\n", "\n", "numpy 1.17.3\n", "scipy 1.3.1\n", "pandas 0.24.2\n", "tqdm 4.38.0\n", "bokeh 1.4.0\n", "bebi103 0.0.44\n", "jupyterlab 1.2.3\n" ] } ], "source": [ "%load_ext watermark\n", "\n", "%watermark -v -p numpy,scipy,pandas,tqdm,bokeh,bebi103,jupyterlab" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.7.5" } }, "nbformat": 4, "nbformat_minor": 4 }