{
"cells": [
{
"cell_type": "markdown",
"metadata": {
"hide_input": true
},
"source": [
"# Mosquito Regressions "
]
},
{
"cell_type": "markdown",
"metadata": {
"hide_input": false
},
"source": [
"The idea behind this notebook is to combine data from the Mosquito Habitat Mapper with data from another GLOBE protocol, such as air temperature or precipitation. The goal is to provide tools for examining the relationship between the two protocols using Weighted Least Squares Regression. "
]
},
{
"cell_type": "markdown",
"metadata": {
"heading_collapsed": true,
"hide_input": true
},
"source": [
"### Importing Required Modules"
]
},
{
"cell_type": "markdown",
"metadata": {
"hidden": true,
"hide_input": true
},
"source": [
"A few Python modules and tools are required to run this script."
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {
"code_folding": [
0
],
"hidden": true,
"hide_input": true
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
" ^ [code] (for designation: designating hidden code blocks)\n"
]
}
],
"source": [
"# subroutine for designating a code block\n",
"def designate(title, section='main'):\n",
" \"\"\"Designate a code block with a title so that the code may be hidden and reopened.\n",
" \n",
" Arguments:\n",
" title: str, title for code block\n",
" section='main': str, section title\n",
" \n",
" Returns:\n",
" None\n",
" \"\"\"\n",
" \n",
" # begin designation\n",
" designation = ' ' * 20\n",
" \n",
" # if marked for user parameters\n",
" if section == 'settings':\n",
" \n",
" # begin designation with indicator\n",
" designation = '*** settings -----> '\n",
" \n",
" # add code designator\n",
" designation += '^ [code] (for {}: {})'.format(section, title)\n",
" \n",
" # print\n",
" print(designation)\n",
" \n",
" return None\n",
"\n",
"# apply to itself\n",
"designate('designating hidden code blocks', 'designation')"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {
"code_folding": [],
"hidden": true,
"hide_input": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
" ^ [code] (for main: importing Python system tools)\n"
]
}
],
"source": [
"designate('importing Python system tools')\n",
"\n",
"# import os and sys modules for system controls\n",
"import os\n",
"import sys\n",
"\n",
"# set runtime warnings to ignore\n",
"import warnings\n",
"\n",
"# import requests and json modules for making API requests\n",
"import requests\n",
"import json\n",
"\n",
"# import fuzzywuzzy for fuzzy name matching\n",
"from fuzzywuzzy import fuzz\n",
"\n",
"# import datetime module for manipulating date and time formats\n",
"from datetime import datetime, timedelta\n",
"\n",
"# import pandas for dataframe manipulation\n",
"import pandas"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {
"hidden": true,
"hide_input": true
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
" ^ [code] (for main: importing Python mathematical modules)\n"
]
}
],
"source": [
"designate('importing Python mathematical modules')\n",
"\n",
"# import numpy for math\n",
"from numpy import array, isnan\n",
"from numpy import exp, sqrt, log, log10, sign, abs\n",
"from numpy import arcsin, arcsinh, sin, cos, pi\n",
"from numpy import average, std, histogram, percentile\n",
"from numpy.random import random, choice\n",
"\n",
"# import scipy for scientific computing\n",
"from scipy import stats\n",
"from scipy.optimize import curve_fit\n",
"\n",
"# import sci-kit for linear regressions\n",
"from sklearn.neighbors import BallTree\n",
"from sklearn.preprocessing import PolynomialFeatures\n",
"from sklearn.linear_model import LinearRegression, PoissonRegressor"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {
"code_folding": [],
"hidden": true,
"hide_input": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
" ^ [code] (for main: importing Python visualization modules)\n"
]
}
],
"source": [
"designate('importing Python visualization modules')\n",
"\n",
"# import bokeh for plotting graphs\n",
"from bokeh.plotting import figure\n",
"from bokeh.io import output_notebook, show\n",
"from bokeh.layouts import row as Row\n",
"from bokeh.models import HoverTool, ColumnDataSource\n",
"from bokeh.models import Circle, LinearAxis, Range1d\n",
"\n",
"# import ipyleaflet and branca for plotting maps\n",
"from ipyleaflet import Map, Marker, basemaps, CircleMarker, LayerGroup\n",
"from ipyleaflet import WidgetControl, ScaleControl, FullScreenControl, LayersControl\n",
"from branca.colormap import linear as Linear, StepColormap\n",
"\n",
"# import iPython for javascript based notebook controls\n",
"from IPython.display import Javascript, display, FileLink\n",
"\n",
"# import ipywidgets for additional widgets\n",
"from ipywidgets import Label, HTML, Button, Output, Box, VBox, HBox"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {
"code_folding": [
3
],
"hidden": true,
"hide_input": true,
"scrolled": true
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
" ^ [code] (for tools: inferring fuzzy match)\n"
]
}
],
"source": [
"designate('inferring fuzzy match', 'tools')\n",
"\n",
"# subroutine for fuzzy matching\n",
"def fuzzy_match(text, options):\n",
" \"\"\"Infer closest match of text from a list of options.\n",
" \n",
" Arguments:\n",
" text: str, entered text\n",
" options: list of str, the options\n",
" \n",
" Returns:\n",
" str, the closest match\n",
" \"\"\"\n",
" \n",
" # perform fuzzy search to get closest match\n",
" fuzzies = [(option, fuzz.ratio(text, option)) for option in options]\n",
" fuzzies.sort(key=lambda pair: pair[1], reverse=True)\n",
" inference = fuzzies[0][0]\n",
" \n",
" return inference"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {
"code_folding": [
3
],
"hidden": true,
"hide_input": true
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
" ^ [code] (for tools: truncating field names)\n"
]
}
],
"source": [
"designate('truncating field names', 'tools')\n",
"\n",
"# truncate field names to first capital\n",
"def truncate_field_name(name, size=5, minimum=4, maximum=15):\n",
" \"\"\"Truncate a name to the first captial letter past the minimum.\n",
" \n",
" Arguments:\n",
" name: str, the name for truncation\n",
" size: the final size of the truncation\n",
" minimum=4: int, minimum length of name\n",
" maximum=15: int, maximum length of name\n",
" \n",
" Returns\n",
" str, truncated name\n",
" \"\"\"\n",
" \n",
" # chop name at maximum and capitalize\n",
" name = name[-maximum:]\n",
" name = name[0].capitalize() + name[1:]\n",
" \n",
" # make stub starting at minimum length\n",
" length = minimum\n",
" stub = name[-length:]\n",
" while not stub[0].isupper():\n",
" \n",
" # add to length\n",
" length += 1\n",
" stub = name[-length:]\n",
" \n",
" # only pass size\n",
" stub = stub[:size]\n",
" \n",
" return stub"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {
"code_folding": [
3
],
"hidden": true,
"hide_input": true
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
" ^ [code] (for tools: entitling a name by capitalizing)\n"
]
}
],
"source": [
"designate('entitling a name by capitalizing', 'tools')\n",
"\n",
"# entitle function to capitalize a word for a title\n",
"def make_title(word):\n",
" \"\"\"Entitle a word by capitalizing the first letter.\n",
" \n",
" Arguments:\n",
" word: str\n",
" \n",
" Returns:\n",
" str\n",
" \"\"\"\n",
" \n",
" # capitalize first letter\n",
" word = word[0].upper() + word[1:]\n",
" \n",
" return word"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {
"code_folding": [
3
],
"hidden": true,
"hide_input": true
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
" ^ [code] (for tools: resolving country name and code)\n"
]
}
],
"source": [
"designate('resolving country name and code', 'tools')\n",
"\n",
"# resolving country name and codes\n",
"def resolve_country_code(country, code):\n",
" \"\"\"Resolve the country code from given information.\n",
" \n",
" Arguments:\n",
" country: str, country name as input\n",
" code: str, country code as input\n",
" \n",
" Returns:\n",
" (str, str) tuple, the country name and country code\n",
" \"\"\"\n",
" \n",
" # check for code\n",
" if code:\n",
" \n",
" # find closest matching code\n",
" code = fuzzy_match(code, [member for member in codes.values()])\n",
" country = countries[code]\n",
" \n",
" # if no code, but a country is given\n",
" if not code and country:\n",
" \n",
" # find closest matching country\n",
" country = fuzzy_match(country, [member for member in codes.keys()])\n",
" code = codes[country]\n",
" \n",
" # if there's no code, check the country\n",
" if not code and not country:\n",
" \n",
" # default to all countries\n",
" country = 'All countries'\n",
" code = ''\n",
" \n",
" return country, code"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {
"code_folding": [
3
],
"hidden": true,
"hide_input": true
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
" ^ [code] (for introspection: scanning notebook for cells)\n"
]
}
],
"source": [
"designate('scanning notebook for cells', 'introspection')\n",
"\n",
"# scan notebook for cell information\n",
"def get_cell_info():\n",
" \"\"\"Scan the notebook and collect cell information.\n",
"\n",
" Arguments:\n",
" None\n",
"\n",
" Returns:\n",
" list of dicts\n",
" \"\"\"\n",
"\n",
" # open the notebook file \n",
" with open('regressions_ksenia_2.ipynb', 'r', encoding='utf-8') as pointer:\n",
" \n",
" # and read its contents\n",
" contents = json.loads(pointer.read())\n",
"\n",
" # get all cells\n",
" cells = contents['cells']\n",
"\n",
" return cells"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {
"hidden": true,
"hide_input": true
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
" ^ [code] (for main: defining global variables)\n"
]
}
],
"source": [
"designate('defining global variables')\n",
"\n",
"# ignore runtime warnings\n",
"warnings.filterwarnings('ignore')\n",
"\n",
"# set pandas optinos\n",
"pandas.set_option(\"display.max_rows\", None)\n",
"pandas.set_option(\"display.max_columns\", None)\n",
"\n",
"# begin optimizations list for previous optimizations\n",
"optimizations = []\n",
"\n",
"# establish genera and colors\n",
"classification = ['Unknown', 'Other', 'Aedes', 'Anopheles', 'Culex']\n",
"colors = ['gray', 'green', 'crimson', 'orange', 'magenta']\n",
"\n",
"# create indicator colors to be used on plots\n",
"indicators = {genus: color for genus, color in zip(classification, colors)}\n",
"indicators.update({'All': 'blue'})\n",
"\n",
"# initiate regression modes\n",
"regressions = {mode: {} for mode in ('linear', 'quadratic', 'exponential', 'power', 'gaussian')}\n",
"\n",
"# define cancellation message\n",
"cancellation = 'no fit achieved'\n",
"\n",
"# initialize memory dictionary for latitude, longitude measurements\n",
"memory = {}\n",
"\n",
"# define template for units\n",
"template = {'distance': '(km)', 'interval': '(d)', 'lag': '(d)', 'confidence': '', 'cutoff': '', 'inclusion': ''}\n",
"template.update({'mode': '', 'genus': '', 'records': '', 'pairs': '', 'coverage': ''})\n",
"template.update({'s.e.': '(larvae)', 'correlation': '', 'R^2': '', 'pvalue': '', 'equation': ''})\n",
"template.update({'slope': '(larvae/feature)', 'center': '(feature)', 'onset': '(feature)'})\n",
"template.update({'curvature': '(larvae/feature^2)', 'height': '(larvae)', 'rate': '(/feature)'})\n",
"template.update({'power': '', 'spread': '(feature^2)'})\n",
"\n",
"# define units\n",
"making = lambda unit: lambda feature: unit.replace('feature', truncate_field_name(feature))\n",
"units = {field: making(unit) for field, unit in template.items()}\n",
"\n",
"# make doppelganger for navigation\n",
"doppelganger = get_cell_info()"
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {
"code_folding": [],
"hidden": true,
"hide_input": false,
"scrolled": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
" ^ [code] (for main: import status)\n",
"modules imported.\n"
]
}
],
"source": [
"designate('import status')\n",
"\n",
"# print status\n",
"print('modules imported.')"
]
},
{
"cell_type": "markdown",
"metadata": {
"heading_collapsed": true,
"hide_input": true
},
"source": [
"### Notes on Navigation"
]
},
{
"cell_type": "markdown",
"metadata": {
"hidden": true,
"hide_input": false
},
"source": [
"#### General Organization:\n",
"\n",
"\n",
"- The notebook is organized in two main sections, with documentation near the top and user settings and plots in the second half. Relevant subroutines are generally presented in the documentation sections, or at the end of the preceding section.\n",
"\n",
"\n",
"- There are several sections that require the input of user selected parameters. Click Apply to see the affect of changing those parameters on that section only, then Propagate to propagate the changes down the notebook. Clicking Both will do both these actions.\n",
"\n",
"#### Running Cells:\n",
"\n",
"\n",
"- Upon loading the notebook, most plots will not be visible. It is necessary to run all the code by selecting \"Restart & Run All\" from the \"Kernel\" menu and clicking \"Restart and Run All Cells\" to confirm.\n",
"\n",
"\n",
"- This action may be performed at any time, for instance after altering the parameters or changing the code in other ways.\n",
"\n",
"\n",
"- Alternatively, any single block of code may be rerun by highlighting the block and pressing Shift-Return.\n",
"\n",
"\n",
"- Also, under the \"Cell\" menu is the option to \"Run All Below\" the currently selected cell, or to simply \"Run Cells\" that have been selected.\n",
"\n",
"\n",
"#### Processing Indicator:\n",
"\n",
"- In the upper righthand corner it says \"Python 3\" with a circle. If this circle is black, it means the program is still processing. A hollow circle indicates all processing is done.\n",
"\n",
"\n",
"#### Collapsible Headings and Code Blocks:\n",
"\n",
"- The Jupyter notebook format features collapsible code sections and headings. An entire section may be collapsed by clicking on the downward pointing triangle at the left of the heading. \n",
"\n",
"\n",
"- Likewise, blocks of code are loaded hidden from view, and designated with '[code] ^'. Click on the '[code] ^' text and select '^' from the toolbar next to \"Download\" to expand the code. Blocks with user parameters to enter are marked with *** settings ---->.\n",
"\n",
"\n",
"- All code blocks may be hidden or exposed by toggling the eye icon in the toolbar.\n",
"\n",
"\n",
"- Large output boxes may be collapsed to scrollable window by clicking to the left, and may also be collapsed completely by double-clicking in the same area. Clicking on the \"...\" will reopen the area.\n",
"\n",
"\n",
"#### Hosting by myBinder:\n",
"\n",
"\n",
"- This notebook is hosted by myBinder.org in order to maintain its interactivity within a browser without the user needing an established Python environment. Unfortunately, connection with myBinder.org will break after 10 minutes of inactivity. In order to reconnect you may use the link under \"Browser Link\" to reload.\n",
"\n",
"\n",
"- The state of the notebook may be saved by clicking the leftmost cloud icon in the toolbar to the right of the Download button. This saves the notebook to the browser. The rightmost cloud icon can then retrieve this saved state in a newly opened copy. Often reopening a saved version comes with all code blocks visible, so toggle this using the eye icon in the toolbar.\n",
"\n",
"\n",
"- The following browser link will reload the notebook in case the connection is lost:\n",
"https://mybinder.org/v2/git/https%3A%2F%2Fmattbandel%40bitbucket.org%2Fmattbandel%2Fglobe-mosquitoes-regressions.git/master?filepath=regressions.ipynb"
]
},
{
"cell_type": "code",
"execution_count": 12,
"metadata": {
"code_folding": [
3
],
"hidden": true,
"hide_input": true
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
" ^ [code] (for navigation: looking for particular cell)\n"
]
}
],
"source": [
"designate('looking for particular cell', 'navigation')\n",
"\n",
"# function to look for cells with a particular text snippet\n",
"def seek_text_in_cell(text):\n",
" \"\"\"Look for a particular text amongst the cells.\n",
" \n",
" Arguments:\n",
" text: str, the text to search for\n",
" \n",
" Returns:\n",
" list of int, the cell indices.\n",
" \"\"\"\n",
" \n",
" # search for cells \n",
" indices = []\n",
" for index, cell in enumerate(doppelganger):\n",
" \n",
" # search for text in source\n",
" if any([text in line.replace(\"'{}'\".format(text), '') for line in cell['source']]):\n",
" \n",
" # add to list\n",
" indices.append(index)\n",
" \n",
" return indices"
]
},
{
"cell_type": "code",
"execution_count": 13,
"metadata": {
"code_folding": [
3
],
"hidden": true,
"hide_input": true,
"scrolled": true
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
" ^ [code] (for navigation: jumping to a particular cell)\n"
]
}
],
"source": [
"designate('jumping to a particular cell', 'navigation')\n",
"\n",
"# jump to a particular cell\n",
"def jump_to_cell(identifier):\n",
" \"\"\"Jump to a particular cell.\n",
" \n",
" Arguments:\n",
" identifier: int or str\n",
" \n",
" Returns:\n",
" None\n",
" \"\"\"\n",
" \n",
" # try to look for a string\n",
" try:\n",
" \n",
" # assuming string, take first index with string\n",
" index = seek_text_in_cell(identifier)\n",
" \n",
" # otherwise assume int\n",
" except (TypeError, IndexError):\n",
" \n",
" # index is identifier\n",
" index = identifier \n",
" \n",
" # scroll to cell\n",
" command = 'IPython.notebook.scroll_to_cell({})'.format(index)\n",
" display(Javascript(command))\n",
" \n",
" return"
]
},
{
"cell_type": "code",
"execution_count": 14,
"metadata": {
"code_folding": [
3
],
"hidden": true,
"hide_input": true,
"scrolled": true
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
" ^ [code] (for navigation: executing cell range by text)\n"
]
}
],
"source": [
"designate('executing cell range by text', 'navigation')\n",
"\n",
"# execute cell range command\n",
"def execute_cell_range(start, finish):\n",
" \"\"\"Execute a cell range based on text snippets.\n",
" \n",
" Arguments:\n",
" start: str, text from beginning cell of range\n",
" finish: str, text from ending cell of range\n",
" \n",
" Returns:\n",
" None\n",
" \"\"\"\n",
" \n",
" # find start and finish indices, adding 1 to be inclusive\n",
" opening = seek_text_in_cell(start)[0] \n",
" closing = seek_text_in_cell(finish)[0]\n",
" bracket = (opening, closing)\n",
" \n",
" # make command\n",
" command = 'IPython.notebook.execute_cell_range' + str(bracket)\n",
" \n",
" # perform execution\n",
" display(Javascript(command))\n",
" \n",
" return None"
]
},
{
"cell_type": "code",
"execution_count": 15,
"metadata": {
"code_folding": [
3
],
"hidden": true,
"hide_input": true
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
" ^ [code] (for navigation: refreshing cells by relative position)\n"
]
}
],
"source": [
"designate('refreshing cells by relative position', 'navigation')\n",
"\n",
"# execute cell range command\n",
"def refresh_cells_by_position(start, finish=None):\n",
" \"\"\"Refresh a particular cell relative to current cell.\n",
" \n",
" Arguments:\n",
" start: int, the first cell offset\n",
" finish=None: int, the second cell offset\n",
" \n",
" Returns:\n",
" None\n",
" \"\"\"\n",
" \n",
" # make offset into a string\n",
" stringify = lambda offset: str(offset) if offset < 0 else '+' + str(offset)\n",
" \n",
" # default finish to start\n",
" finish = finish or start\n",
" \n",
" # make command\n",
" command = 'IPython.notebook.execute_cell_range('\n",
" command += 'IPython.notebook.get_selected_index()' + stringify(start) + ','\n",
" command += 'IPython.notebook.get_selected_index()' + stringify(finish + 1) + ')'\n",
" \n",
" # perform execution\n",
" display(Javascript(command))\n",
" \n",
" return None"
]
},
{
"cell_type": "code",
"execution_count": 16,
"metadata": {
"code_folding": [
3
],
"hidden": true,
"hide_input": true,
"scrolled": true
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
" ^ [code] (for navigation: revealing open cells)\n"
]
}
],
"source": [
"designate('revealing open cells', 'navigation')\n",
"\n",
"# outline headers\n",
"def reveal_open_cells(cells):\n",
" \"\"\"Outline the headers and collapsed or uncollapsed state.\n",
" \n",
" Arguments:\n",
" cells: dict\n",
" \n",
" Returns:\n",
" list of int, the indices of visible cells\n",
" \"\"\"\n",
" \n",
" # search through all cells for headers\n",
" indices = []\n",
" visible = True\n",
" for index, cell in enumerate(cells):\n",
" \n",
" # check for text\n",
" header = False\n",
" if any(['###' in text for text in cell['source']]):\n",
" \n",
" # check for header and visible state\n",
" header = True\n",
" visible = True\n",
" if 'heading_collapsed' in cell['metadata'].keys(): \n",
"\n",
" # set visible flag\n",
" visible = not cell['metadata']['heading_collapsed']\n",
"\n",
" # if either header or visible\n",
" if header or visible: \n",
"\n",
" # add to indices\n",
" indices.append(index)\n",
" \n",
" return indices"
]
},
{
"cell_type": "code",
"execution_count": 17,
"metadata": {
"code_folding": [
3
],
"hidden": true,
"hide_input": true
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
" ^ [code] (for navigation: gauging cell size)\n"
]
}
],
"source": [
"designate('gauging cell size', 'navigation')\n",
"\n",
"# measure a cell's line count and graphics\n",
"def measure_cell_info(cell):\n",
" \"\"\"Gauge a cell's line count and graphic size.\n",
" \n",
" Arguments:\n",
" cell: cell dict\n",
" \n",
" Returns:\n",
" (int, boolean) tuple, line count and graphic boolean\n",
" \"\"\"\n",
" \n",
" # check for display data\n",
" graphic = False\n",
" displays = [entry for entry in cell.setdefault('outputs', []) if entry['output_type'] == 'display_data']\n",
" if len(displays) > 0:\n",
" \n",
" # check for many displays or one long one\n",
" if len(displays) > 2 or '…' in displays[0]['data']['text/plain'][0]:\n",
"\n",
" # switch graphic to true\n",
" graphic = True\n",
"\n",
" # determine total lines of text in source, 2 by default\n",
" length = 2\n",
" \n",
" # determine executions\n",
" executions = [entry for entry in cell.setdefault('outputs', []) if entry['output_type'] == 'execute_result']\n",
" for execution in executions:\n",
" \n",
" # add to length\n",
" length += execution['execution_count']\n",
" \n",
" # check hide-input state\n",
" if not cell['metadata'].setdefault('hide_input', False):\n",
" \n",
" # add lines to source\n",
" source = cell['source']\n",
" for line in source:\n",
"\n",
" # split on newlines\n",
" length += sum([int(len(line) / 100) + 1 for line in line.split('\\n')])\n",
"\n",
" return length, graphic"
]
},
{
"cell_type": "code",
"execution_count": 18,
"metadata": {
"code_folding": [
3
],
"hidden": true,
"hide_input": true
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
" ^ [code] (for navigation: bookmarking cells for screenshotting)\n"
]
}
],
"source": [
"designate('bookmarking cells for screenshotting', 'navigation')\n",
"\n",
"# bookmark which cells to scroll to\n",
"def bookmark_cells(cells):\n",
" \"\"\"Bookmark which cells to scroll to.\n",
"\n",
" Arguments:\n",
" cells: list of dicts\n",
" visibles: list of ints\n",
"\n",
" Returns:\n",
" list of ints\n",
" \"\"\"\n",
"\n",
" # set page length criterion and initialize counters\n",
" criterion = 15\n",
" accumulation = criterion + 1\n",
"\n",
" # determine scroll indices\n",
" bookmarks = []\n",
" visibles = reveal_open_cells(cells)\n",
" for index in visibles:\n",
"\n",
" # measure cell and add to total\n",
" cell = cells[index]\n",
" length, graphic = measure_cell_info(cell)\n",
" accumulation += length\n",
" \n",
" # compare to criterion\n",
" if accumulation > criterion or graphic:\n",
"\n",
" # add to scrolls and reset\n",
" bookmarks.append(index)\n",
" accumulation = length\n",
"\n",
" # for a graphic, make sure accumulation is already maxed\n",
" if graphic:\n",
" \n",
" # add to accumulation\n",
" accumulation = criterion + 1\n",
"\n",
" return bookmarks"
]
},
{
"cell_type": "code",
"execution_count": 19,
"metadata": {
"code_folding": [
3
],
"hidden": true,
"hide_input": true,
"scrolled": true
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
" ^ [code] (for buttons: propagating setting changes across cells)\n"
]
}
],
"source": [
"designate('propagating setting changes across cells', 'buttons')\n",
"\n",
"# def propagate\n",
"def propagate_setting_changes(start, finish, finishii, descriptions=['Apply', 'Propagate', 'Both']):\n",
" \"\"\"Propagate changes across all code cells given by the headings.\n",
" \n",
" Arguments:\n",
" start: str, top header\n",
" finish: str, update stopping point\n",
" finishii: str, propagate stopping point\n",
" descriptions: list of str\n",
" \n",
" Returns:\n",
" None\n",
" \"\"\"\n",
" \n",
" # define jump points\n",
" cues = [(start, finish), (finish, finishii), (start, finishii)]\n",
" \n",
" # make buttons\n",
" buttons = []\n",
" buttoning = lambda start, finish: lambda _: execute_cell_range(start, finish)\n",
" for description, cue in zip(descriptions, cues):\n",
"\n",
" # make button\n",
" button = Button(description=description)\n",
" button.on_click(buttoning(*cue))\n",
" buttons.append(button)\n",
"\n",
" # display\n",
" display(HBox(buttons))\n",
" \n",
" return None"
]
},
{
"cell_type": "code",
"execution_count": 20,
"metadata": {
"code_folding": [
3
],
"hidden": true,
"hide_input": true
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
" ^ [code] (for buttons: navigating to main sections)\n"
]
}
],
"source": [
"designate('navigating to main sections', 'buttons')\n",
"\n",
"# present buttons to jump to particular parts of the notebook\n",
"def navigate_notebook():\n",
" \"\"\"Guide the user towards regression sections with buttons.\n",
" \n",
" Arguments:\n",
" None\n",
" \n",
" Returns:\n",
" None\n",
" \"\"\"\n",
"\n",
" # define jump points\n",
" descriptions = ['Top', 'Settings', 'Filter', 'Weights', 'Data', 'Map']\n",
" cues = ['# Mosquitoe Larvae Regressions', '### Setting the Parameters', '### Filtering Records']\n",
" cues += ['### Defining the Weighting Scheme', '### Viewing the Data Table', '### Visualizing on a Map', ]\n",
" \n",
" # make buttons\n",
" buttons = []\n",
" buttoning = lambda cue: lambda _: jump_to_cell(cue)\n",
" for description, cue in zip(descriptions, cues):\n",
"\n",
" # make button\n",
" button = Button(description=description)\n",
" button.on_click(buttoning(cue))\n",
" buttons.append(button)\n",
"\n",
" # display\n",
" display(HBox(buttons))\n",
" \n",
" return None"
]
},
{
"cell_type": "code",
"execution_count": 21,
"metadata": {
"code_folding": [
3
],
"hidden": true,
"hide_input": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
" ^ [code] (for buttons: guiding to regression modes)\n"
]
}
],
"source": [
"designate('guiding to regression modes', 'buttons')\n",
"\n",
"# present buttons to choose the regression part of the notebook\n",
"def jump_to_regression():\n",
" \"\"\"Guide the user towards regression sections with buttons.\n",
" \n",
" Arguments:\n",
" None\n",
" \n",
" Returns:\n",
" None\n",
" \"\"\"\n",
"\n",
" # make buttons\n",
" buttons = []\n",
" buttoning = lambda mode: lambda _: jump_to_cell('### ' + mode.capitalize() + ' Regression')\n",
" for mode in regressions.keys():\n",
"\n",
" # make button\n",
" button = Button(description=mode.capitalize())\n",
" button.on_click(buttoning(mode))\n",
" buttons.append(button)\n",
"\n",
" # display\n",
" display(HBox(buttons))\n",
" \n",
" return None"
]
},
{
"cell_type": "markdown",
"metadata": {
"hide_input": true
},
"source": [
"### Getting Started"
]
},
{
"cell_type": "markdown",
"metadata": {
"hide_input": true
},
"source": [
"- Select \"Restart & Run All\" from the Kernel menu, and confirm by clicking on \"Restart and Run All Cells\" and wait for the processing to stop (the black circle in the upper right corner next to \"Python 3\" will turn hollow).\n",
"\n",
"\n",
"- Use a Settings button from a navigation menu like the one below to navigate to the Settings section.\n",
"\n",
"\n",
"- Find the ^ [code] block marked with *** setings ----->, and open it using the \"^\" button in the toolbar at the top of the page. Begin inputting your settings and apply the changes with the buttons. Then move on to the next section and apply those settings as well."
]
},
{
"cell_type": "code",
"execution_count": 22,
"metadata": {
"hide_input": true
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
" ^ [code] (for main: navigation buttons)\n"
]
},
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "fde83cde28cd4fa096a519dd6f425d4a",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
"HBox(children=(Button(description='Top', style=ButtonStyle()), Button(description='Settings', style=ButtonStyl…"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "2e01dc09c2f9476db2c9e6da1a665fac",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
"HBox(children=(Button(description='Linear', style=ButtonStyle()), Button(description='Quadratic', style=Button…"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"designate('navigation buttons')\n",
"\n",
"# set two navigation buttons\n",
"navigate_notebook()\n",
"jump_to_regression()"
]
},
{
"cell_type": "markdown",
"metadata": {
"heading_collapsed": true,
"hide_input": true
},
"source": [
"### Background: A Quick Statistics Rundown"
]
},
{
"cell_type": "markdown",
"metadata": {
"hidden": true
},
"source": [
"#### Definition of a linear regression\n",
"\n",
"A straight line can be represented geometrically in the following way:\n",
"\n",
"\\begin{align*}\n",
"y=\\beta_0+\\beta_1 X + \\varepsilon\n",
"\\end{align*}\n",
"\n",
"where $X$ is the independent variable, $y$ is the dependent variable, $\\beta_1$ is the slope of the line, $\\beta_0$ is the intercept, and $\\varepsilon$ is the error term. Any particular line has a single value for $\\beta_0$ and for $\\beta_1$. Thus the above equation describes a family of lines, each with a unique value for $\\beta_0$ and for $\\beta_1$. \n",
"\n",
"If $X$ represents air temperature at a sampling site, for instance, and $y$ represents the number of mosquito larvae found there, then $\\beta_0$ describes the average number of larvae at zero degrees, and $\\beta_1$ describes the average change in the number of larvae for every single degree increase. The equation serves as a model for the relationship between air temperature and mosquito counts.\n",
"\n",
"Given a set of observations represented as points along an X and y axis, regression can estimate the values for $\\beta_0$ and $\\beta_1$ and give insight into a relationship between the dependent and independent variables.\n",
"\n",
"#### Some notes\n",
"\n",
"There are several points to make about this process:\n",
"\n",
"- The family of lines must be specified beforehand. $y=\\beta_0+\\beta_1 X$ for instance, only describes a family of straight lines. The best fitting straight line may be a poor description of data with a curving relationship. To this end, six different families (called \"modes\" here) are used in this notebook, each with particular characteristics. \n",
"\n",
"\n",
"- For some modes, the best fitting parameters may be estimated simply. In these cases, the Ordinary Least Squares equation is sufficient to find the best fit with one calculation. In other cases, however, the fit must be found with Weighted Least Squares, needing an initial starting guess, and several subsequent iterations to find the best fit.\n",
"\n",
"\n",
"- The initial guess must already be somewhat close to the best fitting parameters, or there is a chance the nonlinear algorithm will find only a local best and not a global best. There may be several \"basins of attraction,\" and an initial guess in the wrong basin will lead to only a local best. \n",
"\n",
"\n",
"- In some cases, the nonlinear algorithm fails to find a fit at all, generally because the data is not distributed in a way that suggests the proposed relationship strongly enough.\n",
"\n",
"\n",
"- In other cases, nonlinear or linear regression can produce seemingly absurd results. For example, an entire Gaussian curve has a bell shape, but just one of the tails is a good approximation to an exponential curve. If the data distribution does not clearly suggest a Gaussian curve, the regression may find that fitting a huge Gaussian with only its tail immersed amongst the data points gives a closer fit to the data than a more reasonably sized complete Gaussian.\n",
"\n",
"#### Statistical tools\n",
"\n",
"- Standard error is an excellent tool for judging how well a model fits the data.\n",
" 1) The [standard error of estimate](https://en.wikipedia.org/wiki/Standard_error) ($s_e$) is the typical difference found between the estimates from the regression and the actual observations. In particular, it is the standard deviation of the sampling distribution.\n",
"\n",
"- The best fitting line may still be a poor description of the relationship. There are a few summary statistics given here to indicate the quality of the fit:\n",
" \n",
" 1) [Pearson's correlation coefficient](https://en.wikipedia.org/wiki/Pearson_correlation_coefficient) ($\\rho$) is a unitless statistic between -1 and 1 that represents the linear correlation between X and y. A value of 1 represents a positive linear relationship between the dependent and independent variable. A value of 0 represents no linear correlation, and a value of -1 indictes a negative linear relationship.\n",
" \n",
" 2) The [coefficient of determination](https://en.wikipedia.org/wiki/Coefficient_of_determination) ($R^2$) quantifies how much of the variabiation in y can be explained by the model. It is a unitless statistic between 0 and 1. An $R^2$ of 1 can indicate that the model fits the observations perfectly whereas a value of 0 can indicate that the model fits the data poorly. **Note** that $R^2$ will always increase with more predictors but that *does not mean* that the model is better.\n",
" \n",
" \n",
"- Correlation and $R^2$ are excellent for understanding how well a model fits the data if we have linear data. Standard error is useful for nonlinear models. In general, these measures are mostly used for linear relationships, and are less frequently used for nonlinear modes. The goal is to fit the data as closely as possible, but not at the expense of its ability to generalize to the population as a whole.\n",
"\n",
"#### Caution\n",
"\n",
"- Note also that the statistics above are succeptible to a number of problems. First, if the dataset is small, then correlation and $R^2$ may not be quite accurate. Second, there is a chance that the observations are not representative of the population, and the correlation hinted at by the model is due to sampling bias. A fun explantion of this is the [Datasaurus](https://www.autodeskresearch.com/publications/samestats). All of these plots have the same summary statistics!\n",
"![Datasaurus](https://d2f99xq7vri1nk.cloudfront.net/DinoSequentialSmaller.gif \"datasaurus\")\n",
"\n",
"\n",
"#### Hypothesis testing\n",
"\n",
"- Additionally, a probability (p-value) is calculated to assess the statistical significance of the relationship between X and y. Remember that a probability ranges from 0 to 1. This [hypothesis test](http://www.biostathandbook.com/hypothesistesting.html) is structured as follows:\n",
"\n",
" 1) Assert the null hypothesis ($H_0$). $H_0$ is that there is no linear relationship between X and y.\n",
" \n",
" 2) Choose a critical value ($\\alpha$ -- typically set to 0.05 but is sometimes set to a value as high at 0.1 and as low as 0.001) which we will use to conduct a Hypothesis Test.\n",
" \n",
" 3) Next we calculate a [test statistic](https://en.wikipedia.org/wiki/Pearson_correlation_coefficient#Testing_using_Student's_t-distribution) ($t* = r\\sqrt{\\frac{n-2}{1-r^2}}$ where $r$ is the sample correlation coefficient and $n$ is the number of samples). Use the Test Statistic to find the corresponding p-value (use a [table](http://www.ttable.org/) or the [calculator](http://www.ttable.org/student-t-value-calculator.html)). Note that the Test Statistic will differ between hypotheses.\n",
" \n",
" 4) Compare the p-value found in step 3 to the $\\alpha$ value selected in step 2. A p-value less than $\\alpha$ means that we reject the null hypothesis which means that there is not no linear relationship between X and y which often means that there is a linear relationship between X and y. A p-value greater than $\\alpha$ means that we fail to reject the null hypothesis meaning that there is likely no linear relationship between X and y. It is important to choose a critical value before performing the study, because it is very easy to grant your study significance by calculating the p-value first and then choosing a critical value higher than the one you calculated (called [\"p-hacking\"](https://en.wikipedia.org/wiki/Data_dredging)).\n",
"\n",
"#### Considerations\n",
"\n",
"- Domain knowledge is key for translating the regression models to real situations. The model can only explain the given observations. In particular, a regression model is an interpolation model. We should be wary of [extrapolation](https://online.stat.psu.edu/stat501/lesson/12/12.8) beyond the range of the data unless it is properly justifiable -- remember to ask yourself, is this model logical?\n",
"\n",
"\n",
"- Considering confounding variables while conducting your analysis. A statistically significant model that reasonably models a relationship may still be misleading due to confounding variables. For instance, rainier weather is usually associated with cooler temperatures. An observation of a large numbers of larvae on a day with cooler temperatures may really reflect the relationship between larvae and rainfall. In this example, precipitation would be a confounding variable.\n",
"\n",
"\n",
"- Remember! Correlation is not causation. A representative model that models the data well and is free from confounding variables can only describe a correlation between the dependent and independent variables. We can not answer the question of causation with regression.\n",
"\n",
"\n",
"#### Checking your model\n",
"\n",
"First, note that the true representation of linear data is $y=\\beta_0+\\beta_1 X + \\varepsilon$ and $\\hat{y}=\\hat{\\beta}_0+\\hat{\\beta}_1 X$ is our model. We can find an estimation of the error, $\\varepsilon$, by: $y-\\hat{y} = \\hat{\\varepsilon}$ where $\\hat{\\varepsilon}$ is the estimated error.\n",
"\n",
"\n",
"An excellent [diagnostic](http://sphweb.bumc.bu.edu/otlt/MPH-Modules/BS/R/R5_Correlation-Regression/R5_Correlation-Regression7.html) for linear regression is a plot of the residuals versus fitted values plot. The residuals are an estimation of the error as seen above and the fitted values are our estimated response. In the plot below, we see the residuals on the y-axis and the fitted values on the x-axis. We can learn a lot from the following figure:\n",
"![ConstantVar](https://www.researchgate.net/profile/John_Hodsoll/publication/318911883/figure/fig1/AS:535566758277120@1504700473370/Diagnostic-plots-of-the-residuals-plotted-against-fitted-values-of-regression-model-to.png \"constantvar\")\n",
"\n",
" A) Suggests that a linear regression could be a very good model.\n",
" \n",
" B) Suggests that we should use Weighted Least Squares to account for non constant variance\n",
" \n",
" C) Suggests that our data is not independent. This could suggest that some of our predictors are correlated and we should consider removing some of them. It could also suggest that our data is correlated in time or space and should be processed using time series or spatial statistics methods.\n",
" \n",
" D) Suggests we do not have a linear relationship and should consider non linear methodology.\n",
"\n",
"\n",
"Note that Ordinary Least Squares assumes a constant variance among the errors meaning that the errors around each observation have the same standard deviation. Weighted Least Squares allows for non constant variance.\n",
"\n",
"\n",
"For this particular study, there is an additional level of complexity because data from two protocols are being combined, and are therefore not necessarily concurrent. To this end, observations from one protocol are paired with observations from another protocol and weighted according to their closeness in space and times.\n",
"\n",
"#### To be continued + Summary\n",
"\n",
"Details of the Linear and Non-Linear Weighted Least Squares algorithms used in this notebook are found in the following sections, as well as the statistical methods for measuring model quality, and specific descriptions of each regression mode.\n",
"\n",
"In summary, the process will be performed as follows:\n",
"\n",
"- Retrieve records from the GLOBE API, process the data, and prune off outliers.\n",
"- Assemble associations between the two protocols, weighing them according to the chosen weighting parameters.\n",
"- Perform Linear Weighted Least Squares on an approximate problem to get reasonable starting parameter values.\n",
"- Evaluate the model according to standard error, Pearson's correlation coefficient, coefficient of determination, and p-value."
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### What the code below is doing:\n",
"\n",
"#### Summary\n",
"\n",
"In this notebook, we are looking at the relationship between data from the MHM and other GLOBE data. In order to do so, we perform semi-complex regressions to fit a handful of different curves. See more info on which curves we use further down.\n",
"\n",
"#### Under the hood\n",
"\n",
"In particular, we use a non linear method to fit the curves called Non-Linear Weighted Least Square. This method requires initial guesses for the parameters of each function. These parameters are estimated by getting the coefficients from a [Poisson Regression](http://www.adasis-events.com/statistics-blog/2012/11/26/what-is-the-difference-between-linear-logistic-and-poisson-r.html) (implented using scikit-learn's [sklearn.linear_model.PoissonRegressor](https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.PoissonRegressor.html#sklearn.linear_model.PoissonRegressor)). These parameters are then used in the Levenberg-Marquardt algorithm (an algorithm for Non-Linear Weighted Least Squares) which was impleneted using the scipy function [scipy.optimize.curve_fit](https://docs.scipy.org/doc/scipy/reference/generated/scipy.optimize.curve_fit.html)."
]
},
{
"cell_type": "code",
"execution_count": 23,
"metadata": {
"hidden": true,
"hide_input": true
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
" ^ [code] (for main: navigation buttons)\n"
]
},
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "9e240c17119e474c8bcead31eedf4e2a",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
"HBox(children=(Button(description='Top', style=ButtonStyle()), Button(description='Settings', style=ButtonStyl…"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "986e8f7939b4436b84fb1af46684286b",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
"HBox(children=(Button(description='Linear', style=ButtonStyle()), Button(description='Quadratic', style=Button…"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"designate('navigation buttons')\n",
"\n",
"# set two navigation buttons\n",
"navigate_notebook()\n",
"jump_to_regression()"
]
},
{
"cell_type": "code",
"execution_count": 24,
"metadata": {
"code_folding": [
3
],
"hidden": true,
"hide_input": true
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
" ^ [code] (for graphing: notching ticks in a range)\n"
]
}
],
"source": [
"designate('notching ticks in a range', 'graphing')\n",
"\n",
"# notch function to get evenly spaced points in a range\n",
"def set_ticks(left, right, number=100):\n",
" \"\"\"Notch a number of ticks along a span\n",
" \n",
" Arguments:\n",
" left: float, left axis boundary\n",
" right: float, right axis boundary\n",
" number=100: number ticks\n",
" \n",
" Returns:\n",
" list of floats\n",
" \"\"\"\n",
" \n",
" # get chunk length\n",
" chunk = (right - left) / number\n",
" ticks = [left + index * chunk for index in range(number + 1)]\n",
" \n",
" return ticks"
]
},
{
"cell_type": "code",
"execution_count": 25,
"metadata": {
"code_folding": [
3
],
"hidden": true,
"hide_input": true
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
" ^ [code] (for graphing: sketching functions)\n"
]
}
],
"source": [
"designate('sketching functions', 'graphing')\n",
"\n",
"# sketch a function\n",
"def sketch_plot(*functions, legend=None, span=(-5, 5), title=\"[TEMP TITLE]\", xlab=[\"TEMP X LABEL\"], ylab=\"[TEMP Y LABEL]\"):\n",
" \"\"\"Sketch a function.\n",
" \n",
" Arguments:\n",
" *functions: unpacked list of function objects\n",
" legend=None: list of str\n",
" span=(-5, 5): tuple of floats, the x-axis range\n",
" \n",
" Returns:\n",
" None\n",
" \"\"\"\n",
" \n",
" # begin curve\n",
" curve = figure(x_range=span, plot_width=300, plot_height=300, title=title)\n",
" curve.xaxis.axis_label = xlab\n",
" curve.yaxis.axis_label = ylab\n",
" \n",
" # set colors\n",
" colors = ['red', 'green', 'blue', 'violet', 'cyan', 'orange']\n",
" \n",
" # set default legend\n",
" if not legend:\n",
" \n",
" # set up legend\n",
" legend = [str(index + 1) for index, _ in enumerate(functions)]\n",
" \n",
" # get points\n",
" xs = set_ticks(*span)\n",
" \n",
" # plot functions\n",
" for function, color, name in zip(functions, colors, legend):\n",
" if function == arcsinh:\n",
" # graph line\n",
" points = [{'x': x, 'y': function(x/2)} for x in xs]\n",
" else:\n",
" # graph line\n",
" points = [{'x': x, 'y': function(x)} for x in xs]\n",
" table = ColumnDataSource(pandas.DataFrame(points))\n",
" curve.line(source=table, x='x', y='y', color=color, line_width=1, legend_label=name)\n",
" \n",
" # add hover annotations and legend\n",
" annotations = [('x', '@x'), ('y', '@y')]\n",
" hover = HoverTool(tooltips=annotations)\n",
" curve.add_tools(hover)\n",
" curve.legend.location='top_left'\n",
" \n",
" # show the results\n",
" output_notebook()\n",
" show(curve)\n",
" \n",
" return None"
]
},
{
"cell_type": "code",
"execution_count": 26,
"metadata": {
"code_folding": [
3
],
"hidden": true,
"hide_input": true,
"scrolled": true
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
" ^ [code] (for graphing: annotating graphs)\n"
]
}
],
"source": [
"designate('annotating graphs', 'graphing')\n",
"\n",
"# annotate graphs subroutine\n",
"def annotate_plot(graph, annotations):\n",
" \"\"\"Annotate the graph with annotations.\n",
" \n",
" Arguments:\n",
" graph: bokeh graph object\n",
" annotations: list of (str, str) tuples\n",
" \n",
" Returns:\n",
" graph object\n",
" \"\"\"\n",
" \n",
" # set up hover summary\n",
" summary = \"\"\"\n",
" \n",
" \n",
" \"\"\"\n",
" \n",
" # add annotations\n",
" for field, value in annotations:\n",
" \n",
" # add to summary\n",
" summary += '{}: {}
'.format(field, value)\n",
"\n",
" # setup hovertool\n",
" hover = HoverTool()\n",
" hover.tooltips = summary\n",
" graph.add_tools(hover)\n",
" \n",
" # set up graph legend\n",
" graph.legend.location='top_left'\n",
" graph.legend.click_policy='hide'\n",
" \n",
" return graph"
]
},
{
"cell_type": "code",
"execution_count": 27,
"metadata": {
"code_folding": [],
"hidden": true,
"hide_input": true
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
" ^ [code] (for regressions: issuing a ticket)\n"
]
}
],
"source": [
"designate('issuing a ticket', 'regressions')\n",
"\n",
"# function to issue a default ticket based on settings\n",
"def issue_ticket(settings, genus, mode):\n",
" \"\"\"Issue a blank ticket with default settings.\n",
" \n",
" Arguments:\n",
" settings: dict\n",
" genus: str\n",
" mode: str\n",
" \n",
" Returns:\n",
" dict\n",
" \"\"\"\n",
" \n",
" # begin ticket with all values set to zero\n",
" ticket = {parameter: 0 for parameter, _ in units.items()}\n",
" \n",
" # update with settings\n",
" ticket.update(settings)\n",
" \n",
" # add other default settings\n",
" ticket.update({'genus': genus, 'mode': mode, 'pvalue': 1.0, 'equation': cancellation})\n",
" ticket.update({'coefficients': [0] * (regressions[mode]['polynomial'] + 2)})\n",
" ticket.update({'curve': [0] * regressions[mode]['requirement']})\n",
" \n",
" return ticket"
]
},
{
"cell_type": "code",
"execution_count": 28,
"metadata": {
"code_folding": [],
"hidden": true,
"hide_input": true
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
" ^ [code] (for regressions: running a regression on samples)\n"
]
}
],
"source": [
"designate('running a regression on samples', 'regressions')\n",
"\n",
"# generalized regression function\n",
"def regress(samples, ticket, spy=False, calculate=True):\n",
" \"\"\"Perform regression on the samples, based on the submission ticket.\n",
" \n",
" Arguments:\n",
" samples: list of dicts, the samples\n",
" ticket: dict, the settings\n",
" approximation: float, approximate value for zero\n",
" spy: boolean, verify initial regression fits with plots?\n",
" calculate: boolean, calculate jacobian directly?\n",
" \n",
" Returns:\n",
" dict, the report\n",
" \"\"\" \n",
" \n",
" # resolve regression styles\n",
" mode = ticket['mode']\n",
" size = ticket['records']\n",
" \n",
" # try to run regression\n",
" try:\n",
" \n",
" # get coefficients from linear model\n",
" coefficients = get_coefficients_linear_regression(samples, mode)\n",
"\n",
" # fit non linear model\n",
" curve = perform_nonlinear_regression(samples, mode, coefficients, calculate)\n",
"\n",
" # check the two fits against each other\n",
" if spy:\n",
"\n",
" # verify\n",
" check_optimized_params_with_initial(samples, mode, coefficients, curve)\n",
"\n",
" # assess the model\n",
" assessment = check_model_fit(samples, mode, curve, size)\n",
" ticket.update(assessment)\n",
" \n",
" # but skip for math errors\n",
" except (ZeroDivisionError, TypeError, ValueError, RuntimeError):\n",
" \n",
" # skip\n",
" pass\n",
" \n",
" return ticket"
]
},
{
"cell_type": "code",
"execution_count": 29,
"metadata": {
"code_folding": [],
"hidden": true,
"hide_input": true
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
" ^ [code] (for regressions: assessing a model)\n"
]
}
],
"source": [
"designate('assessing a model', 'regressions')\n",
"\n",
"# assess model fit\n",
"def check_model_fit(samples, mode, curve, size):\n",
" \"\"\"Assess the model by comparing predictions to targets\n",
" \n",
" Arguments:\n",
" samples: list of dicts\n",
" mode: str, the regression mode\n",
" curve: list of floats, the regression parameters\n",
" size: int, number of records\n",
" \n",
" Returns:\n",
" dict\n",
" \"\"\"\n",
" \n",
" # make predictions using model\n",
" matrix = [sample['x'] for sample in samples]\n",
" weights = [sample['weight'] for sample in samples]\n",
" truths = [sample['y'] for sample in samples]\n",
" predictions = [regressions[mode]['function'](entry, *curve) for entry in matrix]\n",
" \n",
" # get validation scores\n",
" validation = calculate_corr_coef(truths, predictions, weights, size)\n",
" \n",
" # create equation\n",
" equation = regressions[mode]['equation']\n",
" for parameter, place in zip(curve, ('β0', 'β1','β2')):\n",
" \n",
" # replace in equation\n",
" equation = equation.replace(place, '{}'.format(round(float(parameter), 2)))\n",
"\n",
" # get critical points\n",
" criticals = {name: round(float(quantity), 4) for name, quantity in zip(regressions[mode]['names'], curve)}\n",
"\n",
" # make assessment\n",
" assessment = {'curve': curve, 'equation': equation}\n",
" assessment.update(validation)\n",
" assessment.update(criticals)\n",
" \n",
" return assessment"
]
},
{
"cell_type": "code",
"execution_count": 30,
"metadata": {
"code_folding": [],
"hidden": true,
"hide_input": true,
"scrolled": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
" ^ [code] (for performing: performing regression study)\n"
]
}
],
"source": [
"designate('performing regression study', 'performing')\n",
"\n",
"# perform regression mode on mosquitoes data\n",
"def perform(mode, associations, spy=False, calculate=True):\n",
" \"\"\"Perform a mode of regression on a set of associations.\n",
" \n",
" Arguments:\n",
" mode: str, the mode of regression\n",
" associations: list of dicts\n",
" spy=True: boolean, observe initial parameter fits?\n",
" calculate: boolean, calculate Jacobian directly?\n",
" \n",
" Returns:\n",
" None\n",
" \"\"\"\n",
" \n",
" # make graph\n",
" graph, panda = plot_scatter_plot(associations, mode, spy, calculate)\n",
"\n",
" # show the results\n",
" output_notebook()\n",
" show(graph)\n",
"\n",
" # get columns and add units\n",
" columns = ['genus', 'records', 'pairs', 'coverage', 'pvalue', 'correlation', 'R^2', 's.e.', 'equation']\n",
" columns += regressions[mode]['names']\n",
" panda = panda[columns]\n",
" panda.columns = [column + units[column](feature) for column in columns]\n",
"\n",
" # show panda\n",
" display(panda)\n",
" \n",
" return None"
]
},
{
"cell_type": "code",
"execution_count": 31,
"metadata": {
"code_folding": [],
"hidden": true,
"hide_input": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
" ^ [code] (for regressions: studying all genera)\n"
]
}
],
"source": [
"designate('studying all genera', 'regressions')\n",
"\n",
"# function to run regression on the associations and return reports per genus\n",
"def study(associations, mode, spy=True, calculate=True):\n",
" \"\"\"Study the data under a regression mode.\n",
" \n",
" Arguments:\n",
" associations: list of dicts\n",
" mode: str, the regression mode\n",
" spy: boolean, plot initial linear fits?\n",
" calculate: boolean, calculate jacobian directly?\n",
" \n",
" Returns:\n",
" list of dicts\n",
" \"\"\"\n",
" \n",
" # go through each genus, running regression\n",
" reports = []\n",
" for genus in ['All', 'Aedes', 'Anopheles', 'Culex', 'Other', 'Unknown']:\n",
" \n",
" # begin ticket\n",
" ticket = issue_ticket(settings, genus, mode)\n",
" \n",
" # perform subsampling by genus and make the samples\n",
" subset = get_subset_of_data(associations, genus)\n",
" samples = assemble_samples(subset)\n",
" coverage = round(len(subset) / len(data), 2)\n",
" ticket.update({'records': len(subset), 'pairs': len(samples), 'coverage': coverage})\n",
" \n",
" # fit the regressor\n",
" report = regress(samples, ticket, spy, calculate)\n",
" reports.append(report)\n",
" \n",
" return reports"
]
},
{
"cell_type": "code",
"execution_count": 32,
"metadata": {
"code_folding": [],
"hidden": true,
"hide_input": true
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
" ^ [code] (for debugging: interpolating between parameters)\n"
]
}
],
"source": [
"designate('interpolating between parameters', 'debugging')\n",
"\n",
"# define interpolation function\n",
"def interpolate(start, finish, number=5):\n",
" \"\"\"Interpolate between start and finish parameters.\n",
" \n",
" Arguments:\n",
" start: list of floats\n",
" finish: list of floats\n",
" number=5: int, number of total curves\n",
" \n",
" Returns:\n",
" list of lists of floats, the curves\n",
" \"\"\"\n",
" \n",
" # find points for each pair\n",
" pairs = zip(start, finish)\n",
" tuplets = []\n",
" for first, last in pairs:\n",
" \n",
" # get chunk\n",
" chunk = (last - first) / (number - 1)\n",
" tuplet = [first + chunk * index for index in range(number)]\n",
" tuplets.append(tuplet)\n",
" \n",
" # zip together sets\n",
" curves = [curve for curve in zip(*tuplets)]\n",
" \n",
" return curves"
]
},
{
"cell_type": "code",
"execution_count": 33,
"metadata": {
"code_folding": [],
"hidden": true,
"hide_input": true
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
" ^ [code] (for debugging: verifying optimized model)\n"
]
}
],
"source": [
"designate('verifying optimized model', 'debugging')\n",
"\n",
"# verify with a plot the optimized model\n",
"def check_optimized_params_with_initial(samples, mode, coefficients, curve):\n",
" \"\"\"Verify the fit of the optimized model compared to the initial estimates.\n",
" \n",
" Arguments:\n",
" samples: list of dicts\n",
" mode: str, regression mode\n",
" coefficients: list of floats, coefficients of linear model\n",
" curve: list of floats, parameters of nonlinear model\n",
" \n",
" Returns:\n",
" None\n",
" \"\"\"\n",
" \n",
" # define points\n",
" independents = [sample['x'] for sample in samples]\n",
" dependents = [sample['y'] for sample in samples]\n",
" sizes = [sample['size'] for sample in samples]\n",
" \n",
" # get ticks\n",
" ticks = set_ticks(min(independents), max(independents), 100)\n",
" \n",
" # make predictions\n",
" initials = regressions[mode]['initial'](*coefficients)\n",
" approximation = [regressions[mode]['function'](tick, *initials) for tick in ticks]\n",
" regression = [regressions[mode]['function'](tick, *curve) for tick in ticks]\n",
"\n",
" # print comparison\n",
" print('{} samples'.format(len(samples)))\n",
" print('{} (coefficients)'.format([round(float(entry), 8) for entry in coefficients]))\n",
" print('{} (initials)'.format([round(float(entry), 8) for entry in initials]))\n",
" print('{} (curve)'.format([round(float(entry), 8) for entry in curve]))\n",
" \n",
" # make figure\n",
" graph = figure()\n",
" graph.circle(x=independents, y=dependents, size=sizes, color='gray', fill_alpha=0.05)\n",
" \n",
" # add lines\n",
" graph.line(x=ticks, y=approximation, color='blue', line_width=3, legend_label='linear')\n",
" graph.line(x=ticks, y=regression, color='green', line_width=3, legend_label='nonlinear')\n",
" \n",
" # show the results\n",
" output_notebook()\n",
" show(graph)\n",
"\n",
" return None"
]
},
{
"cell_type": "markdown",
"metadata": {
"heading_collapsed": true,
"hide_input": true
},
"source": [
"### Data Preparation"
]
},
{
"cell_type": "markdown",
"metadata": {
"hidden": true
},
"source": [
"The process begins with **calling the GLOBE API**:"
]
},
{
"cell_type": "markdown",
"metadata": {
"hidden": true
},
"source": [
"https://www.globe.gov/en/globe-data/globe-api"
]
},
{
"cell_type": "code",
"execution_count": 34,
"metadata": {
"code_folding": [
3
],
"hidden": true,
"hide_input": true
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
" ^ [code] (for api: calling the api)\n"
]
}
],
"source": [
"designate('calling the api', 'api')\n",
"\n",
"# call the api with protocol and country code\n",
"def query_api(protocol, code, beginning, ending, sample=False):\n",
" \"\"\"Call the api:\n",
" \n",
" Arguments:\n",
" protocol: str, the protocol\n",
" code: str, the country code\n",
" beginning: str, the beginning date\n",
" ending: str, the ending date\n",
" sample=False: boolean, only get small sampling?\n",
" \n",
" Returns:\n",
" list of dicts, the records\n",
" \"\"\"\n",
" \n",
" # default to all countries unless a code is specified\n",
" extension = 'country/' if code else ''\n",
" extensionii = '&countrycode=' + code if code else ''\n",
" \n",
" # assemble the url for the API call \n",
" url = 'https://api.globe.gov/search/v1/measurement/protocol/measureddate/' + extension\n",
" url += '?protocols=' + protocol\n",
" url += '&startdate=' + beginning \n",
" url += '&enddate=' + ending\n",
" url += extensionii\n",
"\n",
" # geojson parameter toggles between formats\n",
" url += '&geojson=FALSE'\n",
" \n",
" # sample parameter returns small sample set if true\n",
" url += '&sample=' + str(sample).upper()\n",
"\n",
" # make the API call and return the raw results\n",
" request = requests.get(url)\n",
" raw = json.loads(request.text)\n",
" \n",
" return raw"
]
},
{
"cell_type": "markdown",
"metadata": {
"hidden": true
},
"source": [
"After retrieving the data, several steps are taken to prepare the data. Initially, the data is returned in a nested structure. It is useful to **flatten this nesting** so that all fields are readily accessible. "
]
},
{
"cell_type": "code",
"execution_count": 35,
"metadata": {
"code_folding": [
3
],
"hidden": true,
"hide_input": true
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
" ^ [code] (for processing: flattening records)\n"
]
}
],
"source": [
"designate('flattening records', 'processing')\n",
"\n",
"# function to flatten a nested list into a single-level structure\n",
"def flatten_dict_list(record, label=None):\n",
" \"\"\"Flatten each record into a single level.\n",
"\n",
" Arguments:\n",
" record: dict, a record\n",
" label: str, key from last nesting\n",
"\n",
" Returns:\n",
" dict\n",
" \"\"\"\n",
"\n",
" # initiate dictionary\n",
" flattened = {}\n",
"\n",
" # try to flatten the record\n",
" try:\n",
"\n",
" # go through each field\n",
" for field, info in record.items():\n",
"\n",
" # and flatten the smaller records found there\n",
" flattened.update(flatten_dict_list(info, field))\n",
"\n",
" # otherwise record is a terminal entry\n",
" except AttributeError:\n",
"\n",
" # so update the dictionary with the record\n",
" flattened.update({label: record})\n",
"\n",
" return flattened"
]
},
{
"cell_type": "markdown",
"metadata": {
"hidden": true,
"hide_input": true
},
"source": [
"Additionally, it can be useful to **abbreviate the field names** as the initial field names are often quite long."
]
},
{
"cell_type": "code",
"execution_count": 36,
"metadata": {
"code_folding": [
3
],
"hidden": true,
"hide_input": true
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
" ^ [code] (for processing: abbreviating records)\n"
]
}
],
"source": [
"designate('abbreviating records', 'processing')\n",
"\n",
"# function to abbreviate the fields of a record\n",
"def abbreviate_field_name(record, primary=True, removal=True):\n",
" \"\"\"Abbreviate certain fields in the record for easier manipulation later.\n",
" \n",
" Arguments:\n",
" record: dict\n",
" primary=True: boolean, primary record?\n",
" removal=True: boolean, remove original fields?\n",
" \n",
" Returns:\n",
" dict\n",
" \"\"\"\n",
" \n",
" # define abbreviations dictionary for primary records\n",
" abbreviations = {}\n",
" abbreviations['count'] = larvae\n",
" abbreviations['genus'] = 'mosquitohabitatmapperGenus'\n",
" abbreviations['source'] = 'mosquitohabitatmapperWaterSource'\n",
" abbreviations['stage'] = 'mosquitohabitatmapperLastIdentifyStage'\n",
" abbreviations['type'] = 'mosquitohabitatmapperWaterSourceType'\n",
" abbreviations['measured'] = 'mosquitohabitatmapperMeasuredAt'\n",
" abbreviations['habitat'] = 'mosquitohabitatmapperWaterSourcePhotoUrls'\n",
" abbreviations['body'] = 'mosquitohabitatmapperLarvaFullBodyPhotoUrls'\n",
" abbreviations['abdomen'] = 'mosquitohabitatmapperAbdomenCloseupPhotoUrls'\n",
" \n",
" # if a secondary record\n",
" if not primary:\n",
"\n",
" # define abbreviations dictionary for secondary protocol\n",
" abbreviations = {}\n",
" abbreviations['feature'] = feature\n",
" abbreviations['measured'] = measured\n",
"\n",
" # and each abbreviation\n",
" for abbreviation, field in abbreviations.items():\n",
"\n",
" # copy new field from old, or None if nonexistent\n",
" record[abbreviation] = record.setdefault(field, None)\n",
" \n",
" # remove original field if desired\n",
" if removal:\n",
" \n",
" # remove field\n",
" del(record[field])\n",
" \n",
" return record"
]
},
{
"cell_type": "markdown",
"metadata": {
"hidden": true
},
"source": [
"As all measurements are recorded in reference to UTC time, it is helpful to **convert the measurements to local times**. This is accomplished by adjusting the hour according to the longitude. Though this may not accurately reflect the local time in a political sense as it ignores daylight savings time and time zone boundaries, it is perhaps a more accurate measure in the astronomical sense."
]
},
{
"cell_type": "code",
"execution_count": 37,
"metadata": {
"code_folding": [
3
],
"hidden": true,
"hide_input": true
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
" ^ [code] (for processing: synchronizing times with longitudes)\n"
]
}
],
"source": [
"designate('synchronizing times with longitudes', 'processing')\n",
"\n",
"# synchronize the time of measurement with the longitude for local time\n",
"def sync_time_with_long(record):\n",
" \"\"\"Synchronize the measured times with longitudes.\n",
" \n",
" Arguments:\n",
" record: dict\n",
" \n",
" Returns:\n",
" dict\n",
" \"\"\"\n",
"\n",
" # convert the date string to date object and normalize based on partitioning\n",
" record['time'] = datetime.strptime(record['measured'], \"%Y-%m-%dT%H:%M:%S\")\n",
" record['date'] = record['time'].date()\n",
" \n",
" # convert the date string to date object and correct for longitude\n",
" zone = int(round(record['longitude'] * 24 / 360, 0))\n",
" record['hour'] = record['time'] + timedelta(hours=zone)\n",
"\n",
" return record"
]
},
{
"cell_type": "markdown",
"metadata": {
"hidden": true,
"hide_input": false
},
"source": [
"The larvae count data are initially returned as strings. In order to analyze the data, we need to **convert the numerical strings into number data types**. Additionally, some of the data is entered as a range (e.g., '1-25'), or as a more complicated string ('more than 100'). These strings will be converted to floats using the following rules:\n",
"- a string such as '50' is converted to its floating point equivalent (50)\n",
"- a range such as '1-25'is converted to its average (13)\n",
"- a more complicated string, such as 'more than 100' is converted to its nearest number (100)"
]
},
{
"cell_type": "code",
"execution_count": 38,
"metadata": {
"code_folding": [
3
],
"hidden": true,
"hide_input": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
" ^ [code] (for processing: converting strings to numbers)\n"
]
}
],
"source": [
"designate('converting strings to numbers', 'processing')\n",
"\n",
"# function to convert a string into a floating point number\n",
"def convert_str_to_float(record, field, name):\n",
" \"\"\"Translate info given as a string or range of numbers into a numerical type.\n",
" \n",
" Arguments:\n",
" record: dict\n",
" field: str, the field to get converted\n",
" name: str, the name of the new field\n",
" \n",
" Returns:\n",
" float\n",
" \"\"\"\n",
" \n",
" # try to convert directly\n",
" info = record[field]\n",
" try:\n",
" \n",
" # translate to float\n",
" conversion = float(info)\n",
" \n",
" # otherwise\n",
" except ValueError:\n",
" \n",
" # try to convert a range of values to their average\n",
" try:\n",
" \n",
" # take the average, assuming a range separated by a hyphen\n",
" first, last = info.split('-')\n",
" first = float(first.strip())\n",
" last = float(last.strip())\n",
" conversion = float(first + last) / 2\n",
" \n",
" # otherwise\n",
" except ValueError:\n",
" \n",
" # scan for digits\n",
" digits = [character for character in info if character.isdigit()]\n",
" conversion = ''.join(digits)\n",
" conversion = float(conversion)\n",
" \n",
" \n",
" # add new field\n",
" record[name] = conversion\n",
" \n",
" return record"
]
},
{
"cell_type": "markdown",
"metadata": {
"hidden": true,
"hide_input": true
},
"source": [
"Also, some steps have been taken towards **mosquito genus identification**. The three noteworthy genera in terms of potentially carrying diseases are Aedes, Anopheles, and Culex. If the identification process did not lead to one of these three genera, the genus is regarded as \"Other.\" If the identification process was not fully carried out, the genus is regarded as \"Unknown.\""
]
},
{
"cell_type": "code",
"execution_count": 39,
"metadata": {
"code_folding": [
3
],
"hidden": true,
"hide_input": true
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
" ^ [code] (for processing: identifying mosquito genera)\n"
]
}
],
"source": [
"designate('identifying mosquito genera', 'processing')\n",
"\n",
"# function to identify the mosquito genera based on last stage of identification\n",
"def identify_mosquito_genera(record):\n",
" \"\"\"Identify the genera from a record.\n",
" \n",
" Arguments:\n",
" record: dict\n",
" \n",
" Returns:\n",
" dict\n",
" \"\"\"\n",
"\n",
" # check genus\n",
" if record['genus'] is None:\n",
"\n",
" # check last stage\n",
" if record['stage'] in (None, 'identify'):\n",
"\n",
" # correct genus to 'Unidentified'\n",
" record['genus'] = 'Unknown'\n",
"\n",
" # otherwise\n",
" else:\n",
"\n",
" # correct genus to 'Other'\n",
" record['genus'] = 'Other'\n",
" \n",
" return record"
]
},
{
"cell_type": "markdown",
"metadata": {
"hidden": true,
"hide_input": true
},
"source": [
"Also, many of the records contain photo urls. The **photo urls will be parsed** and the file names formatted according to a naming convention."
]
},
{
"cell_type": "code",
"execution_count": 40,
"metadata": {
"code_folding": [
3
],
"hidden": true,
"hide_input": true
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
" ^ [code] (for processing: localizing latitude and longitude)\n"
]
}
],
"source": [
"designate('localizing latitude and longitude', 'processing')\n",
" \n",
"# specify the location code for the photo based on its geo coordinates\n",
"def location_code_for_photo_naming(latitude, longitude):\n",
" \"\"\"Specify the location code for the photo naming convention.\n",
" \n",
" Arguments:\n",
" latitude: float, the latitude\n",
" longitude: float, the longitude\n",
" \n",
" Returns:\n",
" str, the latlon code\n",
" \"\"\"\n",
" \n",
" # get latlon codes based on < 0 query\n",
" latitudes = {True: 'S', False: 'N'}\n",
" longitudes = {True: 'W', False: 'E'}\n",
" \n",
" # make latlon code from letter and rounded geocoordinate with 3 places\n",
" latlon = latitudes[latitude < 0] + ('000' + str(abs(int(latitude))))[-3:]\n",
" latlon += longitudes[longitude < 0] + ('000' + str(abs(int(longitude))))[-3:]\n",
" \n",
" return latlon"
]
},
{
"cell_type": "code",
"execution_count": 41,
"metadata": {
"code_folding": [
3
],
"hidden": true,
"hide_input": true
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
" ^ [code] (for processing: applying photo naming convention)\n"
]
}
],
"source": [
"designate('applying photo naming convention', 'processing')\n",
"\n",
"# apply the naming convention to a photo url to make a file name\n",
"def photo_naming(urls, code, latitude, longitude, time):\n",
" \"\"\"Apply the naming convention to a group of urls\n",
" \n",
" Arguments:\n",
" urls: list of str, the photo urls\n",
" code: str, the photo sector code\n",
" latitude: float, the latitude\n",
" longitude: float, the longitude\n",
" time: datetime object, the measurement time\n",
" \n",
" Returns:\n",
" list of str, the filenames\n",
" \"\"\"\n",
" \n",
" # begin file name with protocol and latlon\n",
" base = 'GLOBEMHM_' + location_code_for_photo_naming(latitude, longitude) + '_'\n",
" \n",
" # add the measurement time and sector code\n",
" base += time.strftime('%Y%m%dT%H%MZ') + '_' + code\n",
" \n",
" # add index and unique id\n",
" names = []\n",
" for index, url in enumerate(urls):\n",
" \n",
" # add index, starting with 1\n",
" name = base + str(index + 1)\n",
" \n",
" # add unique id and extension\n",
" unique = url.split('/')[-2]\n",
" name += '_' + unique + '.jpg'\n",
" names.append(name)\n",
" \n",
" return names"
]
},
{
"cell_type": "code",
"execution_count": 42,
"metadata": {
"code_folding": [
3
],
"hidden": true,
"hide_input": true,
"scrolled": true
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
" ^ [code] (for processing: parsing photo urls)\n"
]
}
],
"source": [
"designate('parsing photo urls', 'processing')\n",
"\n",
"# function for parsing photo urls\n",
"def parse_photo_info(record):\n",
" \"\"\"Parse photo url information.\n",
" \n",
" Arguments:\n",
" record: dict\n",
" \n",
" Returns:\n",
" dict\n",
" \"\"\"\n",
"\n",
" # dictionary of photo sector codes\n",
" sectors = {'habitat': 'WS', 'body': 'FB', 'abdomen': 'AB'}\n",
"\n",
" # initialize fields for each sector and parse urls\n",
" record['originals'] = []\n",
" record['thumbs'] = []\n",
" record['photos'] = []\n",
" for field, stub in sectors.items():\n",
" \n",
" # split on semicolon, and keep all fragments with 'original'\n",
" datum = record[field] or ''\n",
" originals = [url.strip() for url in datum.split(';') if 'original' in url]\n",
" \n",
" # sort by the unique identifier as the number before the last slash\n",
" originals.sort(key=lambda url: url.split('/')[-2])\n",
" record['originals'] += originals\n",
" \n",
" # get the thumbnail versions\n",
" thumbs = [url.split('original')[0] + 'small.jpg' for url in originals]\n",
" record['thumbs'] += thumbs\n",
" \n",
" # apply the naming convention\n",
" photos = photo_naming(originals, code, record['latitude'], record['longitude'], record['time'])\n",
" record['photos'] += photos\n",
" \n",
" return record"
]
},
{
"cell_type": "code",
"execution_count": 43,
"metadata": {
"code_folding": [
3
],
"hidden": true,
"hide_input": true
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
" ^ [code] (for processing: processing records)\n"
]
}
],
"source": [
"designate('processing records', 'processing')\n",
"\n",
"# function for processing records\n",
"def process_records(records, primary=True):\n",
" \"\"\"Process all records.\n",
" \n",
" Arguments:\n",
" records: list of dicts\n",
" primary=True: boolean, primary record?\n",
" \n",
" Returns:\n",
" list of dicts\n",
" \"\"\"\n",
" \n",
" # flatten and abbreviate all records\n",
" records = [flatten_dict_list(record) for record in records]\n",
" records = [abbreviate_field_name(record, primary) for record in records]\n",
" records = [sync_time_with_long(record) for record in records]\n",
" \n",
" # process primary records\n",
" if primary:\n",
" \n",
" # process\n",
" records = [convert_str_to_float(record, 'count', 'larvae') for record in records]\n",
" records = [identify_mosquito_genera(record) for record in records]\n",
" records = [parse_photo_info(record) for record in records]\n",
" \n",
" # process secondary records\n",
" if not primary:\n",
" \n",
" # process\n",
" records = [convert_str_to_float(record, 'feature', feature) for record in records]\n",
" \n",
" return records"
]
},
{
"cell_type": "markdown",
"metadata": {
"hidden": true,
"hide_input": false
},
"source": [
"Finally, it is sometimes the case that records contain **potential outliers**. For instance, an entry of '1000000' for larvae counts is suspicous because likely no one counted one million larvae. These data can skew analysis and dwarf the rest of the data in graphs. While there are [numerous ways](https://machinelearningmastery.com/how-to-use-statistics-to-identify-outliers-in-data/) to detect and remove these outliers, the method implemented in this notebook uses the [interquartile range](https://machinelearningmastery.com/how-to-use-statistics-to-identify-outliers-in-data/).\n",
"\n",
"We can remove these outliers by setting a threshold for the upper quartile boundary. We then calculate the interquartile range by finding the $x$th percentile and the $100-x$th percentile and subtract the two.\n",
"\n",
"We then calculate a cutoff value at 1.5 multiplied by the interquartile range and subract this value from the lower quartile and add it to the upper quartile.\n",
"\n",
"Any value between the two values is considered valid and any value outside of that range is considered an outlier and is removed from the dataset."
]
},
{
"cell_type": "code",
"execution_count": 44,
"metadata": {
"hidden": true,
"hide_input": true
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
" ^ [code] (for main: navigation buttons)\n"
]
},
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "5acaca56d2c64dc0a025ffbee7ad94b7",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
"HBox(children=(Button(description='Top', style=ButtonStyle()), Button(description='Settings', style=ButtonStyl…"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "f2022a5064d24ab9b672ae363bf506c2",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
"HBox(children=(Button(description='Linear', style=ButtonStyle()), Button(description='Quadratic', style=Button…"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"designate('navigation buttons')\n",
"\n",
"# set two navigation buttons\n",
"navigate_notebook()\n",
"jump_to_regression()"
]
},
{
"cell_type": "code",
"execution_count": 45,
"metadata": {
"code_folding": [],
"hidden": true,
"hide_input": false,
"scrolled": true
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
" ^ [code] (for pruning: outlier pruning)\n"
]
}
],
"source": [
"designate('outlier pruning', 'pruning')\n",
"\n",
"# function to prune away outlying observations\n",
"def remove_outliers(records, field, threshold=75):\n",
" \"\"\"Prune away outlying observations based on the interquartile range.\n",
" \n",
" Arguments:\n",
" records: list of dicts, the records\n",
" field: str, field under inspection\n",
" threshold: float, upper percentile; default = 75\n",
" \n",
" Returns:\n",
" tuple of two lists of dicts, (pruned records, outliers)\n",
" \"\"\"\n",
"\n",
" # continually attempt pruning until there are no more records removed, but at least two remain\n",
" outliers = []\n",
" \n",
" # reset number of records\n",
" number = len(records)\n",
"\n",
" # calculate the mean and standard deviation\n",
" values = [record[field] for record in records]\n",
" q_l, q_u = percentile(values, 100-threshold), percentile(values, threshold)\n",
" iqr = q_u - q_l\n",
" cut_off = iqr * 1.5\n",
" lower, upper = q_l - cut_off, q_u + cut_off\n",
" \n",
" # for each record\n",
" for record in records:\n",
"\n",
" # set the quartiles\n",
" record['lq'] = lower\n",
" record['uq'] = upper\n",
"\n",
" # if the threshold is exceeded\n",
" if record[field] < lower or record[field] > upper:\n",
"\n",
" # append to outliers\n",
" outliers.append(record)\n",
" \n",
" records = [record for record in records if record[field] > lower and record[field] < upper]\n",
" \n",
" return records, outliers"
]
},
{
"cell_type": "code",
"execution_count": 46,
"metadata": {
"code_folding": [
3
],
"hidden": true,
"hide_input": true
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
" ^ [code] (for filtering: sifting data through filter)\n"
]
}
],
"source": [
"designate('sifting data through filter', 'filtering')\n",
"\n",
"# function to sift data through filters\n",
"def filter_data_by_field(records, parameters, fields, functions, symbols):\n",
" \"\"\"Sift records according to parameters.\n",
" \n",
" Arguments:\n",
" records: list of dicts\n",
" parameters: list of settings\n",
" fields: list of str\n",
" functions: list of function objects\n",
" symbols: list of str\n",
" \n",
" Returns:\n",
" list of dicts, str\n",
" \"\"\"\n",
" \n",
" # begin criteria string\n",
" criteria = ''\n",
"\n",
" # filter primaries based on parameters\n",
" for parameter, field, function, symbol in zip(parameters, fields, functions, symbols):\n",
"\n",
" # check for None\n",
" if parameter is not None:\n",
"\n",
" # filter\n",
" if field in records[0].keys():\n",
" \n",
" # filter\n",
" records = [record for record in records if function(record[field], parameter)]\n",
"\n",
" # add to criteria string\n",
" criteria += '{} {} {}\\n'.format(field, symbol, parameter)\n",
" \n",
" # sort data by date and add an index\n",
" records.sort(key=lambda record: record['date'])\n",
" [record.update({'index': index}) for index, record in enumerate(records)]\n",
"\n",
" return records, criteria"
]
},
{
"cell_type": "code",
"execution_count": 47,
"metadata": {
"code_folding": [
3
],
"hidden": true,
"hide_input": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
" ^ [code] (for histograms: chopping data up into bins)\n"
]
}
],
"source": [
"designate('chopping data up into bins', 'histograms')\n",
"\n",
"# chopping data into histogram bars\n",
"def get_bins_width_min_max_for_hist(observations, width=1, limit=1000):\n",
" \"\"\"Chop the observations from the records up into bars\n",
" \n",
" Arguments:\n",
" observations: list of floats\n",
" width=1: float, minimum width of each bar\n",
" limit: int, maximum number of bins\n",
" \n",
" Returns:\n",
" (float, float) tuple, the number of bins and the width\n",
" \"\"\"\n",
"\n",
" # adjust width until number of bins is less than a limit\n",
" bins = limit + 1\n",
" width = width / 10\n",
" while bins > limit:\n",
" \n",
" # multiply width by 10\n",
" width *= 10\n",
" \n",
" # calculate the number of histogram bins\n",
" minimum = (int(min(observations) / width) * width) - (width * 0.5)\n",
" maximum = (int(max(observations) / width) * (width + 1)) + (width * 0.5)\n",
" bins = int((max(observations) - min(observations)) / width) + 1\n",
"\n",
" # readjust maximum to cover an even number of bins\n",
" maximum = minimum + bins * width\n",
"\n",
" return bins, width, minimum, maximum"
]
},
{
"cell_type": "code",
"execution_count": 48,
"metadata": {
"code_folding": [
3
],
"hidden": true,
"hide_input": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
" ^ [code] (for histograms: zooming in on best view)\n"
]
}
],
"source": [
"designate('zooming in on best view', 'histograms')\n",
"\n",
"# function to define horizontal and vertical ranges of the graph\n",
"def zoom_to_best_x_y_ranges(observations, counts, width, percent=1):\n",
" \"\"\"Zoom in on the best horizontal and vertical view ranges.\n",
" \n",
" Arguments:\n",
" observations: list of float\n",
" counts: list of counts per bin\n",
" width: width of each bin\n",
" percent: float, the percentile margin\n",
" \n",
" Returns:\n",
" tuple of tuples of floats, the view boundaries\n",
" \"\"\"\n",
" \n",
" # make left and right boudaries as a width past the percentiles\n",
" left = percentile(observations, percent) - width\n",
" right = percentile(observations, 100 - percent) + width\n",
" horizontal = (left, right)\n",
" \n",
" # make up and down boudaries based on counts\n",
" down = 0\n",
" up = max(counts) * 1.1\n",
" vertical = (down, up)\n",
" \n",
" return horizontal, vertical"
]
},
{
"cell_type": "code",
"execution_count": 49,
"metadata": {
"code_folding": [],
"hidden": true,
"hide_input": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
" ^ [code] (for histograms: begin drafting a histogram)\n"
]
}
],
"source": [
"designate('begin drafting a histogram', 'histograms')\n",
"\n",
"# function for drafting a histogram\n",
"def draft_histogram(field, horizontal, vertical, mean, deviation):\n",
" \"\"\"Draft a histogram with beginning boundary information.\n",
" \n",
" Arguments:\n",
" field: str\n",
" horizontal: (float, float) tuple, the horizontal extent\n",
" vertical: (float, float) tuple, the vertical extent\n",
" mean: float, the mean of the observations\n",
" deviation: standard deviation of the observations\n",
" \n",
" Returns:\n",
" bokeh figure object\n",
" \"\"\"\n",
" \n",
" # create parameters dictionary for histogram labels\n",
" parameters = {}\n",
" parameters['title'] = 'Histogram for {}'.format(make_title(field))\n",
" parameters['x_axis_label'] = '{}'.format(field)\n",
" parameters['y_axis_label'] = 'observations'\n",
" parameters['x_range'] = horizontal\n",
" parameters['y_range'] = vertical\n",
" parameters['plot_height'] = 400\n",
" parameters['plot_width'] = 450\n",
" \n",
" # set extra axis\n",
" starting = (horizontal[0] - mean) / deviation\n",
" ending = (horizontal[1] - mean) / deviation\n",
" parameters['extra_x_ranges'] = {'z-score': Range1d(start=starting, end=ending)}\n",
" \n",
" # initialize the bokeh graph with the parameters\n",
" gram = figure(**parameters)\n",
" \n",
" # label the histogram\n",
" formats = round(mean, 2), round(deviation, 2)\n",
" label = 'Overlay of normal distribution (mean={}, std={})'.format(*formats)\n",
" gram.add_layout(LinearAxis(x_range_name='z-score', axis_label=label), 'above')\n",
" \n",
" # add annotations\n",
" annotations = [('{}:'.format(truncate_field_name(field, 6)), '@left to @right')]\n",
" annotations += [('Observations:', '@ys')]\n",
" annotations += [('Z-score:', '@scores')]\n",
" \n",
" # activate the hover tool\n",
" hover = HoverTool(tooltips=annotations)\n",
" gram.add_tools(hover)\n",
" \n",
" return gram"
]
},
{
"cell_type": "code",
"execution_count": 50,
"metadata": {
"code_folding": [],
"hidden": true,
"hide_input": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
" ^ [code] (for histograms: blocking in bars on the histogram)\n"
]
}
],
"source": [
"designate('blocking in bars on the histogram', 'histograms')\n",
"\n",
"# function to block in bars on the histogram\n",
"def draw_bars(gram, counts, edges, mean, deviation):\n",
" \"\"\"Block in bars on the histogram.\n",
" \n",
" Arguments:\n",
" gram: bokeh figure\n",
" counts: list of floats, the bin counts\n",
" edges: list of floats, the bin edges\n",
" mean: float\n",
" deviation: float\n",
" \n",
" Returns:\n",
" bokeh figure\n",
" \"\"\"\n",
" \n",
" # get middle points\n",
" middles = [(right + left) / 2 for left, right in zip(edges[:-1], edges[1:])]\n",
" \n",
" # calculate z-scores for all middles\n",
" scores = [(middle - mean) / deviation for middle in middles]\n",
" \n",
" # accumulate the info into a table\n",
" table = {'ys': counts, 'left': edges[:-1], 'right': edges[1:]}\n",
" table.update({'scores': scores, 'xs': middles})\n",
" table = ColumnDataSource(table)\n",
" \n",
" # set parameters for drawing the bars, indicating the source of the data\n",
" bars = {'source': table}\n",
" bars.update({'left': 'left', 'right': 'right', 'bottom': 0, 'top': 'ys'})\n",
" bars.update({'line_color': 'white', 'fill_color': 'lightgreen'})\n",
" \n",
" # add to histogram\n",
" gram.quad(**bars)\n",
" \n",
" return gram"
]
},
{
"cell_type": "code",
"execution_count": 51,
"metadata": {
"code_folding": [],
"hidden": true,
"hide_input": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
" ^ [code] (for histograms: normalizing observations to a normal distribution)\n"
]
}
],
"source": [
"designate('normalizing observations to a normal distribution', 'histograms')\n",
"\n",
"# function to produce the normalization curve\n",
"def draw_gaussian_on_bargraph(gram, counts, edges, mean, deviation):\n",
" \"\"\"Normalize the observations by drawing the normal distribution.\n",
" \n",
" Arguments:\n",
" gram: bokeh figure\n",
" counts: list of floats, the bin counts\n",
" edges: list of floats, the bin edges\n",
" mean: float\n",
" deviation: float\n",
" \n",
" Returns:\n",
" bokeh figure\n",
" \"\"\"\n",
" \n",
" # create line from z-score of -4 to 4\n",
" scores = [tick * 0.01 - 4.0 for tick in range(801)]\n",
" xs = [(score * deviation) + mean for score in scores]\n",
" \n",
" # create gaussian fucntion\n",
" area = sum([count * (right - left) for count, left, right in zip(counts, edges[:-1], edges[1:])])\n",
" height = area / (deviation * sqrt(2 * pi))\n",
" normalizing = lambda x: height * exp(-(x - mean) ** 2 / (2 * deviation ** 2))\n",
" ys = [normalizing(x) for x in xs]\n",
" ys = [round(y, 3) for y in ys]\n",
"\n",
" # make column object\n",
" table = ColumnDataSource({'xs': xs, 'ys': ys, 'scores': scores, 'left': xs, 'right': xs})\n",
" summary = 'Normal Distribution'\n",
" gram.line(source=table, x='xs', y='ys', color='blue')\n",
" \n",
" # draw standard lines\n",
" for score in (-3, -2, -1, 0, 1, 2, 3):\n",
" \n",
" # draw std lines\n",
" xs = [(deviation * score) + mean] * 2\n",
" ys = [0, normalizing(xs[0])]\n",
" table = ColumnDataSource({'xs': xs, 'ys': ys, 'scores': [score, score], 'left': xs, 'right': xs})\n",
" gram.line(source=table, x='xs', y='ys', color='blue')\n",
" \n",
" return gram"
]
},
{
"cell_type": "code",
"execution_count": 52,
"metadata": {
"code_folding": [],
"hidden": true,
"hide_input": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
" ^ [code] (for histograms: constructing a bar graph)\n"
]
}
],
"source": [
"designate('constructing a bar graph', 'histograms')\n",
"\n",
"# function for constructing a histogram\n",
"def construct_bargraph(records, field, width=1):\n",
" \"\"\"Make a histogram from the dataset.\n",
" \n",
" Arguments:\n",
" record: list of dicts, the records\n",
" field: str, the field of interest\n",
" width=1: int, the width of each histogram bar\n",
" \n",
" Returns:\n",
" bokeh figure object\n",
" \"\"\"\n",
" \n",
" # gather up observations\n",
" observations = [record[field] for record in records]\n",
" \n",
" # separate into bins\n",
" bins, width, minimum, maximum = get_bins_width_min_max_for_hist(observations, width)\n",
" \n",
" # get the counts and edges of each bin\n",
" counts, edges = histogram(observations, bins=bins, range=(minimum, maximum))\n",
" \n",
" # get the zoom coordinates\n",
" horizontal, vertical = zoom_to_best_x_y_ranges(observations, counts, width)\n",
"\n",
" # get the normal distribution, defaulting to a small deviation in case of zero\n",
" mean = average(observations)\n",
" deviation = max([0.000001, std(observations)])\n",
" \n",
" # begin histogram\n",
" gram = draft_histogram(field, horizontal, vertical, mean, deviation)\n",
" \n",
" # block in bars on the histogram\n",
" gram = draw_bars(gram, counts, edges, mean, deviation)\n",
" \n",
" # draw in equivalent normal distribution\n",
" gram = draw_gaussian_on_bargraph(gram, counts, edges, mean, deviation)\n",
" \n",
" return gram"
]
},
{
"cell_type": "markdown",
"metadata": {
"heading_collapsed": true
},
"source": [
"### Assembling Associations"
]
},
{
"cell_type": "markdown",
"metadata": {
"hidden": true
},
"source": [
"Because the two sets of measurements were not taken concurrently, there must be some criteria to determine when measurements from one protocol correspond to measurements from the other protocol. The method implemented here is a weighing function that determines how strongly to weigh the association between the two data sets, based on the following parameters:\n",
" \n",
"- distance: the distance in kilometers between measurements that will be granted full weight.\n",
" \n",
"- interval: the time interval in days between measurements that will be granted full weight.\n",
" \n",
"- lag: the time in days to anticipate an effect on mosquitoes from a secondary measurement some \n",
" days before.\n",
" \n",
"- confidence: the weight to grant a measurement twice the distance or interval. This determines how steeply the weighting shrinks as the intervals are surpassed. A high confidence will grant higher weights to data outside the intervals. A confidence of zero will have no tolerance for data slightly passed the interval.\n",
" \n",
"- cutoff: the minimum weight to consider in the dataset. A cutoff of 0.1, for instance, will only retain data if the weight is at least 0.1.\n",
" \n",
"- inclusion: the maximum number of nearest secondary measurements to include for each mosquitoes measurement.\n",
"\n",
"The sketch below shows several examples differing in their confidence parameter."
]
},
{
"cell_type": "code",
"execution_count": 53,
"metadata": {
"code_folding": [],
"hidden": true,
"hide_input": true,
"scrolled": true
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
" ^ [code] (for weighting: weighing function)\n"
]
}
],
"source": [
"designate('weighing function', 'weighting')\n",
"\n",
"# weigh a pair of records according to the space and time between them\n",
"def weigh_record_pairs_by_space_and_time(space, time, settings):\n",
" \"\"\"Weigh the significance of a correlation based on the space and time between them\n",
" \n",
" Arguments:\n",
" space: float, space in distance\n",
" time: float, the time time in interval\n",
" settings: dict\n",
" \n",
" Returns:\n",
" float, the weight\n",
" \"\"\"\n",
" \n",
" # unpack settings\n",
" distance, interval, lag = settings['distance'], settings['interval'], settings['lag']\n",
" confidence, cutoff = settings['confidence'], settings['cutoff']\n",
" \n",
" # set default weight and factors to 1.0\n",
" weight = 1.0\n",
" factor = 1.0\n",
" factorii = 1.0\n",
" \n",
" # if beyond the space, calculate the gaussian factor\n",
" if abs(space) > distance:\n",
" \n",
" # default factor to 0, but calculate gaussian\n",
" factor = 0.0\n",
" if confidence > 0:\n",
" \n",
" # calculate the gaussian factor (e ^ -a d^2 = c), a = -ln (c) / d^2\n",
" alpha = -log(confidence) / distance ** 2\n",
" factor = exp(-alpha * (abs(space) - distance) ** 2)\n",
" \n",
" # if beyond time time, calculate gaussian factor\n",
" if abs(time - lag) > interval:\n",
" \n",
" # default factor to 0, but calculate gaussian\n",
" factorii = 0.0\n",
" if confidence > 0:\n",
" \n",
" # calculate the gaussian factor (e ^ -a d^2 = c), a = -ln (c) / d^2\n",
" beta = -log(confidence) / interval ** 2\n",
" factorii = exp(-beta * (abs(time - lag) - interval) ** 2)\n",
" \n",
" # multiply by factors and apply cutoff\n",
" weight *= (factor * factorii)\n",
" weight *= int(weight >= cutoff)\n",
" \n",
" return weight"
]
},
{
"cell_type": "code",
"execution_count": 54,
"metadata": {
"hidden": true,
"hide_input": true
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
" ^ [code] (for main: sketching weight function)\n"
]
},
{
"data": {
"text/html": [
"\n",
"
\\n\"+\n", " \"BokehJS does not appear to have successfully loaded. If loading BokehJS from CDN, this \\n\"+\n", " \"may be due to a slow or bad network connection. Possible fixes:\\n\"+\n", " \"
\\n\"+\n", " \"\\n\"+\n",
" \"from bokeh.resources import INLINE\\n\"+\n",
" \"output_notebook(resources=INLINE)\\n\"+\n",
" \"
\\n\"+\n",
" \"\\n\"+\n \"BokehJS does not appear to have successfully loaded. If loading BokehJS from CDN, this \\n\"+\n \"may be due to a slow or bad network connection. Possible fixes:\\n\"+\n \"
\\n\"+\n \"\\n\"+\n \"from bokeh.resources import INLINE\\n\"+\n \"output_notebook(resources=INLINE)\\n\"+\n \"
\\n\"+\n \"\\n\"+\n", " \"BokehJS does not appear to have successfully loaded. If loading BokehJS from CDN, this \\n\"+\n", " \"may be due to a slow or bad network connection. Possible fixes:\\n\"+\n", " \"
\\n\"+\n", " \"\\n\"+\n",
" \"from bokeh.resources import INLINE\\n\"+\n",
" \"output_notebook(resources=INLINE)\\n\"+\n",
" \"
\\n\"+\n",
" \"\\n\"+\n \"BokehJS does not appear to have successfully loaded. If loading BokehJS from CDN, this \\n\"+\n \"may be due to a slow or bad network connection. Possible fixes:\\n\"+\n \"
\\n\"+\n \"\\n\"+\n \"from bokeh.resources import INLINE\\n\"+\n \"output_notebook(resources=INLINE)\\n\"+\n \"
\\n\"+\n \"\\n\"+\n", " \"BokehJS does not appear to have successfully loaded. If loading BokehJS from CDN, this \\n\"+\n", " \"may be due to a slow or bad network connection. Possible fixes:\\n\"+\n", " \"
\\n\"+\n", " \"\\n\"+\n",
" \"from bokeh.resources import INLINE\\n\"+\n",
" \"output_notebook(resources=INLINE)\\n\"+\n",
" \"
\\n\"+\n",
" \"\\n\"+\n \"BokehJS does not appear to have successfully loaded. If loading BokehJS from CDN, this \\n\"+\n \"may be due to a slow or bad network connection. Possible fixes:\\n\"+\n \"
\\n\"+\n \"\\n\"+\n \"from bokeh.resources import INLINE\\n\"+\n \"output_notebook(resources=INLINE)\\n\"+\n \"
\\n\"+\n \"\\n\"+\n", " \"BokehJS does not appear to have successfully loaded. If loading BokehJS from CDN, this \\n\"+\n", " \"may be due to a slow or bad network connection. Possible fixes:\\n\"+\n", " \"
\\n\"+\n", " \"\\n\"+\n",
" \"from bokeh.resources import INLINE\\n\"+\n",
" \"output_notebook(resources=INLINE)\\n\"+\n",
" \"
\\n\"+\n",
" \"\\n\"+\n \"BokehJS does not appear to have successfully loaded. If loading BokehJS from CDN, this \\n\"+\n \"may be due to a slow or bad network connection. Possible fixes:\\n\"+\n \"
\\n\"+\n \"\\n\"+\n \"from bokeh.resources import INLINE\\n\"+\n \"output_notebook(resources=INLINE)\\n\"+\n \"
\\n\"+\n \"\\n\"+\n", " \"BokehJS does not appear to have successfully loaded. If loading BokehJS from CDN, this \\n\"+\n", " \"may be due to a slow or bad network connection. Possible fixes:\\n\"+\n", " \"
\\n\"+\n", " \"\\n\"+\n",
" \"from bokeh.resources import INLINE\\n\"+\n",
" \"output_notebook(resources=INLINE)\\n\"+\n",
" \"
\\n\"+\n",
" \"\\n\"+\n \"BokehJS does not appear to have successfully loaded. If loading BokehJS from CDN, this \\n\"+\n \"may be due to a slow or bad network connection. Possible fixes:\\n\"+\n \"
\\n\"+\n \"\\n\"+\n \"from bokeh.resources import INLINE\\n\"+\n \"output_notebook(resources=INLINE)\\n\"+\n \"
\\n\"+\n \"\\n\"+\n", " \"BokehJS does not appear to have successfully loaded. If loading BokehJS from CDN, this \\n\"+\n", " \"may be due to a slow or bad network connection. Possible fixes:\\n\"+\n", " \"
\\n\"+\n", " \"\\n\"+\n",
" \"from bokeh.resources import INLINE\\n\"+\n",
" \"output_notebook(resources=INLINE)\\n\"+\n",
" \"
\\n\"+\n",
" \"\\n\"+\n \"BokehJS does not appear to have successfully loaded. If loading BokehJS from CDN, this \\n\"+\n \"may be due to a slow or bad network connection. Possible fixes:\\n\"+\n \"
\\n\"+\n \"\\n\"+\n \"from bokeh.resources import INLINE\\n\"+\n \"output_notebook(resources=INLINE)\\n\"+\n \"
\\n\"+\n \"\\n\"+\n", " \"BokehJS does not appear to have successfully loaded. If loading BokehJS from CDN, this \\n\"+\n", " \"may be due to a slow or bad network connection. Possible fixes:\\n\"+\n", " \"
\\n\"+\n", " \"\\n\"+\n",
" \"from bokeh.resources import INLINE\\n\"+\n",
" \"output_notebook(resources=INLINE)\\n\"+\n",
" \"
\\n\"+\n",
" \"\\n\"+\n \"BokehJS does not appear to have successfully loaded. If loading BokehJS from CDN, this \\n\"+\n \"may be due to a slow or bad network connection. Possible fixes:\\n\"+\n \"
\\n\"+\n \"\\n\"+\n \"from bokeh.resources import INLINE\\n\"+\n \"output_notebook(resources=INLINE)\\n\"+\n \"
\\n\"+\n \"\\n\"+\n", " \"BokehJS does not appear to have successfully loaded. If loading BokehJS from CDN, this \\n\"+\n", " \"may be due to a slow or bad network connection. Possible fixes:\\n\"+\n", " \"
\\n\"+\n", " \"\\n\"+\n",
" \"from bokeh.resources import INLINE\\n\"+\n",
" \"output_notebook(resources=INLINE)\\n\"+\n",
" \"
\\n\"+\n",
" \"\\n\"+\n \"BokehJS does not appear to have successfully loaded. If loading BokehJS from CDN, this \\n\"+\n \"may be due to a slow or bad network connection. Possible fixes:\\n\"+\n \"
\\n\"+\n \"\\n\"+\n \"from bokeh.resources import INLINE\\n\"+\n \"output_notebook(resources=INLINE)\\n\"+\n \"
\\n\"+\n \"\\n\"+\n", " \"BokehJS does not appear to have successfully loaded. If loading BokehJS from CDN, this \\n\"+\n", " \"may be due to a slow or bad network connection. Possible fixes:\\n\"+\n", " \"
\\n\"+\n", " \"\\n\"+\n",
" \"from bokeh.resources import INLINE\\n\"+\n",
" \"output_notebook(resources=INLINE)\\n\"+\n",
" \"
\\n\"+\n",
" \"\\n\"+\n \"BokehJS does not appear to have successfully loaded. If loading BokehJS from CDN, this \\n\"+\n \"may be due to a slow or bad network connection. Possible fixes:\\n\"+\n \"
\\n\"+\n \"\\n\"+\n \"from bokeh.resources import INLINE\\n\"+\n \"output_notebook(resources=INLINE)\\n\"+\n \"
\\n\"+\n \"\\n\"+\n", " \"BokehJS does not appear to have successfully loaded. If loading BokehJS from CDN, this \\n\"+\n", " \"may be due to a slow or bad network connection. Possible fixes:\\n\"+\n", " \"
\\n\"+\n", " \"\\n\"+\n",
" \"from bokeh.resources import INLINE\\n\"+\n",
" \"output_notebook(resources=INLINE)\\n\"+\n",
" \"
\\n\"+\n",
" \"\\n\"+\n \"BokehJS does not appear to have successfully loaded. If loading BokehJS from CDN, this \\n\"+\n \"may be due to a slow or bad network connection. Possible fixes:\\n\"+\n \"
\\n\"+\n \"\\n\"+\n \"from bokeh.resources import INLINE\\n\"+\n \"output_notebook(resources=INLINE)\\n\"+\n \"
\\n\"+\n \"\\n\"+\n", " \"BokehJS does not appear to have successfully loaded. If loading BokehJS from CDN, this \\n\"+\n", " \"may be due to a slow or bad network connection. Possible fixes:\\n\"+\n", " \"
\\n\"+\n", " \"\\n\"+\n",
" \"from bokeh.resources import INLINE\\n\"+\n",
" \"output_notebook(resources=INLINE)\\n\"+\n",
" \"
\\n\"+\n",
" \"\\n\"+\n \"BokehJS does not appear to have successfully loaded. If loading BokehJS from CDN, this \\n\"+\n \"may be due to a slow or bad network connection. Possible fixes:\\n\"+\n \"
\\n\"+\n \"\\n\"+\n \"from bokeh.resources import INLINE\\n\"+\n \"output_notebook(resources=INLINE)\\n\"+\n \"
\\n\"+\n \"\\n\"+\n", " \"BokehJS does not appear to have successfully loaded. If loading BokehJS from CDN, this \\n\"+\n", " \"may be due to a slow or bad network connection. Possible fixes:\\n\"+\n", " \"
\\n\"+\n", " \"\\n\"+\n",
" \"from bokeh.resources import INLINE\\n\"+\n",
" \"output_notebook(resources=INLINE)\\n\"+\n",
" \"
\\n\"+\n",
" \"\\n\"+\n \"BokehJS does not appear to have successfully loaded. If loading BokehJS from CDN, this \\n\"+\n \"may be due to a slow or bad network connection. Possible fixes:\\n\"+\n \"
\\n\"+\n \"\\n\"+\n \"from bokeh.resources import INLINE\\n\"+\n \"output_notebook(resources=INLINE)\\n\"+\n \"
\\n\"+\n \"\n", " | genus | \n", "records | \n", "pairs | \n", "coverage | \n", "pvalue | \n", "correlation | \n", "R^2 | \n", "s.e.(larvae) | \n", "equation | \n", "onset(KitMg) | \n", "slope(larvae/KitMg) | \n", "
---|---|---|---|---|---|---|---|---|---|---|---|
0 | \n", "All | \n", "112 | \n", "251 | \n", "0.10 | \n", "0.1923 | \n", "0.1241 | \n", "0.0154 | \n", "3.80 | \n", "y = 1.27+0.27 x | \n", "1.2733 | \n", "0.2725 | \n", "
1 | \n", "Aedes | \n", "3 | \n", "10 | \n", "0.00 | \n", "0.5763 | \n", "0.6175 | \n", "0.3813 | \n", "5.90 | \n", "y = -14.11+3.59 x | \n", "-14.1097 | \n", "3.5894 | \n", "
2 | \n", "Anopheles | \n", "0 | \n", "0 | \n", "0.00 | \n", "1.0000 | \n", "0.0000 | \n", "0.0000 | \n", "0.00 | \n", "no fit achieved | \n", "0.0000 | \n", "0.0000 | \n", "
3 | \n", "Culex | \n", "5 | \n", "15 | \n", "0.00 | \n", "0.9020 | \n", "0.0771 | \n", "0.0059 | \n", "1.31 | \n", "y = 8.64+-0.28 x | \n", "8.6400 | \n", "-0.2752 | \n", "
4 | \n", "Other | \n", "23 | \n", "51 | \n", "0.02 | \n", "0.0920 | \n", "0.3595 | \n", "0.1293 | \n", "4.48 | \n", "y = 1.49+0.91 x | \n", "1.4887 | \n", "0.9078 | \n", "
5 | \n", "Unknown | \n", "81 | \n", "175 | \n", "0.07 | \n", "0.0030 | \n", "0.3253 | \n", "0.1058 | \n", "1.73 | \n", "y = 3.63+-0.34 x | \n", "3.6265 | \n", "-0.3398 | \n", "
\\n\"+\n", " \"BokehJS does not appear to have successfully loaded. If loading BokehJS from CDN, this \\n\"+\n", " \"may be due to a slow or bad network connection. Possible fixes:\\n\"+\n", " \"
\\n\"+\n", " \"\\n\"+\n",
" \"from bokeh.resources import INLINE\\n\"+\n",
" \"output_notebook(resources=INLINE)\\n\"+\n",
" \"
\\n\"+\n",
" \"\\n\"+\n \"BokehJS does not appear to have successfully loaded. If loading BokehJS from CDN, this \\n\"+\n \"may be due to a slow or bad network connection. Possible fixes:\\n\"+\n \"
\\n\"+\n \"\\n\"+\n \"from bokeh.resources import INLINE\\n\"+\n \"output_notebook(resources=INLINE)\\n\"+\n \"
\\n\"+\n \"\n", " | genus | \n", "records | \n", "pairs | \n", "coverage | \n", "pvalue | \n", "correlation | \n", "R^2 | \n", "s.e.(larvae) | \n", "equation | \n", "center(KitMg) | \n", "height(larvae) | \n", "curvature(larvae/KitMg^2) | \n", "
---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | \n", "All | \n", "112 | \n", "251 | \n", "0.10 | \n", "0.1214 | \n", "0.1472 | \n", "0.0217 | \n", "3.78 | \n", "y = 3.64+-0.44 x+0.05 x^2 | \n", "3.6444 | \n", "-0.4425 | \n", "0.0471 | \n", "
1 | \n", "Aedes | \n", "3 | \n", "10 | \n", "0.00 | \n", "0.5723 | \n", "0.6224 | \n", "0.3874 | \n", "5.87 | \n", "y = -4.77+0.93 x+0.17 x^2 | \n", "-4.7730 | \n", "0.9257 | \n", "0.1719 | \n", "
2 | \n", "Anopheles | \n", "0 | \n", "0 | \n", "0.00 | \n", "1.0000 | \n", "0.0000 | \n", "0.0000 | \n", "0.00 | \n", "no fit achieved | \n", "0.0000 | \n", "0.0000 | \n", "0.0000 | \n", "
3 | \n", "Culex | \n", "5 | \n", "15 | \n", "0.00 | \n", "0.7858 | \n", "0.1691 | \n", "0.0286 | \n", "1.29 | \n", "y = 65.75+-13.5 x+0.73 x^2 | \n", "65.7550 | \n", "-13.4998 | \n", "0.7334 | \n", "
4 | \n", "Other | \n", "23 | \n", "51 | \n", "0.02 | \n", "0.0599 | \n", "0.3981 | \n", "0.1585 | \n", "4.41 | \n", "y = -4.73+2.56 x+-0.1 x^2 | \n", "-4.7257 | \n", "2.5605 | \n", "-0.0952 | \n", "
5 | \n", "Unknown | \n", "81 | \n", "175 | \n", "0.07 | \n", "0.0017 | \n", "0.3437 | \n", "0.1182 | \n", "1.72 | \n", "y = 5.35+-0.89 x+0.04 x^2 | \n", "5.3452 | \n", "-0.8887 | \n", "0.0382 | \n", "
\\n\"+\n", " \"BokehJS does not appear to have successfully loaded. If loading BokehJS from CDN, this \\n\"+\n", " \"may be due to a slow or bad network connection. Possible fixes:\\n\"+\n", " \"
\\n\"+\n", " \"\\n\"+\n",
" \"from bokeh.resources import INLINE\\n\"+\n",
" \"output_notebook(resources=INLINE)\\n\"+\n",
" \"
\\n\"+\n",
" \"\\n\"+\n \"BokehJS does not appear to have successfully loaded. If loading BokehJS from CDN, this \\n\"+\n \"may be due to a slow or bad network connection. Possible fixes:\\n\"+\n \"
\\n\"+\n \"\\n\"+\n \"from bokeh.resources import INLINE\\n\"+\n \"output_notebook(resources=INLINE)\\n\"+\n \"
\\n\"+\n \"\n", " | genus | \n", "records | \n", "pairs | \n", "coverage | \n", "pvalue | \n", "correlation | \n", "R^2 | \n", "s.e.(larvae) | \n", "equation | \n", "onset(KitMg) | \n", "rate(/KitMg) | \n", "
---|---|---|---|---|---|---|---|---|---|---|---|
0 | \n", "All | \n", "112 | \n", "251 | \n", "0.10 | \n", "0.1430 | \n", "0.1393 | \n", "0.0192 | \n", "3.79 | \n", "y = e^(0.1 + 0.42* x) | \n", "0.0991 | \n", "0.4205 | \n", "
1 | \n", "Aedes | \n", "3 | \n", "10 | \n", "0.00 | \n", "0.5919 | \n", "0.5980 | \n", "0.3530 | \n", "6.03 | \n", "y = e^(0.21 + 0.88* x) | \n", "0.2138 | \n", "0.8800 | \n", "
2 | \n", "Anopheles | \n", "0 | \n", "0 | \n", "0.00 | \n", "1.0000 | \n", "0.0000 | \n", "0.0000 | \n", "0.00 | \n", "no fit achieved | \n", "0.0000 | \n", "0.0000 | \n", "
3 | \n", "Culex | \n", "5 | \n", "15 | \n", "0.00 | \n", "0.8985 | \n", "0.0798 | \n", "0.0064 | \n", "1.31 | \n", "y = e^(-0.05 + 2.24* x) | \n", "-0.0484 | \n", "2.2447 | \n", "
4 | \n", "Other | \n", "23 | \n", "51 | \n", "0.02 | \n", "0.1288 | \n", "0.3261 | \n", "0.1053 | \n", "4.55 | \n", "y = e^(0.08 + 1.51* x) | \n", "0.0790 | \n", "1.5093 | \n", "
5 | \n", "Unknown | \n", "81 | \n", "175 | \n", "0.07 | \n", "0.0020 | \n", "0.3377 | \n", "0.1137 | \n", "1.72 | \n", "y = e^(-0.26 + 1.89* x) | \n", "-0.2646 | \n", "1.8881 | \n", "
\\n\"+\n", " \"BokehJS does not appear to have successfully loaded. If loading BokehJS from CDN, this \\n\"+\n", " \"may be due to a slow or bad network connection. Possible fixes:\\n\"+\n", " \"
\\n\"+\n", " \"\\n\"+\n",
" \"from bokeh.resources import INLINE\\n\"+\n",
" \"output_notebook(resources=INLINE)\\n\"+\n",
" \"
\\n\"+\n",
" \"\\n\"+\n \"BokehJS does not appear to have successfully loaded. If loading BokehJS from CDN, this \\n\"+\n \"may be due to a slow or bad network connection. Possible fixes:\\n\"+\n \"
\\n\"+\n \"\\n\"+\n \"from bokeh.resources import INLINE\\n\"+\n \"output_notebook(resources=INLINE)\\n\"+\n \"
\\n\"+\n \"\n", " | genus | \n", "records | \n", "pairs | \n", "coverage | \n", "pvalue | \n", "correlation | \n", "R^2 | \n", "s.e.(larvae) | \n", "equation | \n", "onset(KitMg) | \n", "height(larvae) | \n", "power | \n", "
---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | \n", "All | \n", "112 | \n", "251 | \n", "0.10 | \n", "0.7564 | \n", "-0.0296 | \n", "-0.3046 | \n", "4.37 | \n", "y = 7.04 * (x - c)^0.97 | \n", "7.0444 | \n", "0.9710 | \n", "-0.0000 | \n", "
1 | \n", "Aedes | \n", "3 | \n", "10 | \n", "0.00 | \n", "0.5691 | \n", "0.6264 | \n", "0.3921 | \n", "5.85 | \n", "y = 3.24 * (x - c)^1.61 | \n", "3.2425 | \n", "1.6132 | \n", "1.3645 | \n", "
2 | \n", "Anopheles | \n", "0 | \n", "0 | \n", "0.00 | \n", "1.0000 | \n", "0.0000 | \n", "0.0000 | \n", "0.00 | \n", "no fit achieved | \n", "0.0000 | \n", "0.0000 | \n", "0.0000 | \n", "
3 | \n", "Culex | \n", "5 | \n", "15 | \n", "0.00 | \n", "0.9406 | \n", "0.0467 | \n", "-3.3051 | \n", "2.72 | \n", "y = 7.36 * (x - c)^2.05 | \n", "7.3566 | \n", "2.0519 | \n", "-0.0000 | \n", "
4 | \n", "Other | \n", "23 | \n", "51 | \n", "0.02 | \n", "0.0670 | \n", "0.3885 | \n", "0.1506 | \n", "4.43 | \n", "y = 1.97 * (x - c)^3.55 | \n", "1.9702 | \n", "3.5548 | \n", "0.5290 | \n", "
5 | \n", "Unknown | \n", "81 | \n", "175 | \n", "0.07 | \n", "0.0016 | \n", "0.3446 | \n", "0.1181 | \n", "1.72 | \n", "y = 10.91 * (x - c)^0.12 | \n", "10.9145 | \n", "0.1167 | \n", "-1.5706 | \n", "
\\n\"+\n", " \"BokehJS does not appear to have successfully loaded. If loading BokehJS from CDN, this \\n\"+\n", " \"may be due to a slow or bad network connection. Possible fixes:\\n\"+\n", " \"
\\n\"+\n", " \"\\n\"+\n",
" \"from bokeh.resources import INLINE\\n\"+\n",
" \"output_notebook(resources=INLINE)\\n\"+\n",
" \"
\\n\"+\n",
" \"\\n\"+\n \"BokehJS does not appear to have successfully loaded. If loading BokehJS from CDN, this \\n\"+\n \"may be due to a slow or bad network connection. Possible fixes:\\n\"+\n \"
\\n\"+\n \"\\n\"+\n \"from bokeh.resources import INLINE\\n\"+\n \"output_notebook(resources=INLINE)\\n\"+\n \"
\\n\"+\n \"\n", " | genus | \n", "records | \n", "pairs | \n", "coverage | \n", "pvalue | \n", "correlation | \n", "R^2 | \n", "s.e.(larvae) | \n", "equation | \n", "center(KitMg) | \n", "height(larvae) | \n", "spread(KitMg^2) | \n", "
---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | \n", "All | \n", "112 | \n", "251 | \n", "0.10 | \n", "1.0000 | \n", "0.0000 | \n", "0.0000 | \n", "0.00 | \n", "no fit achieved | \n", "0.0000 | \n", "0.0000 | \n", "0.0000 | \n", "
1 | \n", "Aedes | \n", "3 | \n", "10 | \n", "0.00 | \n", "0.5091 | \n", "0.6969 | \n", "0.4826 | \n", "5.40 | \n", "y = 10.63 * e^(-(x - 28.32)^2 / 2 * c) | \n", "10.6256 | \n", "28.3152 | \n", "4.1804 | \n", "
2 | \n", "Anopheles | \n", "0 | \n", "0 | \n", "0.00 | \n", "1.0000 | \n", "0.0000 | \n", "0.0000 | \n", "0.00 | \n", "no fit achieved | \n", "0.0000 | \n", "0.0000 | \n", "0.0000 | \n", "
3 | \n", "Culex | \n", "5 | \n", "15 | \n", "0.00 | \n", "1.0000 | \n", "0.0000 | \n", "0.0000 | \n", "0.00 | \n", "no fit achieved | \n", "0.0000 | \n", "0.0000 | \n", "0.0000 | \n", "
4 | \n", "Other | \n", "23 | \n", "51 | \n", "0.02 | \n", "0.0782 | \n", "0.3747 | \n", "0.1386 | \n", "4.46 | \n", "y = 12.96 * e^(-(x - 12.76)^2 / 2 * c) | \n", "12.9636 | \n", "12.7554 | \n", "40.5044 | \n", "
5 | \n", "Unknown | \n", "81 | \n", "175 | \n", "0.07 | \n", "0.0015 | \n", "0.3479 | \n", "0.1209 | \n", "1.72 | \n", "y = 2.05 * e^(-(x - 3.09)^2 / 2 * c) | \n", "2.0487 | \n", "3.0865 | \n", "10.5399 | \n", "