Source code for helpers

import os
import psutil
import random
import logging
import colorlog
import multiprocessing as mp
from typing import Iterable

import numpy as np
import pandas as pd
from scipy.stats import norm


def initialize_log():
    pid = os.getpid()
    handler = colorlog.StreamHandler()
    handler.setFormatter(colorlog.ColoredFormatter())

    logging.basicConfig(handlers=[handler])
    log = logging.getLogger(f'type_infer-{pid}')
    log_level = os.environ.get('TYPE_INFER_LOG', 'DEBUG')
    log.setLevel(log_level)
    return log


log = initialize_log()


def get_nr_procs(df=None):
    if 'MINDSDB_N_WORKERS' in os.environ:
        try:
            n = int(os.environ['MINDSDB_N_WORKERS'])
        except ValueError:
            n = 1
        return n
    elif os.name == 'nt':
        return 1
    else:
        available_mem = psutil.virtual_memory().available
        if df is not None:
            max_per_proc_usage = df.size
        else:
            max_per_proc_usage = 0.2 * pow(10, 9)  # multiplier * 1GB

        proc_count = int(min(mp.cpu_count() - 1, available_mem // max_per_proc_usage))

        return max(proc_count, 1)


def seed(seed_nr: int) -> None:
    np.random.seed(seed_nr)
    random.seed(seed_nr)


[docs]def is_nan_numeric(value: object) -> bool: """ Determines if **value** might be `nan` or `inf` or some other numeric value (i.e. which can be cast as `float`) that is not actually a number. """ # noqa if isinstance(value, np.ndarray) or (type(value) != str and isinstance(value, Iterable)): return False try: value = str(value) value = float(value) except Exception: return False try: if isinstance(value, float): a = int(value) # noqa isnan = False except Exception: isnan = True return isnan
[docs]def cast_string_to_python_type(string): """ Returns None, an integer, float or a string from a string""" if string is None or string == '': return None if string.isnumeric(): # Did you know you can write fractions in unicode, and they are numeric but can't be cast to integers !? try: return int(string) except Exception: return None try: return clean_float(string) except Exception: return string
def clean_float(val): if isinstance(val, (int, float)): return float(val) if isinstance(val, float): return val val = str(val).strip(' ') val = val.replace(',', '.') val = val.rstrip('"').lstrip('"') if val in ('', '.', 'None', 'nan'): return None try: return float(val) except Exception: return None def sample_data(df: pd.DataFrame) -> pd.DataFrame: population_size = len(df) if population_size <= 50: sample_size = population_size else: sample_size = int(round(_calculate_sample_size(population_size))) population_size = len(df) input_data_sample_indexes = random.sample(range(population_size), sample_size) return df.iloc[input_data_sample_indexes] def _calculate_sample_size( population_size, margin_error=.01, confidence_level=.995, sigma=1 / 2 ): """ Calculate the minimal sample size to use to achieve a certain margin of error and confidence level for a sample estimate of the population mean. Inputs ------- population_size: integer Total size of the population that the sample is to be drawn from. margin_error: number Maximum expected difference between the true population parameter, such as the mean, and the sample estimate. confidence_level: number in the interval (0, 1) If we were to draw a large number of equal-size samples from the population, the true population parameter should lie within this percentage of the intervals (sample_parameter - e, sample_parameter + e) where e is the margin_error. sigma: number The standard deviation of the population. For the case of estimating a parameter in the interval [0, 1], sigma=1/2 should be sufficient. """ alpha = 1 - confidence_level # dictionary of confidence levels and corresponding z-scores # computed via norm.ppf(1 - (alpha/2)), where norm is # a normal distribution object in scipy.stats. # Here, ppf is the percentile point function. zdict = { .90: 1.645, .91: 1.695, .99: 2.576, .97: 2.17, .94: 1.881, .93: 1.812, .95: 1.96, .98: 2.326, .96: 2.054, .92: 1.751 } if confidence_level in zdict: z = zdict[confidence_level] else: # Inf fix if alpha == 0.0: alpha += 0.001 z = norm.ppf(1 - (alpha / 2)) N = population_size M = margin_error numerator = z**2 * sigma**2 * (N / (N - 1)) denom = M**2 + ((z**2 * sigma**2) / (N - 1)) return numerator / denom