# -*- coding: utf-8 -*-
"""Created on Tue Oct 31 20:02:11 2017@author: liuzimu"""
import pandas as pd
import random, time
import numpy as np
"""1. Establish a random datasetThe dataset contains the following columns:Gender: randomly generate 'male' or 'female'Age: random integers between 22 and 65Salary: random integers between 3000 and 10000"""
n_row = 1000
random.seed(50)
# create a series of gender
gender = pd.Series([random.choice(['male','female']) for i in range(n_row)])
# create a series of age
age_low = 22
age_high = 65
age = pd.Series([random.randint(age_low, age_high) for i in range(n_row)])
# create a series of salary
salary_low = 3000
salary_high = 10000
salary = pd.Series([random.randint(salary_low, salary_high) for i in range(n_row)])
# create a dataframe by gender and salary
df = pd.DataFrame({"gender": gender,"age": age, "salary": salary})
df.head()
"""2. Create a dictionary of strings and their corresponding functions"average": numpy.mean function"sum": numpy.sum function"""
def str2func(x):
func_dict = {"average": np.mean, "sum": np.sum}
return func_dict[x]
"""3. Calculate square errorSuppose we want the variable x to fall on the [a, b] interval,then the calculation of the squared error is as followed:if x in [a, b], then SE=0if x > b, then SE=(x/b−1)^2if x < a,then SE=(1−x/a)^2"""
def get_se(x, rng):
a, b = rng
if a <= x <= b:
res = 0
elif x > b:
# Normalization
res = (x / b - 1) ** 2
else:
res = (1 - x / a) ** 2
return res
"""4. Calculate mean-square error"""
def get_mse(data, rows, cols, funcs, rngs, n_cond):
mse = 0.0
for col, func, rng in zip(cols, funcs, rngs):
se = func(data.loc[rows == 1, col])
se = get_se(se, rng)
mse += se / n_cond
return mse
"""5. Search functionThe variable rows is something like [1, 1, 0, 1, 0, 0, 0..., 0, 1, 0, 0, 0],in which 1 means the row number is selected.Set the mse and min_mse as "infinite" initially to make the code more elegant.a. Create an index array with n zeros.b. Calculate the mse of indexes which are zeros.c. Record the minimum mse as min_mse during step 2,and set the corresponding index as one.d. Compare the mse and min_mse then update the value of mse.e. Break the iteration if the mse cannot be lower anymore."""
def search(data, cols, funcs, rngs, threshold=10e-6):
n_row = data.shape[0]
n_cond = len(cols)
# create a series to show which rows are selected
rows = pd.Series(np.zeros(n_row, dtype = np.int32))
rows.index = data.index
# get functions
funcs = [str2func(x) for x in funcs]
i = 0
mse = float('inf')
while mse > threshold:
min_mse = float('inf')
for idx in data.loc[rows == 0].index:
rows.loc[idx] = 1
tmp_mse = get_mse(data, rows, cols, funcs, rngs, n_cond)
if tmp_mse < min_mse:
min_mse = tmp_mse
min_mse_idx = idx
else:
pass
rows.loc[idx] = 0
# check if mse cannot be lower any more
if min_mse > mse:
break
else:
mse = min_mse
rows.loc[min_mse_idx] = 1
# print loss
print("%dtimes iteration, mse%.3f" % (i+1, mse))
i += 1
return rows
"""6. Test the search fucntion and show results."""
print("\n" * 3)
print("Test search:")
run_time = time.time()
idxs = search(data = df
, cols = ["age", "salary"]
, funcs = ["average", "sum"]
, rngs = [[35,40], [100000, 120000]])
search_result = df.loc[idxs == 1]
average_age = search_result.age.mean()
total_salary = search_result.salary.sum()
print()
print("Target average age is 35 to 40 and target total salary is 100000 to 120000")
print("Average age is%.2fand total salary is%d" % (average_age, total_salary))
print("Run time is%.2fs" % (time.time() - run_time))