文章目录
12.对实验结果进行one-sided Wilcoxon rank-sum p-value
import scipy.stats as stats
baseline = [1,2,3,4,5]
our_model = [6,7,8,9]
stat_val, p_val=stats.mannwhitneyu(our_model,baseline,alternative="greater")#stats.ttest_ind(x, y, equal_var=False)
"""
H0:y<=x
H1:y>x
if p-value<0.05, reject H0
"""
print ('Two-sample t-statistic D = %6.3f, p-value = %6.4f' % (stat_val, p_val))
"""
Two-sample t-statistic D = 20.000, p-value = 0.0100
p-value<0.05这说明H0 reject,接受H1,即our-model 好于baseline
"""
11.张量维度操作(拼接、维度扩展、压缩、转置、重复……)
https://zhuanlan.zhihu.com/p/31495102
10.将一个字符序列转变为one-hot编码
"""
例如:
输入:['a','b','c','a']
输出:
[ [1,0,0],
[0,1,0],
[0,0,1],
[1,0,0]
]"""
def _label_string2matrix(nodes_labels_str):
b, c = np.unique(nodes_labels_str, return_inverse=True)
class_num = len(b)
sample_num = len(c)
class_num = class_num
nodes_labels = torch.zeros((sample_num, class_num))
i = 0
for la in c:
nodes_labels[i, la] = 1.0
i = i + 1
return nodes_labels
9. 多进程
类型一:多个进程执行相同的任务(举行模块切分)
from concurrent.futures import ProcessPoolExecutor
a_very_big_list = []
for item in a_very_big_list:
"""
你需要确保这个for循环每一轮之间是互不影响的, 否则无法进行并行化的处理
some codes include:
1. your logic functions
2. some parameters (read only)
3. some variables (you want to get or return)
"""
parameters = None
variables = None
def _fun(list_split, parameters):
_variables = []
for item in list_split:
_variables = parameters
return _variables
def fun(a_very_big_list, parameters=None, workers=8):
list_split = []
step = int(len(a_very_big_list) / workers)
for i in range(workers):
if i != workers - 1:
# print('slice: ', i * step, ' ', (i + 1) * step)
split = a_very_big_list[i * step:(i + 1) * step]
else:
# print('slice: ', i * step)
split = a_very_big_list[i * step:]
list_split.append(split)
variables = []
print("len(wblog_content_split): ", len(list_split))
with ProcessPoolExecutor(max_workers=workers) as executor:
for _variables in executor.map(_fun,
list_split,
[parameters for i in range(workers)]):
"""
接下来你需要把每一个进程返回的结果进行组装,组装的方式要根据具体的情况灵活设计,
例如对于不受影响的dic,可以使用dic.update
对于list,可以使用+进行拼接
"""
variables = variables + _variables
return variables
类型二:多个进程执行不同的任务
import multiprocessing
def fun_1(parameter):
pass
def fun_2(parameter):
pass
def fun_3():
pass
def fun_4():
pass
parameter=None
p1 = multiprocessing.Process(target=fun_1, args=(parameter,))
p2 = multiprocessing.Process(target=fun_2, args=(parameter,))
p1.start()
p2.start()
p1.join()
fun_3()
p2.join()
fun_4()
"""
进程执行的路线图
fun_1 | fun_2
| |
v v
fun_3 | fun_4
请注意,进程之间是不共享数据的,不要企图在类里面这样使用,实际上会创建好几个类对象,并且各个进程只会修改自己类对象里面的变量
"""
8. 打混数据并保证索引按照正常的排序
df.sample(frac=1).reset_index(drop=True)
7. 解析json字符串并返回支持属性访问的对象
from argparse import Namespace
import json
def json_to_object(data):
return json.loads(data, object_hook=lambda d: Namespace(**d))
with open("default.json") as f:
args = json_to_object(f.read())
6. 概率化编程
def add_unknown_words(word_vecs, vocab, min_df=1, k=300):
"""
For words that occur in at least min_df documents,
create a separate word vector.
same variance as pre-trained ones
"""
for word in vocab:
if word not in word_vecs and vocab[word] >= min_df:
word_vecs[word] = np.random.uniform(-0.25,0.25,k)
print(word)
def pro(pro):
if random.random()<=pro:
pass
5. 定义一段弃用的代码
有时候有些函数功能我们打算弃用,但是担心版本不兼容,为例保持借口的兼容性,可以仿照下面的代码进行编程
@deprecated("Method will be removed in 4.0.0, use self.wv.__getitem__() instead")
def __getitem__(self, words):
"""
Deprecated. Use self.wv.__getitem__() instead.
Refer to the documentation for `gensim.models.keyedvectors.Word2VecKeyedVectors.__getitem__`
"""
return self.wv.__getitem__(words)
4. 计算文件行数
def count_lines(f):
if path.isfile(f): # Test whether a path is a regular file
num_lines = sum(1 for line in open(f))
"""
上面这行代码相当于:
a=[1 for line in open(f)] # a=[1,1,1,1,1,...,1]
num_lines = sum(a)
"""
return num_lines
else:
return 0
3. 计算一组文件里单词的词频
#! /usr/bin/env python
# -*- coding: utf-8 -*-
from concurrent.futures import ProcessPoolExecutor
from collections import Counter
def count_words(file):
c = Counter()
with open(file, 'r') as f:
for l in f:
words = l.strip().split()
c.update(words)
return c
def count_textfiles(files, workers=1):
c = Counter()
with ProcessPoolExecutor(max_workers=workers) as executor:
for c_ in executor.map(count_words, files):
c.update(c_)
return c
2. 代码计时
from time import time
t0 = time()
your code here
t1 = time()
print('make_directed: added missing edges {}s'.format(t1 - t0))
1. 创建一个可迭代的文件对象
class WalksCorpus(object):
def __init__(self, file_list):
"""
:param file_list: 这是write_walks_to_disk写在本地里的一组文件列表
"""
self.file_list = file_list
def __iter__(self):
"""
Python 中的顺序类型,都是可迭代的(list, tuple, string)。
其余包括 dict, set, file 也是可迭代的。
对于用户自己实现的类型,如果提供了 __iter__() 或者 __getitem__() 方法,
那么该类的对象也是可迭代的。
假如file_list=['output.walks.0','output.walks.1']其中
'output.walks.0':
8 2 4 8 2 31 34 28 24 28 25 28 3 10 3
2 1 22 1 18 2 8 2 4 2 18 1 8 3 2 1
'output.walks.1':
32 25 26 32 29 3 33 31 9 33 16 34
6 11 1 20 34 30 24 30 24 28 3 1 14
那么这个函数返回后得到的就是:
[ [8, 2, 4, 8, 2, 31, 34, 28, 24, 28, 25, 28, 3, 10, 3],
[2, 1, 22, 1, 18, 2, 8, 2, 4, 2, 18, 1, 8, 3, 2, 1],
[32, 25, 26, 32, 29, 3, 33, 31, 9, 33, 16, 34 ],
[6, 11, 1, 20, 34, 30, 24, 30, 24, 28, 3, 1, 14]
]
更多关于迭代和yield的内容,可以参考博文
[Python 中的黑暗角落(一):理解 yield 关键字](https://liam0205.me/2017/06/30/understanding-yield-in-python/)
:return:
"""
for file in self.file_list:
with open(file, 'r') as f:
for line in f:
yield line.split()
walk_files="your_file.txt"
walks = WalksCorpus(walk_files)