python np.ceil()和np.repeat()的用法

本文介绍了使用Python的NumPy库进行数组操作的方法,包括初始化数组、元素取整、按维度复制矩阵及其重塑等实用技巧。

摘要生成于 C知道 ,由 DeepSeek-R1 满血版支持, 前往体验 >

import numpy as np

a = np.zeros((2,10))
a[np.arange(2),4] = 1
print("逐个赋值后的值:",a)

b = np.array([0.34,1.3,0.75,-0.23,-1.1])
b = np.ceil(b)
print("向上取整的值:",b)
变化后的值: [[0. 0. 0. 0. 1. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 1. 0. 0. 0. 0. 0.]]
向上取整的值: [ 1.  2.  1. -0. -1.]
print('np.repeat()的用法')
t = np.array([[1,2,4],[-0.23,0.45,-1.2],[3.2,0.44,-0.65]])
t0 = np.repeat(t,3,axis=0)
t0_reshape = t0.reshape(t.shape[0],-1)
t1 = np.repeat(t,2,axis=1)
t1_reshape = t1.reshape(-1,t.shape[1])


print('输入值:',t,t.shape)
print("按第1维度复制矩阵结果:",t0,t0.shape)
print("按第1维度复制并reshape矩阵结果:",t0_reshape,t0_reshape.shape)
print("按第2维度复制矩阵结果:",t1,t1.shape)
print("按第2维度复制并reshape矩阵结果:",t1_reshape,t1_reshape.shape)

np.repeat()的用法
输入值: [[ 1.    2.    4.  ]
 [-0.23  0.45 -1.2 ]
 [ 3.2   0.44 -0.65]] (3, 3)
按第1维度复制矩阵结果: [[ 1.    2.    4.  ]
 [ 1.    2.    4.  ]
 [ 1.    2.    4.  ]
 [-0.23  0.45 -1.2 ]
 [-0.23  0.45 -1.2 ]
 [-0.23  0.45 -1.2 ]
 [ 3.2   0.44 -0.65]
 [ 3.2   0.44 -0.65]
 [ 3.2   0.44 -0.65]] (9, 3)
按第1维度复制并reshape矩阵结果: [[ 1.    2.    4.    1.    2.    4.    1.    2.    4.  ]
 [-0.23  0.45 -1.2  -0.23  0.45 -1.2  -0.23  0.45 -1.2 ]
 [ 3.2   0.44 -0.65  3.2   0.44 -0.65  3.2   0.44 -0.65]] (3, 9)
按第2维度复制矩阵结果: [[ 1.    1.    2.    2.    4.    4.  ]
 [-0.23 -0.23  0.45  0.45 -1.2  -1.2 ]
 [ 3.2   3.2   0.44  0.44 -0.65 -0.65]] (3, 6)
按第2维度复制并reshape矩阵结果: [[ 1.    1.    2.  ]
 [ 2.    4.    4.  ]
 [-0.23 -0.23  0.45]
 [ 0.45 -1.2  -1.2 ]
 [ 3.2   3.2   0.44]
 [ 0.44 -0.65 -0.65]] (6, 3)
import numpy as np from tqdm import tqdm import tensorflow as tf import pandas as pd import torch import re from sklearn.model_selection import train_test_split CHEM_FORMULA_SIZE = "([A-Z][a-z]*)([0-9]*)" VALID_ELEMENTS = [ "C", "N", "P", "O", "S", "Si", "I", "H", "Cl", "F", "Br", "B", "Se", "Fe", "Co", "As", "K", "Na", ] ELEMENT_VECTORS = np.eye(len(VALID_ELEMENTS)) element_to_position = dict(zip(VALID_ELEMENTS, ELEMENT_VECTORS)) def formula_to_dense(chem_formula: str) -> np.ndarray: total_onehot = [] for (chem_symbol, num) in re.findall(CHEM_FORMULA_SIZE, chem_formula): num = 1 if num == "" else int(num) one_hot = element_to_position[chem_symbol].reshape(1, -1) one_hot_repeats = np.repeat(one_hot, repeats=num, axis=0) total_onehot.append(one_hot_repeats) if len(total_onehot) == 0: dense_vec = np.zeros(len(element_to_position)) else: dense_vec = np.vstack(total_onehot).sum(0) return dense_vec def sine_embed(v, max_count=256): num_freqs = int(np.ceil(np.log2(max_count))) freqs = 0.5 ** torch.arange(num_freqs, dtype=torch.float32) * np.pi v_tensor = torch.tensor(v, dtype=torch.float32)[:, None] embedded = torch.sin(v_tensor * freqs[None, :]) return torch.abs(embedded).numpy() def encode_formula(formula: str): candidate_features = formula_to_dense(formula) # 将单个化学式转为特征向量 sine_embeddings = sine_embed(candidate_features) return sine_embeddings.flatten() def positional_encoding(max_position, d_model, min_freq=1e-6): position = np.arange(max_position) freqs = min_freq**(2*(np.arange(d_model)//2)/d_model) pos_enc = position.reshape(-1,1)*freqs.reshape(1,-1) pos_enc[:, ::2] = np.cos(pos_enc[:, ::2]) pos_enc[:, 1::2] = np.sin(pos_enc[:, 1::2]) return pos_enc P=positional_encoding(2000000,256, min_freq=1e2) dimn=256 def encoding(rag_tensor,P,dimn): to_pad=[] for sample in rag_tensor: all_dim=[sample[0].numpy().tolist()] pos_enc=[P[int(i)-1] for i in sample[1].numpy().tolist()] for dim in range(dimn): dim_n=[i[dim] for i in pos_enc] all_dim.append(dim_n) to_pad.append(all_dim) to_pad=[tf.keras.preprocessing.sequence.pad_sequences(i,maxlen=501,dtype='float32',padding='post',truncating='post',value=10) for i in to_pad] to_pad=np.stack((to_pad)) to_pad=np.swapaxes(to_pad, 1, -1) return to_pad def trun_n_d(n,d): return ( n if not n.find('.')+1 else n[:n.find('.')+d+1] ) def prepro_specs_train(df): df = df.reset_index(drop=True) valid = [] mz_intensity = df['Spectrum'].to_list() def process_line(line): pairs = line.split() mz_list = [] intensity_list = [] for pair in pairs: mz, intensity = pair.split(':') mz_list.append(float(mz)) intensity_list.append(float(intensity)) return mz_list, intensity_list for idx, intensities in tqdm(enumerate(mz_intensity)): mz_list, intensity_list = process_line(intensities) mz_list.append(float(df.at[idx, 'Total Exact Mass'])) round_mz_list = [round(float(mz), 2) for mz in mz_list] round_intensity_list = [round(float(intensity), 2) for intensity in intensity_list] valid.append([round_mz_list, round_intensity_list]) return tf.ragged.constant(valid) import json import torch from typing import Dict, List from torch.utils.data import Dataset import transformers from peft import LoraConfig, TaskType, get_peft_model from torch.utils.data import DataLoader, SequentialSampler from transformers import Trainer, TrainingArguments from lora_plus import LoraPlusTrainer from torch.utils.data import RandomSampler def infer_seqlen(source_len: int, target_len: int, cutoff_len: int) -> tuple[int, int]: if target_len * 2 < cutoff_len: # truncate source max_target_len = cutoff_len elif source_len * 2 < cutoff_len: # truncate target max_target_len = cutoff_len - source_len else: # truncate both max_target_len = int(cutoff_len * (target_len / (source_len + target_len))) new_target_len = min(max_target_len , target_len) max_source_len = max(cutoff_len - new_target_len, 0) new_source_len = min(max_source_len, source_len) return new_source_len, new_target_len class SupervisedDataset(Dataset): """Dataset for supervised fine-tuning.""" def __init__( self, data_path, tokenizer, model_max_length, user_tokens=[151644], assistant_tokens=[151645], ): super(SupervisedDataset, self).__init__() self.data = json.load(open(data_path)) self.tokenizer = tokenizer self.model_max_length = model_max_length self.user_tokens = user_tokens self.assistant_tokens = assistant_tokens self.ignore_index = -100 # 测试第一条数据是否正确处理 item = self.preprocessing(self.data[0]) print("input:", self.tokenizer.decode(item["input_ids"])) labels = [id_ for id_ in item["labels"] if id_ != -100] # 过滤 -100 的标签 def __len__(self): return len(self.data) def preprocessing(self, example): input_ids = [] labels = [] # 将用户助手的内容配对 messages = example["conversations"] pairs = [] current_user_encoded = None # 将 user assistant 配对,并将其打包成编码后的 pairs for message in messages: if message["role"] == "user": # 编码用户消息 current_user_encoded = [self.tokenizer.bos_token_id] + self.user_tokens + self.tokenizer.encode( message["content"], add_special_tokens=False ) elif message["role"] == "assistant" and current_user_encoded is not None: # 编码助手消息 assistant_encoded = self.assistant_tokens + self.tokenizer.encode( message["content"], add_special_tokens=False ) # 配对形成一个 (source_ids, target_ids) pairs.append((current_user_encoded, assistant_encoded)) current_user_encoded = None total_length = 0 # 初始化总长度 # 逐对处理编码后的 (source_ids, target_ids) for turn_idx, (source_ids, target_ids) in enumerate(pairs): # 检查是否超出最大长度,若超出则停止处理 if total_length >= self.model_max_length: print("Exceeded max length, stopping processing further turns.") break # 动态截断长度 source_len, target_len = infer_seqlen( len(source_ids), len(target_ids), self.model_max_length - total_length ) source_ids = source_ids[:source_len] target_ids = target_ids[:target_len] # 更新总长度 total_length += source_len + target_len source_label = [self.tokenizer.bos_token_id] + [self.ignore_index] * (source_len-1) target_label = target_ids # 数据拼接 input_ids += source_ids + target_ids labels += source_label + target_label # 添加 EOS 标记 input_ids += [self.tokenizer.eos_token_id] labels += [self.tokenizer.eos_token_id] input_ids += [self.tokenizer.pad_token_id] * ( self.model_max_length - len(input_ids) ) labels += [self.ignore_index] * (self.model_max_length - len(labels)) # 转换为 Tensor input_ids = torch.LongTensor(input_ids) labels = torch.LongTensor(labels) # 构造 attention_mask attention_mask = attention_mask = input_ids.ne(self.tokenizer.pad_token_id) return { "input_ids": input_ids, "labels": labels, "attention_mask": attention_mask, } def __getitem__(self, idx) -> Dict[str, torch.Tensor]: return self.preprocessing(self.data[idx])修改代码,改为读取csv文件,提取csv文件的Molecular Formula、Total Exact Mass、Spectrum以及SELFIES列数据,对Molecular Formula使用formula_to_dense函数获取形状为(batch,18)的输入,对Total Exact MassSpectrum列使用prepro_specs_trainencoding函数获取形状为(batch,501,257)的输入,然后SELFIES使用tokenrizer编码,使用cls_tokensep_token作为开始结束标记,填充标记为pad_token
最新发布
07-24
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包

打赏作者

智能学习者

你的鼓励将是我创作的最大动力

¥1 ¥2 ¥4 ¥6 ¥10 ¥20
扫码支付:¥1
获取中
扫码支付

您的余额不足,请更换扫码支付或充值

打赏作者

实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值