1.完整代码
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
import lightgbm as lgb
df_original = pd.read_csv("train_data.csv")
n_original = df_original.shape[0]
df_submit = pd.read_csv("sample_submission.csv")
df = pd.concat([df_original, df_submit], axis=0).reset_index(drop=True)
def siRNA_feat_builder(s: pd.Series, anti: bool = False):
name = "anti" if anti else "sense"
df = s.to_frame()
df[f"feat_siRNA_{
name}_seq_len"] = s.str.len()
nucleotides = "AUGC"
for pos in [0, -1]:
for c in nucleotides:
df[f"feat_siRNA_{
name}_seq_{
c}_{
'front' if pos == 0 else 'back'}"] = (s.str[pos] == c)
patterns = [
("AA", "UU"), ("GA", "UU"), ("CA", "UU"), ("UA", "UU"),
("UU", "AA"), ("UU"