简易node ts代码给json进行sort或者diff

compare工具的问题在于,如果这些json的节点是未排序,导致基本上显示全部不同。
现node上几行代码,可以输出两个json的不同之处,也可以输出一个所有属性节点有序的json。

注意:这里数组元素也会以对象形式显现,数组的keys即为数组元素index。因为如果按数组处理,需要处理数组的元素是对象的情况,如果数组的元素是对象,那么排序规则是什么?(或者数组应该原样显示?,那么如果是字符串数组,难道不排序下?)
所以没有太完美的解决办法,反正我们的目的是比对和查找,不用于生产,那么代码统一处理带来的简洁可以弥补这点瑕疵。

tsconfig.json

{
  "compilerOptions": {
    "module": "commonjs",
    "target": "es6",
    "strict": true,
    "strictFunctionTypes": true,
    "strictPropertyInitialization": true,
    "strictNullChecks": true,
    "allowJs": true,
    "checkJs": true,
    "removeComments": true,
    "noImplicitAny": true,
    "noImplicitThis": true,
    "noImplicitReturns": true,
    "noImplicitOverride": true,
    "sourceMap": true,
    "declaration": true,
    "rootDir":"./src",
    "outDir":"./out",
    "experimentalDecorators": true,
    "watch":true
  },
  "include":[
    "./src/**/*"
  ],
  "exclude":[
//    "./src/test/*"
  ]
}

package.json

{
  "name": "json-sort-diff",
  "version": "1.0.0",
  "scripts": {
    "build": "tsc --build tsconfig.json"
  },
  "dependencies": {
    "@types/node": "^18.11.9"
  }
}

json-sort-diff.ts

const fs = require('fs');

(function main() {
    const argv = process.argv;
    // argv[0] ==> node
    // argv[1] ==> the js file

    if (argv.length == 3) {
        const json = fs.readFileSync(argv[2]);
        const obj = JSON.parse(json);
        console.log(JSON.stringify(orderedJson(obj), null, 2));
    } else if (argv.length === 4) {
        const fromJson = fs.readFileSync(argv[2]);
        const toJson = fs.readFileSync(argv[3]);
        const from = JSON.parse(fromJson);
        const to = JSON.parse(toJson);
        diff(from, to);
    } else {
        console.log("Examples:");
        console.log("1. Dump sorted json: node json-sort-diff.js path-of-json");
        console.log("2. Diff two json: node json-sort-diff.js path-of-from-json path-of-to-json")
        process.exit(0);
    }
})();

function orderedJson(unordered: any): any {
    return Object.keys(unordered).sort().reduce((obj: any, key: string) => {
        const value = unordered[key];
        if (typeof value === 'object') {
            obj[key] = orderedJson(value)
        } else {
            obj[key] = unordered[key];
        }
        return obj;
    }, {});
}

function dump(obj: any) {
    walk(obj, '', (path, key, value, leaf) => {
        if (leaf) {
            console.log(`${path} -> #(${value})`);
        } else {
            console.log(path);
        }
    });
}

function diff(from: any, to: any) {
    const noFounds: string[] = [];
    const diffInfos: DiffInfo[] = [];
    walk(from, '', (path, key, value, leaf) => {
        const segments = path.split('/');
        if (segments[0] === '') {
            segments.shift();
        }
        let current = to;
        for (const segment of segments) {
            if (current[segment] === undefined) {
                noFounds.push(path);
            }
            current = current[segment];
        }
        if (leaf) {
            if (current !== value) {
                diffInfos.push({ path, from: value, to: current })
            }
        }
    });
    console.log('\x1b[33m%s\x1b[0m', 'List of not found:');
    for (const noFound of noFounds.sort()) {
        console.log(noFound);
    }
    console.log('\x1b[33m%s\x1b[0m', 'List of different value:');
    for (const diffInfo of diffInfos.sort((a, b) => a.path.localeCompare(b.path))) {
        console.log(`${diffInfo.path} diff -> from:${diffInfo.from}; to:${diffInfo.to}`);
    }
}

export function walk(object: any, path: string, callback: (path: string, key: string, value: any, leaf: boolean) => void) {
    for (const [key, value] of Object.entries(object)) {
        const current = path + '/' + key;
        if (typeof value === 'object') {
            callback(current, key, value, false);
            walk(value, current, callback);
        } else {
            callback(current, key, value, true);
        }
    }
}

interface DiffInfo {
    path: string;
    from: any;
    to: any;
}
很好,已收到擴充後的「my-app」完整目錄設計。以下提供針對該結構的檢視、優化建議與可落地的補充,確保契約單一真相、雙引擎協作、Artifact 驅動的 Mock/Flow、以及 CI/CD 一致性。 ## 綜合評估 - 優點 - repo 作為 Artifact 核心,涵蓋 api/flows/data/contracts/validators,契約先於實作,方向正確。 - cmd 與 internal 採用整潔架構分層,利於替換 repository 實作(記憶體/SQL)。 - web 支援雙模式資料訪問(Artifact/真實 API)且分層清晰(core/services、features/data-access)。 - scripts、deployments、docker、Makefile、CI 工作流完整,利於工程化落地。 - 可優化 - contracts 與 api 契約生成鏈路需補「單一真相輸出 openapi.yaml」並同步至 web/gen types 與 Go 端 oapi-codegen。 - repo/api/*.json 與 repo/flows/*.yaml 關係需要 index.json 統一管理 method/path → flow/data 定義。 - validators 的 zod.ts/ajv.ts 建議由腳本從 openapi.yaml 半自動生成,避免漂移。 - docker-compose 建議提供三容器:api、artifact-gateway、web,並提供 /mock、/repo、/api 三通道代理與熱重載。 - CI 建議加入 openapi-diff 檢查破壞性變更與 artifacts schema 校驗。 以下在不更動你既有主要結構的前提下,增補少量目錄與文件,令流水線可立即落地。 ## 建議最小增補 - repo/ - api/index.json:確保所有端點集中路由註冊。 - flows/ 中補充 users.*.flow.yaml 或以 users.flow.yaml 拆分為多文件便於維護。 - scripts/ - gen-openapi.sh:從 repo/contracts 合成 openapi.yaml,並生成後端/前端型別。 - validate-artifact.mjs:校驗 repo/api 與 flows/data 契約一致。 - web/ - environments/environment.ts 與 environment.prod.ts:加入 useArtifact 與 repoBaseUrl/mockBaseUrl/apiBaseUrl。 - core/devtools(可選):資料源切換面板與請求日誌面板。 - .github/workflows/ci.yml - 新增 openapi 生成、types 生成、go test、前端 build、artifact 驗證步驟。 - docker-compose.yml - 提供 dev 佈署:artifact-gateway、api、web(Web 使用 Angular dev server 並代理到 gateway/api)。 ## 修正版完整目錄(在你提供的基礎上,標註新增或重點文件) ``` my-app/ ├── repo/ │ ├── api/ │ │ ├── index.json # 路由索引(method+path → def/flow/data)(重點) │ │ ├── users.get.json │ │ ├── users.id.get.json │ │ └── users.post.json │ ├── contracts/ │ │ ├── common.yaml │ │ └── users.yaml │ ├── data/ │ │ ├── users.list.v1.json │ │ ├── users.detail.u_1.v1.json │ │ └── seed.users.v1.json │ ├── flows/ │ │ ├── users.list.flow.yaml # 建議:分檔 flows(清晰覆蓋 GET/POST) │ │ ├── users.detail.flow.yaml │ │ └── users.create.flow.yaml │ └── validators/ │ ├── zod.ts # 可由 openapi 生成(建議半自動) │ └── ajv.ts ├── cmd/ │ ├── api/ │ │ └── main.go │ └── artifact-gateway/ │ └── main.go # 讀取 repo,掛載 /mock /repo,對照 index.json ├── internal/ │ ├── domain/ │ │ └── user.go │ ├── usecase/ │ │ └── user_usecase.go │ ├── repository/ │ │ ├── user_repo_mem.go │ │ └── user_repo_sql.go │ └── delivery/ │ └── http/ │ ├── middleware.go │ └── user_handler.go ├── platform/ │ ├── logger.go │ ├── config.go │ └── artifact/ │ ├── loader.go # 讀 index.json / flows / data │ └── validator.go # 走 ajv/jsonschema 校驗 ├── web/ │ ├── package.json │ ├── angular.json │ ├── tsconfig.json │ ├── proxy.conf.json │ └── src/ │ ├── main.ts │ ├── index.html │ ├── environments/ │ │ ├── environment.ts # { useArtifact: true, repoBaseUrl, mockBaseUrl, apiBaseUrl } │ │ └── environment.prod.ts # { useArtifact: false, ... } │ └── app/ │ ├── app.routes.ts │ ├── core/ │ │ ├── interceptors/ │ │ │ ├── artifact.interceptor.ts │ │ │ ├── auth.interceptor.ts │ │ │ └── error.interceptor.ts │ │ ├── guards/ │ │ │ └── auth.guard.ts │ │ └── services/ │ │ ├── artifact/ │ │ │ ├── artifact-client.service.ts │ │ │ ├── registry.service.ts │ │ │ └── validators.service.ts │ │ └── api/ │ │ └── api-client.service.ts │ └── features/ │ └── users/ │ ├── users.routes.ts │ ├── components/ │ │ ├── user-list/ │ │ └── user-detail/ │ └── data-access/ │ ├── users.api.service.ts │ ├── users.store.ts │ ├── users.effects.ts │ └── users.reducer.ts ├── scripts/ │ ├── gen-openapi.sh # 生成 openapi,Go/TS 型別 │ ├── validate-artifact.mjs # 校驗 repo api/flows/data 與 contracts 一致 │ ├── migrate-data.sh │ └── wait-for.sh ├── deployments/ │ ├── k8s/ │ │ ├── api-deployment.yaml │ │ ├── gateway-deployment.yaml │ │ └── ingress.yaml │ └── nginx/ │ └── default.conf ├── docker/ │ ├── api.Dockerfile │ ├── web.Dockerfile │ └── artifact-gateway.Dockerfile ├── docker-compose.yml # 建議包含 api、gateway、web ├── Makefile # 一鍵生成/啟動/驗證 ├── .gitignore ├── .env.example └── .github/ └── workflows/ ├── ci.yml └── cd.yml ``` ## 關鍵文件範例與建議 ### 1) repo/api/index.json(示例) ```json { "version": "1.0.0", "endpoints": [ { "id": "users.get", "method": "GET", "path": "/users", "flow": "users.list.flow.yaml", "responseSchema": "#/components/schemas/UserList" }, { "id": "users.id.get", "method": "GET", "path": "/users/:id", "flow": "users.detail.flow.yaml", "responseSchema": "#/components/schemas/User" }, { "id": "users.post", "method": "POST", "path": "/users", "flow": "users.create.flow.yaml", "requestSchema": "#/components/schemas/UserCreate", "responseSchema": "#/components/schemas/User" } ] } ``` ### 2) flows(最小樣例) ```yaml # repo/flows/users.list.flow.yaml steps: - op: filterAndPaginate args: _dataset: "users" page: "$request.query.page" size: "$request.query.size" q: "$request.query.q" fields: ["name","email"] sortBy: "name:asc" - op: respond args: status: 200 bodyFrom: "$ctx.result" ``` ### 3) web 環境切換(最小) ```ts // web/src/environments/environment.ts export const environment = { production: false, useArtifact: true, repoBaseUrl: 'http://localhost:8787/repo', mockBaseUrl: 'http://localhost:8787/mock', apiBaseUrl: 'http://localhost:8080' }; ``` ### 4) Angular 雙源客戶端(最小) ```ts @Injectable({ providedIn: 'root' }) export class UsersApiService { constructor( private artifact: ArtifactClientService, private api: ApiClientService ) {} getUsers(params: any) { return environment.useArtifact ? this.artifact.fetch<User[]>('users.get', params) : this.api.get<User[]>('/v1/users', { params }); } } ``` ### 5) CI 工作流關鍵步驟(摘要) ```yaml # .github/workflows/ci.yml - name: Generate OpenAPI and Types run: scripts/gen-openapi.sh - name: Validate Artifacts run: node scripts/validate-artifact.mjs - name: Go Test run: go test ./... - name: Web Build run: npm ci --prefix web && npm run build --prefix web ``` ### 6) docker-compose(建議) - 服務:artifact-gateway(:8787)、api(:8080)、web(:4200 dev 或 Nginx :80) - 目的:前端可透過 proxy.conf.json 訪問 /mock、/repo、/api,提升開發效率。 ## 下一步落地清單 - 補上 scripts/gen-openapi.sh 與 validate-artifact.mjs 最小版本。 - 實作 artifact-gateway 主程式與 flows 運行器(可先無 RBAC/Schema,逐步加)。 - 前端 ArtifactClientService 與 registry.service 讀取 repo/api/index.json,自動綁定端點。 - 在 Makefile 加入目標:gen.openapi、validate.artifact、dev.up(docker-compose)、gw.run、api.run、web.run。 如需,我可以直接提供上述關鍵文件的可運行樣板(Shell/Node/Go/TS),確保克隆即跑,並在 CI 內自動生成契約與型別。
最新发布
10-28
可是我使用repo api. 仍然還要加入open. Aip的設計嗎?下面提供一套可直接放入 myapp/repo 的「完整範例檔」與對應的最小可執行數據集,以及前端與 Gateway 的單元測試樣板。目標:clone 後不需後端即可用 /mock 與 /repo 跑起來,並在 CI 驗證 artifact 正確性。 # 目錄結構 ``` repo/ ├─ api/ │ ├─ index.json │ ├─ users.get.json │ ├─ users.id.get.json │ └─ users.post.json ├─ contracts/ │ ├─ common.yaml │ └─ users.yaml ├─ data/ │ ├─ users.list.v1.json │ ├─ users.detail.u_1.v1.json │ └─ seed.users.v1.json ├─ flows/ │ └─ users.flow.yaml └─ validators/ ├─ zod.ts └─ ajv.ts ``` --- ## repo/api/index.json ```json { "version": "1.0.0", "endpoints": { "GET /v1/users": "users.get.json", "GET /v1/users/{id}": "users.id.get.json", "POST /v1/users": "users.post.json" } } ``` ## repo/api/users.get.json ```json { "method": "GET", "path": "/v1/users", "request": { "query": { "page": { "type": "integer", "default": 1, "minimum": 1 }, "size": { "type": "integer", "default": 20, "minimum": 1, "maximum": 100 }, "q": { "type": "string", "optional": true } } }, "response": { "status": 200, "body": { "$ref": "../data/users.list.v1.json" } }, "errors": { "429": { "message": "too many requests" } }, "meta": { "contract": "../contracts/users.yaml#list", "rbac": "read" } } ``` ## repo/api/users.id.get.json ```json { "method": "GET", "path": "/v1/users/{id}", "request": { "params": { "id": { "type": "string", "pattern": "^u_\\w+" } } }, "response": { "status": 200, "body": { "$ref": "../data/users.detail.u_1.v1.json" } }, "errors": { "404": { "message": "user not found" } }, "meta": { "contract": "../contracts/users.yaml#getById", "rbac": "read" } } ``` ## repo/api/users.post.json ```json { "method": "POST", "path": "/v1/users", "request": { "headers": { "content-type": { "const": "application/json" } }, "body": { "type": "object", "required": ["email", "name"], "properties": { "email": { "type": "string", "format": "email" }, "name": { "type": "string", "minLength": 1 } } } }, "response": { "status": 201, "body": { "id": "u_seed_new", "email": "new@example.com", "name": "New User" } }, "errors": { "400": { "message": "invalid body" }, "409": { "message": "email already exists" } }, "meta": { "contract": "../contracts/users.yaml#create", "rbac": "write", "idStrategy": "uuid" } } ``` --- ## repo/contracts/common.yaml ```yaml version: 1 name: common components: schemas: ErrorResponse: type: object required: [error] properties: error: type: string PageMeta: type: object required: [page, size, total] properties: page: { type: integer, minimum: 1 } size: { type: integer, minimum: 1, maximum: 100 } total: { type: integer, minimum: 0 } security: rbac: roles: - admin - user rules: read: anyOf: - role: admin - role: user write: anyOf: - role: admin ``` ## repo/contracts/users.yaml ```yaml version: 1 name: users components: schemas: User: type: object required: [id, email, name] properties: id: { type: string } email: { type: string, format: email } name: { type: string } CreateUserRequest: type: object required: [email, name] properties: email: { type: string, format: email } name: { type: string, minLength: 1 } UsersListResponse: type: object required: [items, total] properties: items: type: array items: $ref: '#/components/schemas/User' total: { type: integer, minimum: 0 } endpoints: list: method: GET path: /v1/users query: page: { type: integer, minimum: 1 } size: { type: integer, minimum: 1, maximum: 100 } q: { type: string } responses: "200": content: application/json: schema: $ref: '#/components/schemas/UsersListResponse' rbac: read getById: method: GET path: /v1/users/{id} params: id: { type: string } responses: "200": content: application/json: schema: $ref: '#/components/schemas/User' "404": content: application/json: schema: $ref: '../common.yaml#/components/schemas/ErrorResponse' rbac: read create: method: POST path: /v1/users requestBody: content: application/json: schema: $ref: '#/components/schemas/CreateUserRequest' responses: "201": content: application/json: schema: $ref: '#/components/schemas/User' "400": content: application/json: schema: $ref: '../common.yaml#/components/schemas/ErrorResponse' "409": content: application/json: schema: $ref: '../common.yaml#/components/schemas/ErrorResponse' rbac: write deprecations: - note: "v1 list will move to cursor pagination in v2" ``` --- ## repo/data/users.list.v1.json ```json { "items": [ { "id": "u_1", "email": "a@example.com", "name": "Alice" }, { "id": "u_2", "email": "b@example.com", "name": "Bob" } ], "total": 2 } ``` ## repo/data/users.detail.u_1.v1.json ```json { "id": "u_1", "email": "a@example.com", "name": "Alice" } ``` ## repo/data/seed.users.v1.json ```json [ { "id": "u_1", "email": "a@example.com", "name": "Alice" }, { "id": "u_2", "email": "b@example.com", "name": "Bob" } ] ``` --- ## repo/flows/users.flow.yaml(完整流程示例) ```yaml usecase: "users module" shared: datasets: users: "../data/seed.users.v1.json" validators: listReq: "contracts.users.endpoints.list.query" listRes: "contracts.users.components.schemas.UsersListResponse" userRes: "contracts.users.components.schemas.User" rbac: read: "contracts.common.security.rbac.rules.read" write: "contracts.common.security.rbac.rules.write" flows: list: route: "GET /v1/users" steps: - rbac: require: read # 依 common.yaml 的 rbac 規則,檢查 x-role - validate: target: listReq # 驗證 query 參數 - query: from: users # 使用 datasets.users 資料 op: filterAndPaginate args: page: "$request.query.page" size: "$request.query.size" q: "$request.query.q" - validate: target: listRes # 回傳前驗證 schema - respond: status: 200 getById: route: "GET /v1/users/{id}" steps: - rbac: { require: read } - query: from: users op: findById args: { id: "$request.params.id" } - if: when: "$ctx.result == null" then: - respond: { status: 404, body: { error: "user not found" } } else: - validate: { target: userRes } - respond: { status: 200 } create: route: "POST /v1/users" steps: - rbac: { require: write } - validate: target: "contracts.users.components.schemas.CreateUserRequest" source: "$request.body" - query: from: users op: checkEmailUnique args: { email: "$request.body.email" } - if: when: "$ctx.unique == false" then: [ { respond: { status: 409, body: { error: "email already exists" } } } ] else: - transform: op: assignId args: { strategy: "uuid" } - respond: status: 201 bodyFrom: "$ctx.newUser" errors: - match: "validation_error" respond: { status: 400, body: { error: "invalid request" } } - match: "rbac_denied" respond: { status: 403, body: { error: "forbidden" } } ``` 說明: - filterAndPaginate、findById、checkEmailUnique、assignId 這些運算可由 Gateway 內建或以 JS 執行器提供(Goja)。 --- ## repo/validators/zod.ts前端 Zod 驗證器) ```ts import { z } from 'zod'; // 共用 export const errorResponseSchema = z.object({ error: z.string() }); // Users export const userSchema = z.object({ id: z.string(), email: z.string().email(), name: z.string() }); export const usersListResponseSchema = z.object({ items: z.array(userSchema), total: z.number().nonnegative() }); // create export const createUserRequestSchema = z.object({ email: z.string().email(), name: z.string().min(1) }); export type User = z.infer<typeof userSchema>; export type UsersListResponse = z.infer<typeof usersListResponseSchema>; export type CreateUserRequest = z.infer<typeof createUserRequestSchema>; ``` ## repo/validators/ajv.ts(若採 AJV) ```ts import Ajv from 'ajv'; export function makeAjv() { const ajv = new Ajv({ allErrors: true, strict: false }); // 可動態載入 contracts/*.yaml 轉為 JSON schema 後 addSchema return ajv; } ``` --- # 最小可執行數據集與單元測試 以下提供三種測試層級的最小樣本。 ## 1) 前端單元測試(Jest/Karma,驗證 artifact 與 Zod 一致) web/src/app/core/artifact/artifact.spec.ts ```ts import { usersListResponseSchema, userSchema } from '../../../../repo/validators/zod'; import data from '../../../../repo/data/users.list.v1.json'; describe('Artifact data schema', () => { it('users.list matches schema', () => { const parsed = usersListResponseSchema.parse(data); expect(parsed.items.length).toBeGreaterThan(0); parsed.items.forEach(u => expect(userSchema.parse(u)).toBeTruthy()); }); }); ``` 說明: - 配置 tsconfig / jest 允許 import JSON(或使用 raw-loader/vite json import)。 - Angular 預設 Karma 也可運行,按專案測試框架調整。 ## 2) Node 腳本驗證(CI 用) scripts/validate-artifact.mjs ```js import fs from 'node:fs/promises'; import { fileURLToPath } from 'node:url'; import path from 'node:path'; import { z } from 'zod'; const __filename = fileURLToPath(import.meta.url); const __dirname = path.dirname(__filename); const userSchema = z.object({ id: z.string(), email: z.string().email(), name: z.string() }); const listSchema = z.object({ items: z.array(userSchema), total: z.number().nonnegative() }); async function main() { const base = path.join(__dirname, '..', 'repo', 'data'); const list = JSON.parse(await fs.readFile(path.join(base, 'users.list.v1.json'), 'utf-8')); listSchema.parse(list); const detail = JSON.parse(await fs.readFile(path.join(base, 'users.detail.u_1.v1.json'), 'utf-8')); userSchema.parse(detail); console.log('Artifact validation OK'); } main().catch(e => { console.error(e); process.exit(1); }); ``` CI 片段(.github/workflows/ci.yml) ```yaml - uses: actions/setup-node@v4 with: { node-version: '20' } - run: node scripts/validate-artifact.mjs ``` ## 3) Artifact Gateway 的 E2E 測試(Go,使用 httptest) cmd/artifact-gateway/main_test.go(示意) ```go package main import ( "net/http" "net/http/httptest" "os" "path/filepath" "testing" "github.com/gin-gonic/gin" ) func TestMockListUsers(t *testing.T) { gin.SetMode(gin.TestMode) repoBase := filepath.Join("..", "..", "repo") os.Setenv("REPO_BASE", repoBase) r := gin.Default() r.Static("/repo", repoBase) r.Any("/mock/*any", func(c *gin.Context) { ep, err := matchEndpoint(repoBase, c.Request.Method, c.Param("any")[1:]) if err != nil { c.JSON(http.StatusNotFound, gin.H{"error": err.Error()}) return } status := 200 if s, ok := ep.Response["status"].(float64); ok { status = int(s) } body := ep.Response["body"] c.JSON(status, body) }) w := httptest.NewRecorder() req := httptest.NewRequest("GET", "/mock/v1/users?page=1&size=10", nil) r.ServeHTTP(w, req) if w.Code != http.StatusOK { t.Fatalf("status: %d, body: %s", w.Code, w.Body.String()) } if got := w.Body.String(); len(got) == 0 { t.Fatalf("empty body") } } ``` --- # 開發與啟動提示 - 本地開發 - 啟動 Artifact Gateway:`make gw.run`(或 docker-compose 啟動) - 前端以環境開關使用 `/mock` 或 `/repo`: - `/mock/v1/users` -> 根據 repo/api/ 與 flows 返回 - `/repo/data/users.list.v1.json` -> 直接讀靜態資料 - Docker 組合(在 docker-compose.yml 已示範) - Web 靜態暴露 /repo,並代理 `/mock` 至 Gateway --- # 後續延伸 - 在 Gateway 增加 flows 運算子實作:filterAndPaginate、findById、checkEmailUnique、assignId(可用 Goja 執行 TS/JS)。 - 以 js-yaml 解析 contracts/*.yaml,附加 request/response schema 驗證與 `x-role` RBAC 檢查。 - 提供 CLI:`gw validate` 檢查 repo 完整性(registry 對應檔案存在、$ref 可解析、schema 驗證通過)。 以上檔案可直接放入專案對應路徑,配合前文提供的 Gateway 與 Angular 設定即可起跑。
10-28
import osimport numpy as npimport pandas as pdfrom tqdm import tqdmimport jsonfrom collections import defaultdict def read_xyz_file(file_path): """ 读取XYZ格式的分子结构文件 返回原子类型列表和坐标数组 """ try: with open(file_path, 'r') as f: lines = f.readlines() # 跳过空行 lines = [line.strip() for line in lines if line.strip()] if len(lines) < 2: raise ValueError(f"文件 {file_path} 内容不足") num_atoms = int(lines[0]) atoms = [] coords = [] for i in range(2, 2 + num_atoms): if i >= len(lines): raise ValueError(f"文件 {file_path} 原子数量与声明不符") parts = lines[i].split() if len(parts) < 4: continue atom = parts[0] try: x, y, z = map(float, parts[1:4]) atoms.append(atom) coords.append([x, y, z]) except ValueError: continue return atoms, np.array(coords) except Exception as e: print(f"读取文件 {file_path} 时出错: {e}") return [], np.array([]) def extract_basic_features(atoms, coords): """ 从分子结构中提取基本特征 """ if len(atoms) == 0: return {} # 计算质心 centroid = np.mean(coords, axis=0) # 计算原子间距离矩阵 n_atoms = len(atoms) dist_matrix = np.zeros((n_atoms, n_atoms)) for i in range(n_atoms): for j in range(i+1, n_atoms): dist = np.linalg.norm(coords[i] - coords[j]) dist_matrix[i, j] = dist dist_matrix[j, i] = dist # 简单统计特征 features = { 'num_atoms': n_atoms, 'centroid': centroid.tolist(), 'max_distance': np.max(dist_matrix), 'min_distance': np.min(dist_matrix[np.nonzero(dist_matrix)]) if n_atoms > 1 else 0, 'element_counts': {elem: atoms.count(elem) for elem in set(atoms)} } return features def process_reaction_folder(reaction_folder): """ 处理单个反应文件夹的数据 """ # 查找文件夹中的XYZ文件 xyz_files = [f for f in os.listdir(reaction_folder) if f.endswith('.xyz')] # 确定文件类型 reactant_file = None ts_file = None product_file = None for f in xyz_files: if 'reactant' in f.lower() or 'r_' in f.lower(): reactant_file = os.path.join(reaction_folder, f) elif 'ts' in f.lower() or 'transition' in f.lower(): ts_file = os.path.join(reaction_folder, f) elif 'product' in f.lower() or 'p_' in f.lower(): product_file = os.path.join(reaction_folder, f) # 如果无法通过文件名判断,尝试按顺序分配 if not all([reactant_file, ts_file, product_file]) and len(xyz_files) >= 3: xyz_files.sort() reactant_file, ts_file, product_file = [os.path.join(reaction_folder, f) for f in xyz_files[:3]] # 读取结构文件 reactant_atoms, reactant_coords = read_xyz_file(reactant_file) if reactant_file else ([], []) ts_atoms, ts_coords = read_xyz_file(ts_file) if ts_file else ([], []) product_atoms, product_coords = read_xyz_file(product_file) if product_file else ([], []) # 检查数据有效性 if len(reactant_atoms) == 0 or len(ts_atoms) == 0 or len(product_atoms) == 0: return None # 提取特征 reactant_features = extract_basic_features(reactant_atoms, reactant_coords) ts_features = extract_basic_features(ts_atoms, ts_coords) product_features = extract_basic_features(product_atoms, product_coords) # 计算反应物与过渡态之间的位移 # 注意:根据赛题说明,原子顺序已经对齐 displacement = ts_coords - reactant_coords return { 'reaction_id': os.path.basename(reaction_folder), 'reactant': { 'atoms': reactant_atoms, 'coords': reactant_coords, 'features': reactant_features }, 'ts': { 'atoms': ts_atoms, 'coords': ts_coords, 'features': ts_features }, 'product': { 'atoms': product_atoms, 'coords': product_coords, 'features': product_features }, 'displacement': displacement } def explore_dataset(data_root): """ 探索数据集,统计基本信息 """ # 获取所有反应文件夹 reaction_folders = [os.path.join(data_root, d) for d in os.listdir(data_root) if os.path.isdir(os.path.join(data_root, d))] print(f"找到 {len(reaction_folders)} 个反应文件夹") # 处理每个反应 reactions_data = [] element_counts = defaultdict(int) atom_counts = [] failed_reactions = [] for folder in tqdm(reaction_folders, desc="处理反应数据"): try: reaction_data = process_reaction_folder(folder) if reaction_data: reactions_data.append(reaction_data) # 统计元素出现次数 for atom in reaction_data['reactant']['atoms']: element_counts[atom] += 1 # 统计原子数量 atom_counts.append(len(reaction_data['reactant']['atoms'])) else: failed_reactions.append(folder) except Exception as e: print(f"处理反应 {folder} 时出错: {e}") failed_reactions.append(folder) # 数据集统计信息 dataset_info = { 'total_reactions': len(reactions_data), 'failed_reactions': len(failed_reactions), 'elements': dict(element_counts), 'avg_atoms': np.mean(atom_counts) if atom_counts else 0, 'min_atoms': min(atom_counts) if atom_counts else 0, 'max_atoms': max(atom_counts) if atom_counts else 0, 'sample_reactions': reactions_data[:3] if reactions_data else [] # 保存前3个反应作为样本 } return dataset_info, reactions_data def save_dataset_info(dataset_info, output_path): """保存数据集信息到JSON文件""" # 将numpy数组转换为列表,以便JSON序列化 def convert_numpy(obj): if isinstance(obj, np.ndarray): return obj.tolist() elif isinstance(obj, np.integer): return int(obj) elif isinstance(obj, np.floating): return float(obj) return obj with open(output_path, 'w') as f: json.dump(dataset_info, f, indent=2, default=convert_numpy) def main(): # 设置数据路径 - 根据你的实际路径修改 train_data_path = "train_data" # 确保数据路径存在 if not os.path.exists(train_data_path): print(f"错误: 数据路径 '{train_data_path}' 不存在") return # 探索数据集 print("开始探索数据集...") dataset_info, reactions_data = explore_dataset(train_data_path) # 打印数据集信息 print(f"\n数据集统计:") print(f"成功处理反应: {dataset_info['total_reactions']}") print(f"失败反应: {dataset_info['failed_reactions']}") print(f"元素分布: {dataset_info['elements']}") print(f"平均原子数: {dataset_info['avg_atoms']:.2f}") print(f"最小原子数: {dataset_info['min_atoms']}") print(f"最大原子数: {dataset_info['max_atoms']}") # 保存数据集信息 save_dataset_info(dataset_info, 'dataset_info.json') print("数据集信息已保存到 dataset_info.json") # 保存处理后的数据(可选) if reactions_data: # 你可以选择保存处理后的数据供后续使用 np.save('processed_reactions.npy', reactions_data) print("处理后的反应数据已保存到 processed_reactions.npy") if __name__ == "__main__": main() In [ ] import numpy as npimport pandas as pdimport matplotlib.pyplot as pltimport seaborn as snsfrom collections import Counterimport json # 设置中文字体支持(如果需要) plt.rcParams['font.sans-serif'] = ['SimHei'] # 用来正常显示中文标签 plt.rcParams['axes.unicode_minus'] = False # 用来正常显示负号 # 加载之前保存的数据集信息with open('dataset_info.json', 'r') as f: dataset_info = json.load(f) print("数据集详细信息:")print(f"总反应数: {dataset_info['total_reactions']}")print(f"元素分布: O(氧):{dataset_info['elements']['O']}, C(碳):{dataset_info['elements']['C']}, H(氢):{dataset_info['elements']['H']}, N(氮):{dataset_info['elements']['N']}")print(f"平均原子数: {dataset_info['avg_atoms']:.2f}")print(f"原子数范围: {dataset_info['min_atoms']} - {dataset_info['max_atoms']}") corrected_elements = { 'O': dataset_info['elements']['O'], 'C': dataset_info['elements']['C'], 'H': dataset_info['elements']['H'], 'N': dataset_info['elements']['N'] } # 可视化元素分布 plt.figure(figsize=(10, 6)) plt.bar(corrected_elements.keys(), corrected_elements.values()) plt.title('元素分布') plt.xlabel('元素') plt.ylabel('出现次数') plt.tight_layout() plt.savefig('element_distribution.png') plt.show() # 加载处理后的反应数据try: reactions_data = np.load('processed_reactions.npy', allow_pickle=True) print(f"成功加载 {len(reactions_data)} 个反应的数据") # 分析反应类型 reaction_types = [] for reaction in reactions_data[:100]: # 只分析前100个反应以节省时间 reactant_atoms = reaction['reactant']['atoms'] product_atoms = reaction['product']['atoms'] # 简单的反应类型判断(基于原子数量变化) atom_diff = len(product_atoms) - len(reactant_atoms) if atom_diff > 0: reaction_types.append('加成反应') elif atom_diff < 0: reaction_types.append('消除反应') else: reaction_types.append('重排反应') # 统计反应类型分布 type_counts = Counter(reaction_types) print("反应类型分布(基于原子数变化):") for rtype, count in type_counts.items(): print(f"{rtype}: {count}") # 可视化反应类型分布 plt.figure(figsize=(8, 6)) plt.pie(type_counts.values(), labels=type_counts.keys(), autopct='%1.1f%%') plt.title('反应类型分布') plt.savefig('reaction_type_distribution.png') plt.show() except FileNotFoundError: print("未找到 processed_reactions.npy 文件") reactions_data = [] # 创建分子描述符函数def create_molecular_descriptors(atoms, coords): """创建分子描述符""" if len(atoms) == 0: return {} # 原子质量字典 atomic_masses = { 'H': 1.008, 'C': 12.011, 'N': 14.007, 'O': 15.999, 'F': 18.998, 'P': 30.974, 'S': 32.065, 'Cl': 35.453 } # 计算分子质量 mass = sum(atomic_masses.get(atom, 12.0) for atom in atoms) # 计算质心 centroid = np.mean(coords, axis=0) # 计算惯性矩 centered_coords = coords - centroid inertia_tensor = np.zeros((3, 3)) for i in range(len(atoms)): x, y, z = centered_coords[i] mass_i = atomic_masses.get(atoms[i], 12.0) inertia_tensor[0, 0] += mass_i * (y**2 + z**2) inertia_tensor[1, 1] += mass_i * (x**2 + z**2) inertia_tensor[2, 2] += mass_i * (x**2 + y**2) inertia_tensor[0, 1] -= mass_i * x * y inertia_tensor[0, 2] -= mass_i * x * z inertia_tensor[1, 2] -= mass_i * y * z inertia_tensor[1, 0] = inertia_tensor[0, 1] inertia_tensor[2, 0] = inertia_tensor[0, 2] inertia_tensor[2, 1] = inertia_tensor[1, 2] # 计算惯性矩的主值 principal_moments = np.linalg.eigvalsh(inertia_tensor) # 计算回转半径 gyration_radius = np.sqrt(np.sum(principal_moments) / mass) if mass > 0 else 0 return { 'molecular_mass': mass, 'centroid': centroid.tolist(), 'principal_moments': principal_moments.tolist(), 'gyration_radius': gyration_radius } # 创建特征数据集if len(reactions_data) > 0: features_list = [] for i, reaction in enumerate(reactions_data[:500]): # 只处理前500个反应以节省时间 if i % 100 == 0: print(f"处理第 {i} 个反应...") reaction_id = reaction['reaction_id'] # 反应物特征 reactant_features = create_molecular_descriptors( reaction['reactant']['atoms'], reaction['reactant']['coords'] ) # 产物特征 product_features = create_molecular_descriptors( reaction['product']['atoms'], reaction['product']['coords'] ) # 过渡态特征 ts_features = create_molecular_descriptors( reaction['ts']['atoms'], reaction['ts']['coords'] ) # 计算反应物和产物之间的差异 mass_change = product_features['molecular_mass'] - reactant_features['molecular_mass'] gyration_change = product_features['gyration_radius'] - reactant_features['gyration_radius'] features = { 'reaction_id': reaction_id, 'num_atoms': len(reaction['reactant']['atoms']), 'reactant_mass': reactant_features['molecular_mass'], 'product_mass': product_features['molecular_mass'], 'ts_mass': ts_features['molecular_mass'], 'mass_change': mass_change, 'reactant_gyration': reactant_features['gyration_radius'], 'product_gyration': product_features['gyration_radius'], 'ts_gyration': ts_features['gyration_radius'], 'gyration_change': gyration_change } features_list.append(features) # 创建DataFrame并保存 feature_df = pd.DataFrame(features_list) feature_df.to_csv('molecular_features.csv', index=False) print("分子特征已保存到 molecular_features.csv") # 可视化特征分布 fig, axes = plt.subplots(2, 2, figsize=(12, 10)) axes[0, 0].hist(feature_df['num_atoms'], bins=20, alpha=0.7, color='skyblue') axes[0, 0].set_title('原子数分布') axes[0, 0].set_xlabel('原子数') axes[0, 0].set_ylabel('频率') axes[0, 1].hist(feature_df['reactant_mass'], bins=20, alpha=0.7, color='lightgreen') axes[0, 1].set_title('反应物分子质量分布') axes[0, 1].set_xlabel('分子质量') axes[0, 1].set_ylabel('频率') axes[1, 0].hist(feature_df['mass_change'], bins=20, alpha=0.7, color='lightcoral') axes[1, 0].set_title('质量变化分布') axes[1, 0].set_xlabel('质量变化') axes[1, 0].set_ylabel('频率') axes[1, 1].hist(feature_df['gyration_change'], bins=20, alpha=0.7, color='gold') axes[1, 1].set_title('回转半径变化分布') axes[1, 1].set_xlabel('回转半径变化') axes[1, 1].set_ylabel('频率') plt.tight_layout() plt.savefig('feature_distributions.png') plt.show() # 计算特征之间的相关性 correlation_matrix = feature_df[['num_atoms', 'reactant_mass', 'product_mass', 'mass_change', 'reactant_gyration', 'product_gyration', 'gyration_change']].corr() plt.figure(figsize=(10, 8)) sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', center=0) plt.title('特征相关性矩阵') plt.tight_layout() plt.savefig('feature_correlation.png') plt.show() print("\n下一步建议:")print("1. 基于特征数据集探索化学反应模式")print("2. 开始构建机器学习模型")print("3. 考虑使用图神经网络(GNN)处理分子结构") In [ ] import numpy as npimport pandas as pdfrom sklearn.ensemble import RandomForestRegressorfrom sklearn.model_selection import train_test_splitfrom sklearn.metrics import mean_squared_errorfrom sklearn.preprocessing import StandardScaler # 加载特征数据try: feature_df = pd.read_csv('molecular_features.csv') print(f"成功加载 {len(feature_df)} 个反应的特征数据") # 准备特征和目标变量 # 这里我们使用简单的特征来预测过渡态的回转半径 # 在实际应用中,你需要预测原子坐标,这需要更复杂的模型 X = feature_df[['num_atoms', 'reactant_mass', 'product_mass', 'reactant_gyration', 'product_gyration']].values y = feature_df['ts_gyration'].values # 数据标准化 scaler = StandardScaler() X_scaled = scaler.fit_transform(X) # 划分训练集和测试集 X_train, X_test, y_train, y_test = train_test_split( X_scaled, y, test_size=0.2, random_state=42 ) # 训练随机森林模型 print("训练随机森林模型...") model = RandomForestRegressor(n_estimators=100, random_state=42) model.fit(X_train, y_train) # 评估模型 y_pred = model.predict(X_test) mse = mean_squared_error(y_test, y_pred) rmse = np.sqrt(mse) print(f"模型性能:") print(f"均方根误差(RMSE): {rmse:.4f}") print(f"平均回转半径: {np.mean(y):.4f}") print(f"RMSE/平均值: {rmse/np.mean(y)*100:.2f}%") # 特征重要性 feature_importance = pd.DataFrame({ 'feature': ['num_atoms', 'reactant_mass', 'product_mass', 'reactant_gyration', 'product_gyration'], 'importance': model.feature_importances_ }).sort_values('importance', ascending=False) print("\n特征重要性:") print(feature_importance) except FileNotFoundError: print("未找到 molecular_features.csv 文件,请先运行数据分析脚本") print("\n下一步建议:")print("1. 这个基线模型只是预测了一个简单的物理量")print("2. 实际任务需要预测完整的3D分子结构")print("3. 考虑使用图神经网络(GNN)或生成模型来处理这个任务") In [ ] import paddleimport paddle.nn as nnimport paddle.nn.functional as Fimport numpy as npimport matplotlib.pyplot as pltfrom tqdm import tqdmimport osimport json # 设置随机种子以确保可重复性 paddle.seed(42) np.random.seed(42) # 定义原子类型到索引的映射 ATOM_MAPPING = {'H': 0, 'C': 1, 'N': 2, 'O': 3} NUM_ATOM_TYPES = len(ATOM_MAPPING) class MolecularFeatureExtractor(nn.Layer): """分子特征提取器 - 不使用图结构""" def __init__(self, input_dim, hidden_dim=64): super(MolecularFeatureExtractor, self).__init__() self.linear1 = nn.Linear(input_dim, hidden_dim) self.linear2 = nn.Linear(hidden_dim, hidden_dim) self.bn = nn.BatchNorm1D(hidden_dim) def forward(self, x): x = F.relu(self.linear1(x)) x = self.bn(x) x = F.relu(self.linear2(x)) # 全局平均池化 x = paddle.mean(x, axis=1, keepdim=False) return x class TransitionStatePredictor(nn.Layer): """基于PaddlePaddle的过渡态预测模型 - 不使用图结构""" def __init__(self, feature_dim, hidden_dim=64, output_dim=3, max_atoms=23): super(TransitionStatePredictor, self).__init__() self.max_atoms = max_atoms # 反应物和产物的特征提取器 self.reactant_encoder = MolecularFeatureExtractor(feature_dim, hidden_dim) self.product_encoder = MolecularFeatureExtractor(feature_dim, hidden_dim) # 融合层 self.fusion_linear1 = nn.Linear(hidden_dim * 2, hidden_dim) self.fusion_linear2 = nn.Linear(hidden_dim, hidden_dim) # 坐标预测层 self.coord_predictor = nn.Linear(hidden_dim, output_dim * max_atoms) def forward(self, reactant_feat, product_feat): # 编码反应物 rx = self.reactant_encoder(reactant_feat) # 编码产物 px = self.product_encoder(product_feat) # 融合反应物和产物信息 # 修正:使用paddle.concat而不是pandas.concat,并且axis=1 fused = paddle.concat([rx, px], axis=1) fused = F.relu(self.fusion_linear1(fused)) fused = F.relu(self.fusion_linear2(fused)) # 预测坐标位移 coord_shift = self.coord_predictor(fused) coord_shift = coord_shift.reshape([-1, self.max_atoms, 3]) return coord_shift def create_molecular_features(atoms, coords, max_atoms=23): """创建分子特征向量""" num_nodes = min(len(atoms), max_atoms) # 创建节点特征 (原子类型 + 坐标) node_features = [] for i in range(num_nodes): atom = atoms[i] coord = coords[i] # 原子类型 one-hot 编码 atom_idx = ATOM_MAPPING.get(atom, 0) atom_one_hot = [0] * NUM_ATOM_TYPES atom_one_hot[atom_idx] = 1 # 组合特征: 原子类型 + 坐标 node_feature = atom_one_hot + coord.tolist() node_features.append(node_feature) # 如果原子数少于最大值,填充零向量 if num_nodes < max_atoms: for _ in range(max_atoms - num_nodes): node_features.append([0] * (NUM_ATOM_TYPES + 3)) # 展平为单个向量 flattened_features = np.array(node_features).flatten() return flattened_features def prepare_paddle_data(reactions_data, max_atoms=23): """准备PaddlePaddle数据""" reactant_features = [] product_features = [] displacements = [] for reaction in tqdm(reactions_data, desc="准备分子特征数据"): # 处理反应物 reactant_atoms = reaction['reactant']['atoms'] reactant_coords = reaction['reactant']['coords'] reactant_feat = create_molecular_features(reactant_atoms, reactant_coords, max_atoms) # 处理产物 product_atoms = reaction['product']['atoms'] product_coords = reaction['product']['coords'] product_feat = create_molecular_features(product_atoms, product_coords, max_atoms) # 获取位移(反应物到过渡态) displacement = reaction['displacement'] # 如果原子数少于最大值,填充零 if displacement.shape[0] < max_atoms: padding = np.zeros((max_atoms - displacement.shape[0], 3)) displacement = np.vstack([displacement, padding]) reactant_features.append(reactant_feat) product_features.append(product_feat) displacements.append(displacement.flatten().astype('float32')) # 转换为Paddle张量 reactant_features = paddle.to_tensor(np.array(reactant_features, dtype='float32')) product_features = paddle.to_tensor(np.array(product_features, dtype='float32')) displacements = paddle.to_tensor(np.array(displacements, dtype='float32')) return reactant_features, product_features, displacements def train_paddle_model(model, data, num_epochs=50, learning_rate=0.001): """训练PaddlePaddle模型""" reactant_features, product_features, displacements = data # 定义优化器和损失函数 optimizer = paddle.optimizer.Adam(parameters=model.parameters(), learning_rate=learning_rate) criterion = nn.MSELoss() train_losses = [] # 创建数据集 dataset = [(reactant_features[i], product_features[i], displacements[i]) for i in range(len(reactant_features))] for epoch in range(num_epochs): model.train() total_loss = 0 for i in tqdm(range(len(dataset)), desc=f"Epoch {epoch+1}/{num_epochs}"): rx, px, disp = dataset[i] # 前向传播 pred_displacement = model(rx.unsqueeze(0), px.unsqueeze(0)) # 计算损失 loss = criterion(pred_displacement, disp.reshape(pred_displacement.shape)) # 反向传播 loss.backward() optimizer.step() optimizer.clear_grad() total_loss += loss.numpy()[0] avg_loss = total_loss / len(dataset) train_losses.append(avg_loss) print(f"Epoch {epoch+1}/{num_epochs}, Loss: {avg_loss:.6f}") return model, train_losses # 主程序if __name__ == "__main__": # 加载处理后的反应数据 try: reactions_data = np.load('processed_reactions.npy', allow_pickle=True) print(f"成功加载 {len(reactions_data)} 个反应的数据") # 只使用前100个反应进行演示(减少计算时间) sample_reactions = reactions_data[:100] # 准备数据 max_atoms = 23 data = prepare_paddle_data(sample_reactions, max_atoms) # 初始化模型 feature_dim = max_atoms * (NUM_ATOM_TYPES + 3) # 每个分子的特征维度 model = TransitionStatePredictor(feature_dim, hidden_dim=64, output_dim=3, max_atoms=max_atoms) # 训练模型 print("开始训练PaddlePaddle模型...") trained_model, train_losses = train_paddle_model( model, data, num_epochs=10, learning_rate=0.001 ) # 保存模型 paddle.save(trained_model.state_dict(), 'ts_predictor_paddle.pdparams') print("模型已保存到 ts_predictor_paddle.pdparams") # 绘制训练损失曲线 plt.figure(figsize=(10, 6)) plt.plot(train_losses) plt.title('训练损失曲线') plt.xlabel('Epoch') plt.ylabel('Loss') plt.savefig('training_loss_paddle.png') plt.show() except FileNotFoundError: print("未找到 processed_reactions.npy 文件,请先运行数据预处理脚本") In [ ] import paddleimport paddle.nn as nnimport numpy as npimport osimport json # 简单的线性插值模型class LinearInterpolationModel: """简单的线性插值模型 - 作为基线""" def predict(self, reactant_coords, product_coords): """ 使用简单的线性插值预测过渡态 假设过渡态位于反应物和产物的中间位置 """ return (reactant_coords + product_coords) / 2 # 主程序if __name__ == "__main__": # 加载处理后的反应数据 try: reactions_data = np.load('processed_reactions.npy', allow_pickle=True) print(f"成功加载 {len(reactions_data)} 个反应的数据") # 使用简单的线性插值模型 model = LinearInterpolationModel() # 评估模型 total_error = 0 count = 0 for reaction in reactions_data[:100]: # 只评估前100个反应 reactant_coords = reaction['reactant']['coords'] product_coords = reaction['product']['coords'] true_ts_coords = reaction['ts']['coords'] # 预测过渡态 pred_ts_coords = model.predict(reactant_coords, product_coords) # 计算误差 error = np.sqrt(np.mean((true_ts_coords - pred_ts_coords)**2)) total_error += error count += 1 if count % 10 == 0: print(f"反应 {count}, RMSE: {error:.4f} Å") avg_error = total_error / count print(f"平均RMSE: {avg_error:.4f} Å") # 保存简单的预测结果 predictions = [] for reaction in reactions_data[:100]: reactant_coords = reaction['reactant']['coords'] product_coords = reaction['product']['coords'] pred_ts_coords = model.predict(reactant_coords, product_coords) # 格式化预测结果 coords_list = [] for i, atom in enumerate(reaction['reactant']['atoms']): coords_list.append([atom, float(pred_ts_coords[i, 0]), float(pred_ts_coords[i, 1]), float(pred_ts_coords[i, 2])]) predictions.append({ "reaction_id": reaction['reaction_id'], "coordinates": json.dumps(coords_list) }) # 保存为CSV文件 import csv with open('simple_predictions.csv', 'w', newline='') as csvfile: fieldnames = ['reaction_id', 'coordinates'] writer = csv.DictWriter(csvfile, fieldnames=fieldnames) writer.writeheader() for pred in predictions: writer.writerow(pred) print("简单预测结果已保存到 simple_predictions.csv") # 生成XYZ文件 os.makedirs('ts_predictions', exist_ok=True) for reaction in reactions_data[:100]: reactant_coords = reaction['reactant']['coords'] product_coords = reaction['product']['coords'] pred_ts_coords = model.predict(reactant_coords, product_coords) # 保存为XYZ格式 xyz_file = os.path.join('ts_predictions', f"{reaction['reaction_id']}.xyz") with open(xyz_file, 'w') as f: f.write(f"{len(reaction['reactant']['atoms'])}\n") f.write(f"Predicted transition state for {reaction['reaction_id']}\n") for i, atom in enumerate(reaction['reactant']['atoms']): f.write(f"{atom} {pred_ts_coords[i, 0]:.6f} {pred_ts_coords[i, 1]:.6f} {pred_ts_coords[i, 2]:.6f}\n") # 压缩XYZ文件 import zipfile with zipfile.ZipFile("ts_predictions.zip", 'w') as zipf: for root, dirs, files in os.walk('ts_predictions'): for file in files: if file.endswith('.xyz'): zipf.write(os.path.join(root, file), file) print("XYZ文件已压缩为 ts_predictions.zip") except FileNotFoundError: print("未找到 processed_reactions.npy 文件,请先运行数据预处理脚本") In [12] import subprocessimport pandas as pd # 检查评估脚本是否存在if os.path.exists('get_score.py'): try: # 运行评估脚本 result = subprocess.run(['python', 'get_score.py', 'simple_predictions.csv'], capture_output=True, text=True) print("评估脚本输出:") print("STDOUT:", result.stdout) print("STDERR:", result.stderr) print("返回码:", result.returncode) # 检查是否生成了result.txt if os.path.exists('result.txt'): with open('result.txt', 'r') as f: result_content = f.read() print("result.txt内容:") print(result_content) except Exception as e: print(f"运行评估脚本时出错: {e}")else: print("未找到 get_score.py 评估脚本") 评估脚本输出: STDOUT: 评测结果已保存至: /home/aistudio/result.txt STDERR: 返回码: 0 result.txt内容: 评测结果: 评测失败: 提交文件不存在: simple_predictions.csv 状态码: 1 ================================================== 原始JSON数据: { "score": 0.0, "errorMsg": "评测失败: 提交文件不存在: simple_predictions.csv", "code": 1, "data": [ { "score": 0 } ] } In [14] import osimport numpy as npimport jsonimport csvimport zipfilefrom tqdm import tqdm # 重新定义读取XYZ文件的函数,确保可用def read_xyz_file(file_path): """读取XYZ格式的分子结构文件""" try: with open(file_path, 'r') as f: lines = f.readlines() # 跳过空行 lines = [line.strip() for line in lines if line.strip()] if len(lines) < 2: raise ValueError(f"文件 {file_path} 内容不足") num_atoms = int(lines[0]) atoms = [] coords = [] for i in range(2, 2 + num_atoms): if i >= len(lines): raise ValueError(f"文件 {file_path} 原子数量与声明不符") parts = lines[i].split() if len(parts) < 4: continue atom = parts[0] try: x, y, z = map(float, parts[1:4]) atoms.append(atom) coords.append([x, y, z]) except ValueError: continue return atoms, np.array(coords) except Exception as e: print(f"读取文件 {file_path} 时出错: {e}") return [], np.array([]) # 线性插值模型class LinearInterpolationModel: """简单的线性插值模型 - 作为基线""" def predict(self, reactant_coords, product_coords): """ 使用简单的线性插值预测过渡态 假设过渡态位于反应物和产物的中间位置 """ return (reactant_coords + product_coords) / 2 def generate_predictions_for_test_set(test_data_path, output_csv="pre.csv"): """为测试集生成预测""" # 检查测试集路径是否存在 if not os.path.exists(test_data_path): print(f"错误: 测试集路径 '{test_data_path}' 不存在") return None # 获取所有测试反应文件夹 test_folders = [f for f in os.listdir(test_data_path) if os.path.isdir(os.path.join(test_data_path, f))] # 确保有500个测试反应 if len(test_folders) != 500: print(f"警告: 测试集包含 {len(test_folders)} 个反应,但预期有 500 个") print(f"找到 {len(test_folders)} 个测试反应") # 使用线性插值模型 model = LinearInterpolationModel() predictions = [] # 处理每个测试反应 for folder in tqdm(test_folders, desc="生成测试集预测"): reaction_id = folder # 直接使用文件夹名作为反应ID try: # 尝试不同的文件命名约定 reactant_files = [ os.path.join(test_data_path, folder, "reactant.xyz"), os.path.join(test_data_path, folder, "r.xyz"), os.path.join(test_data_path, folder, "R.xyz") ] product_files = [ os.path.join(test_data_path, folder, "product.xyz"), os.path.join(test_data_path, folder, "p.xyz"), os.path.join(test_data_path, folder, "P.xyz") ] # 找到实际存在的反应物文件 reactant_file = None for f in reactant_files: if os.path.exists(f): reactant_file = f break # 找到实际存在的产物文件 product_file = None for f in product_files: if os.path.exists(f): product_file = f break if not reactant_file or not product_file: print(f"警告: 反应 {reaction_id} 缺少反应物或产物文件") # 添加空预测 predictions.append({ "reaction_id": reaction_id, "coordinates": "[]" }) continue # 读取反应物和产物结构 reactant_atoms, reactant_coords = read_xyz_file(reactant_file) product_atoms, product_coords = read_xyz_file(product_file) # 确保反应物和产物原子数相同 if len(reactant_atoms) != len(product_atoms): print(f"警告: 反应 {reaction_id} 的反应物和产物原子数不同") # 使用最小原子数 min_atoms = min(len(reactant_atoms), len(product_atoms)) reactant_atoms = reactant_atoms[:min_atoms] reactant_coords = reactant_coords[:min_atoms] product_atoms = product_atoms[:min_atoms] product_coords = product_coords[:min_atoms] # 预测过渡态 pred_ts_coords = model.predict(reactant_coords, product_coords) # 格式化坐标数据为JSON字符串 - 修复JSON格式 coords_list = [] for i, atom in enumerate(reactant_atoms): coords_list.append([ atom, float(pred_ts_coords[i, 0]), float(pred_ts_coords[i, 1]), float(pred_ts_coords[i, 2]) ]) # 将坐标列表转换为JSON字符串,确保格式正确 coords_json = json.dumps(coords_list) # 添加到预测结果 predictions.append({ "reaction_id": reaction_id, "coordinates": coords_json }) except Exception as e: print(f"处理反应 {reaction_id} 时出错: {e}") # 添加空预测 predictions.append({ "reaction_id": reaction_id, "coordinates": "[]" }) # 按反应ID排序预测结果 predictions.sort(key=lambda x: x["reaction_id"]) # 保存为CSV文件 with open(output_csv, 'w', newline='', encoding='utf-8') as csvfile: fieldnames = ['reaction_id', 'coordinates'] writer = csv.DictWriter(csvfile, fieldnames=fieldnames) writer.writeheader() for pred in predictions: writer.writerow(pred) print(f"预测结果已保存到 {output_csv}") print(f"共生成 {len(predictions)} 个预测") return output_csv def generate_xyz_files_for_test_set(test_data_path, output_dir="ts_predictions"): """为测试集生成XYZ文件""" # 检查测试集路径是否存在 if not os.path.exists(test_data_path): print(f"错误: 测试集路径 '{test_data_path}' 不存在") return None # 获取所有测试反应文件夹 test_folders = [f for f in os.listdir(test_data_path) if os.path.isdir(os.path.join(test_data_path, f))] print(f"找到 {len(test_folders)} 个测试反应") # 使用线性插值模型 model = LinearInterpolationModel() # 创建输出目录 os.makedirs(output_dir, exist_ok=True) # 处理每个测试反应 for folder in tqdm(test_folders, desc="生成XYZ文件"): reaction_id = folder # 直接使用文件夹名作为反应ID try: # 尝试不同的文件命名约定 reactant_files = [ os.path.join(test_data_path, folder, "reactant.xyz"), os.path.join(test_data_path, folder, "r.xyz"), os.path.join(test_data_path, folder, "R.xyz") ] product_files = [ os.path.join(test_data_path, folder, "product.xyz"), os.path.join(test_data_path, folder, "p.xyz"), os.path.join(test_data_path, folder, "P.xyz") ] # 找到实际存在的反应物文件 reactant_file = None for f in reactant_files: if os.path.exists(f): reactant_file = f break # 找到实际存在的产物文件 product_file = None for f in product_files: if os.path.exists(f): product_file = f break if not reactant_file or not product_file: print(f"警告: 反应 {reaction_id} 缺少反应物或产物文件") continue # 读取反应物和产物结构 reactant_atoms, reactant_coords = read_xyz_file(reactant_file) product_atoms, product_coords = read_xyz_file(product_file) # 确保反应物和产物原子数相同 if len(reactant_atoms) != len(product_atoms): print(f"警告: 反应 {reaction_id} 的反应物和产物原子数不同") # 使用最小原子数 min_atoms = min(len(reactant_atoms), len(product_atoms)) reactant_atoms = reactant_atoms[:min_atoms] reactant_coords = reactant_coords[:min_atoms] product_atoms = product_atoms[:min_atoms] product_coords = product_coords[:min_atoms] # 预测过渡态 pred_ts_coords = model.predict(reactant_coords, product_coords) # 为每个反应创建子文件夹 reaction_dir = os.path.join(output_dir, reaction_id) os.makedirs(reaction_dir, exist_ok=True) # 保存为XYZ格式 xyz_file = os.path.join(reaction_dir, "ts.xyz") with open(xyz_file, 'w') as f: f.write(f"{len(reactant_atoms)}\n") f.write(f"Predicted transition state for {reaction_id}\n") for i, atom in enumerate(reactant_atoms): f.write(f"{atom} {pred_ts_coords[i, 0]:.6f} {pred_ts_coords[i, 1]:.6f} {pred_ts_coords[i, 2]:.6f}\n") except Exception as e: print(f"处理反应 {reaction_id} 时出错: {e}") # 压缩XYZ文件 zip_path = "ts_predictions.zip" with zipfile.ZipFile(zip_path, 'w') as zipf: for root, dirs, files in os.walk(output_dir): for file in files: if file.endswith('.xyz'): file_path = os.path.join(root, file) # 在ZIP中保持目录结构 arcname = os.path.relpath(file_path, output_dir) zipf.write(file_path, arcname) print(f"XYZ文件已压缩为 {zip_path}") return zip_path # 主程序if __name__ == "__main__": # 设置测试集路径 test_data_path = "test_data" # 根据实际路径修改 # 生成CSV预测文件 csv_file = generate_predictions_for_test_set(test_data_path) # 生成XYZ文件 zip_file = generate_xyz_files_for_test_set(test_data_path) # 评估预测结果 if csv_file and os.path.exists('get_score.py'): import subprocess print("\n运行评估脚本...") result = subprocess.run(['python', 'get_score.py', csv_file], capture_output=True, text=True) print("评估脚本输出:") print("STDOUT:", result.stdout) print("STDERR:", result.stderr) print("返回码:", result.returncode) # 检查是否生成了result.txt if os.path.exists('result.txt'): with open('result.txt', 'r') as f: result_content = f.read() print("result.txt内容:") print(result_content) print("\n下一步:") print("1. 检查预测文件格式是否正确") print("2. 如果评估通过,提交 CSV 文件和 ZIP 文件") print("3. 如果评估失败,检查错误信息并修正") 写出与此代码相似的代码,使其能在飞桨aistudio中运行成功且能在比赛中提交成功,保留提供代码中对JSON数据的处理
09-10
评论
成就一亿技术人!
拼手气红包6.0元
还能输入1000个字符
 
红包 添加红包
表情包 插入表情
 条评论被折叠 查看
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值