OpenCL C Built-IN Functions



//
// OpenCL C Built-IN Functions
//

//  Work-Item Functions
uint get_work_dim();
size_t get_global_size(uint dimindex);
size_t get_global_id(uint dimindex);
size_t get_local_size(uint dimindex);
size_t get_local_id(uint dimindex);
size_t get_num_groups(uint dimindex);
size_t get_group_id(uint dimindex);
size_t get_global_offset(uint dimindex);




// Math Functions
/*

*/
#define M_E        2.71828182845904523536
#define M_LOG2E    1.44269504088896340736
#define M_LOG10E   0.434294481903251827651
#define M_LN2      0.693147180559945309417
#define M_LN10     2.30258509299404568402
#define M_PI       3.14159265358979323846
#define M_PI_2     1.57079632679489661923
#define M_PI_4     0.785398163397448309616
#define M_1_PI     0.318309886183790671538
#define M_2_PI     0.636619772367581343076
#define M_2_SQRTPI 1.12837916709551257390
#define M_SQRT2    1.41421356237309504880
#define M_SQRT1_2  0.707106781186547524401


gentype acos(gentype x);             //  Compute the arc cosine of x.
gentype acosh(gentype x);           // Compute the inverse hyperbolic cosine of x.
gentype acospi(gentype x);          // Compute acos(x) / p.
gentype asin(gentype x);            // Compute the arc sine of x.
gentype asinh(gentype x);           // Compute the inverse hyperbolic sine of x.
gentype asinpi(gentype x);          // Compute asin(x) / p.
gentype atan(gentype y_over_x);    // Compute the arc tangent of y_over_x.
gentype atan2(gentype y, gentype x);    // Compute the arc tangent of y / x.
gentype atanh(gentype x);               // Compute the hyperbolic arc tangent of x.
gentype atanpi(gentype x);              // Compute atan(x) / p.
gentype atan2pi(gentype y, gentype x);   // Compute atan2(y, c) / p.
gentype cbrt(gentype x);                // Compute the cube root of x.
gentype ceil(gentype x);                // Round to an integral value using the round - to - positive - infinity rounding mode.
gentype copysign(gentype x, gentype y);   //Returns xwith its sign changed to match the sign of y.
gentype cos(gentype x);                   // Compute the cosine of x.
gentype cosh(gentype x);                // Compute the hyperbolic cosine of x.
gentype cospi(gentype x);               // Compute cos(px)
gentype erfc(gentype x);            // Compute the complementary error function 1.0 – erf(x).
gentype erf(gentype x);             // Compute the error function.For argument xthis is defined as
gentype exp(gentype x);             // Compute the base - e exponential of x.
gentype exp2(gentype x);            // Compute the base - 2 exponential of x.
gentype exp10(gentype x);           // Compute the base - 10 exponential of x.
gentype expm1(gentype x);           // Compute e^x - 1.0
gentype fabs(gentype x);            // Compute the absolute value of a floating - point number.
gentype fdim(gentype x, gentype y);  // Returns x– yif x > y, +0if xis less than or equal to y.
gentype floor(gentype x);           // Round to an integral value using the round - to - negative - infinity rounding mode.
gentype fma(gentype a, gentype b, gentype c);      /* Returns the correctly rounded floating - point representation of the
                                                    sum of cwith the infinitely precise product of aand b.Rounding of
                                                    intermediate products does not occur.Edge case behavior is per the
                                                    IEEE 754 - 2008 standard. */
gentype fmax(gentype x, gentype y);
gentypef fmax(gentypef x, float y);
gentyped fmax(gentyped x, double y);  /* Returns yif x < y; otherwise it returns x.If one argument is a NaN,
                                        fmax()returns the other argument.If both arguments are NaNs,
                                        fmax()returns a NaN. */

gentype fmin(gentype x, gentype y);
gentypef fmin(gentypef x, float y);
gentyped fmin(gentyped x, double y); /*
Returns yif y < x; otherwise it returns x.If one argument is a NaN,
fmin()returns the other argument.If both arguments are NaNs,
fmin()returns a NaN.  */
gentype fmod(gentype x, gentype y);  // Returns x– y* trunc(x / y).
gentype fract(gentype x, global gentype *iptr);
gentype fract(gentype x, local gentype *iptr);
gentype fract(gentype x, private gentype *iptr);    // Returns fmin(x– floor(x), 0x1.fffffep - 1f).floor(x)is returned in iptr.
gentype frexp(gentype x, global intn*exp);
gentype frexp(gentype x, local intn *exp);
gentype frexp(gentype x, private intn*exp);         /*
                                                    Extract mantissa and exponent from x.For each component the
                                                    mantissa returned is a float with magnitude in the interval[1 / 2, 1)
                                                    or0.Each component of xequals mantissa returned * 2
                                                    */
gentype hypot(gentype x, gentype y);                  // Compute the value of the square root of x
gentype ldexp(gentype x, intn exp);
gentype ldexp(gentype x, int exp);   // Returns x * 2^exp
gentype lgamma(gentype x);
gentype lgamma_r(gentype x, global intn*signp);
gentype lgamma_r(gentype x, local intn*signp);
gentype lgamma_r(gentype x, private intn*signp);     // Compute the log gamma function given by

gentype log(gentypex);                  // Compute the natural logarithm of x.
gentype log2(gentypex);                 // Compute the base - 2 logarithm of x.
gentype log10(gentypex);                // Compute the base - 10 logarithm of x.
gentype log1p(gentypex);                // Compute loge(1.0 + x).
gentype logb(gentypex);                 // Compute the exponent of x, which is the integral part of logr | x | .
gentypemad(gentype a, gentype b, gentype c);  /* madapproximates a* b + c.Whether or how the product of a* b
                                                is rounded and how supernormal or subnormal intermediate products are handled are not defined.madis intended to be used where
                                                speed is preferred over accuracy. */
gentype maxmag(gentype x, gentype y); // Returns xif | x | > | y | , yif | y | > | x | , otherwise fmax(x, y).
gentype minmag(gentype x, gentype y); // Returns xif | x | < | y | , yif | y | < | x | , otherwise fmin(x, y).
gentype modf(gentype x,global gentype *iptr)
gentype modf(gentype x,local gentype *iptr)
gentype modf(gentype x, private gentype *iptr) /*
                                                Decompose a floating - point number.The modffunction breaks the
                                                argument xinto integral and fractional parts, each of which has the
                                                same sign as the argument.It stores the integral part in the object
                                                pointed to by iptrand returns the fractional part. 
                                                */
float nan(uintn ancode);
floatn nan(uintn nancode);
double nan(uint nancode);
doublen nan(uintn nancode);  /*
                                Returns a quiet NaN.The nancodemay be placed in the significand
                                of the resulting NaN.
                                */
gentype nextafter(gentype x, gentype y);  /*
                            Compute the next representable single - or double - precision floatingpoint value following xin the direction of y.Thus, if yis less than x,
                            nextafterreturns the largest representable floating - point number
                            less than x
                            */

gentype pow(gentype x, gentype y);      // Compute xto the power y.
gentype pown(gentype x, intn y);        // Compute xto the power y, where yis an integer.
gentype powr(gentype x, gentype y);     // Compute xto the power y, where x >= 0.
gentype remainder(gentype x, gentype y); /*
                                        Compute the value rsuch that r = x– n* y, where nis the
                                        integer nearest the exact value of x / y.If there are two integers closest
                                        to x / y, nwill be the even one.If ris zero, it is given the same sign
                                        as x. 
                                        */
gentype remquo(gentype x, gentype y, global gentypei *quo);
gentype remquo(gentype x, gentype y, local gentypei *quo);
gentype remquo(gentype x, gentype y, private gentypei *quo); /*
                                        Compute the value rsuch that r = x– n* y, where nis the
                                        integer nearest the exact value of x / y.If there are two integers closest
                                        to x / y, nwill be the even one.If ris zero, it is given the same sign
                                        as x.
                                        This is the same value that is returned by the remainder function.
                                        remquoalso calculates the lower seven bits of the integral quotient
                                        x / yand gives that value the same sign as x / y.It stores this signed
                                        value in the object pointed to by quo.
                                        */

gentype rint(gentype x);            // Round to integral value(using round - to - nearest rounding mode) in floating - point format.
gentype rootn(gentype x, intn y);   // Compute xto the power 1 / y.
gentype round(gentype x);           //  Return the integral value nearest to x, rounding halfway cases away from zero, regardless of the current rounding direction.
gentype rsqrt(gentype x);           // Compute the inverse square root of x.
gentype sin(gentype x);             // Compute the sine of x

gentype sincos(gentype x, global gentype *cosval);
gentype sincos(gentype x, local gentype *cosval);
gentype sincos(gentype x, private gentype *cosval);    /*
                                                        Compute the sine and cosine of x.The computed sine is the return
                                                        value and the computed cosine is returned in cosval. */
gentype sinh(gentype x);         // Compute the hyperbolic sine of x.
gentype sinpi(gentype x);        // Compute sin(px).
gentype sqrt(gentype x);        // Compute the square root of x.
gentype tan(gentype x);         // Compute the tangent of x.
gentype tanh(gentype x);        // Compute the hyperbolic tangent of x.
gentype tanpi(gentype x);       // Compute tan(px).
gentype tgamma(gentype x);      // Compute the gamma function.
gentype trunc(gentype x);       // Round to integral value using the round - to - zero rounding mode

gentypef half_cos(gentypef x);  // Compute the cosine of x.xmust be in the range - 2
gentypef half_divide(gentypef x, gentypef y)  //Compute x / y.
gentypef half_exp(gentypef x);      // Compute the base - e exponential of x.
gentypef half_exp2(gentypef x);     // Compute the base - 2 exponential of x.
gentypef half_exp10(gentypef x);    // Compute the base - 10 exponential of x.
gentypef half_log(gentypef x);      // Compute the natural logarithm of x.
gentypef half_log2(gentypef x);     // Compute the base - 2 logarithm of x.
gentypef half_log10(gentypef x);    // Compute the base - 10 logarithm of x.
gentypef half_powr(gentypef x, gentypef y);   // Compute xto the power y, where x >= 0.
gentypef half_recip(gentypef x);    // Compute the reciprocal of x.
gentypef half_rsqrt(gentypef x);    // Compute the inverse square root of x.
gentypef half_sin(gentypef x);      //  Compute the sine of x.xmust be in the range - 2
gentypef half_sqrt(gentypef x);     // Compute the square root of x.
gentypef half_tan(gentypef x);      // Compute the tangent of x.xmust be in the range - 2
gentypef native_cos(gentypef x);    /* Compute the cosine of xover an implementation - defined range.The maximum error is implementation - defined.*/
gentypef native_divide(gentypef x, gentypef y);   // Compute x / yover an implementation - defined range.The maximum error is implementation - defined.

gentypef native_exp(gentypef x);    // Compute the base - e exponential of xover an implementation - defined range.The maximum error is implementation - defined.
gentypef native_exp2(gentypef x);   // Compute the base - 2 exponential of xover an implementation - defined range.The maximum error is implementation - defined.
gentypef native_exp10(gentypef x);  // Compute the base - 10 exponential of xover an implementation - defined range.The maximum error is implementation - defined.
gentypef native_log(gentypef x);    // Compute the natural logarithm of xover an implementation - defined range.The maximum error is implementation - defined.
gentypef native_log2(gentypef x);   // Compute the base - 2 logarithm of xover an implementation - defined range.The maximum error is implementation - defined.
gentypef native_log10(gentypef x);  // Compute the base - 10 logarithm of xover an implementation - defined range.The maximum error is implementation - defined.
gentypef native_recip(gentypef x);  // Compute the reciprocal of xover an implementation - defined range. The maximum error is implementation - defined.
gentypef native_rsqrt(gentypef x);  // Compute the inverse square root of xover an implementation - defined range.The maximum error is implementation - defined.
gentypef native_sin(gentypef x);    // Compute the sine of xover an implementation - defined range.The maximum error is implementation - defined.
gentypef native_sqrt(gentypef x);   // Compute the square root of xover an implementation - defined range.The maximum error is implementation - defined.
gentypef native_tan(gentypef x);    // Compute the tangent of xover an implementation - defined range.Themaximum error is implementation - defined.




// Integer Functions

ugentype abs(gentype x); // Returns | x | .
ugentype abs_diff(gentype x, gentype y);  // Returns | x– y | without modulo overflow.
gentype add_sat(gentype x, gentype y);   // Returns x + yand saturates the result.
gentype hadd(gentype x, gentype y);  // Returns(x + y) >> 1. The intermediate sum does not modulo overflow.
gentype rhadd(gentype x, gentype y);  // Returns(x + y + 1) >> 1. The intermediate sum does not modulo overflow.
gentype clamp(gentype x, gentype minval, gentype maxval);  // Returns min(max(x, minval), maxval).
gentype clamp(gentype x, sgentype minval, sgentype maxval);  //Results are undefined if minval > maxval.
gentype clz(gentype x);      //  Returns the number of leading 0bits in x, starting at the most significant bit position.
gentype mad_hi(gentype a, gentype b, gentype c);
gentype mad_sat(gentype a, gentype b, gentype c);   // Returns mul_hi(a, b) + c.
gentype max(gentype x, gentype y)  // Returns a* b + cand saturates the result.
gentype max(gentype x, sgentype y)
gentype max(gentype x, gentype y);  // Returns yif x < y; otherwise it returns x.
gentype max(gentype x, sgentype y);

gentype mul_hi(gentypex, gentype y); // Computes x* yand returns the high half of the product of x and y.
gentype rotate(gentypev, gentype i);  /* For each element in v, the bits are shifted left by the number of
                                        bits given by the corresponding element in i(subject to the
                                        usual shift modulo rules described in the “Shift Operators”
                                        subsection of “Vector Operators” in Chapter 4).Bits shifted off
                                        the left side of the element are shifted back in from the right.
                                        */
gentype sub_sat(gentype x, gentype y);   // Returns x - y and saturates the result.
short upsample(char hi, uchar lo);
ushort upsample(uchar hi, uchar lo);
shortn upsample(charn hi, ucharn lo);
ushortn upsample(ucharn hi, ucharn lo);
int upsample(short hi, ushort lo);
uint upsample(ushort hi, ushort lo);
intn upsample(shortn hi, ushortn lo);
uintn upsample(ushortn hi, ushortn lo);
long upsample(int hi, uint lo);
ulong upsample(uint hi, uint lo);
longn upsample(intn hi, uintn lo);
ulongn upsample(uintn hi, uintn lo);
/*
If hiand loare scalar :
result = ((short)hi << 8) | lo
result = ((ushort)hi << 8) | lo
result = ((int)hi << 16) | lo
result = ((uint)hi << 16) | lo
result = ((long)hi << 32) | lo
result = ((ulong)hi << 32) | lo
If hiand loare scalar, then for each element of the vector :
result[i] = ((short)hi[i] << 8) | lo[i]
result[i] = ((ushort)hi[i] << 8) | lo[i]
result[i] = ((int)hi[i] << 16) | lo[i]
result[i] = ((uint)hi[i] << 16) | lo[i]
result[i] = ((long)hi[i] << 32) | lo[i]
result[i] = ((ulong)hi[i] << 32) | lo[i]
*/

gentype mad24(gentype x, gentype y, gentype z);  /* Multiply two 24 - bit integer values xand yusing mul24and add
                                                the 32 - bit integer result to the 32 - bit integer z.
                                                gentypemul24(gentypex, gentype y) Multiply two 24 - bit integer values xand y.xand yare 32 - bit
                                                integers but only the low 24 bits are used to perform the
                                                multiplication.mul24should be used only when values in x
                                                and yare in the range[-2^23, 2^23 - 1]
                                                if xand yare signed integers and in the range[0, 2^24 - 1]
                                                if xand yare unsigned integers.If xand yare not in this range, the multiplication
                                                result is implementation - defined.
                                                */






// Common Functions
gentype clamp(gentype x, gentype minval, gentype maxval);
gentypef clamp(gentypef x, float minval, float maxval);
gentyped clamp(gentyped x, double minval, double maxval); /*
    Returns fmin(fmax(x, minval), maxval).
    Results are undefined if minval > maxval. */

gentype degrees(gentype radians);  // Converts radiansto degrees; i.e., (180 / p) * radians.
gentype max(gentype x, gentype y);
gentypef max(gentypef x, float y);
gentyped max(gentyped x, double y);   /*
    Returns yif x < y; otherwise it returns x.This is similar to fmax
    described in Table 5.2 except that if xor yis infiniteor NaN, the
    return values are undefined. */
gentype min(gentype x, gentype y);
gentypef min(gentypef x, float y);
gentyped min(gentyped x, double y);   /*
    Returns yif y < x; otherwise it returns x.This is similar to fmin
    described in Table 5.2 except that if xor yis infiniteorNaN, the
    return values are undefined. */
gentype mix(gentype x, gentype y, gentype a);
gentypef mix(gentypef x, float y, gentype a);
gentyped mix(gentyped x, double y, gentype a); /*
    Returns the linear blend of xand yimplemented as
    x + (y– x) * a
    amust be a value in the range 0.0 … 1.0.If ais not in this range,
    the return values are undefined.
    */
gentype radians(gentype degrees); // Converts degreesto radians; i.e., (p / 180) * degrees.


gentype step(gentype edge, gentype x);
gentypef step(float edge, gentypef x);
gentyped step(double edge, gentyped x); /*
    Returns 0.0if x < edge; otherwise it returns 1.0.The step
    function can be used to create a discontinuous jump at an arbitrary point. */
gentype smoothstep(gentype edge0, gentype edge1, gentype x);
gentypef smoothstep(float edge0, float edge1, gentypef x);
gentyped smoothstep(double edge0, double edge1, gentyped x);
/*
    Returns 0.0if x <= edge0and 1.0if x >= edge1and performs a
    smooth hermite interpolation between 0and 1when edge0 < x<
    edge1.This is useful in cases where a threshold function with a
    smooth transition is needed.
    This is equivalent to the following where tis the same type as x :
t = clamp((x– edge0) / (edge1– edge0), 0, 1);
return t* t* (3 – 2 * t)
The results are undefined if edge0 >= edge1or if x, edge0, or
edge1is a NaN.
*/
gentype sign(gentype x); // Returns 1.0if x> 0, -0.0if x = -0.0, +0.0if x = +0.0, or - 1.0  if x < 0. Returns 0.0if xis a NaN



// Geometric Functions





// Relational Functions


float4 cross(float4 p0, float4 p1);
float3 cross(float3 p0, float3 p1);
double4 cross(double4 p0, double4 p1);
double3 cross(double3 p0, double3 p1);
/*
Returns the cross - product of p0.xyzand p1.xyz.The wcomponent of a 4 - component vector result returned will be 0.
The cross - product is specified only for a 3 - or 4 - component vector.
*/
float dot(gentypef p0, gentypef p1);
double dot(gentyped p0, gentyped p1);   // Returns the dot product of p0and p1.
float distance(gentypef p0, gentypef p1);
double distance(gentyped p0, gentyped p1);   // Returns the distance between p0and p1.This is calculated as length(p0– p1).
float length(gentypef p);
double length(gentyped p);
/*
Returns the length of vector p, i.e.,
3p.x2 + p.y2 + …
The length is calculated without overflow or extraordinary
precision loss due to underflow.
*/
gentypef normalize(gentypef p);
gentyped normalize(gentyped p);
/*
Returns a vector in the same direction as pbut with a length of
1.
normalize(p)function returns pif all elements of pare zero.
normalize(p)returns a vector full of NaNs if any element is a
NaN.
normalize(p)for which any element in pis infinite proceeds as
if the elements in pwere replaced as follows :
for (i = 0; i < sizeof(p) / sizeof(p[0]); i++)
p[i] = isinf(p[i])
?
copysign(1.0, p[i])
: 0.0 * p[i];
*/
float fast_distance(gentypef p0, gentypef p1);
/*
    Returns fast_length(p0– p1)

    floatfast_length(gentypefp) Returns the length of vector pcomputed as
    half_sqrt(p.x
    2
    + p.y
    2
    + …)
    gentypeffast_normalize(gentypefp) Returns a vector in the same direction as pbut with a length
    of1.
    fast_normalizeis computed as
    p* half_sqrt(p.x
    2
    + p.y
    2
    + …)
    The result will be within 8192 ulps error from the infinitely
    precise result of
if (all(p == 0.0f))
result = p;
else
result = p / sqrt(p.x
2
+ p.y
2
+ …)
It has the following exceptions :
• If the sum of squares is greater than FLT_MAX, then the value
of the floating - point values in the result vector is undefined.
• If the sum of squares is less than FLT_MIN, then the implementation may return back p.
• If the device is in “denorms are flushed to zero” mode,
individual operand elements with magnitude less than
sqrt(FLT_MIN)may be flushed to zero before proceeding
with the calculation.
*/



int isequal(float x, float y);
int isequal(double x, double y);
intn isequal(floatn x, floatn y);
longn isequal(doublen x, doublen y);  // Returns the component - wise compare of x == y.
int isnotequal(float x, float y);
int isnotequal(double x, double y);
intn isnotequal(floatn x, floatn y);
longn isnotequal(doublen x, doublen y);   // Returns the component - wise compare of x != y.
int isgreater(float x, float y);
int isgreater(double x, double y);
intn isgreater(floatn x, floatn y);
longn isgreater(doublen x, doublen y);    // Returns the component - wise compare of x > y.
int isgreaterequal(float x, float y);
int isgreaterequal(double x, double y);
intn isgreaterequal(floatn x, floatn y);
longn isgreaterequal(doublen x, doublen y);  // Returns the component - wise compare of x >= y.
int isless(float x, float y);
int isless(double x, double y);
intn isless(floatn x, floatn y);
longn isless(doublen x, doublen y);    //Returns the component - wise compare of x < y.
int islessequal(float x, float y);
int islessequal(double x, double y);
intn islessequal(floatn x, floatn y);
longn islessequal(doublen x, doublen y);   // Returns the component - wise compare of x <= y.
int islessgreater(float x, float y);
int islessgreater(double x, double y);
intn islessgreater(floatn x, floatn y);
longn islessgreater(doublen x, doublen y);   // Returns the component - wise compare of(x< y) || (x> y)



int isfinite(float x);
int isfinite(double x);
intn isfinite(floatn x);
longn isfinite(doublen x);  // Tests for the f i nite value of x.
int isinf(float x);
int isinf(double x);
intn isinf(floatn x);
longn isinf(doublen x);  // Tests for the i nf i nite value(positive or negative) of x.
int isnan(float x);
int isnan(double x);
intn isnan(floatn x);
longn isnan(doublen x);  // Tests for a NaN.
int isnormal(float x);
int isnormal(double x);
intn isnormal(floatn x);
longn isnormal(doublen x);   // Tests for a normal value(i.e., xis neither zero, denormal, infinite, nor NaN).
int isordered(float x, float y);
int isordered(double x, double y);
intn isordered(floatn x, floatn y);
longn isordered(doublen x, doublen y); // Tests i f arguments are ordered.isorderedtakes arguments x and yand returns the result isequal(x, x) && isequal(y, y)
int isunordered(float x, float y);
int isunordered(double x, double y);
intn isunordered(floatn x, floatn y);
longn isunordered(doublen x, doublen y);
/*
Tests i f arguments are unordered.isunorderedtakes arguments
xand y, returning non - zero if xor yis NaN, and zero otherwise.
*/
int signbit(float x);
int signbit(double x);
intn signbit(floatn x);
longn signbit(doublen x);
/*
Tests for sign bit.The scalar version of the f unction ret urns a 1if
the sign bit in the floating - point value of xis set, else it returns
0. The vector version of the function returns the following for
each component : a - 1if the sign bit in the floating - point value is
set, else 0.
*/

int any(sgentype x); // Returns 1if the most significant bit in any component of xis set; otherwise returns 0.
int all(sgentype x); // Returns 1if the most significant bit in all components of xis set; otherwise returns 0.
gentype bitselect(gentype a, gentype b, gentype c); 
/*
Each bit of the result is the corresponding bit of aif
the corresponding bit of cis 0. Otherwise it is the
corresponding bit of b.
*/
gentype select(gentype a, gentype b, sgentype c);
gentype select(gentype a, gentype b, ugentype c);
/*
For each component of a vector type
result[i] = if MSB of c[i] is set ?
b[i] : a[i]
For a scalar type
result = c ? b : a
sgentypeand ugentypemust have the same
number of elements and bits as gentype
*/






// Synchronization Functions

void barrier(cl_mem_fence_flags flags);








// Vector Data Load and Store Functions
/*
gentype to indicate the scalar built-in data types char,uchar,short,
ushort,int,uint,long,ulong,float, or double. We use the generic
type name gentypento indicate the n-element vectors of gentypeelements. We use the type name floatn,doublen, and halfnto represent
n-element vectors of float,double, and halfelements, respectively. The
suffix nis also used in the function names (such as vloadn,vstoren),
where n= 2,3,4,8, or 16
*/
gentypen vloadn(size_t offset, const global gentype *p);
gentypen vloadn(size_t offset, const local gentype *p);
gentypen vloadn(size_t offset, const constant gentype *p);
gentypen vloadn(size_t offset, const private gentype *p);

gentypen vstoren(gentypen data, size_t offset, global gentype *p);
gentypen vstoren(gentypen data, size_t offset, local gentype *p);
gentypen vstoren(gentypen data, size_t offset, private gentype *p);

float vload_half(size_t offset, const global half *p);
float vload_half(size_t offset, const local half *p);
float vload_half(size_t offset, const constant half *p);
float vload_half(size_t offset, const private half *p);

floatn vload_halfn(size_t offset, const global half *p);
floatn vload_halfn(size_t offset, const local half *p);
floatn vload_halfn(size_t offset, const constant half *p);
floatn vload_halfn(size_t offset, const private half *p);


void vstore_half(float data, size_t offset, global half *p);
void vstore_half_rte(float data, size_t offset, global half *p);
void vstore_half_rtz(float data, size_t offset, global half *p);
void vstore_half_rtp(float data, size_t offset, global half *p);
void vstore_half_rtn(float data, size_t offset, global half *p);
void vstore_half(float data, size_t offset, local half *p);
void vstore_half_rte(float data, size_t offset, local half *p);
void vstore_half_rtz(float data, size_t offset, local half *p);
void vstore_half_rtp(float data, size_t offset, local half *p);
void vstore_half_rtn(float data, size_t offset, local half *p);
void vstore_half(float data, size_t offset, private half *p);
void vstore_half_rte(float data, size_t offset, private half *p);
void vstore_half_rtz(float data, size_t offset, private half *p);
void vstore_half_rtp(float data, size_t offset, private half *p);
void vstore_half_rtn(float data, size_t offset, private half *p);


void vstore_halfn(floatn data, size_t offset, global half *p);
void vstore_halfn_rte(floatn data, size_t offset, global half *p);
void vstore_halfn_rtz(floatn data, size_t offset, global half *p);
void vstore_halfn_rtp(floatn data, size_t offset, global half *p);
void vstore_halfn_rtn(floatn data, size_t offset, global half *p);
void vstore_halfn(floatn data, size_t offset, local half *p);
void vstore_halfn_rte(floatn data, size_t offset, local half *p);
void vstore_halfn_rtz(floatn data, size_t offset, local half *p);
void vstore_halfn_rtp(floatn data, size_t offset, local half *p);
void vstore_halfn_rtn(floatn data, size_t offset, local half *p);
void vstore_halfn(floatn data, size_t offset, private half *p);
void vstore_halfn_rte(floatn data, size_t offset, private half *p);
void vstore_halfn_rtz(floatn data, size_t offset, private half *p);
void vstore_halfn_rtp(floatn data, size_t offset, private half *p);
void vstore_halfn_rtn(floatn data, size_t offset, private half *p);

floatn vloada_halfn(size_t offset, const global half *p);
floatn vloada_halfn(size_t offset, const local half *p);
floatn vloada_halfn(size_t offset, const constant half *p);
floatn vloada_halfn(size_t offset, const private half *p);

void vstorea_halfn(floatn data, size_t offset, global half *p);
void vstorea_halfn_rte(floatn data, size_t offset, global half *p);
void vstorea_halfn_rtz(floatn data, size_t offset, global half *p);
void vstorea_halfn_rtp(floatn data, size_t offset, global half *p);
void vstorea_halfn_rtn(floatn data, size_t offset, global half *p);
void vstorea_halfn(floatn data, size_t offset, local half *p);
void vstorea_halfn_rte(floatn data, size_t offset, local half *p);
void vstorea_halfn_rtz(floatn data, size_t offset, local half *p);
void vstorea_halfn_rtp(floatn data, size_t offset, local half *p);
void vstorea_halfn_rtn(floatn data, size_t offset, local half *p);
void vstorea_halfn(floatn data, size_t offset, private half *p);
void vstorea_halfn_rte(floatn data, size_t offset, private half *p);
void vstorea_halfn_rtz(floatn data, size_t offset, private half *p);
void vstorea_halfn_rtp(floatn data, size_t offset, private half *p);
void vstorea_halfn_rtn(floatn data, size_t offset, private half *p);


void vstore_half(double data, size_t offset, global half *p);
void vstore_half_rte(double data, size_t offset, global half *p);
void vstore_half_rtz(double data, size_t offset, global half *p);
void vstore_half_rtp(double data, size_t offset, global half *p);
void vstore_half_rtn(double data, size_t offset, global half *p);
void vstore_half(double data, size_t offset, local half *p);
void vstore_half_rte(double data, size_t offset, local half *p);
void vstore_half_rtz(double data, size_t offset, local half *p);
void vstore_half_rtp(double data, size_t offset, local half *p);
void vstore_half_rtn(double data, size_t offset, local half *p);
void vstore_half(double data, size_t offset, private half *p);
void vstore_half_rte(double data, size_t offset, private half *p);
void vstore_half_rtz(double data, size_t offset, private half *p);
void vstore_half_rtp(double data, size_t offset, private half *p);
void vstore_half_rtn(double data, size_t offset, private half *p);


void vstore_halfn(doublen data, size_t offset, global half *p);
void vstore_halfn_rte(doublen data, size_t offset, global half *p);
void vstore_halfn_rtz(doublen data, size_t offset, global half *p);
void vstore_halfn_rtp(doublen data, size_t offset, global half *p);
void vstore_halfn_rtn(doublen data, size_t offset, global half *p);
void vstore_halfn(doublen data, size_t offset, local half *p);
void vstore_halfn_rte(doublen data, size_t offset, local half *p);
void vstore_halfn_rtz(doublen data, size_t offset, local half *p);
void vstore_halfn_rtp(doublen data, size_t offset, local half *p);
void vstore_halfn_rtn(doublen data, size_t offset, local half *p);
void vstore_halfn(doublen data, size_t offset, private half *p);
void vstore_halfn_rte(doublen data, size_t offset, private half *p);
void vstore_halfn_rtz(doublen data, size_t offset, private half *p);
void vstore_halfn_rtp(doublen data, size_t offset, private half *p);
void vstore_halfn_rtn(doublen data, size_t offset, private half *p);


void vstorea_halfn(doublen data, size_t offset, global half *p);
void vstorea_halfn_rte(doublen data, size_t offset, global half *p);
void vstorea_halfn_rtz(doublen data, size_t offset, global half *p);
void vstorea_halfn_rtp(doublen data, size_t offset, global half *p);
void vstorea_halfn_rtn(doublen data, size_t offset, global half *p);
void vstorea_halfn(doublen data, size_t offset, local half *p);
void vstorea_halfn_rte(doublen data, size_t offset, local half *p);
void vstorea_halfn_rtz(doublen data, size_t offset, local half *p);
void vstorea_halfn_rtp(doublen data, size_t offset, local half *p);
void vstorea_halfn_rtn(doublen data, size_t offset, local half *p);
void vstorea_halfn(doublen data, size_t offset, private half *p);
void vstorea_halfn_rte(doublen data, size_t offset, private half *p);
void vstorea_halfn_rtz(doublen data, size_t offset, private half *p);
void vstorea_halfn_rtp(doublen data, size_t offset, private half *p);
void vstorea_halfn_rtn(doublen data, size_t offset, private half *p);





// Async Copy and Prefetch Functions

event_t async_work_group_copy(local gentype *dst,const global gentype *src,size_t num_gentypes,event_t event);
event_t async_work_group_copy(global gentype *dst,const local gentype *src,size_t num_gentypes,event_t event);
event_t async_work_group_strided_copy(local gentype *dst, const global gentype *src, size_t num_gentypes, size_t src_stride, event_t event);
event_t async_work_group_strided_copy(global gentype *dst,const local gentype *src,size_t num_gentypes,size_t dst_stride,event_t event);
void wait_group_events(int num_events,event_t *event_list);
void prefetch(const global gentype *p,size_t num_gentypes);




// Atomic Functions

int atomic_add(volatile global int *p, int val);
unsigned int atomic_add(volatile global unsigned int *p,unsigned int val);
int atomic_add(volatile local int *p, int val);
unsigned int atomic_add(volatile local unsigned int *p,unsigned int val);

int atomic_sub(volatile global int *p, int val);
unsigned int atomic_sub(volatile global unsigned int *p, unsigned int val);
int atomic_sub(volatile local int *p, int val);
unsigned int atomic_sub(volatile local unsigned int *p, unsigned int val);

int atomic_xchg(volatile global int *p, int val);
unsigned int atomic_xchg(volatile global unsigned int *p,unsigned int val);
float atomic_xchg(volatile global int *p, float val);
int atomic_xchg(volatile local int *p, int val);
unsigned int atomic_xchg(volatile local unsigned int *p,  unsigned int val);
float atomic_xchg(volatile local int *p, float val);


int atomic_inc(volatile global int *p);
unsigned int atomic_inc(volatile global unsigned int *p);
int atomic_inc(volatile local int *p);
unsigned int atomic_inc(volatile local unsigned int *p);


int atomic_dec(volatile global int *p);
unsigned int atomic_dec(volatile global unsigned int *p);
int atomic_dec(volatile local int *p);
unsigned int atomic_dec(volatile local unsigned int *p);


int atomic_cmpxchg(volatile global int *p, int cmp, int val);
unsigned int atomic_cmpxchg(volatile global unsigned int *p, unsigned int cmp, unsigned int val);
int atomic_cmpxchg(volatile local int *p, int cmp, int val);
unsigned int atomic_cmpxchg(volatile local unsigned int *p, unsigned int cmp, unsigned int val);


int atomic_min(volatile global int *p, int val);
unsigned int atomic_min(volatile global unsigned int *p, unsigned int val);
int atomic_min(volatile local int *p, int val);
unsigned int atomic_min(volatile local unsigned int *p, unsigned int val);


int atomic_max(volatile global int *p, int val);
unsigned int atomic_max(volatile global unsigned int *p, unsigned int val) int atomic_max(volatile local int *p, int val);
unsigned int atomic_max(volatile local unsigned int *p, unsigned int val);


int atomic_min(volatile global int *p, int val);
unsigned int atomic_min(volatile global unsigned int *p, unsigned int val);
int atomic_min(volatile local int *p, int val);
unsigned int atomic_min(volatile local unsigned int *p, unsigned int val);

int atomic_and(volatile global int *p, int val);
unsigned int atomic_and(volatile global unsigned int *p, unsigned int val);
int atomic_and(volatile local int *p, int val);
unsigned int atomic_and(volatile local unsigned int *p, unsigned int val);


int atomic_or(volatile global int *p, int val);
unsigned int atomic_or(volatile global unsigned int *p,unsigned int val);
int atomic_or(volatile local int *p, int val);
unsigned int atomic_or(volatile local unsigned int *p, unsigned int val);


int atomic_xor(volatile global int *p, int val);
unsigned int atomic_xor(volatile global unsigned int *p, unsigned int val);
int atomic_xor(volatile local int *p, int val);
unsigned int atomic_xor(volatile local unsigned int *p, unsigned int val);


// Miscellaneous Vector Functions

int vec_step(gentype a);
int vec_step(gentypen a);
int vec_step(char3 a);
int vec_step(uchar3 a);
int vec_step(short3 a);
int vec_step(ushort3 a);
int vec_step(half3 a);
int vec_step(int3 a);
int vec_step(uint3 a);
int vec_step(long3 a);
int vec_step(ulong3 a);
int vec_step(float3 a);
int vec_step(double3 a);
int vec_step(type a);

gentypen shuffle(gentypem x, ugentypen mask);
gentypen shuffle2(gentypem x, gentypem y, ugentypen mask);




// Image Read and Write Functions

float4 read_imagef(image2d_t image,  sampler_t sampler, float2 coord);
float4 read_imagef(image2d_t image,  sampler_t sampler, int2 coord);
int4 read_imagei(image2d_t image,  sampler_t sampler, float2 coord);
int4 read_imagei(image2d_t image,  sampler_t sampler, int2 coord);
uint4 read_imageui(image2d_t image, sampler_t sampler,  float2 coord);
uint4 read_imageui(image2d_t image,  sampler_t sampler,  int2 coord);
float4 read_imagef(image3d_t image,  sampler_t sampler,  float4 coord);
float4 read_imagef(image3d_t image,  sampler_t sampler, int4 coord);
int4 read_imagei(image3d_t image,  sampler_t sampler,  float4 coord);
int4 read_imagei(image3d_t image, sampler_t sampler, int4 coord);
uint4 read_imageui(image3d_t image,  sampler_t sampler,  float4 coord);
uint4 read_imageui(image3d_t image,  sampler_t sampler, int4 coord);
void write_imagef(image2d_t image,  int2 coord,  float4 color);
void write_imagei(image2d_t image, int2 coord,  int4 color);
void write_imageui(image2d_t image, int2 coord, uint4 color);
void write_imagef(image3d_t image, int4 coord, float4 color);
void write_imagei(image3d_t image, int4 coord,  int4 color);
void write_imageui(image3d_t image, int4 coord, uint4 color);

int get_image_width(image2d_t image);
int get_image_width(image3d_t image); // Returns the image width in pixels.
int get_image_height(image2d_t image);
int get_image_height(image3d_t image);   // Returns the image height in pixels.
int get_image_depth(image3d_t image);    // Returns the image depth in pixels.
int2 get_image_dim(image2d_t image);     // Returns the 2D image dimensions in an int2.The width is returned in the x component and the height in the y component.
int4 get_image_dim(image3d_t image);  // Returns the 3D image dimensions in an int4.The width is returned in the x component, the height in the ycomponent, and the depth in the z component.

int get_image_channel_data_type(image2d_t image);
int get_image_channel_data_type(image3d_t image);   /* Returns the channel data type of the image.Valid values are
                                                        CLK_SNORM_INT8
                                                        CLK_SNORM_INT16
                                                        CLK_UNORM_INT8
                                                        CLK_UNORM_INT16
                                                        CLK_UNORM_SHORT_565
                                                        CLK_UNORM_SHORT_555
                                                        CLK_UNORM_SHORT_101010
                                                        CLK_SIGNED_INT8
                                                        CLK_SIGNED_INT16
                                                        CLK_SIGNED_INT32
                                                        CLK_UNSIGNED_INT8
                                                        CLK_UNSIGNED_INT16
                                                        CLK_UNSIGNED_INT32
                                                        CLK_HALF_FLOAT
                                                        CLK_FLOAT
                                                        */
int get_image_channel_data_order(image2d_t image);
int get_image_channel_data_order(image3d_t image);    /*  Returns the image channel order.Valid values are
                                                        CLK_A
                                                        CLK_R
                                                        CLK_Rx
                                                        CLK_RG
                                                        CLK_RGx
                                                        CLK_RGB
                                                        CLK_RGBx
                                                        CLK_RGBA
                                                        CLK_ARGB
                                                        CLK_BGRA
                                                        CLK_INTENSITY
                                                        CLK_LUMINANCE
                                                        */


BUG -- Linker flags (Release): -Wl,--gc-sections -Wl,--as-needed -Wl,--no-undefined -- Linker flags (Debug): -Wl,--gc-sections -Wl,--as-needed -Wl,--no-undefined -- ccache: NO -- Precompiled headers: NO -- Extra dependencies: dl m pthread rt -- 3rdparty dependencies: -- -- OpenCV modules: -- To be built: calib3d core dnn features2d flann gapi highgui imgcodecs imgproc ml objdetect photo stitching ts video videoio -- Disabled: world -- Disabled by dependency: - -- Unavailable: java python2 python3 -- Applications: tests perf_tests apps -- Documentation: NO -- Non-free algorithms: NO -- -- GUI: GTK3 -- GTK+: YES (ver 3.24.20) -- GThread : YES (ver 2.64.6) -- GtkGlExt: NO -- VTK support: NO -- -- Media I/O: -- ZLib: /usr/lib/x86_64-linux-gnu/libz.so (ver 1.2.11) -- JPEG: /usr/lib/x86_64-linux-gnu/libjpeg.so (ver 80) -- WEBP: build (ver encoder: 0x020f) -- PNG: /usr/lib/x86_64-linux-gnu/libpng.so (ver 1.6.37) -- TIFF: /usr/lib/x86_64-linux-gnu/libtiff.so (ver / ) -- JPEG 2000: OpenJPEG (ver 2.4.0) -- OpenEXR: /usr/lib/x86_64-linux-gnu/libImath.so /usr/lib/x86_64-linux-gnu/libIlmImf.so /usr/lib/x86_64-linux-gnu/libIex.so /usr/lib/x86_64-linux-gnu/libHalf.so /usr/lib/x86_64-linux-gnu/libIlmThread.so (ver 2_3) -- HDR: YES -- SUNRASTER: YES -- PXM: YES -- PFM: YES -- -- Video I/O: -- DC1394: YES (2.2.5) -- FFMPEG: YES -- avcodec: YES (58.54.100) -- avformat: YES (58.29.100) -- avutil: YES (56.31.100) -- swscale: YES (5.5.100) -- avresample: NO -- GStreamer: YES (1.16.3) -- v4l/v4l2: YES (linux/videodev2.h) -- -- Parallel framework: pthreads -- -- Trace: YES (with Intel ITT) -- -- Other third-party libraries: -- VA: NO -- Lapack: NO -- Eigen: YES (ver 3.3.7) -- Custom HAL: NO -- Protobuf: build (3.19.1) -- -- OpenCL: YES (no extra features) -- Include path: /home/hyzk/Downloads/opencv-4.7.0/3rdparty/include/opencl/1.2 -- Link libraries: Dynamic load -- -- Python (for build): /usr/bin/python3 -- -- Java: -- ant: NO -- JNI: NO -- Java wrappers: NO -- Java tests: NO -- -- Install to: /usr/local
07-20
PowerShell 7 环境已加载 (版本: 7.5.2) PowerShell 7 环境已加载 (版本: 7.5.2) PS C:\Users\Administrator\Desktop> cd E:\PyTorch_Build\pytorch PS E:\PyTorch_Build\pytorch> .\pytorch_env\Scripts\activate (pytorch_env) PS E:\PyTorch_Build\pytorch> # 退出虚拟环境 (pytorch_env) PS E:\PyTorch_Build\pytorch> deactivate PS E:\PyTorch_Build\pytorch> PS E:\PyTorch_Build\pytorch> # 删除旧环境 PS E:\PyTorch_Build\pytorch> Remove-Item -Recurse -Force .\pytorch_env PS E:\PyTorch_Build\pytorch> Remove-Item -Recurse -Force .\cuda_env PS E:\PyTorch_Build\pytorch> PS E:\PyTorch_Build\pytorch> # 创建新虚拟环境 PS E:\PyTorch_Build\pytorch> python -m venv rtx5070_env PS E:\PyTorch_Build\pytorch> .\rtx5070_env\Scripts\activate (rtx5070_env) PS E:\PyTorch_Build\pytorch> (rtx5070_env) PS E:\PyTorch_Build\pytorch> # 安装基础编译工具 (rtx5070_env) PS E:\PyTorch_Build\pytorch> pip install -U pip setuptools wheel ninja cmake Looking in indexes: https://pypi.tuna.tsinghua.edu.cn/simple Requirement already satisfied: pip in e:\pytorch_build\pytorch\rtx5070_env\lib\site-packages (22.3.1) Collecting pip Using cached https://pypi.tuna.tsinghua.edu.cn/packages/b7/3f/945ef7ab14dc4f9d7f40288d2df998d1837ee0888ec3659c813487572faa/pip-25.2-py3-none-any.whl (1.8 MB) Requirement already satisfied: setuptools in e:\pytorch_build\pytorch\rtx5070_env\lib\site-packages (65.5.0) Collecting setuptools Using cached https://pypi.tuna.tsinghua.edu.cn/packages/a3/dc/17031897dae0efacfea57dfd3a82fdd2a2aeb58e0ff71b77b87e44edc772/setuptools-80.9.0-py3-none-any.whl (1.2 MB) Collecting wheel Using cached https://pypi.tuna.tsinghua.edu.cn/packages/0b/2c/87f3254fd8ffd29e4c02732eee68a83a1d3c346ae39bc6822dcbcb697f2b/wheel-0.45.1-py3-none-any.whl (72 kB) Collecting ninja Using cached https://pypi.tuna.tsinghua.edu.cn/packages/29/45/c0adfbfb0b5895aa18cec400c535b4f7ff3e52536e0403602fc1a23f7de9/ninja-1.13.0-py3-none-win_amd64.whl (309 kB) Collecting cmake Using cached https://pypi.tuna.tsinghua.edu.cn/packages/7c/d0/73cae88d8c25973f2465d5a4457264f95617c16ad321824ed4c243734511/cmake-4.1.0-py3-none-win_amd64.whl (37.6 MB) ERROR: To modify pip, please run the following command: E:\PyTorch_Build\pytorch\rtx5070_env\Scripts\python.exe -m pip install -U pip setuptools wheel ninja cmake [notice] A new release of pip available: 22.3.1 -> 25.2 [notice] To update, run: python.exe -m pip install --upgrade pip (rtx5070_env) PS E:\PyTorch_Build\pytorch> (rtx5070_env) PS E:\PyTorch_Build\pytorch> # 验证 CUDA 安装 (rtx5070_env) PS E:\PyTorch_Build\pytorch> nvcc --version # 应显示 CUDA 12.x nvcc: NVIDIA (R) Cuda compiler driver Copyright (c) 2005-2025 NVIDIA Corporation Built on Wed_Jul_16_20:06:48_Pacific_Daylight_Time_2025 Cuda compilation tools, release 13.0, V13.0.48 Build cuda_13.0.r13.0/compiler.36260728_0 (rtx5070_env) PS E:\PyTorch_Build\pytorch> # 正确更新 pip 和工具链 (rtx5070_env) PS E:\PyTorch_Build\pytorch> python -m pip install -U pip setuptools wheel ninja cmake Looking in indexes: https://pypi.tuna.tsinghua.edu.cn/simple Requirement already satisfied: pip in e:\pytorch_build\pytorch\rtx5070_env\lib\site-packages (22.3.1) Collecting pip Using cached https://pypi.tuna.tsinghua.edu.cn/packages/b7/3f/945ef7ab14dc4f9d7f40288d2df998d1837ee0888ec3659c813487572faa/pip-25.2-py3-none-any.whl (1.8 MB) Requirement already satisfied: setuptools in e:\pytorch_build\pytorch\rtx5070_env\lib\site-packages (65.5.0) Collecting setuptools Using cached https://pypi.tuna.tsinghua.edu.cn/packages/a3/dc/17031897dae0efacfea57dfd3a82fdd2a2aeb58e0ff71b77b87e44edc772/setuptools-80.9.0-py3-none-any.whl (1.2 MB) Collecting wheel Using cached https://pypi.tuna.tsinghua.edu.cn/packages/0b/2c/87f3254fd8ffd29e4c02732eee68a83a1d3c346ae39bc6822dcbcb697f2b/wheel-0.45.1-py3-none-any.whl (72 kB) Collecting ninja Using cached https://pypi.tuna.tsinghua.edu.cn/packages/29/45/c0adfbfb0b5895aa18cec400c535b4f7ff3e52536e0403602fc1a23f7de9/ninja-1.13.0-py3-none-win_amd64.whl (309 kB) Collecting cmake Using cached https://pypi.tuna.tsinghua.edu.cn/packages/7c/d0/73cae88d8c25973f2465d5a4457264f95617c16ad321824ed4c243734511/cmake-4.1.0-py3-none-win_amd64.whl (37.6 MB) Installing collected packages: wheel, setuptools, pip, ninja, cmake Attempting uninstall: setuptools Found existing installation: setuptools 65.5.0 Uninstalling setuptools-65.5.0: Successfully uninstalled setuptools-65.5.0 Attempting uninstall: pip Found existing installation: pip 22.3.1 Uninstalling pip-22.3.1: Successfully uninstalled pip-22.3.1 Successfully installed cmake-4.1.0 ninja-1.13.0 pip-25.2 setuptools-80.9.0 wheel-0.45.1 (rtx5070_env) PS E:\PyTorch_Build\pytorch> (rtx5070_env) PS E:\PyTorch_Build\pytorch> # 验证版本 (rtx5070_env) PS E:\PyTorch_Build\pytorch> pip --version # 应显示 25.2+ pip 25.2 from E:\PyTorch_Build\pytorch\rtx5070_env\lib\site-packages\pip (python 3.10) (rtx5070_env) PS E:\PyTorch_Build\pytorch> cmake --version # 应显示 4.1.0+ cmake version 4.1.0 CMake suite maintained and supported by Kitware (kitware.com/cmake). (rtx5070_env) PS E:\PyTorch_Build\pytorch> ninja --version # 应显示 1.13.0+ 1.13.0.git.kitware.jobserver-pipe-1 (rtx5070_env) PS E:\PyTorch_Build\pytorch> # 设置 CUDA 12.1 环境变量 (rtx5070_env) PS E:\PyTorch_Build\pytorch> $env:CUDA_PATH = "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.1" (rtx5070_env) PS E:\PyTorch_Build\pytorch> $env:PATH = "$env:CUDA_PATH\bin;" + $env:PATH (rtx5070_env) PS E:\PyTorch_Build\pytorch> (rtx5070_env) PS E:\PyTorch_Build\pytorch> # 验证 CUDA 版本 (rtx5070_env) PS E:\PyTorch_Build\pytorch> nvcc --version # 应显示 release 12.1 nvcc: NVIDIA (R) Cuda compiler driver Copyright (c) 2005-2025 NVIDIA Corporation Built on Wed_Jul_16_20:06:48_Pacific_Daylight_Time_2025 Cuda compilation tools, release 13.0, V13.0.48 Build cuda_13.0.r13.0/compiler.36260728_0 (rtx5070_env) PS E:\PyTorch_Build\pytorch> (rtx5070_env) PS E:\PyTorch_Build\pytorch> # 设置 cuDNN 路径(根据实际安装位置) (rtx5070_env) PS E:\PyTorch_Build\pytorch> $env:CUDNN_INCLUDE_DIR = "$env:CUDA_PATH\include" (rtx5070_env) PS E:\PyTorch_Build\pytorch> $env:CUDNN_LIBRARY = "$env:CUDA_PATH\lib\x64\cudnn.lib" (rtx5070_env) PS E:\PyTorch_Build\pytorch> # 安装必要依赖 (rtx5070_env) PS E:\PyTorch_Build\pytorch> pip install pyyaml numpy typing_extensions Looking in indexes: https://pypi.tuna.tsinghua.edu.cn/simple Collecting pyyaml Using cached https://pypi.tuna.tsinghua.edu.cn/packages/b5/84/0fa4b06f6d6c958d207620fc60005e241ecedceee58931bb20138e1e5776/PyYAML-6.0.2-cp310-cp310-win_amd64.whl (161 kB) Collecting numpy Using cached https://pypi.tuna.tsinghua.edu.cn/packages/a3/dd/4b822569d6b96c39d1215dbae0582fd99954dcbcf0c1a13c61783feaca3f/numpy-2.2.6-cp310-cp310-win_amd64.whl (12.9 MB) Collecting typing_extensions Using cached https://pypi.tuna.tsinghua.edu.cn/packages/18/67/36e9267722cc04a6b9f15c7f3441c2363321a3ea07da7ae0c0707beb2a9c/typing_extensions-4.15.0-py3-none-any.whl (44 kB) Installing collected packages: typing_extensions, pyyaml, numpy Successfully installed numpy-2.2.6 pyyaml-6.0.2 typing_extensions-4.15.0 (rtx5070_env) PS E:\PyTorch_Build\pytorch> (rtx5070_env) PS E:\PyTorch_Build\pytorch> # 安装 GPU 相关依赖 (rtx5070_env) PS E:\PyTorch_Build\pytorch> pip install mkl mkl-include intel-openmp Looking in indexes: https://pypi.tuna.tsinghua.edu.cn/simple Collecting mkl Using cached https://pypi.tuna.tsinghua.edu.cn/packages/91/ae/025174ee141432b974f97ecd2aea529a3bdb547392bde3dd55ce48fe7827/mkl-2025.2.0-py2.py3-none-win_amd64.whl (153.6 MB) Collecting mkl-include Using cached https://pypi.tuna.tsinghua.edu.cn/packages/06/87/3eee37bf95c6b820b6394ad98e50132798514ecda1b2584c71c2c96b973c/mkl_include-2025.2.0-py2.py3-none-win_amd64.whl (1.3 MB) Collecting intel-openmp Using cached https://pypi.tuna.tsinghua.edu.cn/packages/89/ed/13fed53fcc7ea17ff84095e89e63418df91d4eeefdc74454243d529bf5a3/intel_openmp-2025.2.1-py2.py3-none-win_amd64.whl (34.0 MB) Collecting tbb==2022.* (from mkl) Using cached https://pypi.tuna.tsinghua.edu.cn/packages/4e/d2/01e2a93f9c644585088188840bf453f23ed1a2838ec51d5ba1ada1ebca71/tbb-2022.2.0-py3-none-win_amd64.whl (420 kB) Collecting intel-cmplr-lib-ur==2025.2.1 (from intel-openmp) Using cached https://pypi.tuna.tsinghua.edu.cn/packages/a8/70/938e81f58886fd4e114d5a5480d98c1396e73e40b7650f566ad0c4395311/intel_cmplr_lib_ur-2025.2.1-py2.py3-none-win_amd64.whl (1.2 MB) Collecting umf==0.11.* (from intel-cmplr-lib-ur==2025.2.1->intel-openmp) Using cached https://pypi.tuna.tsinghua.edu.cn/packages/33/a0/c8d755f08f50ddd99cb4a29a7e950ced7a0903cb72253e57059063609103/umf-0.11.0-py2.py3-none-win_amd64.whl (231 kB) Collecting tcmlib==1.* (from tbb==2022.*->mkl) Using cached https://pypi.tuna.tsinghua.edu.cn/packages/91/7b/e30c461a27b97e0090e4db822eeb1d37b310863241f8c3ee56f68df3e76e/tcmlib-1.4.0-py2.py3-none-win_amd64.whl (370 kB) Installing collected packages: tcmlib, mkl-include, umf, tbb, intel-cmplr-lib-ur, intel-openmp, mkl Successfully installed intel-cmplr-lib-ur-2025.2.1 intel-openmp-2025.2.1 mkl-2025.2.0 mkl-include-2025.2.0 tbb-2022.2.0 tcmlib-1.4.0 umf-0.11.0 (rtx5070_env) PS E:\PyTorch_Build\pytorch> # 安装必要依赖 (rtx5070_env) PS E:\PyTorch_Build\pytorch> pip install pyyaml numpy typing_extensions Looking in indexes: https://pypi.tuna.tsinghua.edu.cn/simple Requirement already satisfied: pyyaml in e:\pytorch_build\pytorch\rtx5070_env\lib\site-packages (6.0.2) Requirement already satisfied: numpy in e:\pytorch_build\pytorch\rtx5070_env\lib\site-packages (2.2.6) Requirement already satisfied: typing_extensions in e:\pytorch_build\pytorch\rtx5070_env\lib\site-packages (4.15.0) (rtx5070_env) PS E:\PyTorch_Build\pytorch> (rtx5070_env) PS E:\PyTorch_Build\pytorch> # 安装 GPU 相关依赖 (rtx5070_env) PS E:\PyTorch_Build\pytorch> pip install mkl mkl-include intel-openmp Looking in indexes: https://pypi.tuna.tsinghua.edu.cn/simple Requirement already satisfied: mkl in e:\pytorch_build\pytorch\rtx5070_env\lib\site-packages (2025.2.0) Requirement already satisfied: mkl-include in e:\pytorch_build\pytorch\rtx5070_env\lib\site-packages (2025.2.0) Requirement already satisfied: intel-openmp in e:\pytorch_build\pytorch\rtx5070_env\lib\site-packages (2025.2.1) Requirement already satisfied: tbb==2022.* in e:\pytorch_build\pytorch\rtx5070_env\lib\site-packages (from mkl) (2022.2.0) Requirement already satisfied: intel-cmplr-lib-ur==2025.2.1 in e:\pytorch_build\pytorch\rtx5070_env\lib\site-packages (from intel-openmp) (2025.2.1) Requirement already satisfied: umf==0.11.* in e:\pytorch_build\pytorch\rtx5070_env\lib\site-packages (from intel-cmplr-lib-ur==2025.2.1->intel-openmp) (0.11.0) Requirement already satisfied: tcmlib==1.* in e:\pytorch_build\pytorch\rtx5070_env\lib\site-packages (from tbb==2022.*->mkl) (1.4.0) (rtx5070_env) PS E:\PyTorch_Build\pytorch> # 设置编译参数 (rtx5070_env) PS E:\PyTorch_Build\pytorch> $env:USE_CUDA=1 (rtx5070_env) PS E:\PyTorch_Build\pytorch> $env:USE_CUDNN=1 (rtx5070_env) PS E:\PyTorch_Build\pytorch> $env:CMAKE_GENERATOR="Ninja" (rtx5070_env) PS E:\PyTorch_Build\pytorch> $env:MAX_JOBS=8 # 根据 CPU 核心数设置 (rtx5070_env) PS E:\PyTorch_Build\pytorch> (rtx5070_env) PS E:\PyTorch_Build\pytorch> # 运行编译 (rtx5070_env) PS E:\PyTorch_Build\pytorch> python setup.py install ` >> --cmake ` >> --cmake-only ` >> --cmake-generator="Ninja" ` >> --verbose ` >> -DCMAKE_CUDA_COMPILER="${env:CUDA_PATH}\bin\nvcc.exe" ` >> -DCUDNN_INCLUDE_DIR="${env:CUDNN_INCLUDE_DIR}" ` >> -DCUDNN_LIBRARY="${env:CUDNN_LIBRARY}" ` >> -DTORCH_CUDA_ARCH_LIST="8.9;9.0;12.0" Building wheel torch-2.9.0a0+git2d31c3d option --cmake-generator not recognized (rtx5070_env) PS E:\PyTorch_Build\pytorch> python rtx5070_test.py ============================================================ Traceback (most recent call last): File "E:\PyTorch_Build\pytorch\rtx5070_test.py", line 39, in <module> verify_gpu_support() File "E:\PyTorch_Build\pytorch\rtx5070_test.py", line 6, in verify_gpu_support if not torch.cuda.is_available(): AttributeError: module 'torch' has no attribute 'cuda' (rtx5070_env) PS E:\PyTorch_Build\pytorch> # 设置编译架构参数 (rtx5070_env) PS E:\PyTorch_Build\pytorch> $env:TORCH_CUDA_ARCH_LIST="8.9;9.0;12.0" (rtx5070_env) PS E:\PyTorch_Build\pytorch> (rtx5070_env) PS E:\PyTorch_Build\pytorch> # 使用正确的编译命令 (rtx5070_env) PS E:\PyTorch_Build\pytorch> python setup.py install ` >> --cmake ` >> --verbose ` >> -DCMAKE_CUDA_COMPILER="${env:CUDA_PATH}\bin\nvcc.exe" ` >> -DCUDNN_INCLUDE_DIR="${env:CUDNN_INCLUDE_DIR}" ` >> -DCUDNN_LIBRARY="${env:CUDNN_LIBRARY}" ` >> -DCMAKE_GENERATOR="Ninja" ` >> -DUSE_CUDA=ON ` >> -DUSE_CUDNN=ON Building wheel torch-2.9.0a0+git2d31c3d option -D not recognized (rtx5070_env) PS E:\PyTorch_Build\pytorch> python enhanced_test.py ============================================================ Python 版本: 3.10.10 Traceback (most recent call last): File "E:\PyTorch_Build\pytorch\enhanced_test.py", line 64, in <module> verify_installation() File "E:\PyTorch_Build\pytorch\enhanced_test.py", line 11, in verify_installation print(f"\nPyTorch 版本: {torch.__version__}") AttributeError: module 'torch' has no attribute '__version__' (rtx5070_env) PS E:\PyTorch_Build\pytorch> # 清除之前的构建 (rtx5070_env) PS E:\PyTorch_Build\pytorch> python setup.py clean --all Building wheel torch-2.9.0a0+git2d31c3d E:\PyTorch_Build\pytorch\rtx5070_env\lib\site-packages\setuptools\config\_apply_pyprojecttoml.py:82: SetuptoolsDeprecationWarning: `project.license` as a TOML table is deprecated !! ******************************************************************************** Please use a simple string containing a SPDX expression for `project.license`. You can also use `project.license-files`. (Both options available on setuptools>=77.0.0). By 2026-Feb-18, you need to update your project and remove deprecated calls or your builds will no longer be supported. See https://packaging.python.org/en/latest/guides/writing-pyproject-toml/#license for details. ******************************************************************************** !! corresp(dist, value, root_dir) usage: setup.py [global_opts] cmd1 [cmd1_opts] [cmd2 [cmd2_opts] ...] or: setup.py --help [cmd1 cmd2 ...] or: setup.py --help-commands or: setup.py cmd --help error: option --all not recognized (rtx5070_env) PS E:\PyTorch_Build\pytorch> (rtx5070_env) PS E:\PyTorch_Build\pytorch> # 设置编译架构参数 (rtx5070_env) PS E:\PyTorch_Build\pytorch> $env:TORCH_CUDA_ARCH_LIST="8.9;9.0;12.0" (rtx5070_env) PS E:\PyTorch_Build\pytorch> (rtx5070_env) PS E:\PyTorch_Build\pytorch> # 使用正确的编译命令(Windows专用) (rtx5070_env) PS E:\PyTorch_Build\pytorch> python setup.py install ` >> --cmake ` >> --cmake-args="-DCMAKE_CUDA_COMPILER='$env:CUDA_PATH\bin\nvcc.exe' ` >> -DCUDNN_INCLUDE_DIR='$env:CUDNN_INCLUDE_DIR' ` >> -DCUDNN_LIBRARY='$env:CUDNN_LIBRARY' ` >> -DCMAKE_GENERATOR='Ninja' ` >> -DUSE_CUDA=ON ` >> -DUSE_CUDNN=ON" ` >> --verbose ` >> --jobs=$env:MAX_JOBS Building wheel torch-2.9.0a0+git2d31c3d option --cmake-args not recognized (rtx5070_env) PS E:\PyTorch_Build\pytorch> # 使用 PyTorch 官方构建工具 (rtx5070_env) PS E:\PyTorch_Build\pytorch> pip install -U setuptools wheel Looking in indexes: https://pypi.tuna.tsinghua.edu.cn/simple Requirement already satisfied: setuptools in e:\pytorch_build\pytorch\rtx5070_env\lib\site-packages (80.9.0) Requirement already satisfied: wheel in e:\pytorch_build\pytorch\rtx5070_env\lib\site-packages (0.45.1) (rtx5070_env) PS E:\PyTorch_Build\pytorch> python setup.py bdist_wheel Building wheel torch-2.9.0a0+git2d31c3d -- Building version 2.9.0a0+git2d31c3d E:\PyTorch_Build\pytorch\rtx5070_env\lib\site-packages\setuptools\_distutils\_msvccompiler.py:12: UserWarning: _get_vc_env is private; find an alternative (pypa/distutils#340) warnings.warn( -- Checkout nccl release tag: v2.27.5-1 cmake -GNinja -DBUILD_PYTHON=True -DBUILD_TEST=True -DCMAKE_BUILD_TYPE=Release -DCMAKE_GENERATOR=Ninja -DCMAKE_INSTALL_PREFIX=E:\PyTorch_Build\pytorch\torch -DCMAKE_PREFIX_PATH=E:\PyTorch_Build\pytorch\rtx5070_env\Lib\site-packages -DCUDNN_INCLUDE_DIR=C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.1\include -DCUDNN_LIBRARY=C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.1\lib\x64\cudnn.lib -DPython_EXECUTABLE=E:\PyTorch_Build\pytorch\rtx5070_env\Scripts\python.exe -DPython_NumPy_INCLUDE_DIR=E:\PyTorch_Build\pytorch\rtx5070_env\lib\site-packages\numpy\_core\include -DTORCH_BUILD_VERSION=2.9.0a0+git2d31c3d -DTORCH_CUDA_ARCH_LIST=8.9;9.0;12.0 -DUSE_CUDA=1 -DUSE_CUDNN=1 -DUSE_NUMPY=True E:\PyTorch_Build\pytorch CMake Deprecation Warning at CMakeLists.txt:18 (cmake_policy): The OLD behavior for policy CMP0126 will be removed from a future version of CMake. The cmake-policies(7) manual explains that the OLD behaviors of all policies are deprecated and that a policy should be set to OLD only under specific short-term circumstances. Projects should be ported to the NEW behavior and not rely on setting a policy to OLD. -- The CXX compiler identification is MSVC 19.44.35215.0 -- The C compiler identification is MSVC 19.44.35215.0 -- Detecting CXX compiler ABI info -- Detecting CXX compiler ABI info - done -- Check for working CXX compiler: C:/Program Files (x86)/Microsoft Visual Studio/2022/BuildTools/VC/Tools/MSVC/14.44.35207/bin/Hostx64/x64/cl.exe - skipped -- Detecting CXX compile features -- Detecting CXX compile features - done -- Detecting C compiler ABI info -- Detecting C compiler ABI info - done -- Check for working C compiler: C:/Program Files (x86)/Microsoft Visual Studio/2022/BuildTools/VC/Tools/MSVC/14.44.35207/bin/Hostx64/x64/cl.exe - skipped -- Detecting C compile features -- Detecting C compile features - done -- Not forcing any particular BLAS to be found CMake Warning at CMakeLists.txt:425 (message): TensorPipe cannot be used on Windows. Set it to OFF CMake Warning at CMakeLists.txt:427 (message): KleidiAI cannot be used on Windows. Set it to OFF CMake Warning at CMakeLists.txt:439 (message): Libuv is not installed in current conda env. Set USE_DISTRIBUTED to OFF. Please run command 'conda install -c conda-forge libuv=1.39' to install libuv. -- Performing Test C_HAS_AVX_1 -- Performing Test C_HAS_AVX_1 - Success -- Performing Test C_HAS_AVX2_1 -- Performing Test C_HAS_AVX2_1 - Success -- Performing Test C_HAS_AVX512_1 -- Performing Test C_HAS_AVX512_1 - Success -- Performing Test CXX_HAS_AVX_1 -- Performing Test CXX_HAS_AVX_1 - Success -- Performing Test CXX_HAS_AVX2_1 -- Performing Test CXX_HAS_AVX2_1 - Success -- Performing Test CXX_HAS_AVX512_1 -- Performing Test CXX_HAS_AVX512_1 - Success -- Current compiler supports avx2 extension. Will build perfkernels. -- Performing Test COMPILER_SUPPORTS_HIDDEN_VISIBILITY -- Performing Test COMPILER_SUPPORTS_HIDDEN_VISIBILITY - Failed -- Performing Test COMPILER_SUPPORTS_HIDDEN_INLINE_VISIBILITY -- Performing Test COMPILER_SUPPORTS_HIDDEN_INLINE_VISIBILITY - Failed -- Could not find hardware support for NEON on this machine. -- No OMAP3 processor on this machine. -- No OMAP4 processor on this machine. -- Compiler does not support SVE extension. Will not build perfkernels. CMake Warning at CMakeLists.txt:845 (message): x64 operating system is required for FBGEMM. Not compiling with FBGEMM. Turn this warning off by USE_FBGEMM=OFF. -- Performing Test HAS/UTF_8 -- Performing Test HAS/UTF_8 - Success -- Found CUDA: E:/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v13.0 (found version "13.0") -- The CUDA compiler identification is NVIDIA 13.0.48 with host compiler MSVC 19.44.35215.0 -- Detecting CUDA compiler ABI info -- Detecting CUDA compiler ABI info - done -- Check for working CUDA compiler: E:/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v13.0/bin/nvcc.exe - skipped -- Detecting CUDA compile features -- Detecting CUDA compile features - done -- Found CUDAToolkit: E:/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v13.0/include (found version "13.0.48") -- PyTorch: CUDA detected: 13.0 -- PyTorch: CUDA nvcc is: E:/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v13.0/bin/nvcc.exe -- PyTorch: CUDA toolkit directory: E:/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v13.0 -- PyTorch: Header version is: 13.0 -- Found Python: E:\PyTorch_Build\pytorch\rtx5070_env\Scripts\python.exe (found version "3.10.10") found components: Interpreter CMake Warning at cmake/public/cuda.cmake:140 (message): Failed to compute shorthash for libnvrtc.so Call Stack (most recent call first): cmake/Dependencies.cmake:44 (include) CMakeLists.txt:873 (include) -- Could NOT find CUDNN (missing: CUDNN_LIBRARY_PATH CUDNN_INCLUDE_PATH) CMake Warning at cmake/public/cuda.cmake:201 (message): Cannot find cuDNN library. Turning the option off Call Stack (most recent call first): cmake/Dependencies.cmake:44 (include) CMakeLists.txt:873 (include) -- Could NOT find CUSPARSELT (missing: CUSPARSELT_LIBRARY_PATH CUSPARSELT_INCLUDE_PATH) CMake Warning at cmake/public/cuda.cmake:226 (message): Cannot find cuSPARSELt library. Turning the option off Call Stack (most recent call first): cmake/Dependencies.cmake:44 (include) CMakeLists.txt:873 (include) -- Could NOT find CUDSS (missing: CUDSS_LIBRARY_PATH CUDSS_INCLUDE_PATH) CMake Warning at cmake/public/cuda.cmake:242 (message): Cannot find CUDSS library. Turning the option off Call Stack (most recent call first): cmake/Dependencies.cmake:44 (include) CMakeLists.txt:873 (include) -- USE_CUFILE is set to 0. Compiling without cuFile support CMake Warning at cmake/public/cuda.cmake:317 (message): pytorch is not compatible with `CMAKE_CUDA_ARCHITECTURES` and will ignore its value. Please configure `TORCH_CUDA_ARCH_LIST` instead. Call Stack (most recent call first): cmake/Dependencies.cmake:44 (include) CMakeLists.txt:873 (include) -- Added CUDA NVCC flags for: -gencode;arch=compute_89,code=sm_89;-gencode;arch=compute_90,code=sm_90;-gencode;arch=compute_120,code=sm_120 CMake Warning at cmake/Dependencies.cmake:95 (message): Not compiling with XPU. Could NOT find SYCL. Suppress this warning with -DUSE_XPU=OFF. Call Stack (most recent call first): CMakeLists.txt:873 (include) -- Building using own protobuf under third_party per request. -- Use custom protobuf build. CMake Warning at cmake/ProtoBuf.cmake:37 (message): Ancient protobuf forces CMake compatibility Call Stack (most recent call first): cmake/ProtoBuf.cmake:87 (custom_protobuf_find) cmake/Dependencies.cmake:107 (include) CMakeLists.txt:873 (include) CMake Deprecation Warning at third_party/protobuf/cmake/CMakeLists.txt:2 (cmake_minimum_required): Compatibility with CMake < 3.10 will be removed from a future version of CMake. Update the VERSION argument <min> value. Or, use the <min>...<max> syntax to tell CMake that the project requires at least <min> but has been updated to work with policies introduced by <max> or earlier. -- -- 3.13.0.0 -- Performing Test CMAKE_HAVE_LIBC_PTHREAD -- Performing Test CMAKE_HAVE_LIBC_PTHREAD - Failed -- Looking for pthread_create in pthreads -- Looking for pthread_create in pthreads - not found -- Looking for pthread_create in pthread -- Looking for pthread_create in pthread - not found -- Found Threads: TRUE -- Caffe2 protobuf include directory: $<BUILD_INTERFACE:E:/PyTorch_Build/pytorch/third_party/protobuf/src>$<INSTALL_INTERFACE:include> -- Trying to find preferred BLAS backend of choice: MKL -- MKL_THREADING = OMP -- Looking for sys/types.h -- Looking for sys/types.h - found -- Looking for stdint.h -- Looking for stdint.h - found -- Looking for stddef.h -- Looking for stddef.h - found -- Check size of void* -- Check size of void* - done -- MKL_THREADING = OMP CMake Warning at cmake/Dependencies.cmake:213 (message): MKL could not be found. Defaulting to Eigen Call Stack (most recent call first): CMakeLists.txt:873 (include) CMake Warning at cmake/Dependencies.cmake:279 (message): Preferred BLAS (MKL) cannot be found, now searching for a general BLAS library Call Stack (most recent call first): CMakeLists.txt:873 (include) -- MKL_THREADING = OMP -- Checking for [mkl_intel_lp64 - mkl_intel_thread - mkl_core - libiomp5md] -- Library mkl_intel_lp64: not found -- Checking for [mkl_intel - mkl_intel_thread - mkl_core - libiomp5md] -- Library mkl_intel: not found -- Checking for [mkl_intel_lp64 - mkl_intel_thread - mkl_core] -- Library mkl_intel_lp64: not found -- Checking for [mkl_intel - mkl_intel_thread - mkl_core] -- Library mkl_intel: not found -- Checking for [mkl_intel_lp64 - mkl_sequential - mkl_core] -- Library mkl_intel_lp64: not found -- Checking for [mkl_intel - mkl_sequential - mkl_core] -- Library mkl_intel: not found -- Checking for [mkl_intel_lp64 - mkl_core - libiomp5md - pthread] -- Library mkl_intel_lp64: not found -- Checking for [mkl_intel - mkl_core - libiomp5md - pthread] -- Library mkl_intel: not found -- Checking for [mkl_intel_lp64 - mkl_core - pthread] -- Library mkl_intel_lp64: not found -- Checking for [mkl_intel - mkl_core - pthread] -- Library mkl_intel: not found -- Checking for [mkl - guide - pthread - m] -- Library mkl: not found -- MKL library not found -- Checking for [blis] -- Library blis: BLAS_blis_LIBRARY-NOTFOUND -- Checking for [Accelerate] -- Library Accelerate: BLAS_Accelerate_LIBRARY-NOTFOUND -- Checking for [vecLib] -- Library vecLib: BLAS_vecLib_LIBRARY-NOTFOUND -- Checking for [flexiblas] -- Library flexiblas: BLAS_flexiblas_LIBRARY-NOTFOUND -- Checking for [openblas] -- Library openblas: BLAS_openblas_LIBRARY-NOTFOUND -- Checking for [openblas - pthread - m] -- Library openblas: BLAS_openblas_LIBRARY-NOTFOUND -- Checking for [openblas - pthread - m - gomp] -- Library openblas: BLAS_openblas_LIBRARY-NOTFOUND -- Checking for [libopenblas] -- Library libopenblas: BLAS_libopenblas_LIBRARY-NOTFOUND -- Checking for [goto2 - gfortran] -- Library goto2: BLAS_goto2_LIBRARY-NOTFOUND -- Checking for [goto2 - gfortran - pthread] -- Library goto2: BLAS_goto2_LIBRARY-NOTFOUND -- Checking for [acml - gfortran] -- Library acml: BLAS_acml_LIBRARY-NOTFOUND -- Checking for [blis] -- Library blis: BLAS_blis_LIBRARY-NOTFOUND -- Could NOT find Atlas (missing: Atlas_CBLAS_INCLUDE_DIR Atlas_CLAPACK_INCLUDE_DIR Atlas_CBLAS_LIBRARY Atlas_BLAS_LIBRARY Atlas_LAPACK_LIBRARY) -- Checking for [ptf77blas - atlas - gfortran] -- Library ptf77blas: BLAS_ptf77blas_LIBRARY-NOTFOUND -- Checking for [] -- Looking for sgemm_ -- Looking for sgemm_ - not found -- Cannot find a library with BLAS API. Not using BLAS. -- Using pocketfft in directory: E:/PyTorch_Build/pytorch/third_party/pocketfft/ CMake Deprecation Warning at third_party/pthreadpool/CMakeLists.txt:1 (CMAKE_MINIMUM_REQUIRED): Compatibility with CMake < 3.10 will be removed from a future version of CMake. Update the VERSION argument <min> value. Or, use the <min>...<max> syntax to tell CMake that the project requires at least <min> but has been updated to work with policies introduced by <max> or earlier. CMake Deprecation Warning at third_party/FXdiv/CMakeLists.txt:1 (CMAKE_MINIMUM_REQUIRED): Compatibility with CMake < 3.10 will be removed from a future version of CMake. Update the VERSION argument <min> value. Or, use the <min>...<max> syntax to tell CMake that the project requires at least <min> but has been updated to work with policies introduced by <max> or earlier. CMake Deprecation Warning at third_party/cpuinfo/CMakeLists.txt:1 (CMAKE_MINIMUM_REQUIRED): Compatibility with CMake < 3.10 will be removed from a future version of CMake. Update the VERSION argument <min> value. Or, use the <min>...<max> syntax to tell CMake that the project requires at least <min> but has been updated to work with policies introduced by <max> or earlier. -- The ASM compiler identification is MSVC CMake Warning (dev) at rtx5070_env/Lib/site-packages/cmake/data/share/cmake-4.1/Modules/CMakeDetermineASMCompiler.cmake:234 (message): Policy CMP194 is not set: MSVC is not an assembler for language ASM. Run "cmake --help-policy CMP194" for policy details. Use the cmake_policy command to set the policy and suppress this warning. Call Stack (most recent call first): third_party/XNNPACK/CMakeLists.txt:18 (PROJECT) This warning is for project developers. Use -Wno-dev to suppress it. -- Found assembler: C:/Program Files (x86)/Microsoft Visual Studio/2022/BuildTools/VC/Tools/MSVC/14.44.35207/bin/Hostx64/x64/cl.exe -- Building for XNNPACK_TARGET_PROCESSOR: x86_64 -- Generating microkernels.cmake Duplicate microkernel definition: src\qs8-qc4w-packw\gen\qs8-qc4w-packw-x8c8-gemm-goi-avx256vnni.c and src\qs8-qc4w-packw\gen\qs8-qc4w-packw-x8c8-gemm-goi-avxvnni.c (1th function) Duplicate microkernel definition: src\qs8-qc4w-packw\gen\qs8-qc4w-packw-x8c8-gemm-goi-avxvnni.c and src\qs8-qc4w-packw\gen\qs8-qc4w-packw-x8c8-gemm-goi-scalar.c No microkernel found in src\reference\binary-elementwise.cc No microkernel found in src\reference\packing.cc No microkernel found in src\reference\unary-elementwise.cc -- Found Git: E:/Program Files/Git/cmd/git.exe (found version "2.51.0.windows.1") -- Google Benchmark version: v1.9.3, normalized to 1.9.3 -- Looking for shm_open in rt -- Looking for shm_open in rt - not found -- Performing Test HAVE_CXX_FLAG_WX -- Performing Test HAVE_CXX_FLAG_WX - Success -- Compiling and running to test HAVE_STD_REGEX -- Performing Test HAVE_STD_REGEX -- success -- Compiling and running to test HAVE_GNU_POSIX_REGEX -- Performing Test HAVE_GNU_POSIX_REGEX -- failed to compile -- Compiling and running to test HAVE_POSIX_REGEX -- Performing Test HAVE_POSIX_REGEX -- failed to compile -- Compiling and running to test HAVE_STEADY_CLOCK -- Performing Test HAVE_STEADY_CLOCK -- success -- Compiling and running to test HAVE_PTHREAD_AFFINITY -- Performing Test HAVE_PTHREAD_AFFINITY -- failed to compile CMake Deprecation Warning at third_party/ittapi/CMakeLists.txt:7 (cmake_minimum_required): Compatibility with CMake < 3.10 will be removed from a future version of CMake. Update the VERSION argument <min> value. Or, use the <min>...<max> syntax to tell CMake that the project requires at least <min> but has been updated to work with policies introduced by <max> or earlier. CMake Warning at cmake/Dependencies.cmake:749 (message): FP16 is only cmake-2.8 compatible Call Stack (most recent call first): CMakeLists.txt:873 (include) CMake Deprecation Warning at third_party/FP16/CMakeLists.txt:1 (CMAKE_MINIMUM_REQUIRED): Compatibility with CMake < 3.10 will be removed from a future version of CMake. Update the VERSION argument <min> value. Or, use the <min>...<max> syntax to tell CMake that the project requires at least <min> but has been updated to work with policies introduced by <max> or earlier. CMake Deprecation Warning at third_party/psimd/CMakeLists.txt:1 (CMAKE_MINIMUM_REQUIRED): Compatibility with CMake < 3.10 will be removed from a future version of CMake. Update the VERSION argument <min> value. Or, use the <min>...<max> syntax to tell CMake that the project requires at least <min> but has been updated to work with policies introduced by <max> or earlier. -- Using third party subdirectory Eigen. -- Found Python: E:\PyTorch_Build\pytorch\rtx5070_env\Scripts\python.exe (found version "3.10.10") found components: Interpreter Development.Module NumPy -- Using third_party/pybind11. -- pybind11 include dirs: E:/PyTorch_Build/pytorch/cmake/../third_party/pybind11/include -- Could NOT find OpenTelemetryApi (missing: OpenTelemetryApi_INCLUDE_DIRS) -- Using third_party/opentelemetry-cpp. -- opentelemetry api include dirs: E:/PyTorch_Build/pytorch/cmake/../third_party/opentelemetry-cpp/api/include -- Could NOT find MPI_C (missing: MPI_C_LIB_NAMES MPI_C_HEADER_DIR MPI_C_WORKS) -- Could NOT find MPI_CXX (missing: MPI_CXX_LIB_NAMES MPI_CXX_HEADER_DIR MPI_CXX_WORKS) -- Could NOT find MPI (missing: MPI_C_FOUND MPI_CXX_FOUND) CMake Warning at cmake/Dependencies.cmake:894 (message): Not compiling with MPI. Suppress this warning with -DUSE_MPI=OFF Call Stack (most recent call first): CMakeLists.txt:873 (include) -- MKL_THREADING = OMP -- Check OMP with lib C:/Program Files (x86)/Microsoft Visual Studio/2022/BuildTools/VC/Tools/MSVC/14.44.35207/lib/x64/libomp.lib and flags -openmp:experimental -- MKL_THREADING = OMP -- Check OMP with lib C:/Program Files (x86)/Microsoft Visual Studio/2022/BuildTools/VC/Tools/MSVC/14.44.35207/lib/x64/libomp.lib and flags -openmp:experimental -- Found OpenMP_C: -openmp:experimental -- Found OpenMP_CXX: -openmp:experimental -- Found OpenMP: TRUE -- Adding OpenMP CXX_FLAGS: -openmp:experimental -- Will link against OpenMP libraries: C:/Program Files (x86)/Microsoft Visual Studio/2022/BuildTools/VC/Tools/MSVC/14.44.35207/lib/x64/libomp.lib -- Found nvtx3: E:/PyTorch_Build/pytorch/third_party/NVTX/c/include -- ROCM_PATH environment variable is not set and C:/opt/rocm does not exist. Building without ROCm support. -- Found Python3: E:\PyTorch_Build\pytorch\rtx5070_env\Scripts\python.exe (found version "3.10.10") found components: Interpreter -- ONNX_PROTOC_EXECUTABLE: $<TARGET_FILE:protobuf::protoc> -- Protobuf_VERSION: Protobuf_VERSION_NOTFOUND Generated: E:/PyTorch_Build/pytorch/build/third_party/onnx/onnx/onnx_onnx_torch-ml.proto Generated: E:/PyTorch_Build/pytorch/build/third_party/onnx/onnx/onnx-operators_onnx_torch-ml.proto Generated: E:/PyTorch_Build/pytorch/build/third_party/onnx/onnx/onnx-data_onnx_torch.proto -- -- ******** Summary ******** -- CMake version : 4.1.0 -- CMake command : E:/PyTorch_Build/pytorch/rtx5070_env/Lib/site-packages/cmake/data/bin/cmake.exe -- System : Windows -- C++ compiler : C:/Program Files (x86)/Microsoft Visual Studio/2022/BuildTools/VC/Tools/MSVC/14.44.35207/bin/Hostx64/x64/cl.exe -- C++ compiler version : 19.44.35215.0 -- CXX flags : /DWIN32 /D_WINDOWS /EHsc /Zc:__cplusplus /bigobj /FS /utf-8 -DUSE_PTHREADPOOL /EHsc /wd26812 -- Build type : Release -- Compile definitions : ONNX_ML=1;ONNXIFI_ENABLE_EXT=1 -- CMAKE_PREFIX_PATH : E:\PyTorch_Build\pytorch\rtx5070_env\Lib\site-packages;E:/Program Files/NVIDIA/CUNND/v9.12;E:\Program Files\NVIDIA\CUNND\v9.12;E:/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v13.0;E:/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v13.0;E:/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v13.0 -- CMAKE_INSTALL_PREFIX : E:/PyTorch_Build/pytorch/torch -- CMAKE_MODULE_PATH : E:/PyTorch_Build/pytorch/cmake/Modules;E:/PyTorch_Build/pytorch/cmake/public/../Modules_CUDA_fix -- -- ONNX version : 1.18.0 -- ONNX NAMESPACE : onnx_torch -- ONNX_USE_LITE_PROTO : OFF -- USE_PROTOBUF_SHARED_LIBS : OFF -- ONNX_DISABLE_EXCEPTIONS : OFF -- ONNX_DISABLE_STATIC_REGISTRATION : OFF -- ONNX_WERROR : OFF -- ONNX_BUILD_TESTS : OFF -- BUILD_SHARED_LIBS : OFF -- -- Protobuf compiler : $<TARGET_FILE:protobuf::protoc> -- Protobuf includes : -- Protobuf libraries : -- ONNX_BUILD_PYTHON : OFF -- Found CUDA with FP16 support, compiling with torch.cuda.HalfTensor -- Adding -DNDEBUG to compile flags -- Checking prototype magma_get_sgeqrf_nb for MAGMA_V2 -- Checking prototype magma_get_sgeqrf_nb for MAGMA_V2 - False -- MAGMA not found. Compiling without MAGMA support -- Could not find hardware support for NEON on this machine. -- No OMAP3 processor on this machine. -- No OMAP4 processor on this machine. -- MKL_THREADING = OMP -- Checking for [mkl_intel_lp64 - mkl_intel_thread - mkl_core - libiomp5md] -- Library mkl_intel_lp64: not found -- Checking for [mkl_intel - mkl_intel_thread - mkl_core - libiomp5md] -- Library mkl_intel: not found -- Checking for [mkl_intel_lp64 - mkl_intel_thread - mkl_core] -- Library mkl_intel_lp64: not found -- Checking for [mkl_intel - mkl_intel_thread - mkl_core] -- Library mkl_intel: not found -- Checking for [mkl_intel_lp64 - mkl_sequential - mkl_core] -- Library mkl_intel_lp64: not found -- Checking for [mkl_intel - mkl_sequential - mkl_core] -- Library mkl_intel: not found -- Checking for [mkl_intel_lp64 - mkl_core - libiomp5md - pthread] -- Library mkl_intel_lp64: not found -- Checking for [mkl_intel - mkl_core - libiomp5md - pthread] -- Library mkl_intel: not found -- Checking for [mkl_intel_lp64 - mkl_core - pthread] -- Library mkl_intel_lp64: not found -- Checking for [mkl_intel - mkl_core - pthread] -- Library mkl_intel: not found -- Checking for [mkl - guide - pthread - m] -- Library mkl: not found -- MKL library not found -- Checking for [blis] -- Library blis: BLAS_blis_LIBRARY-NOTFOUND -- Checking for [Accelerate] -- Library Accelerate: BLAS_Accelerate_LIBRARY-NOTFOUND -- Checking for [vecLib] -- Library vecLib: BLAS_vecLib_LIBRARY-NOTFOUND -- Checking for [flexiblas] -- Library flexiblas: BLAS_flexiblas_LIBRARY-NOTFOUND -- Checking for [openblas] -- Library openblas: BLAS_openblas_LIBRARY-NOTFOUND -- Checking for [openblas - pthread - m] -- Library openblas: BLAS_openblas_LIBRARY-NOTFOUND -- Checking for [openblas - pthread - m - gomp] -- Library openblas: BLAS_openblas_LIBRARY-NOTFOUND -- Checking for [libopenblas] -- Library libopenblas: BLAS_libopenblas_LIBRARY-NOTFOUND -- Checking for [goto2 - gfortran] -- Library goto2: BLAS_goto2_LIBRARY-NOTFOUND -- Checking for [goto2 - gfortran - pthread] -- Library goto2: BLAS_goto2_LIBRARY-NOTFOUND -- Checking for [acml - gfortran] -- Library acml: BLAS_acml_LIBRARY-NOTFOUND -- Checking for [blis] -- Library blis: BLAS_blis_LIBRARY-NOTFOUND -- Could NOT find Atlas (missing: Atlas_CBLAS_INCLUDE_DIR Atlas_CLAPACK_INCLUDE_DIR Atlas_CBLAS_LIBRARY Atlas_BLAS_LIBRARY Atlas_LAPACK_LIBRARY) -- Checking for [ptf77blas - atlas - gfortran] -- Library ptf77blas: BLAS_ptf77blas_LIBRARY-NOTFOUND -- Checking for [] -- Cannot find a library with BLAS API. Not using BLAS. -- LAPACK requires BLAS -- Cannot find a library with LAPACK API. Not using LAPACK. disabling ROCM because NOT USE_ROCM is set -- MIOpen not found. Compiling without MIOpen support disabling MKLDNN because USE_MKLDNN is not set -- {fmt} version: 11.2.0 -- Build type: Release -- Using Kineto with CUPTI support -- Configuring Kineto dependency: -- KINETO_SOURCE_DIR = E:/PyTorch_Build/pytorch/third_party/kineto/libkineto -- KINETO_BUILD_TESTS = OFF -- KINETO_LIBRARY_TYPE = static -- CUDA_SOURCE_DIR = E:/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v13.0 -- CUDA_INCLUDE_DIRS = E:/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v13.0/include -- CUPTI_INCLUDE_DIR = E:/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v13.0/extras/CUPTI/include -- CUDA_cupti_LIBRARY = E:/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v13.0/extras/CUPTI/lib64/cupti.lib -- Found CUPTI CMake Deprecation Warning at third_party/kineto/libkineto/CMakeLists.txt:7 (cmake_minimum_required): Compatibility with CMake < 3.10 will be removed from a future version of CMake. Update the VERSION argument <min> value. Or, use the <min>...<max> syntax to tell CMake that the project requires at least <min> but has been updated to work with policies introduced by <max> or earlier. CMake Warning (dev) at third_party/kineto/libkineto/CMakeLists.txt:15 (find_package): Policy CMP0148 is not set: The FindPythonInterp and FindPythonLibs modules are removed. Run "cmake --help-policy CMP0148" for policy details. Use the cmake_policy command to set the policy and suppress this warning. This warning is for project developers. Use -Wno-dev to suppress it. -- Found PythonInterp: E:/PyTorch_Build/pytorch/rtx5070_env/Scripts/python.exe (found version "3.10.10") -- ROCM_SOURCE_DIR = -- Kineto: FMT_SOURCE_DIR = E:/PyTorch_Build/pytorch/third_party/fmt -- Kineto: FMT_INCLUDE_DIR = E:/PyTorch_Build/pytorch/third_party/fmt/include -- CUPTI_INCLUDE_DIR = E:/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v13.0/extras/CUPTI/include -- ROCTRACER_INCLUDE_DIR = /include/roctracer -- DYNOLOG_INCLUDE_DIR = E:/PyTorch_Build/pytorch/third_party/kineto/libkineto/third_party/dynolog/ -- IPCFABRIC_INCLUDE_DIR = E:/PyTorch_Build/pytorch/third_party/kineto/libkineto/third_party/dynolog//dynolog/src/ipcfabric/ -- Configured Kineto -- Performing Test HAS/WD4624 -- Performing Test HAS/WD4624 - Success -- Performing Test HAS/WD4068 -- Performing Test HAS/WD4068 - Success -- Performing Test HAS/WD4067 -- Performing Test HAS/WD4067 - Success -- Performing Test HAS/WD4267 -- Performing Test HAS/WD4267 - Success -- Performing Test HAS/WD4661 -- Performing Test HAS/WD4661 - Success -- Performing Test HAS/WD4717 -- Performing Test HAS/WD4717 - Success -- Performing Test HAS/WD4244 -- Performing Test HAS/WD4244 - Success -- Performing Test HAS/WD4804 -- Performing Test HAS/WD4804 - Success -- Performing Test HAS/WD4273 -- Performing Test HAS/WD4273 - Success -- Performing Test HAS_WNO_STRINGOP_OVERFLOW -- Performing Test HAS_WNO_STRINGOP_OVERFLOW - Failed -- -- Architecture: x64 -- Use the C++ compiler to compile (MI_USE_CXX=ON) -- -- Library name : mimalloc -- Version : 2.2.4 -- Build type : release -- C++ Compiler : C:/Program Files (x86)/Microsoft Visual Studio/2022/BuildTools/VC/Tools/MSVC/14.44.35207/bin/Hostx64/x64/cl.exe -- Compiler flags : /Zc:__cplusplus -- Compiler defines : MI_CMAKE_BUILD_TYPE=release;MI_BUILD_RELEASE -- Link libraries : psapi;shell32;user32;advapi32;bcrypt -- Build targets : static -- CMake Error at CMakeLists.txt:1264 (add_subdirectory): The source directory E:/PyTorch_Build/pytorch/torch/headeronly does not contain a CMakeLists.txt file. -- don't use NUMA -- Looking for backtrace -- Looking for backtrace - not found -- Could NOT find Backtrace (missing: Backtrace_LIBRARY Backtrace_INCLUDE_DIR) -- headers outputs: torch\csrc\inductor\aoti_torch\generated\c_shim_cpu.h not found torch\csrc\inductor\aoti_torch\generated\c_shim_cuda.h not found torch\csrc\inductor\aoti_torch\generated\c_shim_aten.h not found -- sources outputs: -- declarations_yaml outputs: -- Performing Test COMPILER_SUPPORTS_NO_AVX256_SPLIT -- Performing Test COMPILER_SUPPORTS_NO_AVX256_SPLIT - Failed -- Using ATen parallel backend: OMP -- Could NOT find OpenSSL, try to set the path to OpenSSL root folder in the system variable OPENSSL_ROOT_DIR (missing: OPENSSL_CRYPTO_LIBRARY OPENSSL_INCLUDE_DIR) -- Check size of long double -- Check size of long double - done -- Performing Test COMPILER_SUPPORTS_FLOAT128 -- Performing Test COMPILER_SUPPORTS_FLOAT128 - Failed -- Performing Test COMPILER_SUPPORTS_SSE2 -- Performing Test COMPILER_SUPPORTS_SSE2 - Success -- Performing Test COMPILER_SUPPORTS_SSE4 -- Performing Test COMPILER_SUPPORTS_SSE4 - Success -- Performing Test COMPILER_SUPPORTS_AVX -- Performing Test COMPILER_SUPPORTS_AVX - Success -- Performing Test COMPILER_SUPPORTS_FMA4 -- Performing Test COMPILER_SUPPORTS_FMA4 - Success -- Performing Test COMPILER_SUPPORTS_AVX2 -- Performing Test COMPILER_SUPPORTS_AVX2 - Success -- Performing Test COMPILER_SUPPORTS_AVX512F -- Performing Test COMPILER_SUPPORTS_AVX512F - Success -- Found OpenMP_C: -openmp:experimental (found version "2.0") -- Found OpenMP_CXX: -openmp:experimental (found version "2.0") -- Found OpenMP_CUDA: -openmp (found version "2.0") -- Found OpenMP: TRUE (found version "2.0") -- Performing Test COMPILER_SUPPORTS_OPENMP -- Performing Test COMPILER_SUPPORTS_OPENMP - Success -- Performing Test COMPILER_SUPPORTS_OMP_SIMD -- Performing Test COMPILER_SUPPORTS_OMP_SIMD - Failed -- Performing Test COMPILER_SUPPORTS_WEAK_ALIASES -- Performing Test COMPILER_SUPPORTS_WEAK_ALIASES - Failed -- Performing Test COMPILER_SUPPORTS_BUILTIN_MATH -- Performing Test COMPILER_SUPPORTS_BUILTIN_MATH - Failed -- Performing Test COMPILER_SUPPORTS_SYS_GETRANDOM -- Performing Test COMPILER_SUPPORTS_SYS_GETRANDOM - Failed -- Configuring build for SLEEF-v3.8.0 Target system: Windows-10.0.26100 Target processor: AMD64 Host system: Windows-10.0.26100 Host processor: AMD64 Detected C compiler: MSVC @ C:/Program Files (x86)/Microsoft Visual Studio/2022/BuildTools/VC/Tools/MSVC/14.44.35207/bin/Hostx64/x64/cl.exe CMake: 4.1.0 Make program: E:/PyTorch_Build/pytorch/rtx5070_env/Scripts/ninja.exe -- Using option `/D_CRT_SECURE_NO_WARNINGS /D_CRT_NONSTDC_NO_DEPRECATE ` to compile libsleef -- Building shared libs : OFF -- Building static test bins: OFF -- MPFR : LIB_MPFR-NOTFOUND -- GMP : LIBGMP-NOTFOUND -- RT : -- FFTW3 : LIBFFTW3-NOTFOUND -- OPENSSL : -- SDE : SDE_COMMAND-NOTFOUND -- COMPILER_SUPPORTS_OPENMP : FALSE AT_INSTALL_INCLUDE_DIR include/ATen/core core header install: E:/PyTorch_Build/pytorch/build/aten/src/ATen/core/aten_interned_strings.h core header install: E:/PyTorch_Build/pytorch/build/aten/src/ATen/core/enum_tag.h core header install: E:/PyTorch_Build/pytorch/build/aten/src/ATen/core/TensorBody.h -- NVSHMEM not found, not building with NVSHMEM support. CMake Error at torch/CMakeLists.txt:3 (add_subdirectory): The source directory E:/PyTorch_Build/pytorch/torch/csrc does not contain a CMakeLists.txt file. CMake Warning at CMakeLists.txt:1285 (message): Generated cmake files are only fully tested if one builds with system glog, gflags, and protobuf. Other settings may generate files that are not well tested. -- -- ******** Summary ******** -- General: -- CMake version : 4.1.0 -- CMake command : E:/PyTorch_Build/pytorch/rtx5070_env/Lib/site-packages/cmake/data/bin/cmake.exe -- System : Windows -- C++ compiler : C:/Program Files (x86)/Microsoft Visual Studio/2022/BuildTools/VC/Tools/MSVC/14.44.35207/bin/Hostx64/x64/cl.exe -- C++ compiler id : MSVC -- C++ compiler version : 19.44.35215.0 -- Using ccache if found : OFF -- CXX flags : /DWIN32 /D_WINDOWS /EHsc /Zc:__cplusplus /bigobj /FS /utf-8 -DUSE_PTHREADPOOL -DNDEBUG -DUSE_KINETO -DLIBKINETO_NOROCTRACER -DLIBKINETO_NOXPUPTI=ON -DUSE_XNNPACK -DSYMBOLICATE_MOBILE_DEBUG_HANDLE /wd4624 /wd4068 /wd4067 /wd4267 /wd4661 /wd4717 /wd4244 /wd4804 /wd4273 -- Shared LD flags : /machine:x64 /ignore:4049 /ignore:4217 /ignore:4099 -- Static LD flags : /machine:x64 /ignore:4049 /ignore:4217 /ignore:4099 -- Module LD flags : /machine:x64 /ignore:4049 /ignore:4217 /ignore:4099 -- Build type : Release -- Compile definitions : ONNX_ML=1;ONNXIFI_ENABLE_EXT=1;ONNX_NAMESPACE=onnx_torch;_CRT_SECURE_NO_DEPRECATE=1;USE_EXTERNAL_MZCRC;MINIZ_DISABLE_ZIP_READER_CRC32_CHECKS;EXPORT_AOTI_FUNCTIONS;WIN32_LEAN_AND_MEAN;_UCRT_LEGACY_INFINITY;NOMINMAX;USE_MIMALLOC -- CMAKE_PREFIX_PATH : E:\PyTorch_Build\pytorch\rtx5070_env\Lib\site-packages;E:/Program Files/NVIDIA/CUNND/v9.12;E:\Program Files\NVIDIA\CUNND\v9.12;E:/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v13.0;E:/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v13.0;E:/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v13.0 -- CMAKE_INSTALL_PREFIX : E:/PyTorch_Build/pytorch/torch -- USE_GOLD_LINKER : OFF -- -- TORCH_VERSION : 2.9.0 -- BUILD_STATIC_RUNTIME_BENCHMARK: OFF -- BUILD_BINARY : OFF -- BUILD_CUSTOM_PROTOBUF : ON -- Link local protobuf : ON -- BUILD_PYTHON : True -- Python version : 3.10.10 -- Python executable : E:\PyTorch_Build\pytorch\rtx5070_env\Scripts\python.exe -- Python library : E:/Python310/libs/python310.lib -- Python includes : E:/Python310/Include -- Python site-package : E:\PyTorch_Build\pytorch\rtx5070_env\Lib\site-packages -- BUILD_SHARED_LIBS : ON -- CAFFE2_USE_MSVC_STATIC_RUNTIME : OFF -- BUILD_TEST : True -- BUILD_JNI : OFF -- BUILD_MOBILE_AUTOGRAD : OFF -- BUILD_LITE_INTERPRETER: OFF -- INTERN_BUILD_MOBILE : -- TRACING_BASED : OFF -- USE_BLAS : 0 -- USE_LAPACK : 0 -- USE_ASAN : OFF -- USE_TSAN : OFF -- USE_CPP_CODE_COVERAGE : OFF -- USE_CUDA : 1 -- CUDA static link : OFF -- USE_CUDNN : OFF -- USE_CUSPARSELT : OFF -- USE_CUDSS : OFF -- USE_CUFILE : OFF -- CUDA version : 13.0 -- USE_FLASH_ATTENTION : OFF -- USE_MEM_EFF_ATTENTION : ON -- CUDA root directory : E:/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v13.0 -- CUDA library : E:/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v13.0/lib/x64/cuda.lib -- cudart library : E:/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v13.0/lib/x64/cudart.lib -- cublas library : E:/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v13.0/lib/x64/cublas.lib -- cufft library : E:/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v13.0/lib/x64/cufft.lib -- curand library : E:/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v13.0/lib/x64/curand.lib -- cusparse library : E:/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v13.0/lib/x64/cusparse.lib -- nvrtc : E:/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v13.0/lib/x64/nvrtc.lib -- CUDA include path : E:/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v13.0/include -- NVCC executable : E:/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v13.0/bin/nvcc.exe -- CUDA compiler : E:/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v13.0/bin/nvcc.exe -- CUDA flags : -DLIBCUDACXX_ENABLE_SIMPLIFIED_COMPLEX_OPERATIONS -Xcompiler /Zc:__cplusplus -Xcompiler /w -w -Xcompiler /FS -Xfatbin -compress-all -DONNX_NAMESPACE=onnx_torch --use-local-env -gencode arch=compute_89,code=sm_89 -gencode arch=compute_90,code=sm_90 -gencode arch=compute_120,code=sm_120 -Xcudafe --diag_suppress=cc_clobber_ignored,--diag_suppress=field_without_dll_interface,--diag_suppress=base_class_has_different_dll_interface,--diag_suppress=dll_interface_conflict_none_assumed,--diag_suppress=dll_interface_conflict_dllexport_assumed,--diag_suppress=bad_friend_decl --Werror cross-execution-space-call --no-host-device-move-forward --expt-relaxed-constexpr --expt-extended-lambda -Xcompiler=/wd4819,/wd4503,/wd4190,/wd4244,/wd4251,/wd4275,/wd4522 -Wno-deprecated-gpu-targets --expt-extended-lambda -DCUB_WRAPPED_NAMESPACE=at_cuda_detail -DCUDA_HAS_FP16=1 -D__CUDA_NO_HALF_OPERATORS__ -D__CUDA_NO_HALF_CONVERSIONS__ -D__CUDA_NO_HALF2_OPERATORS__ -D__CUDA_NO_BFLOAT16_CONVERSIONS__ -- CUDA host compiler : -- CUDA --device-c : OFF -- USE_TENSORRT : -- USE_XPU : OFF -- USE_ROCM : OFF -- BUILD_NVFUSER : -- USE_EIGEN_FOR_BLAS : ON -- USE_EIGEN_FOR_SPARSE : OFF -- USE_FBGEMM : OFF -- USE_KINETO : ON -- USE_GFLAGS : OFF -- USE_GLOG : OFF -- USE_LITE_PROTO : OFF -- USE_PYTORCH_METAL : OFF -- USE_PYTORCH_METAL_EXPORT : OFF -- USE_MPS : OFF -- CAN_COMPILE_METAL : -- USE_MKL : OFF -- USE_MKLDNN : OFF -- USE_UCC : OFF -- USE_ITT : ON -- USE_XCCL : OFF -- USE_NCCL : OFF -- Found NVSHMEM : -- USE_NNPACK : OFF -- USE_NUMPY : ON -- USE_OBSERVERS : ON -- USE_OPENCL : OFF -- USE_OPENMP : ON -- USE_MIMALLOC : ON -- USE_MIMALLOC_ON_MKL : OFF -- USE_VULKAN : OFF -- USE_PROF : OFF -- USE_PYTORCH_QNNPACK : OFF -- USE_XNNPACK : ON -- USE_DISTRIBUTED : OFF -- Public Dependencies : -- Private Dependencies : Threads::Threads;pthreadpool;cpuinfo;XNNPACK;microkernels-prod;ittnotify;fp16;caffe2::openmp;fmt::fmt-header-only;kineto -- Public CUDA Deps. : -- Private CUDA Deps. : caffe2::curand;caffe2::cufft;caffe2::cublas;fmt::fmt-header-only;E:/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v13.0/lib/x64/cudart_static.lib;CUDA::cusparse;CUDA::cufft;CUDA::cusolver;ATEN_CUDA_FILES_GEN_LIB -- USE_COREML_DELEGATE : OFF -- BUILD_LAZY_TS_BACKEND : ON -- USE_ROCM_KERNEL_ASSERT : OFF -- Performing Test HAS_WMISSING_PROTOTYPES -- Performing Test HAS_WMISSING_PROTOTYPES - Failed -- Performing Test HAS_WERROR_MISSING_PROTOTYPES -- Performing Test HAS_WERROR_MISSING_PROTOTYPES - Failed -- Configuring incomplete, errors occurred! (rtx5070_env) PS E:\PyTorch_Build\pytorch> (rtx5070_env) PS E:\PyTorch_Build\pytorch> # 安装生成的包 (rtx5070_env) PS E:\PyTorch_Build\pytorch> $wheelPath = Get-ChildItem dist\*.whl | Select-Object -First 1 Get-ChildItem: Cannot find path 'E:\PyTorch_Build\pytorch\dist' because it does not exist. (rtx5070_env) PS E:\PyTorch_Build\pytorch> pip install $wheelPath --force-reinstall --no-deps ERROR: You must give at least one requirement to install (see "pip help install") (rtx5070_env) PS E:\PyTorch_Build\pytorch> python diagnostic_test.py ================================================== CUDA Toolkit 验证: ✅ NVCC 版本: nvcc: NVIDIA (R) Cuda compiler driver Copyright (c) 2005-2025 NVIDIA Corporation Built on Wed_Jul_16_20:06:48_Pacific_Daylight_Time_2025 Cuda compilation tools, release 13.0, V13.0.48 Build cuda_13.0.r13.0/compiler.36260728_0 ✅ NVIDIA-SMI 输出: Mon Sep 1 20:54:10 2025 +-----------------------------------------------------------------------------------------+ | NVIDIA-SMI 580.97 Driver Version: 580.97 CUDA Version: 13.0 | +-----------------------------------------+------------------------+----------------------+ | GPU Name Driver-Model | Bus-Id Disp.A | Volatile Uncorr. ECC | | Fan Temp Perf Pwr:Usage/Cap | Memory-Usage | GPU-Util Compute M. | | | | MIG M. | |=========================================+========================+======================| | 0 NVIDIA GeForce RTX 5070 WDDM | 00000000:01:00.0 On | N/A | | 0% 35C P3 16W / 250W | 1328MiB / 12227MiB | 0% Default | | | | N/A | +-----------------------------------------+------------------------+----------------------+ +-----------------------------------------------------------------------------------------+ | Processes: | | GPU GI CI PID Type Process name GPU Memory | | ID ID Usage | |=========================================================================================| | 0 N/A N/A 1124 C+G ...yb3d8bbwe\WindowsTerminal.exe N/A | | 0 N/A N/A 1288 C+G ...les\Tencent\Weixin\Weixin.exe N/A | | 0 N/A N/A 1776 C+G C:\Windows\System32\dwm.exe N/A | | 0 N/A N/A 2272 C+G ...t\Edge\Application\msedge.exe N/A | | 0 N/A N/A 3268 C+G ...em32\ApplicationFrameHost.exe N/A | | 0 N/A N/A 7860 C+G C:\Windows\explorer.exe N/A | | 0 N/A N/A 8004 C+G ...indows\System32\ShellHost.exe N/A | | 0 N/A N/A 8156 C+G ...0.3405.125\msedgewebview2.exe N/A | | 0 N/A N/A 8852 C+G ..._cw5n1h2txyewy\SearchHost.exe N/A | | 0 N/A N/A 8876 C+G ...y\StartMenuExperienceHost.exe N/A | | 0 N/A N/A 10540 C+G ...0.3405.125\msedgewebview2.exe N/A | | 0 N/A N/A 12380 C+G ...5n1h2txyewy\TextInputHost.exe N/A | | 0 N/A N/A 15340 C+G ...acted\runtime\WeChatAppEx.exe N/A | | 0 N/A N/A 18600 C+G ...ntrolPanel\SystemSettings.exe N/A | +-----------------------------------------------------------------------------------------+ ================================================== ❌ 严重错误发生: Traceback (most recent call last): File "E:\PyTorch_Build\pytorch\diagnostic_test.py", line 116, in <module> check_cuda_toolkit() File "E:\PyTorch_Build\pytorch\diagnostic_test.py", line 21, in check_cuda_toolkit cuda_path = os.environ.get('CUDA_PATH', '未设置') NameError: name 'os' is not defined 按 Enter 键退出... (rtx5070_env) PS E:\PyTorch_Build\pytorch> # 卸载现有版本 (rtx5070_env) PS E:\PyTorch_Build\pytorch> pip uninstall -y torch torchvision torchaudio WARNING: Skipping torch as it is not installed. WARNING: Skipping torchvision as it is not installed. WARNING: Skipping torchaudio as it is not installed. (rtx5070_env) PS E:\PyTorch_Build\pytorch> (rtx5070_env) PS E:\PyTorch_Build\pytorch> # 安装支持 RTX 5070 的预编译版本 (rtx5070_env) PS E:\PyTorch_Build\pytorch> pip install --pre torch torchvision torchaudio ` >> --index-url https://download.pytorch.org/whl/nightly/cu121 ` >> --no-deps Looking in indexes: https://download.pytorch.org/whl/nightly/cu121 Collecting torch Using cached https://download.pytorch.org/whl/nightly/cu121/torch-2.6.0.dev20241112%2Bcu121-cp310-cp310-win_amd64.whl (2456.2 MB) Collecting torchvision Using cached https://download.pytorch.org/whl/nightly/cu121/torchvision-0.20.0.dev20241112%2Bcu121-cp310-cp310-win_amd64.whl (6.2 MB) Collecting torchaudio Using cached https://download.pytorch.org/whl/nightly/cu121/torchaudio-2.5.0.dev20241112%2Bcu121-cp310-cp310-win_amd64.whl (4.2 MB) Installing collected packages: torchaudio, torchvision, torch Successfully installed torch-2.6.0.dev20241112+cu121 torchaudio-2.5.0.dev20241112+cu121 torchvision-0.20.0.dev20241112+cu121 (rtx5070_env) PS E:\PyTorch_Build\pytorch> (rtx5070_env) PS E:\PyTorch_Build\pytorch> # 安装必要依赖 (rtx5070_env) PS E:\PyTorch_Build\pytorch> pip install pyyaml numpy typing_extensions mkl mkl-include intel-openmp Looking in indexes: https://pypi.tuna.tsinghua.edu.cn/simple Requirement already satisfied: pyyaml in e:\pytorch_build\pytorch\rtx5070_env\lib\site-packages (6.0.2) Requirement already satisfied: numpy in e:\pytorch_build\pytorch\rtx5070_env\lib\site-packages (2.2.6) Requirement already satisfied: typing_extensions in e:\pytorch_build\pytorch\rtx5070_env\lib\site-packages (4.15.0) Requirement already satisfied: mkl in e:\pytorch_build\pytorch\rtx5070_env\lib\site-packages (2025.2.0) Requirement already satisfied: mkl-include in e:\pytorch_build\pytorch\rtx5070_env\lib\site-packages (2025.2.0) Requirement already satisfied: intel-openmp in e:\pytorch_build\pytorch\rtx5070_env\lib\site-packages (2025.2.1) Requirement already satisfied: tbb==2022.* in e:\pytorch_build\pytorch\rtx5070_env\lib\site-packages (from mkl) (2022.2.0) Requirement already satisfied: intel-cmplr-lib-ur==2025.2.1 in e:\pytorch_build\pytorch\rtx5070_env\lib\site-packages (from intel-openmp) (2025.2.1) Requirement already satisfied: umf==0.11.* in e:\pytorch_build\pytorch\rtx5070_env\lib\site-packages (from intel-cmplr-lib-ur==2025.2.1->intel-openmp) (0.11.0) Requirement already satisfied: tcmlib==1.* in e:\pytorch_build\pytorch\rtx5070_env\lib\site-packages (from tbb==2022.*->mkl) (1.4.0) (rtx5070_env) PS E:\PyTorch_Build\pytorch> (rtx5070_env) PS E:\PyTorch_Build\pytorch> # 执行诊断测试 (rtx5070_env) PS E:\PyTorch_Build\pytorch> python diagnostic_test.py ================================================== CUDA Toolkit 验证: ✅ NVCC 版本: nvcc: NVIDIA (R) Cuda compiler driver Copyright (c) 2005-2025 NVIDIA Corporation Built on Wed_Jul_16_20:06:48_Pacific_Daylight_Time_2025 Cuda compilation tools, release 13.0, V13.0.48 Build cuda_13.0.r13.0/compiler.36260728_0 ✅ NVIDIA-SMI 输出: Mon Sep 1 20:55:52 2025 +-----------------------------------------------------------------------------------------+ | NVIDIA-SMI 580.97 Driver Version: 580.97 CUDA Version: 13.0 | +-----------------------------------------+------------------------+----------------------+ | GPU Name Driver-Model | Bus-Id Disp.A | Volatile Uncorr. ECC | | Fan Temp Perf Pwr:Usage/Cap | Memory-Usage | GPU-Util Compute M. | | | | MIG M. | |=========================================+========================+======================| | 0 NVIDIA GeForce RTX 5070 WDDM | 00000000:01:00.0 On | N/A | | 0% 35C P3 19W / 250W | 1346MiB / 12227MiB | 0% Default | | | | N/A | +-----------------------------------------+------------------------+----------------------+ +-----------------------------------------------------------------------------------------+ | Processes: | | GPU GI CI PID Type Process name GPU Memory | | ID ID Usage | |=========================================================================================| | 0 N/A N/A 1124 C+G ...yb3d8bbwe\WindowsTerminal.exe N/A | | 0 N/A N/A 1288 C+G ...les\Tencent\Weixin\Weixin.exe N/A | | 0 N/A N/A 1776 C+G C:\Windows\System32\dwm.exe N/A | | 0 N/A N/A 2272 C+G ...t\Edge\Application\msedge.exe N/A | | 0 N/A N/A 3268 C+G ...em32\ApplicationFrameHost.exe N/A | | 0 N/A N/A 7860 C+G C:\Windows\explorer.exe N/A | | 0 N/A N/A 8004 C+G ...indows\System32\ShellHost.exe N/A | | 0 N/A N/A 8156 C+G ...0.3405.125\msedgewebview2.exe N/A | | 0 N/A N/A 8852 C+G ..._cw5n1h2txyewy\SearchHost.exe N/A | | 0 N/A N/A 8876 C+G ...y\StartMenuExperienceHost.exe N/A | | 0 N/A N/A 10540 C+G ...0.3405.125\msedgewebview2.exe N/A | | 0 N/A N/A 12380 C+G ...5n1h2txyewy\TextInputHost.exe N/A | | 0 N/A N/A 15340 C+G ...acted\runtime\WeChatAppEx.exe N/A | | 0 N/A N/A 18600 C+G ...ntrolPanel\SystemSettings.exe N/A | +-----------------------------------------------------------------------------------------+ ================================================== ❌ 严重错误发生: Traceback (most recent call last): File "E:\PyTorch_Build\pytorch\diagnostic_test.py", line 116, in <module> check_cuda_toolkit() File "E:\PyTorch_Build\pytorch\diagnostic_test.py", line 21, in check_cuda_toolkit cuda_path = os.environ.get('CUDA_PATH', '未设置') NameError: name 'os' is not defined 按 Enter 键退出... (rtx5070_env) PS E:\PyTorch_Build\pytorch>
最新发布
09-02
评论 2
成就一亿技术人!
拼手气红包6.0元
还能输入1000个字符
 
红包 添加红包
表情包 插入表情
 条评论被折叠 查看
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值