Paired Single C/C++ Extensions

Paired Single Type

The Paired Single extensions enable support for a new data type, __vec2x32float__, which represents a vector two 32-bit floating-point numbers. When possible, variables with this type are allocated to the floating point registers.

The __vec2x32float__ type supports brace delimited constant initializers, such as arrays or structures in C. For example, the following code snippet initializes the variable t_ps0 with 1.0, and t_ps1 with -1.0.

__vec2x32float__ t = { 1.0f, -1.0f };

The ps0 and ps1 fields of the vector can be accessed as if the vector is an array with fixed size using square brackets. The following example returns the sum of the ps0 and ps1 halves of the parameter a.

float func(__vec2x32float__ a) { return a[0] + a[1]; }

This section is subject to change in future releases: The Paired Single type is passed in registers following the same conventions as floating-point arguments. Similarly, the ps1 halves are saved and restored from the stack along with the ps0 halves.

Operations not listed above are not permitted with the __vec2x32float__ data type. For example, a Paired Single variable may not be cast to another type, added to another Paired Single with the + operator, or negated using the unary - operator.

Intrinsics

The following intrinsics are supported and compile directly to the underlying instruction.

__vec2x32float__ __PS_ABS(__vec2x32float__);
__vec2x32float__ __PS_NABS(__vec2x32float__);
__vec2x32float__ __PS_NEG(__vec2x32float__);
__vec2x32float__ __PS_RES(__vec2x32float__);
__vec2x32float__ __PS_RSQRTE(__vec2x32float__);
__vec2x32float__ __PS_ADD(__vec2x32float__, __vec2x32float__);
__vec2x32float__ __PS_DIV(__vec2x32float__, __vec2x32float__);
__vec2x32float__ __PS_MERGE00(__vec2x32float__, __vec2x32float__);
__vec2x32float__ __PS_MERGE01(__vec2x32float__, __vec2x32float__);
__vec2x32float__ __PS_MERGE10(__vec2x32float__, __vec2x32float__);
__vec2x32float__ __PS_MERGE11(__vec2x32float__, __vec2x32float__);
__vec2x32float__ __PS_MUL(__vec2x32float__, __vec2x32float__);
__vec2x32float__ __PS_MULS0(__vec2x32float__, __vec2x32float__);
__vec2x32float__ __PS_MULS1(__vec2x32float__, __vec2x32float__);
__vec2x32float__ __PS_SUB(__vec2x32float__, __vec2x32float__);
__vec2x32float__ __PS_MADD(__vec2x32float__, __vec2x32float__,  __vec2x32float__);
__vec2x32float__ __PS_MADDS0(__vec2x32float__, __vec2x32float__,  __vec2x32float__);
__vec2x32float__ __PS_MADDS1(__vec2x32float__, __vec2x32float__,  __vec2x32float__);
__vec2x32float__ __PS_MSUB(__vec2x32float__, __vec2x32float__,  __vec2x32float__);
__vec2x32float__ __PS_NMADD(__vec2x32float__, __vec2x32float__,  __vec2x32float__);
__vec2x32float__ __PS_NMSUB(__vec2x32float__, __vec2x32float__,  __vec2x32float__);
__vec2x32float__ __PS_SEL(__vec2x32float__, __vec2x32float__,  __vec2x32float__);
__vec2x32float__ __PS_SUM0(__vec2x32float__, __vec2x32float__,  __vec2x32float__);
__vec2x32float__ __PS_SUM1(__vec2x32float__, __vec2x32float__,  __vec2x32float__);

Quantized Loads and Stores

The quantized loads and stores can be accessed with the following functions. The W and I parameters must be compile-time constants of 0 or 1, and 1 to 7, respectively. The GQR0 register is reserved for use by the compiler and runtime.

__vec2x32float__ __PSQ_L(const void *p, int W, int I);
__vec2x32float__ __PSQ_LX(const void *p, int offset, int W, int I);
void __PSQ_ST(const void *p, __vec2x32float__, int W, int I);
void __PSQ_STX(const void *p, int offset, __vec2x32float__, int W, int I);

To read and write the GQR registers, the general read or write SPR intrinsics can be used:

void __MTSPR(unsigned int spr, unsigned int val);
unsigned int __MFSPR(unsigned int spr);

Sample Code

/* Paired Single intrinsic example.
This has not been validated for functional correctness!
*/

#include <ppc_ps.h>

typedef f32x2 * Mtx;

void PSMTXIdentity(Mtx m) 
{
    f32x2 c_zero = {0.0f, 0.0f};
    f32x2 c_one = {1.0f, 1.0f};
    f32x2 c_01, c_10;

    m[1] = c_zero;
    c_01 = __PS_MERGE01(c_zero, c_one);
    m[3] = c_zero;
    c_10 = __PS_MERGE10(c_one, c_zero);
    m[4] = c_zero;
    m[2] = c_01;
    m[0] = c_10;
    m[5] = c_10;
}

void PSMTXCopy(Mtx src, Mtx dst) 
{
    dst[0] = src[0];
    dst[1] = src[1];
    dst[2] = src[2];
    dst[3] = src[3];
    dst[4] = src[4];
    dst[5] = src[5];
}

void PSMTXConcat(Mtx ma, Mtx mb, Mtx mab) 
{
    f32x2 a00_a01, a02_a03, a10_a11, a12_a13, a20_a21, a22_a23;

    f32x2 b00_b01, b02_b03, b10_b11, b12_b13, b20_b21, b22_b23;
    f32x2 d00_d01, d02_d03, d10_d11, d12_d13, d20_d21, d22_d23;
    f32x2 unit01 = {0.0f, 1.0f};

    a00_a01 = ma[0];
    b00_b01 = mb[0];
    b02_b03 = mb[1];
    b10_b11 = mb[2];

    d00_d01 = __PS_MULS0(b00_b01, a00_a01);

    a10_a11 = ma[2];

    d02_d03 = __PS_MULS0(b02_b03, a00_a01);
    d10_d11 = __PS_MULS0(b00_b01, a10_a11);

    b12_b13 = mb[3];

    d12_d13 = __PS_MULS0(b02_b03, a10_a11);

    a02_a03 = ma[1];

    d00_d01 = __PS_MADDS1(b10_b11, a00_a01, d00_d01);

    a12_a13 = ma[3];

    d10_d11 = __PS_MADDS1(b10_b11, a10_a11, d10_d11);

    b20_b21 = mb[4];

    d02_d03 = __PS_MADDS1(b12_b13, a00_a01, d02_d03);

    b22_b23 = mb[5];

    d12_d13 = __PS_MADDS1(b12_b13, a10_a11, d12_d13);

    a20_a21 = ma[4];
    a22_a23 = ma[5];

    d00_d01 = __PS_MADDS0(b20_b21, a02_a03, d00_d01);
    d02_d03 = __PS_MADDS0(b22_b23, a02_a03, d02_d03);
    d10_d11 = __PS_MADDS0(b20_b21, a12_a13, d10_d11);
    d12_d13 = __PS_MADDS0(b22_b23, a12_a13, d12_d13);

    mab[0] = d00_d01;

    d20_d21 = __PS_MULS0(b00_b01, a20_a21);
    d02_d03 = __PS_MADDS1(unit01, a02_a03, d02_d03);
    d22_d23 = __PS_MULS0(b02_b03, a20_a21);

    mab[2] = d10_d11;

    d12_d13 = __PS_MADDS1(unit01, a12_a13, d12_d13);

    mab[1] = d02_d03;

    d20_d21 = __PS_MADDS1(b10_b11, a20_a21, d20_d21);
    d22_d23 = __PS_MADDS1(b12_b13, a20_a21, d22_d23);
    d20_d21 = __PS_MADDS0(b20_b21, a22_a23, d20_d21);

    mab[3] = d12_d13;

    d22_d23 = __PS_MADDS0(b22_b23, a22_a23, d22_d23);

    mab[4] = d20_d21;

    d22_d23 = __PS_MADDS1(unit01, a22_a23, d22_d23);

    mab[5] = d22_d23;
}

Revision History

2014/03/10 Automated cleanup pass.
2013/05/08 Automated cleanup pass.
2013/03/15 Initial version.


CONFIDENTIAL