1 | e50bc5a4 | Alan Curry | ```
/*
``` |
---|---|---|---|

2 | d026b45e | Diego Biurrun | ```
* AltiVec-enhanced yuv2yuvX
``` |

3 | ```
*
``` |
||

4 | ```
* Copyright (C) 2004 Romain Dolbeau <romain@dolbeau.org>
``` |
||

5 | e281d684 | Diego Biurrun | ```
* based on the equivalent C code in swscale.c
``` |

6 | d026b45e | Diego Biurrun | ```
*
``` |

7 | ```
* This file is part of FFmpeg.
``` |
||

8 | ```
*
``` |
||

9 | 4d02387f | Diego Biurrun | ```
* FFmpeg is free software; you can redistribute it and/or
``` |

10 | ```
* modify it under the terms of the GNU Lesser General Public
``` |
||

11 | ```
* License as published by the Free Software Foundation; either
``` |
||

12 | ```
* version 2.1 of the License, or (at your option) any later version.
``` |
||

13 | d026b45e | Diego Biurrun | ```
*
``` |

14 | ```
* FFmpeg is distributed in the hope that it will be useful,
``` |
||

15 | ```
* but WITHOUT ANY WARRANTY; without even the implied warranty of
``` |
||

16 | 4d02387f | Diego Biurrun | ```
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
``` |

17 | ```
* Lesser General Public License for more details.
``` |
||

18 | d026b45e | Diego Biurrun | ```
*
``` |

19 | 4d02387f | Diego Biurrun | ```
* You should have received a copy of the GNU Lesser General Public
``` |

20 | ```
* License along with FFmpeg; if not, write to the Free Software
``` |
||

21 | b19bcbaa | Diego Biurrun | ```
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
``` |

22 | d026b45e | Diego Biurrun | ```
*/
``` |

23 | a2faa401 | Romain Dolbeau | |

24 | 265a1ac7 | Alan Curry | #define vzero vec_splat_s32(0) |

25 | 8c266f0c | Romain Dolbeau | |

26 | a2faa401 | Romain Dolbeau | static inline void |

27 | dd68318c | Ramiro Polla | altivec_packIntArrayToCharArray(int *val, uint8_t* dest, int dstW) |

28 | { |
||

29 | 6e42e6c4 | Diego Biurrun | register int i; |

30 | vector unsigned int altivec_vectorShiftInt19 = |
||

31 | 30c48a0a | Benoit Fouet | vec_add(vec_splat_u32(10), vec_splat_u32(9)); |

32 | 6e42e6c4 | Diego Biurrun | if ((unsigned long)dest % 16) { |

33 | 737278c8 | Diego Biurrun | ```
/* badly aligned store, we force store alignment */
``` |

34 | ```
/* and will handle load misalignment on val w/ vec_perm */
``` |
||

35 | 6e42e6c4 | Diego Biurrun | vector unsigned char perm1; |

36 | vector signed int v1; |
||

37 | for (i = 0 ; (i < dstW) && |
||

38 | (((unsigned long)dest + i) % 16) ; i++) { |
||

39 | int t = val[i] >> 19; |
||

40 | dest[i] = (t < 0) ? 0 : ((t > 255) ? 255 : t); |
||

41 | } |
||

42 | ```
perm1 = vec_lvsl(i << 2, val);
``` |
||

43 | ```
v1 = vec_ld(i << 2, val);
``` |
||

44 | for ( ; i < (dstW - 15); i+=16) { |
||

45 | int offset = i << 2; |
||

46 | vector signed int v2 = vec_ld(offset + 16, val); |
||

47 | vector signed int v3 = vec_ld(offset + 32, val); |
||

48 | vector signed int v4 = vec_ld(offset + 48, val); |
||

49 | vector signed int v5 = vec_ld(offset + 64, val); |
||

50 | 30c48a0a | Benoit Fouet | vector signed int v12 = vec_perm(v1, v2, perm1); |

51 | vector signed int v23 = vec_perm(v2, v3, perm1); |
||

52 | vector signed int v34 = vec_perm(v3, v4, perm1); |
||

53 | vector signed int v45 = vec_perm(v4, v5, perm1); |
||

54 | 6e42e6c4 | Diego Biurrun | |

55 | vector signed int vA = vec_sra(v12, altivec_vectorShiftInt19); |
||

56 | vector signed int vB = vec_sra(v23, altivec_vectorShiftInt19); |
||

57 | vector signed int vC = vec_sra(v34, altivec_vectorShiftInt19); |
||

58 | vector signed int vD = vec_sra(v45, altivec_vectorShiftInt19); |
||

59 | vector unsigned short vs1 = vec_packsu(vA, vB); |
||

60 | vector unsigned short vs2 = vec_packsu(vC, vD); |
||

61 | vector unsigned char vf = vec_packsu(vs1, vs2); |
||

62 | vec_st(vf, i, dest); |
||

63 | v1 = v5; |
||

64 | } |
||

65 | } else { // dest is properly aligned, great |
||

66 | for (i = 0; i < (dstW - 15); i+=16) { |
||

67 | int offset = i << 2; |
||

68 | vector signed int v1 = vec_ld(offset, val); |
||

69 | vector signed int v2 = vec_ld(offset + 16, val); |
||

70 | vector signed int v3 = vec_ld(offset + 32, val); |
||

71 | vector signed int v4 = vec_ld(offset + 48, val); |
||

72 | vector signed int v5 = vec_sra(v1, altivec_vectorShiftInt19); |
||

73 | vector signed int v6 = vec_sra(v2, altivec_vectorShiftInt19); |
||

74 | vector signed int v7 = vec_sra(v3, altivec_vectorShiftInt19); |
||

75 | vector signed int v8 = vec_sra(v4, altivec_vectorShiftInt19); |
||

76 | vector unsigned short vs1 = vec_packsu(v5, v6); |
||

77 | vector unsigned short vs2 = vec_packsu(v7, v8); |
||

78 | vector unsigned char vf = vec_packsu(vs1, vs2); |
||

79 | vec_st(vf, i, dest); |
||

80 | } |
||

81 | a2faa401 | Romain Dolbeau | } |

82 | 6e42e6c4 | Diego Biurrun | ```
for ( ; i < dstW ; i++) {
``` |

83 | int t = val[i] >> 19; |
||

84 | dest[i] = (t < 0) ? 0 : ((t > 255) ? 255 : t); |
||

85 | a2faa401 | Romain Dolbeau | } |

86 | } |
||

87 | |||

88 | static inline void |
||

89 | 5a55d5b5 | Reimar Döffinger | yuv2yuvX_altivec_real(const int16_t *lumFilter, const int16_t **lumSrc, int lumFilterSize, |

90 | const int16_t *chrFilter, const int16_t **chrSrc, int chrFilterSize, |
||

91 | 6e42e6c4 | Diego Biurrun | uint8_t *dest, uint8_t *uDest, uint8_t *vDest, int dstW, int chrDstW) |

92 | a2faa401 | Romain Dolbeau | { |

93 | 6e42e6c4 | Diego Biurrun | const vector signed int vini = {(1 << 18), (1 << 18), (1 << 18), (1 << 18)}; |

94 | register int i, j; |
||

95 | { |
||

96 | 4a888526 | Måns Rullgård | DECLARE_ALIGNED(16, int, val)[dstW]; |

97 | 6a4970ab | Diego Biurrun | |

98 | 6e42e6c4 | Diego Biurrun | for (i = 0; i < (dstW -7); i+=4) { |

99 | ```
vec_st(vini, i << 2, val);
``` |
||

100 | } |
||

101 | ```
for (; i < dstW; i++) {
``` |
||

102 | val[i] = (1 << 18); |
||

103 | } |
||

104 | 6a4970ab | Diego Biurrun | |

105 | 6e42e6c4 | Diego Biurrun | for (j = 0; j < lumFilterSize; j++) { |

106 | vector signed short l1, vLumFilter = vec_ld(j << 1, lumFilter); |
||

107 | vector unsigned char perm, perm0 = vec_lvsl(j << 1, lumFilter); |
||

108 | vLumFilter = vec_perm(vLumFilter, vLumFilter, perm0); |
||

109 | vLumFilter = vec_splat(vLumFilter, 0); // lumFilter[j] is loaded 8 times in vLumFilter |
||

110 | 6a4970ab | Diego Biurrun | |

111 | 6e42e6c4 | Diego Biurrun | ```
perm = vec_lvsl(0, lumSrc[j]);
``` |

112 | ```
l1 = vec_ld(0, lumSrc[j]);
``` |
||

113 | 6a4970ab | Diego Biurrun | |

114 | 6e42e6c4 | Diego Biurrun | for (i = 0; i < (dstW - 7); i+=8) { |

115 | int offset = i << 2; |
||

116 | vector signed short l2 = vec_ld((i << 1) + 16, lumSrc[j]); |
||

117 | 6a4970ab | Diego Biurrun | |

118 | 6e42e6c4 | Diego Biurrun | vector signed int v1 = vec_ld(offset, val); |

119 | vector signed int v2 = vec_ld(offset + 16, val); |
||

120 | 6a4970ab | Diego Biurrun | |

121 | 6e42e6c4 | Diego Biurrun | vector signed short ls = vec_perm(l1, l2, perm); // lumSrc[j][i] ... lumSrc[j][i+7] |

122 | 6a4970ab | Diego Biurrun | |

123 | 6e42e6c4 | Diego Biurrun | vector signed int i1 = vec_mule(vLumFilter, ls); |

124 | vector signed int i2 = vec_mulo(vLumFilter, ls); |
||

125 | 6a4970ab | Diego Biurrun | |

126 | 6e42e6c4 | Diego Biurrun | vector signed int vf1 = vec_mergeh(i1, i2); |

127 | vector signed int vf2 = vec_mergel(i1, i2); // lumSrc[j][i] * lumFilter[j] ... lumSrc[j][i+7] * lumFilter[j] |
||

128 | 6a4970ab | Diego Biurrun | |

129 | 6e42e6c4 | Diego Biurrun | vector signed int vo1 = vec_add(v1, vf1); |

130 | vector signed int vo2 = vec_add(v2, vf2); |
||

131 | 6a4970ab | Diego Biurrun | |

132 | 6e42e6c4 | Diego Biurrun | vec_st(vo1, offset, val); |

133 | ```
vec_st(vo2, offset + 16, val);
``` |
||

134 | 6a4970ab | Diego Biurrun | |

135 | 6e42e6c4 | Diego Biurrun | l1 = l2; |

136 | } |
||

137 | ```
for ( ; i < dstW; i++) {
``` |
||

138 | val[i] += lumSrc[j][i] * lumFilter[j]; |
||

139 | } |
||

140 | } |
||

141 | 30c48a0a | Benoit Fouet | altivec_packIntArrayToCharArray(val, dest, dstW); |

142 | a2faa401 | Romain Dolbeau | } |

143 | 6e42e6c4 | Diego Biurrun | if (uDest != 0) { |

144 | 4a888526 | Måns Rullgård | DECLARE_ALIGNED(16, int, u)[chrDstW]; |

145 | DECLARE_ALIGNED(16, int, v)[chrDstW]; |
||

146 | 6e42e6c4 | Diego Biurrun | |

147 | for (i = 0; i < (chrDstW -7); i+=4) { |
||

148 | ```
vec_st(vini, i << 2, u);
``` |
||

149 | ```
vec_st(vini, i << 2, v);
``` |
||

150 | } |
||

151 | ```
for (; i < chrDstW; i++) {
``` |
||

152 | u[i] = (1 << 18); |
||

153 | v[i] = (1 << 18); |
||

154 | } |
||

155 | 6a4970ab | Diego Biurrun | |

156 | 6e42e6c4 | Diego Biurrun | for (j = 0; j < chrFilterSize; j++) { |

157 | vector signed short l1, l1_V, vChrFilter = vec_ld(j << 1, chrFilter); |
||

158 | vector unsigned char perm, perm0 = vec_lvsl(j << 1, chrFilter); |
||

159 | vChrFilter = vec_perm(vChrFilter, vChrFilter, perm0); |
||

160 | vChrFilter = vec_splat(vChrFilter, 0); // chrFilter[j] is loaded 8 times in vChrFilter |
||

161 | |||

162 | ```
perm = vec_lvsl(0, chrSrc[j]);
``` |
||

163 | ```
l1 = vec_ld(0, chrSrc[j]);
``` |
||

164 | l1_V = vec_ld(2048 << 1, chrSrc[j]); |
||

165 | |||

166 | for (i = 0; i < (chrDstW - 7); i+=8) { |
||

167 | int offset = i << 2; |
||

168 | vector signed short l2 = vec_ld((i << 1) + 16, chrSrc[j]); |
||

169 | vector signed short l2_V = vec_ld(((i + 2048) << 1) + 16, chrSrc[j]); |
||

170 | |||

171 | vector signed int v1 = vec_ld(offset, u); |
||

172 | vector signed int v2 = vec_ld(offset + 16, u); |
||

173 | vector signed int v1_V = vec_ld(offset, v); |
||

174 | vector signed int v2_V = vec_ld(offset + 16, v); |
||

175 | |||

176 | vector signed short ls = vec_perm(l1, l2, perm); // chrSrc[j][i] ... chrSrc[j][i+7] |
||

177 | vector signed short ls_V = vec_perm(l1_V, l2_V, perm); // chrSrc[j][i+2048] ... chrSrc[j][i+2055] |
||

178 | |||

179 | vector signed int i1 = vec_mule(vChrFilter, ls); |
||

180 | vector signed int i2 = vec_mulo(vChrFilter, ls); |
||

181 | vector signed int i1_V = vec_mule(vChrFilter, ls_V); |
||

182 | vector signed int i2_V = vec_mulo(vChrFilter, ls_V); |
||

183 | |||

184 | vector signed int vf1 = vec_mergeh(i1, i2); |
||

185 | vector signed int vf2 = vec_mergel(i1, i2); // chrSrc[j][i] * chrFilter[j] ... chrSrc[j][i+7] * chrFilter[j] |
||

186 | vector signed int vf1_V = vec_mergeh(i1_V, i2_V); |
||

187 | vector signed int vf2_V = vec_mergel(i1_V, i2_V); // chrSrc[j][i] * chrFilter[j] ... chrSrc[j][i+7] * chrFilter[j] |
||

188 | |||

189 | vector signed int vo1 = vec_add(v1, vf1); |
||

190 | vector signed int vo2 = vec_add(v2, vf2); |
||

191 | vector signed int vo1_V = vec_add(v1_V, vf1_V); |
||

192 | vector signed int vo2_V = vec_add(v2_V, vf2_V); |
||

193 | |||

194 | vec_st(vo1, offset, u); |
||

195 | ```
vec_st(vo2, offset + 16, u);
``` |
||

196 | vec_st(vo1_V, offset, v); |
||

197 | ```
vec_st(vo2_V, offset + 16, v);
``` |
||

198 | |||

199 | l1 = l2; |
||

200 | l1_V = l2_V; |
||

201 | } |
||

202 | ```
for ( ; i < chrDstW; i++) {
``` |
||

203 | u[i] += chrSrc[j][i] * chrFilter[j]; |
||

204 | ```
v[i] += chrSrc[j][i + 2048] * chrFilter[j];
``` |
||

205 | } |
||

206 | } |
||

207 | 30c48a0a | Benoit Fouet | altivec_packIntArrayToCharArray(u, uDest, chrDstW); |

208 | altivec_packIntArrayToCharArray(v, vDest, chrDstW); |
||

209 | a2faa401 | Romain Dolbeau | } |

210 | } |
||

211 | 8c266f0c | Romain Dolbeau | |

212 | f1933e43 | Diego Biurrun | static inline void hScale_altivec_real(int16_t *dst, int dstW, |

213 | const uint8_t *src, int srcW, |
||

214 | int xInc, const int16_t *filter, |
||

215 | const int16_t *filterPos, int filterSize) |
||

216 | { |
||

217 | 6e42e6c4 | Diego Biurrun | register int i; |

218 | 4a888526 | Måns Rullgård | DECLARE_ALIGNED(16, int, tempo)[4]; |

219 | 6e42e6c4 | Diego Biurrun | |

220 | if (filterSize % 4) { |
||

221 | for (i=0; i<dstW; i++) { |
||

222 | register int j; |
||

223 | register int srcPos = filterPos[i]; |
||

224 | register int val = 0; |
||

225 | for (j=0; j<filterSize; j++) { |
||

226 | ```
val += ((int)src[srcPos + j])*filter[filterSize*i + j];
``` |
||

227 | } |
||

228 | 43cbf8ad | Kostya Shishkov | dst[i] = FFMIN(val>>7, (1<<15)-1); |

229 | 6e42e6c4 | Diego Biurrun | } |

230 | 8c266f0c | Romain Dolbeau | } |

231 | 6e42e6c4 | Diego Biurrun | ```
else
``` |

232 | ```
switch (filterSize) {
``` |
||

233 | case 4: |
||

234 | 8c266f0c | Romain Dolbeau | { |

235 | 6e42e6c4 | Diego Biurrun | for (i=0; i<dstW; i++) { |

236 | register int srcPos = filterPos[i]; |
||

237 | |||

238 | vector unsigned char src_v0 = vec_ld(srcPos, src); |
||

239 | vector unsigned char src_v1, src_vF; |
||

240 | vector signed short src_v, filter_v; |
||

241 | vector signed int val_vEven, val_s; |
||

242 | if ((((int)src + srcPos)% 16) > 12) { |
||

243 | ```
src_v1 = vec_ld(srcPos + 16, src);
``` |
||

244 | } |
||

245 | src_vF = vec_perm(src_v0, src_v1, vec_lvsl(srcPos, src)); |
||

246 | |||

247 | ```
src_v = // vec_unpackh sign-extends...
``` |
||

248 | (vector signed short)(vec_mergeh((vector unsigned char)vzero, src_vF)); |
||

249 | ```
// now put our elements in the even slots
``` |
||

250 | src_v = vec_mergeh(src_v, (vector signed short)vzero); |
||

251 | |||

252 | ```
filter_v = vec_ld(i << 3, filter);
``` |
||

253 | 8a322796 | Diego Biurrun | ```
// The 3 above is 2 (filterSize == 4) + 1 (sizeof(short) == 2).
``` |

254 | 8c266f0c | Romain Dolbeau | |

255 | 8a322796 | Diego Biurrun | ```
// The neat trick: We only care for half the elements,
``` |

256 | 8c266f0c | Romain Dolbeau | ```
// high or low depending on (i<<3)%16 (it's 0 or 8 here),
``` |

257 | 8a322796 | Diego Biurrun | ```
// and we're going to use vec_mule, so we choose
``` |

258 | ```
// carefully how to "unpack" the elements into the even slots.
``` |
||

259 | 6e42e6c4 | Diego Biurrun | if ((i << 3) % 16) |

260 | 30c48a0a | Benoit Fouet | filter_v = vec_mergel(filter_v, (vector signed short)vzero); |

261 | 6e42e6c4 | Diego Biurrun | ```
else
``` |

262 | 30c48a0a | Benoit Fouet | filter_v = vec_mergeh(filter_v, (vector signed short)vzero); |

263 | 6e42e6c4 | Diego Biurrun | |

264 | val_vEven = vec_mule(src_v, filter_v); |
||

265 | val_s = vec_sums(val_vEven, vzero); |
||

266 | ```
vec_st(val_s, 0, tempo);
``` |
||

267 | 43cbf8ad | Kostya Shishkov | dst[i] = FFMIN(tempo[3]>>7, (1<<15)-1); |

268 | 6e42e6c4 | Diego Biurrun | } |

269 | 8c266f0c | Romain Dolbeau | } |

270 | ```
break;
``` |
||

271 | |||

272 | 6e42e6c4 | Diego Biurrun | case 8: |

273 | 8c266f0c | Romain Dolbeau | { |

274 | 6e42e6c4 | Diego Biurrun | for (i=0; i<dstW; i++) { |

275 | register int srcPos = filterPos[i]; |
||

276 | |||

277 | vector unsigned char src_v0 = vec_ld(srcPos, src); |
||

278 | vector unsigned char src_v1, src_vF; |
||

279 | vector signed short src_v, filter_v; |
||

280 | vector signed int val_v, val_s; |
||

281 | if ((((int)src + srcPos)% 16) > 8) { |
||

282 | ```
src_v1 = vec_ld(srcPos + 16, src);
``` |
||

283 | } |
||

284 | src_vF = vec_perm(src_v0, src_v1, vec_lvsl(srcPos, src)); |
||

285 | |||

286 | ```
src_v = // vec_unpackh sign-extends...
``` |
||

287 | (vector signed short)(vec_mergeh((vector unsigned char)vzero, src_vF)); |
||

288 | ```
filter_v = vec_ld(i << 4, filter);
``` |
||

289 | 8c266f0c | Romain Dolbeau | ```
// the 4 above is 3 (filterSize == 8) + 1 (sizeof(short) == 2)
``` |

290 | |||

291 | 6e42e6c4 | Diego Biurrun | val_v = vec_msums(src_v, filter_v, (vector signed int)vzero); |

292 | val_s = vec_sums(val_v, vzero); |
||

293 | ```
vec_st(val_s, 0, tempo);
``` |
||

294 | 43cbf8ad | Kostya Shishkov | dst[i] = FFMIN(tempo[3]>>7, (1<<15)-1); |

295 | 6e42e6c4 | Diego Biurrun | } |

296 | 8c266f0c | Romain Dolbeau | } |

297 | ```
break;
``` |
||

298 | |||

299 | 6e42e6c4 | Diego Biurrun | case 16: |

300 | 8c266f0c | Romain Dolbeau | { |

301 | 6e42e6c4 | Diego Biurrun | for (i=0; i<dstW; i++) { |

302 | register int srcPos = filterPos[i]; |
||

303 | 8c266f0c | Romain Dolbeau | |

304 | 6e42e6c4 | Diego Biurrun | vector unsigned char src_v0 = vec_ld(srcPos, src); |

305 | vector unsigned char src_v1 = vec_ld(srcPos + 16, src); |
||

306 | vector unsigned char src_vF = vec_perm(src_v0, src_v1, vec_lvsl(srcPos, src)); |
||

307 | 8c266f0c | Romain Dolbeau | |

308 | 6e42e6c4 | Diego Biurrun | vector signed short src_vA = // vec_unpackh sign-extends... |

309 | (vector signed short)(vec_mergeh((vector unsigned char)vzero, src_vF)); |
||

310 | vector signed short src_vB = // vec_unpackh sign-extends... |
||

311 | (vector signed short)(vec_mergel((vector unsigned char)vzero, src_vF)); |
||

312 | 8c266f0c | Romain Dolbeau | |

313 | 6e42e6c4 | Diego Biurrun | vector signed short filter_v0 = vec_ld(i << 5, filter); |

314 | vector signed short filter_v1 = vec_ld((i << 5) + 16, filter); |
||

315 | ```
// the 5 above are 4 (filterSize == 16) + 1 (sizeof(short) == 2)
``` |
||

316 | 8c266f0c | Romain Dolbeau | |

317 | 6e42e6c4 | Diego Biurrun | vector signed int val_acc = vec_msums(src_vA, filter_v0, (vector signed int)vzero); |

318 | vector signed int val_v = vec_msums(src_vB, filter_v1, val_acc); |
||

319 | 8c266f0c | Romain Dolbeau | |

320 | 6e42e6c4 | Diego Biurrun | vector signed int val_s = vec_sums(val_v, vzero); |

321 | 8c266f0c | Romain Dolbeau | |

322 | 6e42e6c4 | Diego Biurrun | ```
vec_st(val_s, 0, tempo);
``` |

323 | 43cbf8ad | Kostya Shishkov | dst[i] = FFMIN(tempo[3]>>7, (1<<15)-1); |

324 | 6e42e6c4 | Diego Biurrun | } |

325 | 8c266f0c | Romain Dolbeau | } |

326 | ```
break;
``` |
||

327 | 6a4970ab | Diego Biurrun | |

328 | 6e42e6c4 | Diego Biurrun | ```
default:
``` |

329 | 8c266f0c | Romain Dolbeau | { |

330 | 6e42e6c4 | Diego Biurrun | for (i=0; i<dstW; i++) { |

331 | register int j; |
||

332 | register int srcPos = filterPos[i]; |
||

333 | 8c266f0c | Romain Dolbeau | |

334 | 484267f3 | Diego Biurrun | vector signed int val_s, val_v = (vector signed int)vzero; |

335 | 6e42e6c4 | Diego Biurrun | vector signed short filter_v0R = vec_ld(i * 2 * filterSize, filter); |

336 | 8c266f0c | Romain Dolbeau | vector unsigned char permF = vec_lvsl((i * 2 * filterSize), filter); |

337 | |||

338 | vector unsigned char src_v0 = vec_ld(srcPos, src); |
||

339 | vector unsigned char permS = vec_lvsl(srcPos, src); |
||

340 | |||

341 | for (j = 0 ; j < filterSize - 15; j += 16) { |
||

342 | 6e42e6c4 | Diego Biurrun | vector unsigned char src_v1 = vec_ld(srcPos + j + 16, src); |

343 | vector unsigned char src_vF = vec_perm(src_v0, src_v1, permS); |
||

344 | 6a4970ab | Diego Biurrun | |

345 | 6e42e6c4 | Diego Biurrun | vector signed short src_vA = // vec_unpackh sign-extends... |

346 | (vector signed short)(vec_mergeh((vector unsigned char)vzero, src_vF)); |
||

347 | vector signed short src_vB = // vec_unpackh sign-extends... |
||

348 | (vector signed short)(vec_mergel((vector unsigned char)vzero, src_vF)); |
||

349 | 6a4970ab | Diego Biurrun | |

350 | 6e42e6c4 | Diego Biurrun | vector signed short filter_v1R = vec_ld((i * 2 * filterSize) + (j * 2) + 16, filter); |

351 | vector signed short filter_v2R = vec_ld((i * 2 * filterSize) + (j * 2) + 32, filter); |
||

352 | vector signed short filter_v0 = vec_perm(filter_v0R, filter_v1R, permF); |
||

353 | vector signed short filter_v1 = vec_perm(filter_v1R, filter_v2R, permF); |
||

354 | 6a4970ab | Diego Biurrun | |

355 | 6e42e6c4 | Diego Biurrun | vector signed int val_acc = vec_msums(src_vA, filter_v0, val_v); |

356 | val_v = vec_msums(src_vB, filter_v1, val_acc); |
||

357 | 8c266f0c | Romain Dolbeau | |

358 | 6e42e6c4 | Diego Biurrun | filter_v0R = filter_v2R; |

359 | src_v0 = src_v1; |
||

360 | 8c266f0c | Romain Dolbeau | } |

361 | |||

362 | 8916b4b5 | Benoit Fouet | if (j < filterSize-7) { |

363 | 6e42e6c4 | Diego Biurrun | ```
// loading src_v0 is useless, it's already done above
``` |

364 | ```
//vector unsigned char src_v0 = vec_ld(srcPos + j, src);
``` |
||

365 | vector unsigned char src_v1, src_vF; |
||

366 | vector signed short src_v, filter_v1R, filter_v; |
||

367 | if ((((int)src + srcPos)% 16) > 8) { |
||

368 | ```
src_v1 = vec_ld(srcPos + j + 16, src);
``` |
||

369 | } |
||

370 | src_vF = vec_perm(src_v0, src_v1, permS); |
||

371 | |||

372 | ```
src_v = // vec_unpackh sign-extends...
``` |
||

373 | (vector signed short)(vec_mergeh((vector unsigned char)vzero, src_vF)); |
||

374 | ```
// loading filter_v0R is useless, it's already done above
``` |
||

375 | ```
//vector signed short filter_v0R = vec_ld((i * 2 * filterSize) + j, filter);
``` |
||

376 | filter_v1R = vec_ld((i * 2 * filterSize) + (j * 2) + 16, filter); |
||

377 | filter_v = vec_perm(filter_v0R, filter_v1R, permF); |
||

378 | |||

379 | val_v = vec_msums(src_v, filter_v, val_v); |
||

380 | 8c266f0c | Romain Dolbeau | } |

381 | |||

382 | 484267f3 | Diego Biurrun | val_s = vec_sums(val_v, vzero); |

383 | 6a4970ab | Diego Biurrun | |

384 | 8c266f0c | Romain Dolbeau | ```
vec_st(val_s, 0, tempo);
``` |

385 | 43cbf8ad | Kostya Shishkov | dst[i] = FFMIN(tempo[3]>>7, (1<<15)-1); |

386 | 6e42e6c4 | Diego Biurrun | } |

387 | 6a4970ab | Diego Biurrun | |

388 | 8c266f0c | Romain Dolbeau | } |

389 | 6e42e6c4 | Diego Biurrun | } |

390 | 8c266f0c | Romain Dolbeau | } |

391 | b71cf33c | Romain Dolbeau | |

392 | 5a55d5b5 | Reimar Döffinger | static inline int yv12toyuy2_unscaled_altivec(SwsContext *c, const uint8_t* src[], int srcStride[], int srcSliceY, |

393 | dd68318c | Ramiro Polla | int srcSliceH, uint8_t* dstParam[], int dstStride_a[]) |

394 | { |
||

395 | 6e42e6c4 | Diego Biurrun | uint8_t *dst=dstParam[0] + dstStride_a[0]*srcSliceY; |

396 | 30c48a0a | Benoit Fouet | ```
// yv12toyuy2(src[0], src[1], src[2], dst, c->srcW, srcSliceH, srcStride[0], srcStride[1], dstStride[0]);
``` |

397 | 5a55d5b5 | Reimar Döffinger | const uint8_t *ysrc = src[0]; |

398 | const uint8_t *usrc = src[1]; |
||

399 | const uint8_t *vsrc = src[2]; |
||

400 | 6e42e6c4 | Diego Biurrun | const int width = c->srcW; |

401 | const int height = srcSliceH; |
||

402 | const int lumStride = srcStride[0]; |
||

403 | const int chromStride = srcStride[1]; |
||

404 | const int dstStride = dstStride_a[0]; |
||

405 | const vector unsigned char yperm = vec_lvsl(0, ysrc); |
||

406 | const int vertLumPerChroma = 2; |
||

407 | register unsigned int y; |
||

408 | |||

409 | if (width&15) { |
||

410 | 30c48a0a | Benoit Fouet | yv12toyuy2(ysrc, usrc, vsrc, dst, c->srcW, srcSliceH, lumStride, chromStride, dstStride); |

411 | 6e42e6c4 | Diego Biurrun | ```
return srcSliceH;
``` |

412 | } |
||

413 | 6a4970ab | Diego Biurrun | |

414 | 8a322796 | Diego Biurrun | ```
/* This code assumes:
``` |

415 | 6e42e6c4 | Diego Biurrun | |

416 | ```
1) dst is 16 bytes-aligned
``` |
||

417 | ```
2) dstStride is a multiple of 16
``` |
||

418 | ```
3) width is a multiple of 16
``` |
||

419 | 8a322796 | Diego Biurrun | ```
4) lum & chrom stride are multiples of 8
``` |

420 | 6e42e6c4 | Diego Biurrun | ```
*/
``` |

421 | |||

422 | for (y=0; y<height; y++) { |
||

423 | ```
int i;
``` |
||

424 | for (i = 0; i < width - 31; i+= 32) { |
||

425 | const unsigned int j = i >> 1; |
||

426 | vector unsigned char v_yA = vec_ld(i, ysrc); |
||

427 | vector unsigned char v_yB = vec_ld(i + 16, ysrc); |
||

428 | vector unsigned char v_yC = vec_ld(i + 32, ysrc); |
||

429 | vector unsigned char v_y1 = vec_perm(v_yA, v_yB, yperm); |
||

430 | vector unsigned char v_y2 = vec_perm(v_yB, v_yC, yperm); |
||

431 | vector unsigned char v_uA = vec_ld(j, usrc); |
||

432 | vector unsigned char v_uB = vec_ld(j + 16, usrc); |
||

433 | vector unsigned char v_u = vec_perm(v_uA, v_uB, vec_lvsl(j, usrc)); |
||

434 | vector unsigned char v_vA = vec_ld(j, vsrc); |
||

435 | vector unsigned char v_vB = vec_ld(j + 16, vsrc); |
||

436 | vector unsigned char v_v = vec_perm(v_vA, v_vB, vec_lvsl(j, vsrc)); |
||

437 | vector unsigned char v_uv_a = vec_mergeh(v_u, v_v); |
||

438 | vector unsigned char v_uv_b = vec_mergel(v_u, v_v); |
||

439 | vector unsigned char v_yuy2_0 = vec_mergeh(v_y1, v_uv_a); |
||

440 | vector unsigned char v_yuy2_1 = vec_mergel(v_y1, v_uv_a); |
||

441 | vector unsigned char v_yuy2_2 = vec_mergeh(v_y2, v_uv_b); |
||

442 | vector unsigned char v_yuy2_3 = vec_mergel(v_y2, v_uv_b); |
||

443 | ```
vec_st(v_yuy2_0, (i << 1), dst);
``` |
||

444 | vec_st(v_yuy2_1, (i << 1) + 16, dst); |
||

445 | vec_st(v_yuy2_2, (i << 1) + 32, dst); |
||

446 | vec_st(v_yuy2_3, (i << 1) + 48, dst); |
||

447 | } |
||

448 | ```
if (i < width) {
``` |
||

449 | const unsigned int j = i >> 1; |
||

450 | vector unsigned char v_y1 = vec_ld(i, ysrc); |
||

451 | vector unsigned char v_u = vec_ld(j, usrc); |
||

452 | vector unsigned char v_v = vec_ld(j, vsrc); |
||

453 | vector unsigned char v_uv_a = vec_mergeh(v_u, v_v); |
||

454 | vector unsigned char v_yuy2_0 = vec_mergeh(v_y1, v_uv_a); |
||

455 | vector unsigned char v_yuy2_1 = vec_mergel(v_y1, v_uv_a); |
||

456 | ```
vec_st(v_yuy2_0, (i << 1), dst);
``` |
||

457 | vec_st(v_yuy2_1, (i << 1) + 16, dst); |
||

458 | } |
||

459 | 8916b4b5 | Benoit Fouet | if ((y&(vertLumPerChroma-1)) == vertLumPerChroma-1) { |

460 | 6e42e6c4 | Diego Biurrun | usrc += chromStride; |

461 | vsrc += chromStride; |
||

462 | } |
||

463 | ysrc += lumStride; |
||

464 | dst += dstStride; |
||

465 | b71cf33c | Romain Dolbeau | } |

466 | 6a4970ab | Diego Biurrun | |

467 | 6e42e6c4 | Diego Biurrun | ```
return srcSliceH;
``` |

468 | b71cf33c | Romain Dolbeau | } |

469 | |||

470 | 5a55d5b5 | Reimar Döffinger | static inline int yv12touyvy_unscaled_altivec(SwsContext *c, const uint8_t* src[], int srcStride[], int srcSliceY, |

471 | dd68318c | Ramiro Polla | int srcSliceH, uint8_t* dstParam[], int dstStride_a[]) |

472 | { |
||

473 | 6e42e6c4 | Diego Biurrun | uint8_t *dst=dstParam[0] + dstStride_a[0]*srcSliceY; |

474 | 30c48a0a | Benoit Fouet | ```
// yv12toyuy2(src[0], src[1], src[2], dst, c->srcW, srcSliceH, srcStride[0], srcStride[1], dstStride[0]);
``` |

475 | 5a55d5b5 | Reimar Döffinger | const uint8_t *ysrc = src[0]; |

476 | const uint8_t *usrc = src[1]; |
||

477 | const uint8_t *vsrc = src[2]; |
||

478 | 6e42e6c4 | Diego Biurrun | const int width = c->srcW; |

479 | const int height = srcSliceH; |
||

480 | const int lumStride = srcStride[0]; |
||

481 | const int chromStride = srcStride[1]; |
||

482 | const int dstStride = dstStride_a[0]; |
||

483 | const int vertLumPerChroma = 2; |
||

484 | const vector unsigned char yperm = vec_lvsl(0, ysrc); |
||

485 | register unsigned int y; |
||

486 | |||

487 | if (width&15) { |
||

488 | 30c48a0a | Benoit Fouet | yv12touyvy(ysrc, usrc, vsrc, dst, c->srcW, srcSliceH, lumStride, chromStride, dstStride); |

489 | 6e42e6c4 | Diego Biurrun | ```
return srcSliceH;
``` |

490 | } |
||

491 | 6a4970ab | Diego Biurrun | |

492 | 8a322796 | Diego Biurrun | ```
/* This code assumes:
``` |

493 | 6e42e6c4 | Diego Biurrun | |

494 | ```
1) dst is 16 bytes-aligned
``` |
||

495 | ```
2) dstStride is a multiple of 16
``` |
||

496 | ```
3) width is a multiple of 16
``` |
||

497 | 8a322796 | Diego Biurrun | ```
4) lum & chrom stride are multiples of 8
``` |

498 | 6e42e6c4 | Diego Biurrun | ```
*/
``` |

499 | |||

500 | for (y=0; y<height; y++) { |
||

501 | ```
int i;
``` |
||

502 | for (i = 0; i < width - 31; i+= 32) { |
||

503 | const unsigned int j = i >> 1; |
||

504 | vector unsigned char v_yA = vec_ld(i, ysrc); |
||

505 | vector unsigned char v_yB = vec_ld(i + 16, ysrc); |
||

506 | vector unsigned char v_yC = vec_ld(i + 32, ysrc); |
||

507 | vector unsigned char v_y1 = vec_perm(v_yA, v_yB, yperm); |
||

508 | vector unsigned char v_y2 = vec_perm(v_yB, v_yC, yperm); |
||

509 | vector unsigned char v_uA = vec_ld(j, usrc); |
||

510 | vector unsigned char v_uB = vec_ld(j + 16, usrc); |
||

511 | vector unsigned char v_u = vec_perm(v_uA, v_uB, vec_lvsl(j, usrc)); |
||

512 | vector unsigned char v_vA = vec_ld(j, vsrc); |
||

513 | vector unsigned char v_vB = vec_ld(j + 16, vsrc); |
||

514 | vector unsigned char v_v = vec_perm(v_vA, v_vB, vec_lvsl(j, vsrc)); |
||

515 | vector unsigned char v_uv_a = vec_mergeh(v_u, v_v); |
||

516 | vector unsigned char v_uv_b = vec_mergel(v_u, v_v); |
||

517 | vector unsigned char v_uyvy_0 = vec_mergeh(v_uv_a, v_y1); |
||

518 | vector unsigned char v_uyvy_1 = vec_mergel(v_uv_a, v_y1); |
||

519 | vector unsigned char v_uyvy_2 = vec_mergeh(v_uv_b, v_y2); |
||

520 | vector unsigned char v_uyvy_3 = vec_mergel(v_uv_b, v_y2); |
||

521 | ```
vec_st(v_uyvy_0, (i << 1), dst);
``` |
||

522 | vec_st(v_uyvy_1, (i << 1) + 16, dst); |
||

523 | vec_st(v_uyvy_2, (i << 1) + 32, dst); |
||

524 | vec_st(v_uyvy_3, (i << 1) + 48, dst); |
||

525 | } |
||

526 | ```
if (i < width) {
``` |
||

527 | const unsigned int j = i >> 1; |
||

528 | vector unsigned char v_y1 = vec_ld(i, ysrc); |
||

529 | vector unsigned char v_u = vec_ld(j, usrc); |
||

530 | vector unsigned char v_v = vec_ld(j, vsrc); |
||

531 | vector unsigned char v_uv_a = vec_mergeh(v_u, v_v); |
||

532 | vector unsigned char v_uyvy_0 = vec_mergeh(v_uv_a, v_y1); |
||

533 | vector unsigned char v_uyvy_1 = vec_mergel(v_uv_a, v_y1); |
||

534 | ```
vec_st(v_uyvy_0, (i << 1), dst);
``` |
||

535 | vec_st(v_uyvy_1, (i << 1) + 16, dst); |
||

536 | } |
||

537 | 8916b4b5 | Benoit Fouet | if ((y&(vertLumPerChroma-1)) == vertLumPerChroma-1) { |

538 | 6e42e6c4 | Diego Biurrun | usrc += chromStride; |

539 | vsrc += chromStride; |
||

540 | } |
||

541 | ysrc += lumStride; |
||

542 | dst += dstStride; |
||

543 | b71cf33c | Romain Dolbeau | } |

544 | 6e42e6c4 | Diego Biurrun | ```
return srcSliceH;
``` |

545 | b71cf33c | Romain Dolbeau | } |