ViewVC Help
View File | Revision Log | Show Annotations | Revision Graph | Root Listing
root/cebix/BasiliskII/src/uae_cpu/compiler/codegen_x86.cpp
(Generate patch)

Comparing BasiliskII/src/uae_cpu/compiler/codegen_x86.cpp (file contents):
Revision 1.6 by gbeauche, 2002-10-03T16:13:46Z vs.
Revision 1.41 by gbeauche, 2008-02-16T22:15:00Z

# Line 3 | Line 3
3   *
4   *  Original 68040 JIT compiler for UAE, copyright 2000-2002 Bernd Meyer
5   *
6 < *  Adaptation for Basilisk II and improvements, copyright 2000-2002
6 > *  Adaptation for Basilisk II and improvements, copyright 2000-2005
7   *    Gwenole Beauchesne
8   *
9 < *  Basilisk II (C) 1997-2002 Christian Bauer
9 > *  Basilisk II (C) 1997-2008 Christian Bauer
10 > *
11 > *  Portions related to CPU detection come from linux/arch/i386/kernel/setup.c
12   *  
13   *  This program is free software; you can redistribute it and/or modify
14   *  it under the terms of the GNU General Public License as published by
# Line 40 | Line 42
42   #define EBP_INDEX 5
43   #define ESI_INDEX 6
44   #define EDI_INDEX 7
45 + #if defined(__x86_64__)
46 + #define R8_INDEX  8
47 + #define R9_INDEX  9
48 + #define R10_INDEX 10
49 + #define R11_INDEX 11
50 + #define R12_INDEX 12
51 + #define R13_INDEX 13
52 + #define R14_INDEX 14
53 + #define R15_INDEX 15
54 + #endif
55 + /* XXX this has to match X86_Reg8H_Base + 4 */
56 + #define AH_INDEX (0x10+4+EAX_INDEX)
57 + #define CH_INDEX (0x10+4+ECX_INDEX)
58 + #define DH_INDEX (0x10+4+EDX_INDEX)
59 + #define BH_INDEX (0x10+4+EBX_INDEX)
60  
61   /* The register in which subroutines return an integer return value */
62 < #define REG_RESULT 0
62 > #define REG_RESULT EAX_INDEX
63  
64   /* The registers subroutines take their first and second argument in */
65   #if defined( _MSC_VER ) && !defined( USE_NORMAL_CALLING_CONVENTION )
66   /* Handle the _fastcall parameters of ECX and EDX */
67 < #define REG_PAR1 1
68 < #define REG_PAR2 2
67 > #define REG_PAR1 ECX_INDEX
68 > #define REG_PAR2 EDX_INDEX
69 > #elif defined(__x86_64__)
70 > #define REG_PAR1 EDI_INDEX
71 > #define REG_PAR2 ESI_INDEX
72   #else
73 < #define REG_PAR1 0
74 < #define REG_PAR2 2
73 > #define REG_PAR1 EAX_INDEX
74 > #define REG_PAR2 EDX_INDEX
75   #endif
76  
77 < /* Three registers that are not used for any of the above */
58 < #define REG_NOPAR1 6
59 < #define REG_NOPAR2 5
60 < #define REG_NOPAR3 3
61 <
62 < #define REG_PC_PRE 0 /* The register we use for preloading regs.pc_p */
77 > #define REG_PC_PRE EAX_INDEX /* The register we use for preloading regs.pc_p */
78   #if defined( _MSC_VER ) && !defined( USE_NORMAL_CALLING_CONVENTION )
79 < #define REG_PC_TMP 0
79 > #define REG_PC_TMP EAX_INDEX
80   #else
81 < #define REG_PC_TMP 1 /* Another register that is not the above */
81 > #define REG_PC_TMP ECX_INDEX /* Another register that is not the above */
82   #endif
83  
84 < #define SHIFTCOUNT_NREG 1  /* Register that can be used for shiftcount.
84 > #define SHIFTCOUNT_NREG ECX_INDEX  /* Register that can be used for shiftcount.
85                                -1 if any reg will do */
86 < #define MUL_NREG1 0 /* %eax will hold the low 32 bits after a 32x32 mul */
87 < #define MUL_NREG2 2 /* %edx will hold the high 32 bits */
86 > #define MUL_NREG1 EAX_INDEX /* %eax will hold the low 32 bits after a 32x32 mul */
87 > #define MUL_NREG2 EDX_INDEX /* %edx will hold the high 32 bits */
88 >
89 > #define STACK_ALIGN             16
90 > #define STACK_OFFSET    sizeof(void *)
91  
92   uae_s8 always_used[]={4,-1};
93 + #if defined(__x86_64__)
94 + uae_s8 can_byte[]={0,1,2,3,5,6,7,8,9,10,11,12,13,14,15,-1};
95 + uae_s8 can_word[]={0,1,2,3,5,6,7,8,9,10,11,12,13,14,15,-1};
96 + #else
97   uae_s8 can_byte[]={0,1,2,3,-1};
98   uae_s8 can_word[]={0,1,2,3,5,6,7,-1};
99 + #endif
100  
101 + #if USE_OPTIMIZED_CALLS
102 + /* Make sure interpretive core does not use cpuopti */
103 + uae_u8 call_saved[]={0,0,0,1,1,1,1,1};
104 + #error FIXME: code not ready
105 + #else
106   /* cpuopti mutate instruction handlers to assume registers are saved
107     by the caller */
108 < uae_u8 call_saved[]={0,0,0,0,1,0,0,0};
108 > uae_u8 call_saved[]={0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0};
109 > #endif
110  
111   /* This *should* be the same as call_saved. But:
112     - We might not really know which registers are saved, and which aren't,
# Line 86 | Line 115 | uae_u8 call_saved[]={0,0,0,0,1,0,0,0};
115     - Special registers (such like the stack pointer) should not be "preserved"
116       by pushing, even though they are "saved" across function calls
117   */
118 < uae_u8 need_to_preserve[]={1,1,1,1,0,1,1,1};
118 > #if defined(__x86_64__)
119 > /* callee-saved registers as defined by Linux AMD64 ABI: rbx, rbp, rsp, r12 - r15 */
120 > /* preserve r11 because it's generally used to hold pointers to functions */
121 > static const uae_u8 need_to_preserve[]={0,0,0,1,0,1,0,0,0,0,0,1,1,1,1,1};
122 > #else
123 > /* callee-saved registers as defined by System V IA-32 ABI: edi, esi, ebx, ebp */
124 > static const uae_u8 need_to_preserve[]={0,0,0,1,0,1,1,1};
125 > #endif
126  
127   /* Whether classes of instructions do or don't clobber the native flags */
128   #define CLOBBER_MOV
# Line 111 | Line 147 | uae_u8 need_to_preserve[]={1,1,1,1,0,1,1
147   #define CLOBBER_TEST clobber_flags()
148   #define CLOBBER_CL16
149   #define CLOBBER_CL8  
150 + #define CLOBBER_SE32
151   #define CLOBBER_SE16
152   #define CLOBBER_SE8
153 + #define CLOBBER_ZE32
154   #define CLOBBER_ZE16
155   #define CLOBBER_ZE8
156   #define CLOBBER_SW16 clobber_flags()
# Line 122 | Line 160 | uae_u8 need_to_preserve[]={1,1,1,1,0,1,1
160   #define CLOBBER_BT   clobber_flags()
161   #define CLOBBER_BSF  clobber_flags()
162  
163 + /* The older code generator is now deprecated.  */
164 + #define USE_NEW_RTASM 1
165 +
166 + #if USE_NEW_RTASM
167 +
168 + #if defined(__x86_64__)
169 + #define X86_TARGET_64BIT                1
170 + /* The address override prefix causes a 5 cycles penalty on Intel Core
171 +   processors. Another solution would be to decompose the load in an LEA,
172 +   MOV (to zero-extend), MOV (from memory): is it better? */
173 + #define ADDR32                                  x86_emit_byte(0x67),
174 + #else
175 + #define ADDR32                                  /**/
176 + #endif
177 + #define X86_FLAT_REGISTERS              0
178 + #define X86_OPTIMIZE_ALU                1
179 + #define X86_OPTIMIZE_ROTSHI             1
180 + #include "codegen_x86.h"
181 +
182 + #define x86_emit_byte(B)                emit_byte(B)
183 + #define x86_emit_word(W)                emit_word(W)
184 + #define x86_emit_long(L)                emit_long(L)
185 + #define x86_emit_quad(Q)                emit_quad(Q)
186 + #define x86_get_target()                get_target()
187 + #define x86_emit_failure(MSG)   jit_fail(MSG, __FILE__, __LINE__, __FUNCTION__)
188 +
189 + static void jit_fail(const char *msg, const char *file, int line, const char *function)
190 + {
191 +        fprintf(stderr, "JIT failure in function %s from file %s at line %d: %s\n",
192 +                        function, file, line, msg);
193 +        abort();
194 + }
195 +
196 + LOWFUNC(NONE,WRITE,1,raw_push_l_r,(R4 r))
197 + {
198 + #if defined(__x86_64__)
199 +        PUSHQr(r);
200 + #else
201 +        PUSHLr(r);
202 + #endif
203 + }
204 + LENDFUNC(NONE,WRITE,1,raw_push_l_r,(R4 r))
205 +
206 + LOWFUNC(NONE,READ,1,raw_pop_l_r,(R4 r))
207 + {
208 + #if defined(__x86_64__)
209 +        POPQr(r);
210 + #else
211 +        POPLr(r);
212 + #endif
213 + }
214 + LENDFUNC(NONE,READ,1,raw_pop_l_r,(R4 r))
215 +
216 + LOWFUNC(NONE,READ,1,raw_pop_l_m,(MEMW d))
217 + {
218 + #if defined(__x86_64__)
219 +        POPQm(d, X86_NOREG, X86_NOREG, 1);
220 + #else
221 +        POPLm(d, X86_NOREG, X86_NOREG, 1);
222 + #endif
223 + }
224 + LENDFUNC(NONE,READ,1,raw_pop_l_m,(MEMW d))
225 +
226 + LOWFUNC(WRITE,NONE,2,raw_bt_l_ri,(R4 r, IMM i))
227 + {
228 +        BTLir(i, r);
229 + }
230 + LENDFUNC(WRITE,NONE,2,raw_bt_l_ri,(R4 r, IMM i))
231 +
232 + LOWFUNC(WRITE,NONE,2,raw_bt_l_rr,(R4 r, R4 b))
233 + {
234 +        BTLrr(b, r);
235 + }
236 + LENDFUNC(WRITE,NONE,2,raw_bt_l_rr,(R4 r, R4 b))
237 +
238 + LOWFUNC(WRITE,NONE,2,raw_btc_l_ri,(RW4 r, IMM i))
239 + {
240 +        BTCLir(i, r);
241 + }
242 + LENDFUNC(WRITE,NONE,2,raw_btc_l_ri,(RW4 r, IMM i))
243 +
244 + LOWFUNC(WRITE,NONE,2,raw_btc_l_rr,(RW4 r, R4 b))
245 + {
246 +        BTCLrr(b, r);
247 + }
248 + LENDFUNC(WRITE,NONE,2,raw_btc_l_rr,(RW4 r, R4 b))
249 +
250 + LOWFUNC(WRITE,NONE,2,raw_btr_l_ri,(RW4 r, IMM i))
251 + {
252 +        BTRLir(i, r);
253 + }
254 + LENDFUNC(WRITE,NONE,2,raw_btr_l_ri,(RW4 r, IMM i))
255 +
256 + LOWFUNC(WRITE,NONE,2,raw_btr_l_rr,(RW4 r, R4 b))
257 + {
258 +        BTRLrr(b, r);
259 + }
260 + LENDFUNC(WRITE,NONE,2,raw_btr_l_rr,(RW4 r, R4 b))
261 +
262 + LOWFUNC(WRITE,NONE,2,raw_bts_l_ri,(RW4 r, IMM i))
263 + {
264 +        BTSLir(i, r);
265 + }
266 + LENDFUNC(WRITE,NONE,2,raw_bts_l_ri,(RW4 r, IMM i))
267 +
268 + LOWFUNC(WRITE,NONE,2,raw_bts_l_rr,(RW4 r, R4 b))
269 + {
270 +        BTSLrr(b, r);
271 + }
272 + LENDFUNC(WRITE,NONE,2,raw_bts_l_rr,(RW4 r, R4 b))
273 +
274 + LOWFUNC(WRITE,NONE,2,raw_sub_w_ri,(RW2 d, IMM i))
275 + {
276 +        SUBWir(i, d);
277 + }
278 + LENDFUNC(WRITE,NONE,2,raw_sub_w_ri,(RW2 d, IMM i))
279 +
280 + LOWFUNC(NONE,READ,2,raw_mov_l_rm,(W4 d, MEMR s))
281 + {
282 +        MOVLmr(s, X86_NOREG, X86_NOREG, 1, d);
283 + }
284 + LENDFUNC(NONE,READ,2,raw_mov_l_rm,(W4 d, MEMR s))
285 +
286 + LOWFUNC(NONE,WRITE,2,raw_mov_l_mi,(MEMW d, IMM s))
287 + {
288 +        MOVLim(s, d, X86_NOREG, X86_NOREG, 1);
289 + }
290 + LENDFUNC(NONE,WRITE,2,raw_mov_l_mi,(MEMW d, IMM s))
291 +
292 + LOWFUNC(NONE,WRITE,2,raw_mov_w_mi,(MEMW d, IMM s))
293 + {
294 +        MOVWim(s, d, X86_NOREG, X86_NOREG, 1);
295 + }
296 + LENDFUNC(NONE,WRITE,2,raw_mov_w_mi,(MEMW d, IMM s))
297 +
298 + LOWFUNC(NONE,WRITE,2,raw_mov_b_mi,(MEMW d, IMM s))
299 + {
300 +        MOVBim(s, d, X86_NOREG, X86_NOREG, 1);
301 + }
302 + LENDFUNC(NONE,WRITE,2,raw_mov_b_mi,(MEMW d, IMM s))
303 +
304 + LOWFUNC(WRITE,RMW,2,raw_rol_b_mi,(MEMRW d, IMM i))
305 + {
306 +        ROLBim(i, d, X86_NOREG, X86_NOREG, 1);
307 + }
308 + LENDFUNC(WRITE,RMW,2,raw_rol_b_mi,(MEMRW d, IMM i))
309 +
310 + LOWFUNC(WRITE,NONE,2,raw_rol_b_ri,(RW1 r, IMM i))
311 + {
312 +        ROLBir(i, r);
313 + }
314 + LENDFUNC(WRITE,NONE,2,raw_rol_b_ri,(RW1 r, IMM i))
315 +
316 + LOWFUNC(WRITE,NONE,2,raw_rol_w_ri,(RW2 r, IMM i))
317 + {
318 +        ROLWir(i, r);
319 + }
320 + LENDFUNC(WRITE,NONE,2,raw_rol_w_ri,(RW2 r, IMM i))
321 +
322 + LOWFUNC(WRITE,NONE,2,raw_rol_l_ri,(RW4 r, IMM i))
323 + {
324 +        ROLLir(i, r);
325 + }
326 + LENDFUNC(WRITE,NONE,2,raw_rol_l_ri,(RW4 r, IMM i))
327 +
328 + LOWFUNC(WRITE,NONE,2,raw_rol_l_rr,(RW4 d, R1 r))
329 + {
330 +        ROLLrr(r, d);
331 + }
332 + LENDFUNC(WRITE,NONE,2,raw_rol_l_rr,(RW4 d, R1 r))
333 +
334 + LOWFUNC(WRITE,NONE,2,raw_rol_w_rr,(RW2 d, R1 r))
335 + {
336 +        ROLWrr(r, d);
337 + }
338 + LENDFUNC(WRITE,NONE,2,raw_rol_w_rr,(RW2 d, R1 r))
339 +
340 + LOWFUNC(WRITE,NONE,2,raw_rol_b_rr,(RW1 d, R1 r))
341 + {
342 +        ROLBrr(r, d);
343 + }
344 + LENDFUNC(WRITE,NONE,2,raw_rol_b_rr,(RW1 d, R1 r))
345 +
346 + LOWFUNC(WRITE,NONE,2,raw_shll_l_rr,(RW4 d, R1 r))
347 + {
348 +        SHLLrr(r, d);
349 + }
350 + LENDFUNC(WRITE,NONE,2,raw_shll_l_rr,(RW4 d, R1 r))
351 +
352 + LOWFUNC(WRITE,NONE,2,raw_shll_w_rr,(RW2 d, R1 r))
353 + {
354 +        SHLWrr(r, d);
355 + }
356 + LENDFUNC(WRITE,NONE,2,raw_shll_w_rr,(RW2 d, R1 r))
357 +
358 + LOWFUNC(WRITE,NONE,2,raw_shll_b_rr,(RW1 d, R1 r))
359 + {
360 +        SHLBrr(r, d);
361 + }
362 + LENDFUNC(WRITE,NONE,2,raw_shll_b_rr,(RW1 d, R1 r))
363 +
364 + LOWFUNC(WRITE,NONE,2,raw_ror_b_ri,(RW1 r, IMM i))
365 + {
366 +        RORBir(i, r);
367 + }
368 + LENDFUNC(WRITE,NONE,2,raw_ror_b_ri,(RW1 r, IMM i))
369 +
370 + LOWFUNC(WRITE,NONE,2,raw_ror_w_ri,(RW2 r, IMM i))
371 + {
372 +        RORWir(i, r);
373 + }
374 + LENDFUNC(WRITE,NONE,2,raw_ror_w_ri,(RW2 r, IMM i))
375 +
376 + LOWFUNC(WRITE,READ,2,raw_or_l_rm,(RW4 d, MEMR s))
377 + {
378 +        ORLmr(s, X86_NOREG, X86_NOREG, 1, d);
379 + }
380 + LENDFUNC(WRITE,READ,2,raw_or_l_rm,(RW4 d, MEMR s))
381 +
382 + LOWFUNC(WRITE,NONE,2,raw_ror_l_ri,(RW4 r, IMM i))
383 + {
384 +        RORLir(i, r);
385 + }
386 + LENDFUNC(WRITE,NONE,2,raw_ror_l_ri,(RW4 r, IMM i))
387 +
388 + LOWFUNC(WRITE,NONE,2,raw_ror_l_rr,(RW4 d, R1 r))
389 + {
390 +        RORLrr(r, d);
391 + }
392 + LENDFUNC(WRITE,NONE,2,raw_ror_l_rr,(RW4 d, R1 r))
393 +
394 + LOWFUNC(WRITE,NONE,2,raw_ror_w_rr,(RW2 d, R1 r))
395 + {
396 +        RORWrr(r, d);
397 + }
398 + LENDFUNC(WRITE,NONE,2,raw_ror_w_rr,(RW2 d, R1 r))
399 +
400 + LOWFUNC(WRITE,NONE,2,raw_ror_b_rr,(RW1 d, R1 r))
401 + {
402 +        RORBrr(r, d);
403 + }
404 + LENDFUNC(WRITE,NONE,2,raw_ror_b_rr,(RW1 d, R1 r))
405 +
406 + LOWFUNC(WRITE,NONE,2,raw_shrl_l_rr,(RW4 d, R1 r))
407 + {
408 +        SHRLrr(r, d);
409 + }
410 + LENDFUNC(WRITE,NONE,2,raw_shrl_l_rr,(RW4 d, R1 r))
411 +
412 + LOWFUNC(WRITE,NONE,2,raw_shrl_w_rr,(RW2 d, R1 r))
413 + {
414 +        SHRWrr(r, d);
415 + }
416 + LENDFUNC(WRITE,NONE,2,raw_shrl_w_rr,(RW2 d, R1 r))
417 +
418 + LOWFUNC(WRITE,NONE,2,raw_shrl_b_rr,(RW1 d, R1 r))
419 + {
420 +        SHRBrr(r, d);
421 + }
422 + LENDFUNC(WRITE,NONE,2,raw_shrl_b_rr,(RW1 d, R1 r))
423 +
424 + LOWFUNC(WRITE,NONE,2,raw_shra_l_rr,(RW4 d, R1 r))
425 + {
426 +        SARLrr(r, d);
427 + }
428 + LENDFUNC(WRITE,NONE,2,raw_shra_l_rr,(RW4 d, R1 r))
429 +
430 + LOWFUNC(WRITE,NONE,2,raw_shra_w_rr,(RW2 d, R1 r))
431 + {
432 +        SARWrr(r, d);
433 + }
434 + LENDFUNC(WRITE,NONE,2,raw_shra_w_rr,(RW2 d, R1 r))
435 +
436 + LOWFUNC(WRITE,NONE,2,raw_shra_b_rr,(RW1 d, R1 r))
437 + {
438 +        SARBrr(r, d);
439 + }
440 + LENDFUNC(WRITE,NONE,2,raw_shra_b_rr,(RW1 d, R1 r))
441 +
442 + LOWFUNC(WRITE,NONE,2,raw_shll_l_ri,(RW4 r, IMM i))
443 + {
444 +        SHLLir(i, r);
445 + }
446 + LENDFUNC(WRITE,NONE,2,raw_shll_l_ri,(RW4 r, IMM i))
447 +
448 + LOWFUNC(WRITE,NONE,2,raw_shll_w_ri,(RW2 r, IMM i))
449 + {
450 +        SHLWir(i, r);
451 + }
452 + LENDFUNC(WRITE,NONE,2,raw_shll_w_ri,(RW2 r, IMM i))
453 +
454 + LOWFUNC(WRITE,NONE,2,raw_shll_b_ri,(RW1 r, IMM i))
455 + {
456 +        SHLBir(i, r);
457 + }
458 + LENDFUNC(WRITE,NONE,2,raw_shll_b_ri,(RW1 r, IMM i))
459 +
460 + LOWFUNC(WRITE,NONE,2,raw_shrl_l_ri,(RW4 r, IMM i))
461 + {
462 +        SHRLir(i, r);
463 + }
464 + LENDFUNC(WRITE,NONE,2,raw_shrl_l_ri,(RW4 r, IMM i))
465 +
466 + LOWFUNC(WRITE,NONE,2,raw_shrl_w_ri,(RW2 r, IMM i))
467 + {
468 +        SHRWir(i, r);
469 + }
470 + LENDFUNC(WRITE,NONE,2,raw_shrl_w_ri,(RW2 r, IMM i))
471 +
472 + LOWFUNC(WRITE,NONE,2,raw_shrl_b_ri,(RW1 r, IMM i))
473 + {
474 +        SHRBir(i, r);
475 + }
476 + LENDFUNC(WRITE,NONE,2,raw_shrl_b_ri,(RW1 r, IMM i))
477 +
478 + LOWFUNC(WRITE,NONE,2,raw_shra_l_ri,(RW4 r, IMM i))
479 + {
480 +        SARLir(i, r);
481 + }
482 + LENDFUNC(WRITE,NONE,2,raw_shra_l_ri,(RW4 r, IMM i))
483 +
484 + LOWFUNC(WRITE,NONE,2,raw_shra_w_ri,(RW2 r, IMM i))
485 + {
486 +        SARWir(i, r);
487 + }
488 + LENDFUNC(WRITE,NONE,2,raw_shra_w_ri,(RW2 r, IMM i))
489 +
490 + LOWFUNC(WRITE,NONE,2,raw_shra_b_ri,(RW1 r, IMM i))
491 + {
492 +        SARBir(i, r);
493 + }
494 + LENDFUNC(WRITE,NONE,2,raw_shra_b_ri,(RW1 r, IMM i))
495 +
496 + LOWFUNC(WRITE,NONE,1,raw_sahf,(R2 dummy_ah))
497 + {
498 +        SAHF();
499 + }
500 + LENDFUNC(WRITE,NONE,1,raw_sahf,(R2 dummy_ah))
501 +
502 + LOWFUNC(NONE,NONE,1,raw_cpuid,(R4 dummy_eax))
503 + {
504 +        CPUID();
505 + }
506 + LENDFUNC(NONE,NONE,1,raw_cpuid,(R4 dummy_eax))
507 +
508 + LOWFUNC(READ,NONE,1,raw_lahf,(W2 dummy_ah))
509 + {
510 +        LAHF();
511 + }
512 + LENDFUNC(READ,NONE,1,raw_lahf,(W2 dummy_ah))
513 +
514 + LOWFUNC(READ,NONE,2,raw_setcc,(W1 d, IMM cc))
515 + {
516 +        SETCCir(cc, d);
517 + }
518 + LENDFUNC(READ,NONE,2,raw_setcc,(W1 d, IMM cc))
519 +
520 + LOWFUNC(READ,WRITE,2,raw_setcc_m,(MEMW d, IMM cc))
521 + {
522 +        SETCCim(cc, d, X86_NOREG, X86_NOREG, 1);
523 + }
524 + LENDFUNC(READ,WRITE,2,raw_setcc_m,(MEMW d, IMM cc))
525 +
526 + LOWFUNC(READ,NONE,3,raw_cmov_b_rr,(RW1 d, R1 s, IMM cc))
527 + {
528 +        /* replacement using branch and mov */
529 +        int8 *target_p = (int8 *)x86_get_target() + 1;
530 +        JCCSii(cc^1, 0);
531 +        MOVBrr(s, d);
532 +        *target_p = (uintptr)x86_get_target() - ((uintptr)target_p + 1);
533 + }
534 + LENDFUNC(READ,NONE,3,raw_cmov_b_rr,(RW1 d, R1 s, IMM cc))
535 +
536 + LOWFUNC(READ,NONE,3,raw_cmov_w_rr,(RW2 d, R2 s, IMM cc))
537 + {
538 +        if (have_cmov)
539 +                CMOVWrr(cc, s, d);
540 +        else { /* replacement using branch and mov */
541 +                int8 *target_p = (int8 *)x86_get_target() + 1;
542 +                JCCSii(cc^1, 0);
543 +                MOVWrr(s, d);
544 +            *target_p = (uintptr)x86_get_target() - ((uintptr)target_p + 1);
545 +        }
546 + }
547 + LENDFUNC(READ,NONE,3,raw_cmov_w_rr,(RW2 d, R2 s, IMM cc))
548 +
549 + LOWFUNC(READ,NONE,3,raw_cmov_l_rr,(RW4 d, R4 s, IMM cc))
550 + {
551 +        if (have_cmov)
552 +                CMOVLrr(cc, s, d);
553 +        else { /* replacement using branch and mov */
554 +                int8 *target_p = (int8 *)x86_get_target() + 1;
555 +                JCCSii(cc^1, 0);
556 +                MOVLrr(s, d);
557 +            *target_p = (uintptr)x86_get_target() - ((uintptr)target_p + 1);
558 +        }
559 + }
560 + LENDFUNC(READ,NONE,3,raw_cmov_l_rr,(RW4 d, R4 s, IMM cc))
561 +
562 + LOWFUNC(WRITE,NONE,2,raw_bsf_l_rr,(W4 d, R4 s))
563 + {
564 +        BSFLrr(s, d);
565 + }
566 + LENDFUNC(WRITE,NONE,2,raw_bsf_l_rr,(W4 d, R4 s))
567 +
568 + LOWFUNC(NONE,NONE,2,raw_sign_extend_32_rr,(W4 d, R4 s))
569 + {
570 +        MOVSLQrr(s, d);
571 + }
572 + LENDFUNC(NONE,NONE,2,raw_sign_extend_32_rr,(W4 d, R4 s))
573 +
574 + LOWFUNC(NONE,NONE,2,raw_sign_extend_16_rr,(W4 d, R2 s))
575 + {
576 +        MOVSWLrr(s, d);
577 + }
578 + LENDFUNC(NONE,NONE,2,raw_sign_extend_16_rr,(W4 d, R2 s))
579 +
580 + LOWFUNC(NONE,NONE,2,raw_sign_extend_8_rr,(W4 d, R1 s))
581 + {
582 +        MOVSBLrr(s, d);
583 + }
584 + LENDFUNC(NONE,NONE,2,raw_sign_extend_8_rr,(W4 d, R1 s))
585 +
586 + LOWFUNC(NONE,NONE,2,raw_zero_extend_16_rr,(W4 d, R2 s))
587 + {
588 +        MOVZWLrr(s, d);
589 + }
590 + LENDFUNC(NONE,NONE,2,raw_zero_extend_16_rr,(W4 d, R2 s))
591 +
592 + LOWFUNC(NONE,NONE,2,raw_zero_extend_8_rr,(W4 d, R1 s))
593 + {
594 +        MOVZBLrr(s, d);
595 + }
596 + LENDFUNC(NONE,NONE,2,raw_zero_extend_8_rr,(W4 d, R1 s))
597 +
598 + LOWFUNC(NONE,NONE,2,raw_imul_32_32,(RW4 d, R4 s))
599 + {
600 +        IMULLrr(s, d);
601 + }
602 + LENDFUNC(NONE,NONE,2,raw_imul_32_32,(RW4 d, R4 s))
603 +
604 + LOWFUNC(NONE,NONE,2,raw_imul_64_32,(RW4 d, RW4 s))
605 + {
606 +        if (d!=MUL_NREG1 || s!=MUL_NREG2) {
607 +        write_log("Bad register in IMUL: d=%d, s=%d\n",d,s);
608 +        abort();
609 +        }
610 +        IMULLr(s);
611 + }
612 + LENDFUNC(NONE,NONE,2,raw_imul_64_32,(RW4 d, RW4 s))
613 +
614 + LOWFUNC(NONE,NONE,2,raw_mul_64_32,(RW4 d, RW4 s))
615 + {
616 +        if (d!=MUL_NREG1 || s!=MUL_NREG2) {
617 +        write_log("Bad register in MUL: d=%d, s=%d\n",d,s);
618 +        abort();
619 +        }
620 +        MULLr(s);
621 + }
622 + LENDFUNC(NONE,NONE,2,raw_mul_64_32,(RW4 d, RW4 s))
623 +
624 + LOWFUNC(NONE,NONE,2,raw_mul_32_32,(RW4 d, R4 s))
625 + {
626 +        abort(); /* %^$&%^$%#^ x86! */
627 + }
628 + LENDFUNC(NONE,NONE,2,raw_mul_32_32,(RW4 d, R4 s))
629 +
630 + LOWFUNC(NONE,NONE,2,raw_mov_b_rr,(W1 d, R1 s))
631 + {
632 +        MOVBrr(s, d);
633 + }
634 + LENDFUNC(NONE,NONE,2,raw_mov_b_rr,(W1 d, R1 s))
635 +
636 + LOWFUNC(NONE,NONE,2,raw_mov_w_rr,(W2 d, R2 s))
637 + {
638 +        MOVWrr(s, d);
639 + }
640 + LENDFUNC(NONE,NONE,2,raw_mov_w_rr,(W2 d, R2 s))
641 +
642 + LOWFUNC(NONE,READ,4,raw_mov_l_rrm_indexed,(W4 d,R4 baser, R4 index, IMM factor))
643 + {
644 +        ADDR32 MOVLmr(0, baser, index, factor, d);
645 + }
646 + LENDFUNC(NONE,READ,4,raw_mov_l_rrm_indexed,(W4 d,R4 baser, R4 index, IMM factor))
647 +
648 + LOWFUNC(NONE,READ,4,raw_mov_w_rrm_indexed,(W2 d, R4 baser, R4 index, IMM factor))
649 + {
650 +        ADDR32 MOVWmr(0, baser, index, factor, d);
651 + }
652 + LENDFUNC(NONE,READ,4,raw_mov_w_rrm_indexed,(W2 d, R4 baser, R4 index, IMM factor))
653 +
654 + LOWFUNC(NONE,READ,4,raw_mov_b_rrm_indexed,(W1 d, R4 baser, R4 index, IMM factor))
655 + {
656 +        ADDR32 MOVBmr(0, baser, index, factor, d);
657 + }
658 + LENDFUNC(NONE,READ,4,raw_mov_b_rrm_indexed,(W1 d, R4 baser, R4 index, IMM factor))
659 +
660 + LOWFUNC(NONE,WRITE,4,raw_mov_l_mrr_indexed,(R4 baser, R4 index, IMM factor, R4 s))
661 + {
662 +        ADDR32 MOVLrm(s, 0, baser, index, factor);
663 + }
664 + LENDFUNC(NONE,WRITE,4,raw_mov_l_mrr_indexed,(R4 baser, R4 index, IMM factor, R4 s))
665 +
666 + LOWFUNC(NONE,WRITE,4,raw_mov_w_mrr_indexed,(R4 baser, R4 index, IMM factor, R2 s))
667 + {
668 +        ADDR32 MOVWrm(s, 0, baser, index, factor);
669 + }
670 + LENDFUNC(NONE,WRITE,4,raw_mov_w_mrr_indexed,(R4 baser, R4 index, IMM factor, R2 s))
671 +
672 + LOWFUNC(NONE,WRITE,4,raw_mov_b_mrr_indexed,(R4 baser, R4 index, IMM factor, R1 s))
673 + {
674 +        ADDR32 MOVBrm(s, 0, baser, index, factor);
675 + }
676 + LENDFUNC(NONE,WRITE,4,raw_mov_b_mrr_indexed,(R4 baser, R4 index, IMM factor, R1 s))
677 +
678 + LOWFUNC(NONE,WRITE,5,raw_mov_l_bmrr_indexed,(IMM base, R4 baser, R4 index, IMM factor, R4 s))
679 + {
680 +        ADDR32 MOVLrm(s, base, baser, index, factor);
681 + }
682 + LENDFUNC(NONE,WRITE,5,raw_mov_l_bmrr_indexed,(IMM base, R4 baser, R4 index, IMM factor, R4 s))
683 +
684 + LOWFUNC(NONE,WRITE,5,raw_mov_w_bmrr_indexed,(IMM base, R4 baser, R4 index, IMM factor, R2 s))
685 + {
686 +        ADDR32 MOVWrm(s, base, baser, index, factor);
687 + }
688 + LENDFUNC(NONE,WRITE,5,raw_mov_w_bmrr_indexed,(IMM base, R4 baser, R4 index, IMM factor, R2 s))
689 +
690 + LOWFUNC(NONE,WRITE,5,raw_mov_b_bmrr_indexed,(IMM base, R4 baser, R4 index, IMM factor, R1 s))
691 + {
692 +        ADDR32 MOVBrm(s, base, baser, index, factor);
693 + }
694 + LENDFUNC(NONE,WRITE,5,raw_mov_b_bmrr_indexed,(IMM base, R4 baser, R4 index, IMM factor, R1 s))
695 +
696 + LOWFUNC(NONE,READ,5,raw_mov_l_brrm_indexed,(W4 d, IMM base, R4 baser, R4 index, IMM factor))
697 + {
698 +        ADDR32 MOVLmr(base, baser, index, factor, d);
699 + }
700 + LENDFUNC(NONE,READ,5,raw_mov_l_brrm_indexed,(W4 d, IMM base, R4 baser, R4 index, IMM factor))
701 +
702 + LOWFUNC(NONE,READ,5,raw_mov_w_brrm_indexed,(W2 d, IMM base, R4 baser, R4 index, IMM factor))
703 + {
704 +        ADDR32 MOVWmr(base, baser, index, factor, d);
705 + }
706 + LENDFUNC(NONE,READ,5,raw_mov_w_brrm_indexed,(W2 d, IMM base, R4 baser, R4 index, IMM factor))
707 +
708 + LOWFUNC(NONE,READ,5,raw_mov_b_brrm_indexed,(W1 d, IMM base, R4 baser, R4 index, IMM factor))
709 + {
710 +        ADDR32 MOVBmr(base, baser, index, factor, d);
711 + }
712 + LENDFUNC(NONE,READ,5,raw_mov_b_brrm_indexed,(W1 d, IMM base, R4 baser, R4 index, IMM factor))
713 +
714 + LOWFUNC(NONE,READ,4,raw_mov_l_rm_indexed,(W4 d, IMM base, R4 index, IMM factor))
715 + {
716 +        ADDR32 MOVLmr(base, X86_NOREG, index, factor, d);
717 + }
718 + LENDFUNC(NONE,READ,4,raw_mov_l_rm_indexed,(W4 d, IMM base, R4 index, IMM factor))
719 +
720 + LOWFUNC(NONE,READ,5,raw_cmov_l_rm_indexed,(W4 d, IMM base, R4 index, IMM factor, IMM cond))
721 + {
722 +        if (have_cmov)
723 +                ADDR32 CMOVLmr(cond, base, X86_NOREG, index, factor, d);
724 +        else { /* replacement using branch and mov */
725 +                int8 *target_p = (int8 *)x86_get_target() + 1;
726 +                JCCSii(cond^1, 0);
727 +                ADDR32 MOVLmr(base, X86_NOREG, index, factor, d);
728 +            *target_p = (uintptr)x86_get_target() - ((uintptr)target_p + 1);
729 +        }
730 + }
731 + LENDFUNC(NONE,READ,5,raw_cmov_l_rm_indexed,(W4 d, IMM base, R4 index, IMM factor, IMM cond))
732 +
733 + LOWFUNC(NONE,READ,3,raw_cmov_l_rm,(W4 d, IMM mem, IMM cond))
734 + {
735 +        if (have_cmov)
736 +                CMOVLmr(cond, mem, X86_NOREG, X86_NOREG, 1, d);
737 +        else { /* replacement using branch and mov */
738 +                int8 *target_p = (int8 *)x86_get_target() + 1;
739 +                JCCSii(cond^1, 0);
740 +                MOVLmr(mem, X86_NOREG, X86_NOREG, 1, d);
741 +            *target_p = (uintptr)x86_get_target() - ((uintptr)target_p + 1);
742 +        }
743 + }
744 + LENDFUNC(NONE,READ,3,raw_cmov_l_rm,(W4 d, IMM mem, IMM cond))
745 +
746 + LOWFUNC(NONE,READ,3,raw_mov_l_rR,(W4 d, R4 s, IMM offset))
747 + {
748 +        ADDR32 MOVLmr(offset, s, X86_NOREG, 1, d);
749 + }
750 + LENDFUNC(NONE,READ,3,raw_mov_l_rR,(W4 d, R4 s, IMM offset))
751 +
752 + LOWFUNC(NONE,READ,3,raw_mov_w_rR,(W2 d, R4 s, IMM offset))
753 + {
754 +        ADDR32 MOVWmr(offset, s, X86_NOREG, 1, d);
755 + }
756 + LENDFUNC(NONE,READ,3,raw_mov_w_rR,(W2 d, R4 s, IMM offset))
757 +
758 + LOWFUNC(NONE,READ,3,raw_mov_b_rR,(W1 d, R4 s, IMM offset))
759 + {
760 +        ADDR32 MOVBmr(offset, s, X86_NOREG, 1, d);
761 + }
762 + LENDFUNC(NONE,READ,3,raw_mov_b_rR,(W1 d, R4 s, IMM offset))
763 +
764 + LOWFUNC(NONE,READ,3,raw_mov_l_brR,(W4 d, R4 s, IMM offset))
765 + {
766 +        ADDR32 MOVLmr(offset, s, X86_NOREG, 1, d);
767 + }
768 + LENDFUNC(NONE,READ,3,raw_mov_l_brR,(W4 d, R4 s, IMM offset))
769 +
770 + LOWFUNC(NONE,READ,3,raw_mov_w_brR,(W2 d, R4 s, IMM offset))
771 + {
772 +        ADDR32 MOVWmr(offset, s, X86_NOREG, 1, d);
773 + }
774 + LENDFUNC(NONE,READ,3,raw_mov_w_brR,(W2 d, R4 s, IMM offset))
775 +
776 + LOWFUNC(NONE,READ,3,raw_mov_b_brR,(W1 d, R4 s, IMM offset))
777 + {
778 +        ADDR32 MOVBmr(offset, s, X86_NOREG, 1, d);
779 + }
780 + LENDFUNC(NONE,READ,3,raw_mov_b_brR,(W1 d, R4 s, IMM offset))
781 +
782 + LOWFUNC(NONE,WRITE,3,raw_mov_l_Ri,(R4 d, IMM i, IMM offset))
783 + {
784 +        ADDR32 MOVLim(i, offset, d, X86_NOREG, 1);
785 + }
786 + LENDFUNC(NONE,WRITE,3,raw_mov_l_Ri,(R4 d, IMM i, IMM offset))
787 +
788 + LOWFUNC(NONE,WRITE,3,raw_mov_w_Ri,(R4 d, IMM i, IMM offset))
789 + {
790 +        ADDR32 MOVWim(i, offset, d, X86_NOREG, 1);
791 + }
792 + LENDFUNC(NONE,WRITE,3,raw_mov_w_Ri,(R4 d, IMM i, IMM offset))
793 +
794 + LOWFUNC(NONE,WRITE,3,raw_mov_b_Ri,(R4 d, IMM i, IMM offset))
795 + {
796 +        ADDR32 MOVBim(i, offset, d, X86_NOREG, 1);
797 + }
798 + LENDFUNC(NONE,WRITE,3,raw_mov_b_Ri,(R4 d, IMM i, IMM offset))
799 +
800 + LOWFUNC(NONE,WRITE,3,raw_mov_l_Rr,(R4 d, R4 s, IMM offset))
801 + {
802 +        ADDR32 MOVLrm(s, offset, d, X86_NOREG, 1);
803 + }
804 + LENDFUNC(NONE,WRITE,3,raw_mov_l_Rr,(R4 d, R4 s, IMM offset))
805 +
806 + LOWFUNC(NONE,WRITE,3,raw_mov_w_Rr,(R4 d, R2 s, IMM offset))
807 + {
808 +        ADDR32 MOVWrm(s, offset, d, X86_NOREG, 1);
809 + }
810 + LENDFUNC(NONE,WRITE,3,raw_mov_w_Rr,(R4 d, R2 s, IMM offset))
811 +
812 + LOWFUNC(NONE,WRITE,3,raw_mov_b_Rr,(R4 d, R1 s, IMM offset))
813 + {
814 +        ADDR32 MOVBrm(s, offset, d, X86_NOREG, 1);
815 + }
816 + LENDFUNC(NONE,WRITE,3,raw_mov_b_Rr,(R4 d, R1 s, IMM offset))
817 +
818 + LOWFUNC(NONE,NONE,3,raw_lea_l_brr,(W4 d, R4 s, IMM offset))
819 + {
820 +        LEALmr(offset, s, X86_NOREG, 1, d);
821 + }
822 + LENDFUNC(NONE,NONE,3,raw_lea_l_brr,(W4 d, R4 s, IMM offset))
823 +
824 + LOWFUNC(NONE,NONE,5,raw_lea_l_brr_indexed,(W4 d, R4 s, R4 index, IMM factor, IMM offset))
825 + {
826 +        LEALmr(offset, s, index, factor, d);
827 + }
828 + LENDFUNC(NONE,NONE,5,raw_lea_l_brr_indexed,(W4 d, R4 s, R4 index, IMM factor, IMM offset))
829 +
830 + LOWFUNC(NONE,NONE,4,raw_lea_l_rr_indexed,(W4 d, R4 s, R4 index, IMM factor))
831 + {
832 +        LEALmr(0, s, index, factor, d);
833 + }
834 + LENDFUNC(NONE,NONE,4,raw_lea_l_rr_indexed,(W4 d, R4 s, R4 index, IMM factor))
835 +
836 + LOWFUNC(NONE,NONE,4,raw_lea_l_r_scaled,(W4 d, R4 index, IMM factor))
837 + {
838 +        LEALmr(0, X86_NOREG, index, factor, d);
839 + }
840 + LENDFUNC(NONE,NONE,4,raw_lea_l_r_scaled,(W4 d, R4 index, IMM factor))
841 +
842 + LOWFUNC(NONE,WRITE,3,raw_mov_l_bRr,(R4 d, R4 s, IMM offset))
843 + {
844 +        ADDR32 MOVLrm(s, offset, d, X86_NOREG, 1);
845 + }
846 + LENDFUNC(NONE,WRITE,3,raw_mov_l_bRr,(R4 d, R4 s, IMM offset))
847 +
848 + LOWFUNC(NONE,WRITE,3,raw_mov_w_bRr,(R4 d, R2 s, IMM offset))
849 + {
850 +        ADDR32 MOVWrm(s, offset, d, X86_NOREG, 1);
851 + }
852 + LENDFUNC(NONE,WRITE,3,raw_mov_w_bRr,(R4 d, R2 s, IMM offset))
853 +
854 + LOWFUNC(NONE,WRITE,3,raw_mov_b_bRr,(R4 d, R1 s, IMM offset))
855 + {
856 +        ADDR32 MOVBrm(s, offset, d, X86_NOREG, 1);
857 + }
858 + LENDFUNC(NONE,WRITE,3,raw_mov_b_bRr,(R4 d, R1 s, IMM offset))
859 +
860 + LOWFUNC(NONE,NONE,1,raw_bswap_32,(RW4 r))
861 + {
862 +        BSWAPLr(r);
863 + }
864 + LENDFUNC(NONE,NONE,1,raw_bswap_32,(RW4 r))
865 +
866 + LOWFUNC(WRITE,NONE,1,raw_bswap_16,(RW2 r))
867 + {
868 +        ROLWir(8, r);
869 + }
870 + LENDFUNC(WRITE,NONE,1,raw_bswap_16,(RW2 r))
871 +
872 + LOWFUNC(NONE,NONE,2,raw_mov_l_rr,(W4 d, R4 s))
873 + {
874 +        MOVLrr(s, d);
875 + }
876 + LENDFUNC(NONE,NONE,2,raw_mov_l_rr,(W4 d, R4 s))
877 +
878 + LOWFUNC(NONE,WRITE,2,raw_mov_l_mr,(IMM d, R4 s))
879 + {
880 +        MOVLrm(s, d, X86_NOREG, X86_NOREG, 1);
881 + }
882 + LENDFUNC(NONE,WRITE,2,raw_mov_l_mr,(IMM d, R4 s))
883 +
884 + LOWFUNC(NONE,WRITE,2,raw_mov_w_mr,(IMM d, R2 s))
885 + {
886 +        MOVWrm(s, d, X86_NOREG, X86_NOREG, 1);
887 + }
888 + LENDFUNC(NONE,WRITE,2,raw_mov_w_mr,(IMM d, R2 s))
889 +
890 + LOWFUNC(NONE,READ,2,raw_mov_w_rm,(W2 d, IMM s))
891 + {
892 +        MOVWmr(s, X86_NOREG, X86_NOREG, 1, d);
893 + }
894 + LENDFUNC(NONE,READ,2,raw_mov_w_rm,(W2 d, IMM s))
895 +
896 + LOWFUNC(NONE,WRITE,2,raw_mov_b_mr,(IMM d, R1 s))
897 + {
898 +        MOVBrm(s, d, X86_NOREG, X86_NOREG, 1);
899 + }
900 + LENDFUNC(NONE,WRITE,2,raw_mov_b_mr,(IMM d, R1 s))
901 +
902 + LOWFUNC(NONE,READ,2,raw_mov_b_rm,(W1 d, IMM s))
903 + {
904 +        MOVBmr(s, X86_NOREG, X86_NOREG, 1, d);
905 + }
906 + LENDFUNC(NONE,READ,2,raw_mov_b_rm,(W1 d, IMM s))
907 +
908 + LOWFUNC(NONE,NONE,2,raw_mov_l_ri,(W4 d, IMM s))
909 + {
910 +        MOVLir(s, d);
911 + }
912 + LENDFUNC(NONE,NONE,2,raw_mov_l_ri,(W4 d, IMM s))
913 +
914 + LOWFUNC(NONE,NONE,2,raw_mov_w_ri,(W2 d, IMM s))
915 + {
916 +        MOVWir(s, d);
917 + }
918 + LENDFUNC(NONE,NONE,2,raw_mov_w_ri,(W2 d, IMM s))
919 +
920 + LOWFUNC(NONE,NONE,2,raw_mov_b_ri,(W1 d, IMM s))
921 + {
922 +        MOVBir(s, d);
923 + }
924 + LENDFUNC(NONE,NONE,2,raw_mov_b_ri,(W1 d, IMM s))
925 +
926 + LOWFUNC(RMW,RMW,2,raw_adc_l_mi,(MEMRW d, IMM s))
927 + {
928 +        ADCLim(s, d, X86_NOREG, X86_NOREG, 1);
929 + }
930 + LENDFUNC(RMW,RMW,2,raw_adc_l_mi,(MEMRW d, IMM s))
931 +
932 + LOWFUNC(WRITE,RMW,2,raw_add_l_mi,(IMM d, IMM s))
933 + {
934 +        ADDLim(s, d, X86_NOREG, X86_NOREG, 1);
935 + }
936 + LENDFUNC(WRITE,RMW,2,raw_add_l_mi,(IMM d, IMM s))
937 +
938 + LOWFUNC(WRITE,RMW,2,raw_add_w_mi,(IMM d, IMM s))
939 + {
940 +        ADDWim(s, d, X86_NOREG, X86_NOREG, 1);
941 + }
942 + LENDFUNC(WRITE,RMW,2,raw_add_w_mi,(IMM d, IMM s))
943 +
944 + LOWFUNC(WRITE,RMW,2,raw_add_b_mi,(IMM d, IMM s))
945 + {
946 +        ADDBim(s, d, X86_NOREG, X86_NOREG, 1);
947 + }
948 + LENDFUNC(WRITE,RMW,2,raw_add_b_mi,(IMM d, IMM s))
949 +
950 + LOWFUNC(WRITE,NONE,2,raw_test_l_ri,(R4 d, IMM i))
951 + {
952 +        TESTLir(i, d);
953 + }
954 + LENDFUNC(WRITE,NONE,2,raw_test_l_ri,(R4 d, IMM i))
955 +
956 + LOWFUNC(WRITE,NONE,2,raw_test_l_rr,(R4 d, R4 s))
957 + {
958 +        TESTLrr(s, d);
959 + }
960 + LENDFUNC(WRITE,NONE,2,raw_test_l_rr,(R4 d, R4 s))
961 +
962 + LOWFUNC(WRITE,NONE,2,raw_test_w_rr,(R2 d, R2 s))
963 + {
964 +        TESTWrr(s, d);
965 + }
966 + LENDFUNC(WRITE,NONE,2,raw_test_w_rr,(R2 d, R2 s))
967 +
968 + LOWFUNC(WRITE,NONE,2,raw_test_b_rr,(R1 d, R1 s))
969 + {
970 +        TESTBrr(s, d);
971 + }
972 + LENDFUNC(WRITE,NONE,2,raw_test_b_rr,(R1 d, R1 s))
973 +
974 + LOWFUNC(WRITE,NONE,2,raw_xor_l_ri,(RW4 d, IMM i))
975 + {
976 +        XORLir(i, d);
977 + }
978 + LENDFUNC(WRITE,NONE,2,raw_xor_l_ri,(RW4 d, IMM i))
979 +
980 + LOWFUNC(WRITE,NONE,2,raw_and_l_ri,(RW4 d, IMM i))
981 + {
982 +        ANDLir(i, d);
983 + }
984 + LENDFUNC(WRITE,NONE,2,raw_and_l_ri,(RW4 d, IMM i))
985 +
986 + LOWFUNC(WRITE,NONE,2,raw_and_w_ri,(RW2 d, IMM i))
987 + {
988 +        ANDWir(i, d);
989 + }
990 + LENDFUNC(WRITE,NONE,2,raw_and_w_ri,(RW2 d, IMM i))
991 +
992 + LOWFUNC(WRITE,NONE,2,raw_and_l,(RW4 d, R4 s))
993 + {
994 +        ANDLrr(s, d);
995 + }
996 + LENDFUNC(WRITE,NONE,2,raw_and_l,(RW4 d, R4 s))
997 +
998 + LOWFUNC(WRITE,NONE,2,raw_and_w,(RW2 d, R2 s))
999 + {
1000 +        ANDWrr(s, d);
1001 + }
1002 + LENDFUNC(WRITE,NONE,2,raw_and_w,(RW2 d, R2 s))
1003 +
1004 + LOWFUNC(WRITE,NONE,2,raw_and_b,(RW1 d, R1 s))
1005 + {
1006 +        ANDBrr(s, d);
1007 + }
1008 + LENDFUNC(WRITE,NONE,2,raw_and_b,(RW1 d, R1 s))
1009 +
1010 + LOWFUNC(WRITE,NONE,2,raw_or_l_ri,(RW4 d, IMM i))
1011 + {
1012 +        ORLir(i, d);
1013 + }
1014 + LENDFUNC(WRITE,NONE,2,raw_or_l_ri,(RW4 d, IMM i))
1015 +
1016 + LOWFUNC(WRITE,NONE,2,raw_or_l,(RW4 d, R4 s))
1017 + {
1018 +        ORLrr(s, d);
1019 + }
1020 + LENDFUNC(WRITE,NONE,2,raw_or_l,(RW4 d, R4 s))
1021 +
1022 + LOWFUNC(WRITE,NONE,2,raw_or_w,(RW2 d, R2 s))
1023 + {
1024 +        ORWrr(s, d);
1025 + }
1026 + LENDFUNC(WRITE,NONE,2,raw_or_w,(RW2 d, R2 s))
1027 +
1028 + LOWFUNC(WRITE,NONE,2,raw_or_b,(RW1 d, R1 s))
1029 + {
1030 +        ORBrr(s, d);
1031 + }
1032 + LENDFUNC(WRITE,NONE,2,raw_or_b,(RW1 d, R1 s))
1033 +
1034 + LOWFUNC(RMW,NONE,2,raw_adc_l,(RW4 d, R4 s))
1035 + {
1036 +        ADCLrr(s, d);
1037 + }
1038 + LENDFUNC(RMW,NONE,2,raw_adc_l,(RW4 d, R4 s))
1039 +
1040 + LOWFUNC(RMW,NONE,2,raw_adc_w,(RW2 d, R2 s))
1041 + {
1042 +        ADCWrr(s, d);
1043 + }
1044 + LENDFUNC(RMW,NONE,2,raw_adc_w,(RW2 d, R2 s))
1045 +
1046 + LOWFUNC(RMW,NONE,2,raw_adc_b,(RW1 d, R1 s))
1047 + {
1048 +        ADCBrr(s, d);
1049 + }
1050 + LENDFUNC(RMW,NONE,2,raw_adc_b,(RW1 d, R1 s))
1051 +
1052 + LOWFUNC(WRITE,NONE,2,raw_add_l,(RW4 d, R4 s))
1053 + {
1054 +        ADDLrr(s, d);
1055 + }
1056 + LENDFUNC(WRITE,NONE,2,raw_add_l,(RW4 d, R4 s))
1057 +
1058 + LOWFUNC(WRITE,NONE,2,raw_add_w,(RW2 d, R2 s))
1059 + {
1060 +        ADDWrr(s, d);
1061 + }
1062 + LENDFUNC(WRITE,NONE,2,raw_add_w,(RW2 d, R2 s))
1063 +
1064 + LOWFUNC(WRITE,NONE,2,raw_add_b,(RW1 d, R1 s))
1065 + {
1066 +        ADDBrr(s, d);
1067 + }
1068 + LENDFUNC(WRITE,NONE,2,raw_add_b,(RW1 d, R1 s))
1069 +
1070 + LOWFUNC(WRITE,NONE,2,raw_sub_l_ri,(RW4 d, IMM i))
1071 + {
1072 +        SUBLir(i, d);
1073 + }
1074 + LENDFUNC(WRITE,NONE,2,raw_sub_l_ri,(RW4 d, IMM i))
1075 +
1076 + LOWFUNC(WRITE,NONE,2,raw_sub_b_ri,(RW1 d, IMM i))
1077 + {
1078 +        SUBBir(i, d);
1079 + }
1080 + LENDFUNC(WRITE,NONE,2,raw_sub_b_ri,(RW1 d, IMM i))
1081 +
1082 + LOWFUNC(WRITE,NONE,2,raw_add_l_ri,(RW4 d, IMM i))
1083 + {
1084 +        ADDLir(i, d);
1085 + }
1086 + LENDFUNC(WRITE,NONE,2,raw_add_l_ri,(RW4 d, IMM i))
1087 +
1088 + LOWFUNC(WRITE,NONE,2,raw_add_w_ri,(RW2 d, IMM i))
1089 + {
1090 +        ADDWir(i, d);
1091 + }
1092 + LENDFUNC(WRITE,NONE,2,raw_add_w_ri,(RW2 d, IMM i))
1093 +
1094 + LOWFUNC(WRITE,NONE,2,raw_add_b_ri,(RW1 d, IMM i))
1095 + {
1096 +        ADDBir(i, d);
1097 + }
1098 + LENDFUNC(WRITE,NONE,2,raw_add_b_ri,(RW1 d, IMM i))
1099 +
1100 + LOWFUNC(RMW,NONE,2,raw_sbb_l,(RW4 d, R4 s))
1101 + {
1102 +        SBBLrr(s, d);
1103 + }
1104 + LENDFUNC(RMW,NONE,2,raw_sbb_l,(RW4 d, R4 s))
1105 +
1106 + LOWFUNC(RMW,NONE,2,raw_sbb_w,(RW2 d, R2 s))
1107 + {
1108 +        SBBWrr(s, d);
1109 + }
1110 + LENDFUNC(RMW,NONE,2,raw_sbb_w,(RW2 d, R2 s))
1111 +
1112 + LOWFUNC(RMW,NONE,2,raw_sbb_b,(RW1 d, R1 s))
1113 + {
1114 +        SBBBrr(s, d);
1115 + }
1116 + LENDFUNC(RMW,NONE,2,raw_sbb_b,(RW1 d, R1 s))
1117 +
1118 + LOWFUNC(WRITE,NONE,2,raw_sub_l,(RW4 d, R4 s))
1119 + {
1120 +        SUBLrr(s, d);
1121 + }
1122 + LENDFUNC(WRITE,NONE,2,raw_sub_l,(RW4 d, R4 s))
1123 +
1124 + LOWFUNC(WRITE,NONE,2,raw_sub_w,(RW2 d, R2 s))
1125 + {
1126 +        SUBWrr(s, d);
1127 + }
1128 + LENDFUNC(WRITE,NONE,2,raw_sub_w,(RW2 d, R2 s))
1129 +
1130 + LOWFUNC(WRITE,NONE,2,raw_sub_b,(RW1 d, R1 s))
1131 + {
1132 +        SUBBrr(s, d);
1133 + }
1134 + LENDFUNC(WRITE,NONE,2,raw_sub_b,(RW1 d, R1 s))
1135 +
1136 + LOWFUNC(WRITE,NONE,2,raw_cmp_l,(R4 d, R4 s))
1137 + {
1138 +        CMPLrr(s, d);
1139 + }
1140 + LENDFUNC(WRITE,NONE,2,raw_cmp_l,(R4 d, R4 s))
1141 +
1142 + LOWFUNC(WRITE,NONE,2,raw_cmp_l_ri,(R4 r, IMM i))
1143 + {
1144 +        CMPLir(i, r);
1145 + }
1146 + LENDFUNC(WRITE,NONE,2,raw_cmp_l_ri,(R4 r, IMM i))
1147 +
1148 + LOWFUNC(WRITE,NONE,2,raw_cmp_w,(R2 d, R2 s))
1149 + {
1150 +        CMPWrr(s, d);
1151 + }
1152 + LENDFUNC(WRITE,NONE,2,raw_cmp_w,(R2 d, R2 s))
1153 +
1154 + LOWFUNC(WRITE,READ,2,raw_cmp_b_mi,(MEMR d, IMM s))
1155 + {
1156 +        CMPBim(s, d, X86_NOREG, X86_NOREG, 1);
1157 + }
1158 + LENDFUNC(WRITE,READ,2,raw_cmp_l_mi,(MEMR d, IMM s))
1159 +
1160 + LOWFUNC(WRITE,NONE,2,raw_cmp_b_ri,(R1 d, IMM i))
1161 + {
1162 +        CMPBir(i, d);
1163 + }
1164 + LENDFUNC(WRITE,NONE,2,raw_cmp_b_ri,(R1 d, IMM i))
1165 +
1166 + LOWFUNC(WRITE,NONE,2,raw_cmp_b,(R1 d, R1 s))
1167 + {
1168 +        CMPBrr(s, d);
1169 + }
1170 + LENDFUNC(WRITE,NONE,2,raw_cmp_b,(R1 d, R1 s))
1171 +
1172 + LOWFUNC(WRITE,READ,4,raw_cmp_l_rm_indexed,(R4 d, IMM offset, R4 index, IMM factor))
1173 + {
1174 +        ADDR32 CMPLmr(offset, X86_NOREG, index, factor, d);
1175 + }
1176 + LENDFUNC(WRITE,READ,4,raw_cmp_l_rm_indexed,(R4 d, IMM offset, R4 index, IMM factor))
1177 +
1178 + LOWFUNC(WRITE,NONE,2,raw_xor_l,(RW4 d, R4 s))
1179 + {
1180 +        XORLrr(s, d);
1181 + }
1182 + LENDFUNC(WRITE,NONE,2,raw_xor_l,(RW4 d, R4 s))
1183 +
1184 + LOWFUNC(WRITE,NONE,2,raw_xor_w,(RW2 d, R2 s))
1185 + {
1186 +        XORWrr(s, d);
1187 + }
1188 + LENDFUNC(WRITE,NONE,2,raw_xor_w,(RW2 d, R2 s))
1189 +
1190 + LOWFUNC(WRITE,NONE,2,raw_xor_b,(RW1 d, R1 s))
1191 + {
1192 +        XORBrr(s, d);
1193 + }
1194 + LENDFUNC(WRITE,NONE,2,raw_xor_b,(RW1 d, R1 s))
1195 +
1196 + LOWFUNC(WRITE,RMW,2,raw_sub_l_mi,(MEMRW d, IMM s))
1197 + {
1198 +        SUBLim(s, d, X86_NOREG, X86_NOREG, 1);
1199 + }
1200 + LENDFUNC(WRITE,RMW,2,raw_sub_l_mi,(MEMRW d, IMM s))
1201 +
1202 + LOWFUNC(WRITE,READ,2,raw_cmp_l_mi,(MEMR d, IMM s))
1203 + {
1204 +        CMPLim(s, d, X86_NOREG, X86_NOREG, 1);
1205 + }
1206 + LENDFUNC(WRITE,READ,2,raw_cmp_l_mi,(MEMR d, IMM s))
1207 +
1208 + LOWFUNC(NONE,NONE,2,raw_xchg_l_rr,(RW4 r1, RW4 r2))
1209 + {
1210 +        XCHGLrr(r2, r1);
1211 + }
1212 + LENDFUNC(NONE,NONE,2,raw_xchg_l_rr,(RW4 r1, RW4 r2))
1213 +
1214 + LOWFUNC(NONE,NONE,2,raw_xchg_b_rr,(RW4 r1, RW4 r2))
1215 + {
1216 +        XCHGBrr(r2, r1);
1217 + }
1218 + LENDFUNC(NONE,NONE,2,raw_xchg_b_rr,(RW4 r1, RW4 r2))
1219 +
1220 + LOWFUNC(READ,WRITE,0,raw_pushfl,(void))
1221 + {
1222 +        PUSHF();
1223 + }
1224 + LENDFUNC(READ,WRITE,0,raw_pushfl,(void))
1225 +
1226 + LOWFUNC(WRITE,READ,0,raw_popfl,(void))
1227 + {
1228 +        POPF();
1229 + }
1230 + LENDFUNC(WRITE,READ,0,raw_popfl,(void))
1231 +
1232 + /* Generate floating-point instructions */
1233 + static inline void x86_fadd_m(MEMR s)
1234 + {
1235 +        FADDDm(s,X86_NOREG,X86_NOREG,1);
1236 + }
1237 +
1238 + #else
1239 +
1240   const bool optimize_accum               = true;
1241   const bool optimize_imm8                = true;
1242   const bool optimize_shift_once  = true;
# Line 157 | Line 1272 | LOWFUNC(NONE,READ,1,raw_pop_l_r,(R4 r))
1272   }
1273   LENDFUNC(NONE,READ,1,raw_pop_l_r,(R4 r))
1274  
1275 + LOWFUNC(NONE,READ,1,raw_pop_l_m,(MEMW d))
1276 + {
1277 +        emit_byte(0x8f);
1278 +        emit_byte(0x05);
1279 +        emit_long(d);
1280 + }
1281 + LENDFUNC(NONE,READ,1,raw_pop_l_m,(MEMW d))
1282 +
1283   LOWFUNC(WRITE,NONE,2,raw_bt_l_ri,(R4 r, IMM i))
1284   {
1285          emit_byte(0x0f);
# Line 639 | Line 1762 | LOWFUNC(READ,WRITE,2,raw_setcc_m,(MEMW d
1762   }
1763   LENDFUNC(READ,WRITE,2,raw_setcc_m,(MEMW d, IMM cc))
1764  
1765 + LOWFUNC(READ,NONE,3,raw_cmov_b_rr,(RW1 d, R1 s, IMM cc))
1766 + {
1767 +        /* replacement using branch and mov */
1768 +        int uncc=(cc^1);
1769 +        emit_byte(0x70+uncc);
1770 +        emit_byte(3);  /* skip next 2 bytes if not cc=true */
1771 +    emit_byte(0x88);
1772 +    emit_byte(0xc0+8*s+d);
1773 + }
1774 + LENDFUNC(READ,NONE,3,raw_cmov_b_rr,(RW1 d, R1 s, IMM cc))
1775 +
1776 + LOWFUNC(READ,NONE,3,raw_cmov_w_rr,(RW2 d, R2 s, IMM cc))
1777 + {
1778 +    if (have_cmov) {
1779 +        emit_byte(0x66);
1780 +        emit_byte(0x0f);
1781 +        emit_byte(0x40+cc);
1782 +        emit_byte(0xc0+8*d+s);
1783 +    }
1784 +    else { /* replacement using branch and mov */
1785 +        int uncc=(cc^1);
1786 +        emit_byte(0x70+uncc);
1787 +        emit_byte(3);  /* skip next 3 bytes if not cc=true */
1788 +        emit_byte(0x66);
1789 +        emit_byte(0x89);
1790 +        emit_byte(0xc0+8*s+d);
1791 +    }
1792 + }
1793 + LENDFUNC(READ,NONE,3,raw_cmov_w_rr,(RW2 d, R2 s, IMM cc))
1794 +
1795   LOWFUNC(READ,NONE,3,raw_cmov_l_rr,(RW4 d, R4 s, IMM cc))
1796   {
1797      if (have_cmov) {
# Line 1071 | Line 2224 | LENDFUNC(NONE,READ,3,raw_cmov_l_rm,(W4 d
2224  
2225   LOWFUNC(NONE,READ,3,raw_mov_l_rR,(W4 d, R4 s, IMM offset))
2226   {
2227 +        Dif(!isbyte(offset)) abort();
2228      emit_byte(0x8b);
2229      emit_byte(0x40+8*d+s);
2230      emit_byte(offset);
# Line 1079 | Line 2233 | LENDFUNC(NONE,READ,3,raw_mov_l_rR,(W4 d,
2233  
2234   LOWFUNC(NONE,READ,3,raw_mov_w_rR,(W2 d, R4 s, IMM offset))
2235   {
2236 +        Dif(!isbyte(offset)) abort();
2237      emit_byte(0x66);
2238      emit_byte(0x8b);
2239      emit_byte(0x40+8*d+s);
# Line 1088 | Line 2243 | LENDFUNC(NONE,READ,3,raw_mov_w_rR,(W2 d,
2243  
2244   LOWFUNC(NONE,READ,3,raw_mov_b_rR,(W1 d, R4 s, IMM offset))
2245   {
2246 +        Dif(!isbyte(offset)) abort();
2247      emit_byte(0x8a);
2248      emit_byte(0x40+8*d+s);
2249      emit_byte(offset);
# Line 1121 | Line 2277 | LENDFUNC(NONE,READ,3,raw_mov_b_brR,(W1 d
2277  
2278   LOWFUNC(NONE,WRITE,3,raw_mov_l_Ri,(R4 d, IMM i, IMM offset))
2279   {
2280 +        Dif(!isbyte(offset)) abort();
2281      emit_byte(0xc7);
2282      emit_byte(0x40+d);
2283      emit_byte(offset);
# Line 1130 | Line 2287 | LENDFUNC(NONE,WRITE,3,raw_mov_l_Ri,(R4 d
2287  
2288   LOWFUNC(NONE,WRITE,3,raw_mov_w_Ri,(R4 d, IMM i, IMM offset))
2289   {
2290 +        Dif(!isbyte(offset)) abort();
2291      emit_byte(0x66);
2292      emit_byte(0xc7);
2293      emit_byte(0x40+d);
# Line 1140 | Line 2298 | LENDFUNC(NONE,WRITE,3,raw_mov_w_Ri,(R4 d
2298  
2299   LOWFUNC(NONE,WRITE,3,raw_mov_b_Ri,(R4 d, IMM i, IMM offset))
2300   {
2301 +        Dif(!isbyte(offset)) abort();
2302      emit_byte(0xc6);
2303      emit_byte(0x40+d);
2304      emit_byte(offset);
# Line 1149 | Line 2308 | LENDFUNC(NONE,WRITE,3,raw_mov_b_Ri,(R4 d
2308  
2309   LOWFUNC(NONE,WRITE,3,raw_mov_l_Rr,(R4 d, R4 s, IMM offset))
2310   {
2311 +        Dif(!isbyte(offset)) abort();
2312      emit_byte(0x89);
2313      emit_byte(0x40+8*s+d);
2314      emit_byte(offset);
# Line 1157 | Line 2317 | LENDFUNC(NONE,WRITE,3,raw_mov_l_Rr,(R4 d
2317  
2318   LOWFUNC(NONE,WRITE,3,raw_mov_w_Rr,(R4 d, R2 s, IMM offset))
2319   {
2320 +        Dif(!isbyte(offset)) abort();
2321      emit_byte(0x66);
2322      emit_byte(0x89);
2323      emit_byte(0x40+8*s+d);
# Line 1166 | Line 2327 | LENDFUNC(NONE,WRITE,3,raw_mov_w_Rr,(R4 d
2327  
2328   LOWFUNC(NONE,WRITE,3,raw_mov_b_Rr,(R4 d, R1 s, IMM offset))
2329   {
2330 +        Dif(!isbyte(offset)) abort();
2331      emit_byte(0x88);
2332      emit_byte(0x40+8*s+d);
2333      emit_byte(offset);
# Line 1326 | Line 2488 | LENDFUNC(NONE,READ,2,raw_mov_w_rm,(W2 d,
2488   LOWFUNC(NONE,WRITE,2,raw_mov_b_mr,(IMM d, R1 s))
2489   {
2490      emit_byte(0x88);
2491 <    emit_byte(0x05+8*s);
2491 >    emit_byte(0x05+8*(s&0xf)); /* XXX this handles %ah case (defined as 0x10+4) and others */
2492      emit_long(d);
2493   }
2494   LENDFUNC(NONE,WRITE,2,raw_mov_b_mr,(IMM d, R1 s))
# Line 1440 | Line 2602 | LOWFUNC(WRITE,NONE,2,raw_test_b_rr,(R1 d
2602   }
2603   LENDFUNC(WRITE,NONE,2,raw_test_b_rr,(R1 d, R1 s))
2604  
2605 + LOWFUNC(WRITE,NONE,2,raw_xor_l_ri,(RW4 d, IMM i))
2606 + {
2607 +    emit_byte(0x81);
2608 +    emit_byte(0xf0+d);
2609 +    emit_long(i);
2610 + }
2611 + LENDFUNC(WRITE,NONE,2,raw_xor_l_ri,(RW4 d, IMM i))
2612 +
2613   LOWFUNC(WRITE,NONE,2,raw_and_l_ri,(RW4 d, IMM i))
2614   {
2615          if (optimize_imm8 && isbyte(i)) {
# Line 1855 | Line 3025 | LOWFUNC(NONE,NONE,2,raw_xchg_l_rr,(RW4 r
3025   }
3026   LENDFUNC(NONE,NONE,2,raw_xchg_l_rr,(RW4 r1, RW4 r2))
3027  
3028 < /*************************************************************************
1859 < * FIXME: string-related instructions                                    *
1860 < *************************************************************************/
1861 <
1862 < LOWFUNC(WRITE,NONE,0,raw_cld,(void))
1863 < {
1864 <        emit_byte(0xfc);
1865 < }
1866 < LENDFUNC(WRITE,NONE,0,raw_cld,(void))
1867 <
1868 < LOWFUNC(WRITE,NONE,0,raw_std,(void))
1869 < {
1870 <        emit_byte(0xfd);
1871 < }
1872 < LENDFUNC(WRITE,NONE,0,raw_std,(void))
1873 <
1874 < LOWFUNC(NONE,RMW,0,raw_movs_b,(void))
1875 < {
1876 <        emit_byte(0xa4);
1877 < }
1878 < LENDFUNC(NONE,RMW,0,raw_movs_b,(void))
1879 <
1880 < LOWFUNC(NONE,RMW,0,raw_movs_l,(void))
1881 < {
1882 <        emit_byte(0xa5);
1883 < }
1884 < LENDFUNC(NONE,RMW,0,raw_movs_l,(void))
1885 <
1886 < LOWFUNC(NONE,RMW,0,raw_rep,(void))
1887 < {
1888 <        emit_byte(0xf3);
1889 < }
1890 < LENDFUNC(NONE,RMW,0,raw_rep,(void))
1891 <
1892 < LOWFUNC(NONE,RMW,0,raw_rep_movsb,(void))
1893 < {
1894 <        raw_rep();
1895 <        raw_movs_b();
1896 < }
1897 < LENDFUNC(NONE,RMW,0,raw_rep_movsb,(void))
1898 <
1899 < LOWFUNC(NONE,RMW,0,raw_rep_movsl,(void))
3028 > LOWFUNC(NONE,NONE,2,raw_xchg_b_rr,(RW4 r1, RW4 r2))
3029   {
3030 <        raw_rep();
3031 <        raw_movs_l();
3030 >  emit_byte(0x86);
3031 >  emit_byte(0xc0+8*(r1&0xf)+(r2&0xf)); /* XXX this handles upper-halves registers (e.g. %ah defined as 0x10+4) */
3032   }
3033 < LENDFUNC(NONE,RMW,0,raw_rep_movsl,(void))
3033 > LENDFUNC(NONE,NONE,2,raw_xchg_l_rr,(RW4 r1, RW4 r2))
3034  
3035   /*************************************************************************
3036   * FIXME: mem access modes probably wrong                                *
# Line 1919 | Line 3048 | LOWFUNC(WRITE,READ,0,raw_popfl,(void))
3048   }
3049   LENDFUNC(WRITE,READ,0,raw_popfl,(void))
3050  
3051 + /* Generate floating-point instructions */
3052 + static inline void x86_fadd_m(MEMR s)
3053 + {
3054 +        emit_byte(0xdc);
3055 +        emit_byte(0x05);
3056 +        emit_long(s);
3057 + }
3058 +
3059 + #endif
3060 +
3061   /*************************************************************************
3062   * Unoptimizable stuff --- jump                                          *
3063   *************************************************************************/
3064  
3065   static __inline__ void raw_call_r(R4 r)
3066   {
3067 + #if USE_NEW_RTASM
3068 +    CALLsr(r);
3069 + #else
3070      emit_byte(0xff);
3071      emit_byte(0xd0+r);
3072 + #endif
3073   }
3074  
3075   static __inline__ void raw_call_m_indexed(uae_u32 base, uae_u32 r, uae_u32 m)
3076   {
3077 + #if USE_NEW_RTASM
3078 +    CALLsm(base, X86_NOREG, r, m);
3079 + #else
3080      int mu;
3081      switch(m) {
3082       case 1: mu=0; break;
# Line 1943 | Line 3089 | static __inline__ void raw_call_m_indexe
3089      emit_byte(0x14);
3090      emit_byte(0x05+8*r+0x40*mu);
3091      emit_long(base);
3092 + #endif
3093   }
3094  
3095   static __inline__ void raw_jmp_r(R4 r)
3096   {
3097 + #if USE_NEW_RTASM
3098 +    JMPsr(r);
3099 + #else
3100      emit_byte(0xff);
3101      emit_byte(0xe0+r);
3102 + #endif
3103   }
3104  
3105   static __inline__ void raw_jmp_m_indexed(uae_u32 base, uae_u32 r, uae_u32 m)
3106   {
3107 + #if USE_NEW_RTASM
3108 +    JMPsm(base, X86_NOREG, r, m);
3109 + #else
3110      int mu;
3111      switch(m) {
3112       case 1: mu=0; break;
# Line 1965 | Line 3119 | static __inline__ void raw_jmp_m_indexed
3119      emit_byte(0x24);
3120      emit_byte(0x05+8*r+0x40*mu);
3121      emit_long(base);
3122 + #endif
3123   }
3124  
3125   static __inline__ void raw_jmp_m(uae_u32 base)
# Line 1977 | Line 3132 | static __inline__ void raw_jmp_m(uae_u32
3132  
3133   static __inline__ void raw_call(uae_u32 t)
3134   {
3135 + #if USE_NEW_RTASM
3136 +    CALLm(t);
3137 + #else
3138      emit_byte(0xe8);
3139      emit_long(t-(uae_u32)target-4);
3140 + #endif
3141   }
3142  
3143   static __inline__ void raw_jmp(uae_u32 t)
3144   {
3145 + #if USE_NEW_RTASM
3146 +    JMPm(t);
3147 + #else
3148      emit_byte(0xe9);
3149      emit_long(t-(uae_u32)target-4);
3150 + #endif
3151   }
3152  
3153   static __inline__ void raw_jl(uae_u32 t)
3154   {
3155      emit_byte(0x0f);
3156      emit_byte(0x8c);
3157 <    emit_long(t-(uae_u32)target-4);
3157 >    emit_long(t-(uintptr)target-4);
3158   }
3159  
3160   static __inline__ void raw_jz(uae_u32 t)
3161   {
3162      emit_byte(0x0f);
3163      emit_byte(0x84);
3164 <    emit_long(t-(uae_u32)target-4);
3164 >    emit_long(t-(uintptr)target-4);
3165   }
3166  
3167   static __inline__ void raw_jnz(uae_u32 t)
3168   {
3169      emit_byte(0x0f);
3170      emit_byte(0x85);
3171 <    emit_long(t-(uae_u32)target-4);
3171 >    emit_long(t-(uintptr)target-4);
3172   }
3173  
3174   static __inline__ void raw_jnz_l_oponly(void)
# Line 2055 | Line 3218 | static __inline__ void raw_nop(void)
3218      emit_byte(0x90);
3219   }
3220  
3221 + static __inline__ void raw_emit_nop_filler(int nbytes)
3222 + {
3223 +  /* Source: GNU Binutils 2.12.90.0.15 */
3224 +  /* Various efficient no-op patterns for aligning code labels.
3225 +     Note: Don't try to assemble the instructions in the comments.
3226 +     0L and 0w are not legal.  */
3227 +  static const uae_u8 f32_1[] =
3228 +    {0x90};                                                                     /* nop                                  */
3229 +  static const uae_u8 f32_2[] =
3230 +    {0x89,0xf6};                                                        /* movl %esi,%esi               */
3231 +  static const uae_u8 f32_3[] =
3232 +    {0x8d,0x76,0x00};                                           /* leal 0(%esi),%esi    */
3233 +  static const uae_u8 f32_4[] =
3234 +    {0x8d,0x74,0x26,0x00};                                      /* leal 0(%esi,1),%esi  */
3235 +  static const uae_u8 f32_5[] =
3236 +    {0x90,                                                                      /* nop                                  */
3237 +     0x8d,0x74,0x26,0x00};                                      /* leal 0(%esi,1),%esi  */
3238 +  static const uae_u8 f32_6[] =
3239 +    {0x8d,0xb6,0x00,0x00,0x00,0x00};            /* leal 0L(%esi),%esi   */
3240 +  static const uae_u8 f32_7[] =
3241 +    {0x8d,0xb4,0x26,0x00,0x00,0x00,0x00};       /* leal 0L(%esi,1),%esi */
3242 +  static const uae_u8 f32_8[] =
3243 +    {0x90,                                                                      /* nop                                  */
3244 +     0x8d,0xb4,0x26,0x00,0x00,0x00,0x00};       /* leal 0L(%esi,1),%esi */
3245 +  static const uae_u8 f32_9[] =
3246 +    {0x89,0xf6,                                                         /* movl %esi,%esi               */
3247 +     0x8d,0xbc,0x27,0x00,0x00,0x00,0x00};       /* leal 0L(%edi,1),%edi */
3248 +  static const uae_u8 f32_10[] =
3249 +    {0x8d,0x76,0x00,                                            /* leal 0(%esi),%esi    */
3250 +     0x8d,0xbc,0x27,0x00,0x00,0x00,0x00};       /* leal 0L(%edi,1),%edi */
3251 +  static const uae_u8 f32_11[] =
3252 +    {0x8d,0x74,0x26,0x00,                                       /* leal 0(%esi,1),%esi  */
3253 +     0x8d,0xbc,0x27,0x00,0x00,0x00,0x00};       /* leal 0L(%edi,1),%edi */
3254 +  static const uae_u8 f32_12[] =
3255 +    {0x8d,0xb6,0x00,0x00,0x00,0x00,                     /* leal 0L(%esi),%esi   */
3256 +     0x8d,0xbf,0x00,0x00,0x00,0x00};            /* leal 0L(%edi),%edi   */
3257 +  static const uae_u8 f32_13[] =
3258 +    {0x8d,0xb6,0x00,0x00,0x00,0x00,                     /* leal 0L(%esi),%esi   */
3259 +     0x8d,0xbc,0x27,0x00,0x00,0x00,0x00};       /* leal 0L(%edi,1),%edi */
3260 +  static const uae_u8 f32_14[] =
3261 +    {0x8d,0xb4,0x26,0x00,0x00,0x00,0x00,        /* leal 0L(%esi,1),%esi */
3262 +     0x8d,0xbc,0x27,0x00,0x00,0x00,0x00};       /* leal 0L(%edi,1),%edi */
3263 +  static const uae_u8 f32_15[] =
3264 +    {0xeb,0x0d,0x90,0x90,0x90,0x90,0x90,        /* jmp .+15; lotsa nops */
3265 +     0x90,0x90,0x90,0x90,0x90,0x90,0x90,0x90};
3266 +  static const uae_u8 f32_16[] =
3267 +    {0xeb,0x0d,0x90,0x90,0x90,0x90,0x90,        /* jmp .+15; lotsa nops */
3268 +     0x90,0x90,0x90,0x90,0x90,0x90,0x90,0x90,0x90};
3269 +  static const uae_u8 *const f32_patt[] = {
3270 +    f32_1, f32_2, f32_3, f32_4, f32_5, f32_6, f32_7, f32_8,
3271 +    f32_9, f32_10, f32_11, f32_12, f32_13, f32_14, f32_15
3272 +  };
3273 +  static const uae_u8 prefixes[4] = { 0x66, 0x66, 0x66, 0x66 };
3274 +
3275 + #if defined(__x86_64__)
3276 +  /* The recommended way to pad 64bit code is to use NOPs preceded by
3277 +     maximally four 0x66 prefixes.  Balance the size of nops.  */
3278 +  if (nbytes == 0)
3279 +          return;
3280 +
3281 +  int i;
3282 +  int nnops = (nbytes + 3) / 4;
3283 +  int len = nbytes / nnops;
3284 +  int remains = nbytes - nnops * len;
3285 +
3286 +  for (i = 0; i < remains; i++) {
3287 +          emit_block(prefixes, len);
3288 +          raw_nop();
3289 +  }
3290 +  for (; i < nnops; i++) {
3291 +          emit_block(prefixes, len - 1);
3292 +          raw_nop();
3293 +  }
3294 + #else
3295 +  int nloops = nbytes / 16;
3296 +  while (nloops-- > 0)
3297 +        emit_block(f32_16, sizeof(f32_16));
3298 +
3299 +  nbytes %= 16;
3300 +  if (nbytes)
3301 +        emit_block(f32_patt[nbytes - 1], nbytes);
3302 + #endif
3303 + }
3304 +
3305  
3306   /*************************************************************************
3307   * Flag handling, to and fro UAE flag register                           *
3308   *************************************************************************/
3309  
3310 < #ifdef SAHF_SETO_PROFITABLE
2064 <
2065 < #define FLAG_NREG1 0  /* Set to -1 if any register will do */
2066 <
2067 < static __inline__ void raw_flags_to_reg(int r)
3310 > static __inline__ void raw_flags_evicted(int r)
3311   {
2069  raw_lahf(0);  /* Most flags in AH */
2070  //raw_setcc(r,0); /* V flag in AL */
2071  raw_setcc_m((uae_u32)live.state[FLAGTMP].mem,0);
2072  
2073 #if 1   /* Let's avoid those nasty partial register stalls */
2074  //raw_mov_b_mr((uae_u32)live.state[FLAGTMP].mem,r);
2075  raw_mov_b_mr(((uae_u32)live.state[FLAGTMP].mem)+1,r+4);
3312    //live.state[FLAGTMP].status=CLEAN;
3313    live.state[FLAGTMP].status=INMEM;
3314    live.state[FLAGTMP].realreg=-1;
# Line 2082 | Line 3318 | static __inline__ void raw_flags_to_reg(
3318        abort();
3319    }
3320    live.nat[r].nholds=0;
3321 + }
3322 +
3323 + #define FLAG_NREG1_FLAGREG 0  /* Set to -1 if any register will do */
3324 + static __inline__ void raw_flags_to_reg_FLAGREG(int r)
3325 + {
3326 +  raw_lahf(0);  /* Most flags in AH */
3327 +  //raw_setcc(r,0); /* V flag in AL */
3328 +  raw_setcc_m((uintptr)live.state[FLAGTMP].mem,0);
3329 +  
3330 + #if 1   /* Let's avoid those nasty partial register stalls */
3331 +  //raw_mov_b_mr((uintptr)live.state[FLAGTMP].mem,r);
3332 +  raw_mov_b_mr(((uintptr)live.state[FLAGTMP].mem)+1,AH_INDEX);
3333 +  raw_flags_evicted(r);
3334   #endif
3335   }
3336  
3337 < #define FLAG_NREG2 0  /* Set to -1 if any register will do */
3338 < static __inline__ void raw_reg_to_flags(int r)
3337 > #define FLAG_NREG2_FLAGREG 0  /* Set to -1 if any register will do */
3338 > static __inline__ void raw_reg_to_flags_FLAGREG(int r)
3339   {
3340    raw_cmp_b_ri(r,-127); /* set V */
3341    raw_sahf(0);
3342   }
3343  
3344 < #else
3344 > #define FLAG_NREG3_FLAGREG 0  /* Set to -1 if any register will do */
3345 > static __inline__ void raw_flags_set_zero_FLAGREG(int s, int tmp)
3346 > {
3347 >    raw_mov_l_rr(tmp,s);
3348 >    raw_lahf(s); /* flags into ah */
3349 >    raw_and_l_ri(s,0xffffbfff);
3350 >    raw_and_l_ri(tmp,0x00004000);
3351 >    raw_xor_l_ri(tmp,0x00004000);
3352 >    raw_or_l(s,tmp);
3353 >    raw_sahf(s);
3354 > }
3355  
3356 < #define FLAG_NREG1 -1  /* Set to -1 if any register will do */
3357 < static __inline__ void raw_flags_to_reg(int r)
3356 > static __inline__ void raw_flags_init_FLAGREG(void) { }
3357 >
3358 > #define FLAG_NREG1_FLAGSTK -1  /* Set to -1 if any register will do */
3359 > static __inline__ void raw_flags_to_reg_FLAGSTK(int r)
3360   {
3361          raw_pushfl();
3362          raw_pop_l_r(r);
3363 <        raw_mov_l_mr((uae_u32)live.state[FLAGTMP].mem,r);
3364 < //      live.state[FLAGTMP].status=CLEAN;
2104 <        live.state[FLAGTMP].status=INMEM;
2105 <        live.state[FLAGTMP].realreg=-1;
2106 <        /* We just "evicted" FLAGTMP. */
2107 <        if (live.nat[r].nholds!=1) {
2108 <          /* Huh? */
2109 <          abort();
2110 <        }
2111 <        live.nat[r].nholds=0;
3363 >        raw_mov_l_mr((uintptr)live.state[FLAGTMP].mem,r);
3364 >        raw_flags_evicted(r);
3365   }
3366  
3367 < #define FLAG_NREG2 -1  /* Set to -1 if any register will do */
3368 < static __inline__ void raw_reg_to_flags(int r)
3367 > #define FLAG_NREG2_FLAGSTK -1  /* Set to -1 if any register will do */
3368 > static __inline__ void raw_reg_to_flags_FLAGSTK(int r)
3369   {
3370          raw_push_l_r(r);
3371          raw_popfl();
3372   }
3373  
3374 + #define FLAG_NREG3_FLAGSTK -1  /* Set to -1 if any register will do */
3375 + static __inline__ void raw_flags_set_zero_FLAGSTK(int s, int tmp)
3376 + {
3377 +    raw_mov_l_rr(tmp,s);
3378 +    raw_pushfl();
3379 +    raw_pop_l_r(s);
3380 +    raw_and_l_ri(s,0xffffffbf);
3381 +    raw_and_l_ri(tmp,0x00000040);
3382 +    raw_xor_l_ri(tmp,0x00000040);
3383 +    raw_or_l(s,tmp);
3384 +    raw_push_l_r(s);
3385 +    raw_popfl();
3386 + }
3387 +
3388 + static __inline__ void raw_flags_init_FLAGSTK(void) { }
3389 +
3390 + #if defined(__x86_64__)
3391 + /* Try to use the LAHF/SETO method on x86_64 since it is faster.
3392 +   This can't be the default because some older CPUs don't support
3393 +   LAHF/SAHF in long mode.  */
3394 + static int FLAG_NREG1_FLAGGEN = 0;
3395 + static __inline__ void raw_flags_to_reg_FLAGGEN(int r)
3396 + {
3397 +        if (have_lahf_lm) {
3398 +                // NOTE: the interpreter uses the normal EFLAGS layout
3399 +                //   pushf/popf CF(0) ZF( 6) SF( 7) OF(11)
3400 +                //   sahf/lahf  CF(8) ZF(14) SF(15) OF( 0)
3401 +                assert(r == 0);
3402 +                raw_setcc(r,0);                                 /* V flag in AL */
3403 +                raw_lea_l_r_scaled(0,0,8);              /* move it to its EFLAGS location */
3404 +                raw_mov_b_mr(((uintptr)live.state[FLAGTMP].mem)+1,0);
3405 +                raw_lahf(0);                                    /* most flags in AH */
3406 +                raw_mov_b_mr((uintptr)live.state[FLAGTMP].mem,AH_INDEX);
3407 +                raw_flags_evicted(r);
3408 +        }
3409 +        else
3410 +                raw_flags_to_reg_FLAGSTK(r);
3411 + }
3412 +
3413 + static int FLAG_NREG2_FLAGGEN = 0;
3414 + static __inline__ void raw_reg_to_flags_FLAGGEN(int r)
3415 + {
3416 +        if (have_lahf_lm) {
3417 +                raw_xchg_b_rr(0,AH_INDEX);
3418 +                raw_cmp_b_ri(r,-120); /* set V */
3419 +                raw_sahf(0);
3420 +        }
3421 +        else
3422 +                raw_reg_to_flags_FLAGSTK(r);
3423 + }
3424 +
3425 + static int FLAG_NREG3_FLAGGEN = 0;
3426 + static __inline__ void raw_flags_set_zero_FLAGGEN(int s, int tmp)
3427 + {
3428 +        if (have_lahf_lm)
3429 +                raw_flags_set_zero_FLAGREG(s, tmp);
3430 +        else
3431 +                raw_flags_set_zero_FLAGSTK(s, tmp);
3432 + }
3433 +
3434 + static __inline__ void raw_flags_init_FLAGGEN(void)
3435 + {
3436 +        if (have_lahf_lm) {
3437 +                FLAG_NREG1_FLAGGEN = FLAG_NREG1_FLAGREG;
3438 +                FLAG_NREG2_FLAGGEN = FLAG_NREG2_FLAGREG;
3439 +                FLAG_NREG1_FLAGGEN = FLAG_NREG3_FLAGREG;
3440 +        }
3441 +        else {
3442 +                FLAG_NREG1_FLAGGEN = FLAG_NREG1_FLAGSTK;
3443 +                FLAG_NREG2_FLAGGEN = FLAG_NREG2_FLAGSTK;
3444 +                FLAG_NREG1_FLAGGEN = FLAG_NREG3_FLAGSTK;
3445 +        }
3446 + }
3447 + #endif
3448 +
3449 + #ifdef SAHF_SETO_PROFITABLE
3450 + #define FLAG_SUFFIX FLAGREG
3451 + #elif defined __x86_64__
3452 + #define FLAG_SUFFIX FLAGGEN
3453 + #else
3454 + #define FLAG_SUFFIX FLAGSTK
3455   #endif
3456  
3457 + #define FLAG_GLUE_2(x, y)               x ## _ ## y
3458 + #define FLAG_GLUE_1(x, y)               FLAG_GLUE_2(x, y)
3459 + #define FLAG_GLUE(x)                    FLAG_GLUE_1(x, FLAG_SUFFIX)
3460 +
3461 + #define raw_flags_init                  FLAG_GLUE(raw_flags_init)
3462 + #define FLAG_NREG1                              FLAG_GLUE(FLAG_NREG1)
3463 + #define raw_flags_to_reg                FLAG_GLUE(raw_flags_to_reg)
3464 + #define FLAG_NREG2                              FLAG_GLUE(FLAG_NREG2)
3465 + #define raw_reg_to_flags                FLAG_GLUE(raw_reg_to_flags)
3466 + #define FLAG_NREG3                              FLAG_GLUE(FLAG_NREG3)
3467 + #define raw_flags_set_zero              FLAG_GLUE(raw_flags_set_zero)
3468 +
3469   /* Apparently, there are enough instructions between flag store and
3470     flag reload to avoid the partial memory stall */
3471   static __inline__ void raw_load_flagreg(uae_u32 target, uae_u32 r)
3472   {
3473   #if 1
3474 <    raw_mov_l_rm(target,(uae_u32)live.state[r].mem);
3474 >    raw_mov_l_rm(target,(uintptr)live.state[r].mem);
3475   #else
3476 <    raw_mov_b_rm(target,(uae_u32)live.state[r].mem);
3477 <    raw_mov_b_rm(target+4,((uae_u32)live.state[r].mem)+1);
3476 >    raw_mov_b_rm(target,(uintptr)live.state[r].mem);
3477 >    raw_mov_b_rm(target+4,((uintptr)live.state[r].mem)+1);
3478   #endif
3479   }
3480  
# Line 2136 | Line 3482 | static __inline__ void raw_load_flagreg(
3482   static __inline__ void raw_load_flagx(uae_u32 target, uae_u32 r)
3483   {
3484      if (live.nat[target].canbyte)
3485 <        raw_mov_b_rm(target,(uae_u32)live.state[r].mem);
3485 >        raw_mov_b_rm(target,(uintptr)live.state[r].mem);
3486      else if (live.nat[target].canword)
3487 <        raw_mov_w_rm(target,(uae_u32)live.state[r].mem);
3487 >        raw_mov_w_rm(target,(uintptr)live.state[r].mem);
3488      else
3489 <        raw_mov_l_rm(target,(uae_u32)live.state[r].mem);
3489 >        raw_mov_l_rm(target,(uintptr)live.state[r].mem);
3490   }
3491  
3492 + static __inline__ void raw_dec_sp(int off)
3493 + {
3494 +    if (off) raw_sub_l_ri(ESP_INDEX,off);
3495 + }
3496  
3497   static __inline__ void raw_inc_sp(int off)
3498   {
3499 <    raw_add_l_ri(ESP_INDEX,off);
3499 >    if (off) raw_add_l_ri(ESP_INDEX,off);
3500   }
3501  
3502   /*************************************************************************
# Line 2305 | Line 3655 | static void vec(int x, struct sigcontext
3655                  for (i=0;i<5;i++)
3656                      vecbuf[i]=target[i];
3657                  emit_byte(0xe9);
3658 <                emit_long((uae_u32)veccode-(uae_u32)target-4);
3658 >                emit_long((uintptr)veccode-(uintptr)target-4);
3659                  write_log("Create jump to %p\n",veccode);
3660              
3661                  write_log("Handled one access!\n");
# Line 2332 | Line 3682 | static void vec(int x, struct sigcontext
3682                  }
3683                  for (i=0;i<5;i++)
3684                      raw_mov_b_mi(sc.eip+i,vecbuf[i]);
3685 <                raw_mov_l_mi((uae_u32)&in_handler,0);
3685 >                raw_mov_l_mi((uintptr)&in_handler,0);
3686                  emit_byte(0xe9);
3687 <                emit_long(sc.eip+len-(uae_u32)target-4);
3687 >                emit_long(sc.eip+len-(uintptr)target-4);
3688                  in_handler=1;
3689                  target=tmp;
3690              }
# Line 2429 | Line 3779 | enum {
3779    X86_PROCESSOR_K6,
3780    X86_PROCESSOR_ATHLON,
3781    X86_PROCESSOR_PENTIUM4,
3782 +  X86_PROCESSOR_X86_64,
3783    X86_PROCESSOR_max
3784   };
3785  
# Line 2439 | Line 3790 | static const char * x86_processor_string
3790    "PentiumPro",
3791    "K6",
3792    "Athlon",
3793 <  "Pentium4"
3793 >  "Pentium4",
3794 >  "x86-64"
3795   };
3796  
3797   static struct ptt {
# Line 2456 | Line 3808 | x86_alignments[X86_PROCESSOR_max] = {
3808    { 16, 15, 16,  7, 16 },
3809    { 32,  7, 32,  7, 32 },
3810    { 16,  7, 16,  7, 16 },
3811 <  {  0,  0,  0,  0,  0 }
3811 >  {  0,  0,  0,  0,  0 },
3812 >  { 16,  7, 16,  7, 16 }
3813   };
3814  
3815   static void
# Line 2490 | Line 3843 | x86_get_cpu_vendor(struct cpuinfo_x86 *c
3843   static void
3844   cpuid(uae_u32 op, uae_u32 *eax, uae_u32 *ebx, uae_u32 *ecx, uae_u32 *edx)
3845   {
3846 <  static uae_u8 cpuid_space[256];  
3846 >  const int CPUID_SPACE = 4096;
3847 >  uae_u8* cpuid_space = (uae_u8 *)vm_acquire(CPUID_SPACE);
3848 >  if (cpuid_space == VM_MAP_FAILED)
3849 >    abort();
3850 >  vm_protect(cpuid_space, CPUID_SPACE, VM_PAGE_READ | VM_PAGE_WRITE | VM_PAGE_EXECUTE);
3851 >
3852 >  static uae_u32 s_op, s_eax, s_ebx, s_ecx, s_edx;
3853    uae_u8* tmp=get_target();
3854  
3855 +  s_op = op;
3856    set_target(cpuid_space);
3857    raw_push_l_r(0); /* eax */
3858    raw_push_l_r(1); /* ecx */
3859    raw_push_l_r(2); /* edx */
3860    raw_push_l_r(3); /* ebx */
3861 <  raw_mov_l_rm(0,(uae_u32)&op);
3861 >  raw_mov_l_rm(0,(uintptr)&s_op);
3862    raw_cpuid(0);
3863 <  if (eax != NULL) raw_mov_l_mr((uae_u32)eax,0);
3864 <  if (ebx != NULL) raw_mov_l_mr((uae_u32)ebx,3);
3865 <  if (ecx != NULL) raw_mov_l_mr((uae_u32)ecx,1);
3866 <  if (edx != NULL) raw_mov_l_mr((uae_u32)edx,2);
3863 >  raw_mov_l_mr((uintptr)&s_eax,0);
3864 >  raw_mov_l_mr((uintptr)&s_ebx,3);
3865 >  raw_mov_l_mr((uintptr)&s_ecx,1);
3866 >  raw_mov_l_mr((uintptr)&s_edx,2);
3867    raw_pop_l_r(3);
3868    raw_pop_l_r(2);
3869    raw_pop_l_r(1);
# Line 2512 | Line 3872 | cpuid(uae_u32 op, uae_u32 *eax, uae_u32
3872    set_target(tmp);
3873  
3874    ((cpuop_func*)cpuid_space)(0);
3875 +  if (eax != NULL) *eax = s_eax;
3876 +  if (ebx != NULL) *ebx = s_ebx;
3877 +  if (ecx != NULL) *ecx = s_ecx;
3878 +  if (edx != NULL) *edx = s_edx;
3879 +
3880 +  vm_release(cpuid_space, CPUID_SPACE);
3881   }
3882  
3883   static void
# Line 2520 | Line 3886 | raw_init_cpu(void)
3886    struct cpuinfo_x86 *c = &cpuinfo;
3887  
3888    /* Defaults */
3889 +  c->x86_processor = X86_PROCESSOR_max;
3890    c->x86_vendor = X86_VENDOR_UNKNOWN;
3891    c->cpuid_level = -1;                          /* CPUID not detected */
3892    c->x86_model = c->x86_mask = 0;       /* So far unknown... */
# Line 2541 | Line 3908 | raw_init_cpu(void)
3908          uae_u32 tfms, brand_id;
3909          cpuid(0x00000001, &tfms, &brand_id, NULL, &c->x86_hwcap);
3910          c->x86 = (tfms >> 8) & 15;
3911 +        if (c->x86 == 0xf)
3912 +                c->x86 += (tfms >> 20) & 0xff; /* extended family */
3913          c->x86_model = (tfms >> 4) & 15;
3914 +        if (c->x86_model == 0xf)
3915 +                c->x86_model |= (tfms >> 12) & 0xf0; /* extended model */
3916          c->x86_brand_id = brand_id & 0xff;
2546        if ( (c->x86_vendor == X86_VENDOR_AMD) &&
2547                 (c->x86 == 0xf)) {
2548          /* AMD Extended Family and Model Values */
2549          c->x86 += (tfms >> 20) & 0xff;
2550          c->x86_model += (tfms >> 12) & 0xf0;
2551        }
3917          c->x86_mask = tfms & 15;
3918    } else {
3919          /* Have CPUID level 0 only - unheard of */
3920          c->x86 = 4;
3921    }
3922  
3923 +  /* AMD-defined flags: level 0x80000001 */
3924 +  uae_u32 xlvl;
3925 +  cpuid(0x80000000, &xlvl, NULL, NULL, NULL);
3926 +  if ( (xlvl & 0xffff0000) == 0x80000000 ) {
3927 +        if ( xlvl >= 0x80000001 ) {
3928 +          uae_u32 features, extra_features;
3929 +          cpuid(0x80000001, NULL, NULL, &extra_features, &features);
3930 +          if (features & (1 << 29)) {
3931 +                /* Assume x86-64 if long mode is supported */
3932 +                c->x86_processor = X86_PROCESSOR_X86_64;
3933 +          }
3934 +          if (extra_features & (1 << 0))
3935 +                  have_lahf_lm = true;
3936 +        }
3937 +  }
3938 +          
3939    /* Canonicalize processor ID */
2559  c->x86_processor = X86_PROCESSOR_max;
3940    switch (c->x86) {
3941    case 3:
3942          c->x86_processor = X86_PROCESSOR_I386;
# Line 2577 | Line 3957 | raw_init_cpu(void)
3957            c->x86_processor = X86_PROCESSOR_PENTIUMPRO;
3958          break;
3959    case 15:
3960 <        if (c->x86_vendor == X86_VENDOR_INTEL) {
3961 <          /*  Assume any BranID >= 8 and family == 15 yields a Pentium 4 */
3962 <          if (c->x86_brand_id >= 8)
3963 <                c->x86_processor = X86_PROCESSOR_PENTIUM4;
3964 <        }
3965 <        break;
3960 >          if (c->x86_processor == X86_PROCESSOR_max) {
3961 >                  switch (c->x86_vendor) {
3962 >                  case X86_VENDOR_INTEL:
3963 >                          c->x86_processor = X86_PROCESSOR_PENTIUM4;
3964 >                          break;
3965 >                  case X86_VENDOR_AMD:
3966 >                          /* Assume a 32-bit Athlon processor if not in long mode */
3967 >                          c->x86_processor = X86_PROCESSOR_ATHLON;
3968 >                          break;
3969 >                  }
3970 >          }
3971 >          break;
3972    }
3973    if (c->x86_processor == X86_PROCESSOR_max) {
3974 <        fprintf(stderr, "Error: unknown processor type\n");
3974 >        c->x86_processor = X86_PROCESSOR_I386;
3975 >        fprintf(stderr, "Error: unknown processor type, assuming i386\n");
3976          fprintf(stderr, "  Family  : %d\n", c->x86);
3977          fprintf(stderr, "  Model   : %d\n", c->x86_model);
3978          fprintf(stderr, "  Mask    : %d\n", c->x86_mask);
3979 +        fprintf(stderr, "  Vendor  : %s [%d]\n", c->x86_vendor_id, c->x86_vendor);
3980          if (c->x86_brand_id)
3981            fprintf(stderr, "  BrandID : %02x\n", c->x86_brand_id);
2594        abort();
3982    }
3983  
3984    /* Have CMOV support? */
3985 <  have_cmov = (c->x86_hwcap & (1 << 15)) && true;
3985 >  have_cmov = c->x86_hwcap & (1 << 15);
3986 > #if defined(__x86_64__)
3987 >  if (!have_cmov) {
3988 >          write_log("x86-64 implementations are bound to have CMOV!\n");
3989 >          abort();
3990 >  }
3991 > #endif
3992  
3993    /* Can the host CPU suffer from partial register stalls? */
3994    have_rat_stall = (c->x86_vendor == X86_VENDOR_INTEL);
# Line 2616 | Line 4009 | raw_init_cpu(void)
4009    write_log("Max CPUID level=%d Processor is %s [%s]\n",
4010                          c->cpuid_level, c->x86_vendor_id,
4011                          x86_processor_string_table[c->x86_processor]);
4012 +
4013 +  raw_flags_init();
4014 + }
4015 +
4016 + static bool target_check_bsf(void)
4017 + {
4018 +        bool mismatch = false;
4019 +        for (int g_ZF = 0; g_ZF <= 1; g_ZF++) {
4020 +        for (int g_CF = 0; g_CF <= 1; g_CF++) {
4021 +        for (int g_OF = 0; g_OF <= 1; g_OF++) {
4022 +        for (int g_SF = 0; g_SF <= 1; g_SF++) {
4023 +                for (int value = -1; value <= 1; value++) {
4024 +                        unsigned long flags = (g_SF << 7) | (g_OF << 11) | (g_ZF << 6) | g_CF;
4025 +                        unsigned long tmp = value;
4026 +                        __asm__ __volatile__ ("push %0; popf; bsf %1,%1; pushf; pop %0"
4027 +                                                                  : "+r" (flags), "+r" (tmp) : : "cc");
4028 +                        int OF = (flags >> 11) & 1;
4029 +                        int SF = (flags >>  7) & 1;
4030 +                        int ZF = (flags >>  6) & 1;
4031 +                        int CF = flags & 1;
4032 +                        tmp = (value == 0);
4033 +                        if (ZF != tmp || SF != g_SF || OF != g_OF || CF != g_CF)
4034 +                                mismatch = true;
4035 +                }
4036 +        }}}}
4037 +        if (mismatch)
4038 +                write_log("Target CPU defines all flags on BSF instruction\n");
4039 +        return !mismatch;
4040   }
4041  
4042  
# Line 2740 | Line 4161 | static __inline__ void tos_make(int r)
4161      emit_byte(0xd8+(live.tos+1)-live.spos[r]);  /* store top of stack in reg,
4162                                           and pop it*/
4163   }
4164 <    
4165 <        
4164 >
4165 > /* FP helper functions */
4166 > #if USE_NEW_RTASM
4167 > #define DEFINE_OP(NAME, GEN)                    \
4168 > static inline void raw_##NAME(uint32 m)         \
4169 > {                                               \
4170 >    GEN(m, X86_NOREG, X86_NOREG, 1);            \
4171 > }
4172 > DEFINE_OP(fstl,  FSTDm);
4173 > DEFINE_OP(fstpl, FSTPDm);
4174 > DEFINE_OP(fldl,  FLDDm);
4175 > DEFINE_OP(fildl, FILDLm);
4176 > DEFINE_OP(fistl, FISTLm);
4177 > DEFINE_OP(flds,  FLDSm);
4178 > DEFINE_OP(fsts,  FSTSm);
4179 > DEFINE_OP(fstpt, FSTPTm);
4180 > DEFINE_OP(fldt,  FLDTm);
4181 > #else
4182 > #define DEFINE_OP(NAME, OP1, OP2)               \
4183 > static inline void raw_##NAME(uint32 m)         \
4184 > {                                               \
4185 >    emit_byte(OP1);                             \
4186 >    emit_byte(OP2);                             \
4187 >    emit_long(m);                               \
4188 > }
4189 > DEFINE_OP(fstl,  0xdd, 0x15);
4190 > DEFINE_OP(fstpl, 0xdd, 0x1d);
4191 > DEFINE_OP(fldl,  0xdd, 0x05);
4192 > DEFINE_OP(fildl, 0xdb, 0x05);
4193 > DEFINE_OP(fistl, 0xdb, 0x15);
4194 > DEFINE_OP(flds,  0xd9, 0x05);
4195 > DEFINE_OP(fsts,  0xd9, 0x15);
4196 > DEFINE_OP(fstpt, 0xdb, 0x3d);
4197 > DEFINE_OP(fldt,  0xdb, 0x2d);
4198 > #endif
4199 > #undef DEFINE_OP
4200 >
4201   LOWFUNC(NONE,WRITE,2,raw_fmov_mr,(MEMW m, FR r))
4202   {
4203      make_tos(r);
4204 <    emit_byte(0xdd);
2749 <    emit_byte(0x15);
2750 <    emit_long(m);
4204 >    raw_fstl(m);
4205   }
4206   LENDFUNC(NONE,WRITE,2,raw_fmov_mr,(MEMW m, FR r))
4207  
4208   LOWFUNC(NONE,WRITE,2,raw_fmov_mr_drop,(MEMW m, FR r))
4209   {
4210      make_tos(r);
4211 <    emit_byte(0xdd);
2758 <    emit_byte(0x1d);
2759 <    emit_long(m);
4211 >    raw_fstpl(m);
4212      live.onstack[live.tos]=-1;
4213      live.tos--;
4214      live.spos[r]=-2;
# Line 2765 | Line 4217 | LENDFUNC(NONE,WRITE,2,raw_fmov_mr,(MEMW
4217  
4218   LOWFUNC(NONE,READ,2,raw_fmov_rm,(FW r, MEMR m))
4219   {
4220 <    emit_byte(0xdd);
2769 <    emit_byte(0x05);
2770 <    emit_long(m);
4220 >    raw_fldl(m);
4221      tos_make(r);
4222   }
4223   LENDFUNC(NONE,READ,2,raw_fmov_rm,(FW r, MEMR m))
4224  
4225   LOWFUNC(NONE,READ,2,raw_fmovi_rm,(FW r, MEMR m))
4226   {
4227 <    emit_byte(0xdb);
2778 <    emit_byte(0x05);
2779 <    emit_long(m);
4227 >    raw_fildl(m);
4228      tos_make(r);
4229   }
4230   LENDFUNC(NONE,READ,2,raw_fmovi_rm,(FW r, MEMR m))
# Line 2784 | Line 4232 | LENDFUNC(NONE,READ,2,raw_fmovi_rm,(FW r,
4232   LOWFUNC(NONE,WRITE,2,raw_fmovi_mr,(MEMW m, FR r))
4233   {
4234      make_tos(r);
4235 <    emit_byte(0xdb);
2788 <    emit_byte(0x15);
2789 <    emit_long(m);
4235 >    raw_fistl(m);
4236   }
4237   LENDFUNC(NONE,WRITE,2,raw_fmovi_mr,(MEMW m, FR r))
4238  
4239   LOWFUNC(NONE,READ,2,raw_fmovs_rm,(FW r, MEMR m))
4240   {
4241 <    emit_byte(0xd9);
2796 <    emit_byte(0x05);
2797 <    emit_long(m);
4241 >    raw_flds(m);
4242      tos_make(r);
4243   }
4244   LENDFUNC(NONE,READ,2,raw_fmovs_rm,(FW r, MEMR m))
# Line 2802 | Line 4246 | LENDFUNC(NONE,READ,2,raw_fmovs_rm,(FW r,
4246   LOWFUNC(NONE,WRITE,2,raw_fmovs_mr,(MEMW m, FR r))
4247   {
4248      make_tos(r);
4249 <    emit_byte(0xd9);
2806 <    emit_byte(0x15);
2807 <    emit_long(m);
4249 >    raw_fsts(m);
4250   }
4251   LENDFUNC(NONE,WRITE,2,raw_fmovs_mr,(MEMW m, FR r))
4252  
# Line 2819 | Line 4261 | LOWFUNC(NONE,WRITE,2,raw_fmov_ext_mr,(ME
4261      emit_byte(0xd9);     /* Get a copy to the top of stack */
4262      emit_byte(0xc0+rs);
4263  
4264 <    emit_byte(0xdb);  /* store and pop it */
2823 <    emit_byte(0x3d);
2824 <    emit_long(m);
4264 >    raw_fstpt(m);       /* store and pop it */
4265   }
4266   LENDFUNC(NONE,WRITE,2,raw_fmov_ext_mr,(MEMW m, FR r))
4267  
# Line 2830 | Line 4270 | LOWFUNC(NONE,WRITE,2,raw_fmov_ext_mr_dro
4270      int rs;
4271  
4272      make_tos(r);
4273 <    emit_byte(0xdb);  /* store and pop it */
2834 <    emit_byte(0x3d);
2835 <    emit_long(m);
4273 >    raw_fstpt(m);       /* store and pop it */
4274      live.onstack[live.tos]=-1;
4275      live.tos--;
4276      live.spos[r]=-2;
# Line 2841 | Line 4279 | LENDFUNC(NONE,WRITE,2,raw_fmov_ext_mr,(M
4279  
4280   LOWFUNC(NONE,READ,2,raw_fmov_ext_rm,(FW r, MEMR m))
4281   {
4282 <    emit_byte(0xdb);
2845 <    emit_byte(0x2d);
2846 <    emit_long(m);
4282 >    raw_fldt(m);
4283      tos_make(r);
4284   }
4285   LENDFUNC(NONE,READ,2,raw_fmov_ext_rm,(FW r, MEMR m))
# Line 3030 | Line 4466 | LOWFUNC(NONE,NONE,2,raw_fsin_rr,(FW d, F
4466   }
4467   LENDFUNC(NONE,NONE,2,raw_fsin_rr,(FW d, FR s))
4468  
4469 < double one=1;
4469 > static const double one=1;
4470   LOWFUNC(NONE,NONE,2,raw_ftwotox_rr,(FW d, FR s))
4471   {
4472      int ds;
# Line 3050 | Line 4486 | LOWFUNC(NONE,NONE,2,raw_ftwotox_rr,(FW d
4486      emit_byte(0xe1);  /* subtract rounded from original */
4487      emit_byte(0xd9);
4488      emit_byte(0xf0);  /* f2xm1 */
4489 <    emit_byte(0xdc);
3054 <    emit_byte(0x05);
3055 <    emit_long((uae_u32)&one);  /* Add '1' without using extra stack space */
4489 >    x86_fadd_m((uintptr)&one);  /* Add '1' without using extra stack space */
4490      emit_byte(0xd9);
4491      emit_byte(0xfd);  /* and scale it */
4492      emit_byte(0xdd);
# Line 3084 | Line 4518 | LOWFUNC(NONE,NONE,2,raw_fetox_rr,(FW d,
4518      emit_byte(0xe1);  /* subtract rounded from original */
4519      emit_byte(0xd9);
4520      emit_byte(0xf0);  /* f2xm1 */
4521 <    emit_byte(0xdc);
3088 <    emit_byte(0x05);
3089 <    emit_long((uae_u32)&one);  /* Add '1' without using extra stack space */
4521 >    x86_fadd_m((uintptr)&one);  /* Add '1' without using extra stack space */
4522      emit_byte(0xd9);
4523      emit_byte(0xfd);  /* and scale it */
4524      emit_byte(0xdd);

Diff Legend

Removed lines
+ Added lines
< Changed lines
> Changed lines