Thumb-2: Implement the unified arch/arm/lib functions

This patch adds the ARM/Thumb-2 unified support for the arch/arm/lib/* files. Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>

Thumb-2: Implement the unified arch/arm/lib functions
This patch adds the ARM/Thumb-2 unified support for the arch/arm/lib/* files. Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
8ed3d6a4 · Catalin Marinas · 441b91ac · 8ed3d6a4 · 8ed3d6a4 · 8ed3d6a4
Commit 8ed3d6a4 authored Jun 22, 2007 by Catalin Marinas
48 changed files
--- a/arch/arm/lib/ashldi3.S
+++ b/arch/arm/lib/ashldi3.S
@@ -25,6 +25,7 @@ along with this program; see the file COPYING.  If not, write to
 the Free Software Foundation, 51 Franklin Street, Fifth Floor,
 Boston, MA 02110-1301, USA.  */
+#include <asm/unified.h>
 #include <linux/linkage.h>
@@ -36,14 +37,19 @@ Boston, MA 02110-1301, USA.  */
 #define ah r1
 #endif
+	.type	__ashldi3, %function
 ENTRY(__ashldi3)
+	.type	__aeabi_llsl, %function
 ENTRY(__aeabi_llsl)
 	subs	r3, r2, #32
 	rsb	ip, r2, #32
+	itett	mi
 	movmi	ah, ah, lsl r2
 	movpl	ah, al, lsl r3
-	orrmi	ah, ah, al, lsr ip
+ ARM(	orrmi	ah, ah, al, lsr ip	)
+ THUMB(	lsrmi	r3, al, ip		)
+ THUMB(	orrmi	ah, ah, r3		)
 	mov	al, al, lsl r2
 	mov	pc, lr
--- a/arch/arm/lib/ashrdi3.S
+++ b/arch/arm/lib/ashrdi3.S
@@ -25,6 +25,7 @@ along with this program; see the file COPYING.  If not, write to
 the Free Software Foundation, 51 Franklin Street, Fifth Floor,
 Boston, MA 02110-1301, USA.  */
+#include <asm/unified.h>
 #include <linux/linkage.h>
@@ -36,14 +37,19 @@ Boston, MA 02110-1301, USA.  */
 #define ah r1
 #endif
+	.type	__ashrdi3, %function
 ENTRY(__ashrdi3)
+	.type	__aeabi_lasr, %function
 ENTRY(__aeabi_lasr)
 	subs	r3, r2, #32
 	rsb	ip, r2, #32
+	itett	mi
 	movmi	al, al, lsr r2
 	movpl	al, ah, asr r3
-	orrmi	al, al, ah, lsl ip
+ ARM(	orrmi	al, al, ah, lsl ip	)
+ THUMB(	lslmi	r3, ah, ip		)
+ THUMB(	orrmi	al, al, r3		)
 	mov	ah, ah, asr r2
 	mov	pc, lr
--- a/arch/arm/lib/backtrace.S
+++ b/arch/arm/lib/backtrace.S
@@ -10,6 +10,8 @@
 * 27/03/03 Ian Molton Clean up CONFIG_CPU
 *
 */
+#include <asm/unified.h>
 #include <linux/linkage.h>
 #include <asm/assembler.h>
 		.text
@@ -22,10 +24,12 @@
 #define mask	r7
 #define offset	r8
+		.type	__backtrace, %function
 ENTRY(__backtrace)
 		mov	r1, #0x10
 		mov	r0, fp
+		.type	c_backtrace, %function
 ENTRY(c_backtrace)
 #if !defined(CONFIG_FRAME_POINTER) || !defined(CONFIG_PRINTK)
@@ -34,11 +38,16 @@ ENTRY(c_backtrace)
 		stmfd	sp!, {r4 - r8, lr}	@ Save an extra register so we have a location...
 		tst	r1, #0x10		@ 26 or 32-bit?
-		moveq	mask, #0xfc000003
+		itte	eq
+ ARM(		moveq	mask, #0xfc000003	)
+ THUMB(		moveq	mask, #0xfc000000	)
+ THUMB(		orreq	mask, #0x03		)
 		movne	mask, #0
 		tst	mask, r0
+		it	ne
 		movne	r0, #0
 		movs	frame, r0
+		itt	eq
 1:		moveq	r0, #-2
 		ldmeqfd	sp!, {r4 - r8, pc}
@@ -59,6 +68,7 @@ ENTRY(c_backtrace)
 		mov	r1, r1, lsr #10
 		ldr	r3, .Ldsi+4
 		teq	r1, r3
+		it	eq
 		subeq	save, save, #4
 		mov	r0, save
 		bic	r1, r2, mask
@@ -70,6 +80,7 @@ ENTRY(c_backtrace)
 		mov	r3, r1, lsr #10
 		ldr	r2, .Ldsi+4
 		teq	r3, r2			@ Check for stmia sp!, {args}
+		itt	eq
 		addeq	save, save, #4		@ next instruction
 		bleq	.Ldumpstm
@@ -78,12 +89,14 @@ ENTRY(c_backtrace)
 		mov	r3, r1, lsr #10
 		ldr	r2, .Ldsi
 		teq	r3, r2
+		it	eq
 		bleq	.Ldumpstm
 		/*
 		 * A zero next framepointer means we're done.
 		 */
 		teq	next, #0
+		it	eq
 		ldmeqfd	sp!, {r4 - r8, pc}
 		/*
@@ -124,10 +137,13 @@ ENTRY(c_backtrace)
 		mov	reg, #9
 		mov	r7, #0
 1:		mov	r3, #1
-		tst	instr, r3, lsl reg
+ ARM(		tst	instr, r3, lsl reg	)
+ THUMB(		lsl	r3, reg			)
+ THUMB(		tst	instr, r3		)
 		beq	2f
 		add	r7, r7, #1
 		teq	r7, #4
+		itte	eq
 		moveq	r7, #0
 		moveq	r3, #'\n'
 		movne	r3, #' '
@@ -138,6 +154,7 @@ ENTRY(c_backtrace)
 2:		subs	reg, reg, #1
 		bpl	1b
 		teq	r7, #0
+		itt	ne
 		adrne	r0, .Lcr
 		blne	printk
 		mov	r0, stack

--- a/arch/arm/lib/bitops.h
+++ b/arch/arm/lib/bitops.h
@@ -20,7 +20,7 @@
 	mov	pc, lr
 	.endm
-	.macro	testop, instr, store
+	.macro	testop, instr, store, cond=al
 	and	r3, r0, #7		@ Get bit offset
 	mov	r2, #1
 	add	r1, r1, r0, lsr #3	@ Get byte offset
@@ -34,11 +34,15 @@
 #endif
 1:	ldrexb	r2, [r1]
 	ands	r0, r2, r3		@ save old value of bit
+	.ifnc	\cond,al
+	it	\cond
+	.endif
 	\instr	r2, r2, r3		@ toggle bit
 	strexb	ip, r2, [r1]
 	cmp	ip, #0
 	bne	1b
 	cmp	r0, #0
+	it	ne
 	movne	r0, #1
 2:	mov	pc, lr
 	.endm
@@ -63,7 +67,7 @@
 * Note: we can trivially conditionalise the store instruction
 * to avoid dirting the data cache.
 */
-	.macro	testop, instr, store
+	.macro	testop, instr, store, cond=al
 	add	r1, r1, r0, lsr #3
 	and	r3, r0, #7
 	mov	r0, #1

--- a/arch/arm/lib/changebit.S
+++ b/arch/arm/lib/changebit.S
@@ -7,6 +7,8 @@
 * it under the terms of the GNU General Public License version 2 as
 * published by the Free Software Foundation.
 */
+#include <asm/unified.h>
 #include <linux/linkage.h>
 #include <asm/assembler.h>
 #include "bitops.h"
@@ -15,7 +17,9 @@
 /* Purpose  : Function to change a bit
 * Prototype: int change_bit(int bit, void *addr)
 */
+	.type	_change_bit_be, %function
 ENTRY(_change_bit_be)
 		eor	r0, r0, #0x18		@ big endian byte ordering
+	.type	_change_bit_le, %function
 ENTRY(_change_bit_le)
 	bitop	eor
--- a/arch/arm/lib/clear_user.S
+++ b/arch/arm/lib/clear_user.S
@@ -7,6 +7,8 @@
 * it under the terms of the GNU General Public License version 2 as
 * published by the Free Software Foundation.
 */
+#include <asm/unified.h>
 #include <linux/linkage.h>
 #include <asm/assembler.h>
@@ -18,6 +20,7 @@
 *          : sz   - number of bytes to clear
 * Returns  : number of bytes NOT cleared
 */
+		.type	__clear_user, %function
 ENTRY(__clear_user)
 		stmfd	sp!, {r1, lr}
 		mov	r2, #0
@@ -26,22 +29,42 @@ ENTRY(__clear_user)
 		ands	ip, r0, #3
 		beq	1f
 		cmp	ip, #2
-USER(		strbt	r2, [r0], #1)
+ARM(USER(	strbt	r2, [r0], #1	))
-USER(		strlebt	r2, [r0], #1)
+THUMB(USER(	strbt	r2, [r0]	))
-USER(		strltbt	r2, [r0], #1)
+THUMB(		add	r0, #1		)
+ARM(USER(	strlebt	r2, [r0], #1	))
+		itt	le
+THUMB(USER(	strlebt	r2, [r0]	))
+THUMB(		addle	r0, #1		)
+ARM(USER(	strltbt	r2, [r0], #1	))
+		itt	lt
+THUMB(USER(	strltbt	r2, [r0]	))
+THUMB(		addlt	r0, #1		)
 		rsb	ip, ip, #4
 		sub	r1, r1, ip		@  7  6  5  4  3  2  1
 1:		subs	r1, r1, #8		@ -1 -2 -3 -4 -5 -6 -7
-USER(		strplt	r2, [r0], #4)
+ARM(USER(	strplt	r2, [r0], #4	))
-USER(		strplt	r2, [r0], #4)
+ARM(USER(	strplt	r2, [r0], #4	))
+		itttt	pl
+THUMB(USER(	strplt	r2, [r0]	))
+THUMB(USER(	strplt	r2, [r0, #4]	))
+THUMB(		addpl	r0, #8		)
 		bpl	1b
 		adds	r1, r1, #4		@  3  2  1  0 -1 -2 -3
-USER(		strplt	r2, [r0], #4)
+ARM(USER(	strplt	r2, [r0], #4	))
+		itt	pl
+THUMB(USER(	strplt	r2, [r0]	))
+THUMB(		addpl	r0, #4		)
 2:		tst	r1, #2			@ 1x 1x 0x 0x 1x 1x 0x
-USER(		strnebt	r2, [r0], #1)
+ARM(USER(	strnebt	r2, [r0], #1	))
-USER(		strnebt	r2, [r0], #1)
+ARM(USER(	strnebt	r2, [r0], #1	))
+		ittt	ne
+THUMB(USER(	strnebt	r2, [r0]	))
+THUMB(USER(	strnebt	r2, [r0, #1]	))
+THUMB(		addne	r0, #2		)
 		tst	r1, #1			@ x1 x0 x1 x0 x1 x0 x1
-USER(		strnebt	r2, [r0], #1)
+		it	ne
+USER(		strnebt	r2, [r0]	)
 		mov	r0, #0
 		ldmfd	sp!, {r1, pc}

--- a/arch/arm/lib/clearbit.S
+++ b/arch/arm/lib/clearbit.S
@@ -7,6 +7,8 @@
 * it under the terms of the GNU General Public License version 2 as
 * published by the Free Software Foundation.
 */
+#include <asm/unified.h>
 #include <linux/linkage.h>
 #include <asm/assembler.h>
 #include "bitops.h"
@@ -16,7 +18,9 @@
 * Purpose  : Function to clear a bit
 * Prototype: int clear_bit(int bit, void *addr)
 */
+	.type	_clear_bit_be, %function
 ENTRY(_clear_bit_be)
 		eor	r0, r0, #0x18		@ big endian byte ordering
+	.type	_clear_bit_le, %function
 ENTRY(_clear_bit_le)
 	bitop	bic
--- a/arch/arm/lib/copy_from_user.S
+++ b/arch/arm/lib/copy_from_user.S
@@ -9,6 +9,7 @@
 *  it under the terms of the GNU General Public License version 2 as
 *  published by the Free Software Foundation.
 */
+#include <asm/unified.h>
 #include <linux/linkage.h>
 #include <asm/assembler.h>
@@ -33,8 +34,18 @@
 *	Number of bytes NOT copied.
 */
+#ifndef CONFIG_THUMB2_KERNEL
+#define LDR1W_SHIFT	0
+#else
+#define LDR1W_SHIFT	1
+#endif
+#define STR1W_SHIFT	0
 	.macro ldr1w ptr reg abort
-100:	ldrt \reg, [\ptr], #4
+100:	
+ ARM(	ldrt \reg, [\ptr], #4		)
+ THUMB(	ldrt \reg, [\ptr]		)
+ THUMB(	add.w \ptr, \ptr, #4		)
 	.section __ex_table, "a"
 	.long 100b, \abort
 	.previous
@@ -53,14 +64,20 @@
 	.endm
 	.macro ldr1b ptr reg cond=al abort
-100:	ldr\cond\()bt \reg, [\ptr], #1
+	.ifnc \cond,al
+	itt \cond
+	.endif
+100:	
+ ARM(	ldr\cond\()bt \reg, [\ptr], #1	)
+ THUMB(	ldr\cond\()bt \reg, [\ptr]	)
+ THUMB(	add\cond \ptr, \ptr, #1		)
 	.section __ex_table, "a"
 	.long 100b, \abort
 	.previous
 	.endm
 	.macro str1w ptr reg abort
-	str \reg, [\ptr], #4
+	W(str) \reg, [\ptr], #4
 	.endm
 	.macro str8w ptr reg1 reg2 reg3 reg4 reg5 reg6 reg7 reg8 abort
@@ -68,6 +85,9 @@
 	.endm
 	.macro str1b ptr reg cond=al abort
+	.ifnc \cond,al
+	it \cond
+	.endif
 	str\cond\()b \reg, [\ptr], #1
 	.endm
@@ -83,6 +103,7 @@
 	.text
+	.type	__copy_from_user, %function
 ENTRY(__copy_from_user)
 #include "copy_template.S"

--- a/arch/arm/lib/copy_page.S
+++ b/arch/arm/lib/copy_page.S
@@ -9,6 +9,8 @@
 *
 *  ASM optimised string functions
 */
+#include <asm/unified.h>
 #include <linux/linkage.h>
 #include <asm/assembler.h>
 #include <asm/asm-offsets.h>
@@ -23,6 +25,7 @@
 * Note that we probably achieve closer to the 100MB/s target with
 * the core clock switching.
 */
+		.type	copy_page, %function
 ENTRY(copy_page)
 		stmfd	sp!, {r4, lr}			@	2
 	PLD(	pld	[r1, #0]		)
@@ -39,8 +42,10 @@ ENTRY(copy_page)
 		ldmia	r1!, {r3, r4, ip, lr}		@	4
 		subs	r2, r2, #1			@	1
 		stmia	r0!, {r3, r4, ip, lr}		@	4
+		itt	gt
 		ldmgtia	r1!, {r3, r4, ip, lr}		@	4
 		bgt	1b				@	1
+	PLD(	itt	eq			)
 	PLD(	ldmeqia r1!, {r3, r4, ip, lr}	)
 	PLD(	beq	2b			)
 		ldmfd	sp!, {r4, pc}			@	3
--- a/arch/arm/lib/copy_template.S
+++ b/arch/arm/lib/copy_template.S
@@ -65,6 +65,13 @@
 *
 *	Restore registers with the values previously saved with the
 *	'preserv' macro. Called upon code termination.
+ *
+ * LDR1W_SHIFT
+ * STR1W_SHIFT
+ *
+ *	Correction to be applied to the "ip" register when branching into
+ *	the ldr1w or str1w instructions (some of these macros may expand to
+ *	than one 32bit instruction in Thumb-2)
 */
@@ -107,9 +114,13 @@
 5:		ands	ip, r2, #28
 		rsb	ip, ip, #32
-		addne	pc, pc, ip		@ C is always clear here
+		it	ne
+		addne	pc, pc, ip, lsl #LDR1W_SHIFT	@ C is always clear here
 		b	7f
 6:		nop
+		.rept	(1 << LDR1W_SHIFT) - 1
+		W(nop)
+		.endr
 		ldr1w	r1, r3, abort=20f
 		ldr1w	r1, r4, abort=20f
 		ldr1w	r1, r5, abort=20f
@@ -118,9 +129,12 @@
 		ldr1w	r1, r8, abort=20f
 		ldr1w	r1, lr, abort=20f
-		add	pc, pc, ip
+		add	pc, pc, ip, lsl #STR1W_SHIFT
 		nop
 		nop
+		.rept	(1 << STR1W_SHIFT) - 1
+		W(nop)
+		.endr
 		str1w	r0, r3, abort=20f
 		str1w	r0, r4, abort=20f
 		str1w	r0, r5, abort=20f

--- a/arch/arm/lib/copy_to_user.S
+++ b/arch/arm/lib/copy_to_user.S
@@ -9,6 +9,7 @@
 *  it under the terms of the GNU General Public License version 2 as
 *  published by the Free Software Foundation.
 */
+#include <asm/unified.h>
 #include <linux/linkage.h>
 #include <asm/assembler.h>
@@ -33,8 +34,15 @@
 *	Number of bytes NOT copied.
 */
+#define LDR1W_SHIFT	0
+#ifndef CONFIG_THUMB2_KERNEL
+#define STR1W_SHIFT	0
+#else
+#define STR1W_SHIFT	1
+#endif
 	.macro ldr1w ptr reg abort
-	ldr \reg, [\ptr], #4
+	W(ldr) \reg, [\ptr], #4
 	.endm
 	.macro ldr4w ptr reg1 reg2 reg3 reg4 abort
@@ -46,11 +54,17 @@
 	.endm
 	.macro ldr1b ptr reg cond=al abort
+	.ifnc \cond,al
+	it \cond
+	.endif
 	ldr\cond\()b \reg, [\ptr], #1
 	.endm
 	.macro str1w ptr reg abort
-100:	strt \reg, [\ptr], #4
+100:	
+ ARM(	strt \reg, [\ptr], #4		)
+ THUMB(	strt \reg, [\ptr]		)
+ THUMB(	add.w \ptr, \ptr, #4		)
 	.section __ex_table, "a"
 	.long 100b, \abort
 	.previous
@@ -68,7 +82,13 @@
 	.endm
 	.macro str1b ptr reg cond=al abort
-100:	str\cond\()bt \reg, [\ptr], #1
+	.ifnc \cond,al
+	itt \cond
+	.endif
+100:	
+ ARM(	str\cond\()bt \reg, [\ptr], #1	)
+ THUMB(	str\cond\()bt \reg, [\ptr]	)
+ THUMB(	add\cond \ptr, \ptr, #1		)
 	.section __ex_table, "a"
 	.long 100b, \abort
 	.previous
@@ -86,6 +106,7 @@
 	.text
+	.type	__copy_to_user, %function
 ENTRY(__copy_to_user)
 #include "copy_template.S"

--- a/arch/arm/lib/csumipv6.S
+++ b/arch/arm/lib/csumipv6.S
@@ -7,11 +7,14 @@
 * it under the terms of the GNU General Public License version 2 as
 * published by the Free Software Foundation.
 */
+#include <asm/unified.h>
 #include <linux/linkage.h>
 #include <asm/assembler.h>
 		.text
+		.type	__csum_ipv6_magic, %function
 ENTRY(__csum_ipv6_magic)
 		str	lr, [sp, #-4]!
 		adds	ip, r2, r3

--- a/arch/arm/lib/csumpartial.S
+++ b/arch/arm/lib/csumpartial.S
@@ -7,6 +7,8 @@
 * it under the terms of the GNU General Public License version 2 as
 * published by the Free Software Foundation.
 */
+#include <asm/unified.h>
 #include <linux/linkage.h>
 #include <asm/assembler.h>
@@ -39,6 +41,7 @@ td3	.req	lr
 		/* we must have at least one byte. */
 		tst	buf, #1			@ odd address?
+		itttt	ne
 		movne	sum, sum, ror #8
 		ldrneb	td0, [buf], #1
 		subne	len, len, #1
@@ -68,25 +71,30 @@ td3	.req	lr
 		bne	.Lless8_wordlp
 .Lless8_byte:	tst	len, #1			@ odd number of bytes
+		itt	ne
 		ldrneb	td0, [buf], #1		@ include last byte
 		adcnes	sum, sum, td0, put_byte_0	@ update checksum
 .Ldone:		adc	r0, sum, #0		@ collect up the last carry
 		ldr	td0, [sp], #4
 		tst	td0, #1			@ check buffer alignment
+		it	ne
 		movne	r0, r0, ror #8		@ rotate checksum by 8 bits
 		ldr	pc, [sp], #4		@ return
 .Lnot_aligned:	tst	buf, #1			@ odd address
+		ittt	ne
 		ldrneb	td0, [buf], #1		@ make even
 		subne	len, len, #1
 		adcnes	sum, sum, td0, put_byte_1	@ update checksum
 		tst	buf, #2			@ 32-bit aligned?
 #if __LINUX_ARM_ARCH__ >= 4
+		itt	ne
 		ldrneh	td0, [buf], #2		@ make 32-bit aligned
 		subne	len, len, #2
 #else
+		itttt	ne
 		ldrneb	td0, [buf], #1
 		ldrneb	ip, [buf], #1
 		subne	len, len, #2
@@ -96,19 +104,23 @@ td3	.req	lr
 		orrne	td0, ip, td0, lsl #8
 #endif
 #endif
+		it	ne
 		adcnes	sum, sum, td0		@ update checksum
 		mov	pc, lr
+		.type	csum_partial, %function
 ENTRY(csum_partial)
 		stmfd	sp!, {buf, lr}
 		cmp	len, #8			@ Ensure that we have at least
 		blo	.Lless8			@ 8 bytes to copy.
 		tst	buf, #1
+		it	ne
 		movne	sum, sum, ror #8
 		adds	sum, sum, #0		@ C = 0
 		tst	buf, #3			@ Test destination alignment
+		it	ne
 		blne	.Lnot_aligned		@ align destination, return here
 1:		bics	ip, len, #31

--- a/arch/arm/lib/csumpartialcopy.S
+++ b/arch/arm/lib/csumpartialcopy.S
@@ -7,6 +7,8 @@
 * it under the terms of the GNU General Public License version 2 as
 * published by the Free Software Foundation.
 */
+#include <asm/unified.h>
 #include <linux/linkage.h>
 #include <asm/assembler.h>
@@ -18,13 +20,15 @@
 */
 		.macro	save_regs
-		mov	ip, sp
+ ARM(		mov	ip, sp					)
-		stmfd	sp!, {r1, r4 - r8, fp, ip, lr, pc}
+ ARM(		stmfd	sp!, {r1, r4 - r8, fp, ip, lr, pc}	)
-		sub	fp, ip, #4
+ ARM(		sub	fp, ip, #4				)
+ THUMB(		stmfd	sp!, {r1, r4 - r8, lr}			)
 		.endm
 		.macro	load_regs
-		ldmfd	sp, {r1, r4 - r8, fp, sp, pc}
+ ARM(		ldmfd	sp, {r1, r4 - r8, fp, sp, pc}		)
+ THUMB(		ldmfd	sp!, {r1, r4 - r8, pc}			)
 		.endm
 		.macro	load1b, reg1

--- a/arch/arm/lib/csumpartialcopygeneric.S
+++ b/arch/arm/lib/csumpartialcopygeneric.S
@@ -16,6 +16,8 @@
 *
 * Note that 'tst' and 'teq' preserve the carry flag.
 */
+#include <asm/unified.h>
 src	.req	r0
 dst	.req	r1
@@ -40,6 +42,7 @@ sum	.req	r3
 		adcs	sum, sum, ip, put_byte_1	@ update checksum
 		strb	ip, [dst], #1
 		tst	dst, #2
+		it	eq
 		moveq	pc, lr			@ dst is now 32bit aligned
 .Ldst_16bit:	load2b	r8, ip
@@ -94,6 +97,7 @@ FN_ENTRY
 		adds	sum, sum, #0		@ C = 0
 		tst	dst, #3			@ Test destination alignment
+		it	ne
 		blne	.Ldst_unaligned		@ align destination, return here
 		/*
@@ -147,6 +151,7 @@ FN_ENTRY
 		strb	r5, [dst], #1
 		mov	r5, r4, get_byte_2
 .Lexit:		tst	len, #1
+		ittt	ne
 		strneb	r5, [dst], #1
 		andne	r5, r5, #255
 		adcnes	sum, sum, r5, put_byte_0
@@ -160,6 +165,7 @@ FN_ENTRY
 .Ldone:		adc	r0, sum, #0
 		ldr	sum, [sp, #0]		@ dst
 		tst	sum, #1
+		it	ne
 		movne	r0, r0, ror #8
 		load_regs

--- a/arch/arm/lib/csumpartialcopyuser.S
+++ b/arch/arm/lib/csumpartialcopyuser.S
@@ -10,6 +10,8 @@
 * 27/03/03 Ian Molton Clean up CONFIG_CPU
 *
 */
+#include <asm/unified.h>
 #include <linux/linkage.h>
 #include <asm/assembler.h>
 #include <asm/errno.h>
@@ -18,17 +20,22 @@
 		.text
 		.macro	save_regs
-		mov	ip, sp
+ ARM(		mov	ip, sp					)
-		stmfd	sp!, {r1 - r2, r4 - r8, fp, ip, lr, pc}
+ ARM(		stmfd	sp!, {r1 - r2, r4 - r8, fp, ip, lr, pc}	)
-		sub	fp, ip, #4
+ ARM(		sub	fp, ip, #4				)
+ THUMB(		stmfd	sp!, {r1, r2, r4 - r8, lr}		)
 		.endm
 		.macro	load_regs
-		ldmfd	sp, {r1, r2, r4-r8, fp, sp, pc}
+ ARM(		ldmfd	sp, {r1, r2, r4-r8, fp, sp, pc}		)
+ THUMB(		ldmfd	sp!, {r1, r2, r4 - r8, pc}		)
 		.endm
 		.macro	load1b,	reg1
-9999:		ldrbt	\reg1, [r0], $1
+9999:
+ ARM(		ldrbt	\reg1, [r0], $1	)
+ THUMB(		ldrbt	\reg1, [r0]	)
+ THUMB(		add	\reg1, $1	)
 		.section __ex_table, "a"
 		.align	3
 		.long	9999b, 6001f
@@ -36,8 +43,14 @@
 		.endm
 		.macro	load2b, reg1, reg2
-9999:		ldrbt	\reg1, [r0], $1
+9999:
-9998:		ldrbt	\reg2, [r0], $1
+ ARM(		ldrbt	\reg1, [r0], $1	)
+ THUMB(		ldrbt	\reg1, [r0]	)
+ THUMB(		add	\reg1, $1	)
+9998:		
+ ARM(		ldrbt	\reg2, [r0], $1	)
+ THUMB(		ldrbt	\reg2, [r0]	)
+ THUMB(		add	\reg2, $1	)
 		.section __ex_table, "a"
 		.long	9999b, 6001f
 		.long	9998b, 6001f
@@ -45,7 +58,10 @@
 		.endm
 		.macro	load1l, reg1
-9999:		ldrt	\reg1, [r0], $4
+9999:
+ ARM(		ldrt	\reg1, [r0], $4	)
+ THUMB(		ldrt	\reg1, [r0]	)
+ THUMB(		add	\reg1, $4	)
 		.section __ex_table, "a"
 		.align	3
 		.long	9999b, 6001f
@@ -53,8 +69,14 @@
 		.endm
 		.macro	load2l, reg1, reg2
-9999:		ldrt	\reg1, [r0], $4
+9999:
-9998:		ldrt	\reg2, [r0], $4
+ ARM(		ldrt	\reg1, [r0], $4	)
+ THUMB(		ldrt	\reg1, [r0]	)
+ THUMB(		add	\reg1, $4	)
+9998:
+ ARM(		ldrt	\reg2, [r0], $4	)
+ THUMB(		ldrt	\reg2, [r0]	)
+ THUMB(		add	\reg2, $4	)
 		.section __ex_table, "a"
 		.long	9999b, 6001f
 		.long	9998b, 6001f
@@ -62,10 +84,22 @@
 		.endm
 		.macro	load4l, reg1, reg2, reg3, reg4
-9999:		ldrt	\reg1, [r0], $4
+9999:
-9998:		ldrt	\reg2, [r0], $4
+ ARM(		ldrt	\reg1, [r0], $4	)
-9997:		ldrt	\reg3, [r0], $4
+ THUMB(		ldrt	\reg1, [r0]	)
-9996:		ldrt	\reg4, [r0], $4
+ THUMB(		add	\reg1, $4	)
+9998:
+ ARM(		ldrt	\reg2, [r0], $4	)
+ THUMB(		ldrt	\reg2, [r0]	)
+ THUMB(		add	\reg2, $4	)
+9997:
+ ARM(		ldrt	\reg3, [r0], $4	)
+ THUMB(		ldrt	\reg3, [r0]	)
+ THUMB(		add	\reg3, $4	)
+9996:
+ ARM(		ldrt	\reg4, [r0], $4	)
+ THUMB(		ldrt	\reg4, [r0]	)
+ THUMB(		add	\reg4, $4	)
 		.section __ex_table, "a"
 		.long	9999b, 6001f
 		.long	9998b, 6001f
@@ -100,6 +134,7 @@
 		add	r2, r2, r1
 		mov	r0, #0			@ zero the buffer
 6002:		teq	r2, r1
+		it 	ne
 		strneb	r0, [r1], #1
 		bne	6002b
 		load_regs

--- a/arch/arm/lib/delay.S
+++ b/arch/arm/lib/delay.S
@@ -7,6 +7,8 @@
 * it under the terms of the GNU General Public License version 2 as
 * published by the Free Software Foundation.
 */
+#include <asm/unified.h>
 #include <linux/linkage.h>
 #include <asm/assembler.h>
 #include <asm/param.h>
@@ -21,9 +23,11 @@
 * HZ  <= 1000
 */
+		.type	__udelay, %function
 ENTRY(__udelay)
 		ldr	r2, .LC1
 		mul	r0, r2, r0
+		.type	__const_udelay, %function
 ENTRY(__const_udelay)				@ 0 <= r0 <= 0x7fffff06
 		ldr	r2, .LC0
 		ldr	r2, [r2]		@ max = 0x01ffffff
@@ -31,6 +35,7 @@ ENTRY(__const_udelay)				@ 0 <= r0 <= 0x7fffff06
 		mov	r2, r2, lsr #10		@ max = 0x00007fff
 		mul	r0, r2, r0		@ max = 2^32-1
 		movs	r0, r0, lsr #6
+		it	eq
 		moveq	pc, lr
 /*
@@ -40,6 +45,7 @@ ENTRY(__const_udelay)				@ 0 <= r0 <= 0x7fffff06
 */
 @ Delay routine
+		.type	__delay, %function
 ENTRY(__delay)
 		subs	r0, r0, #1
 #if 0
@@ -58,5 +64,6 @@ ENTRY(__delay)
 		movls	pc, lr
 		subs	r0, r0, #1
 #endif
+		it	hi
 		bhi	__delay
 		mov	pc, lr
--- a/arch/arm/lib/div64.S
+++ b/arch/arm/lib/div64.S
@@ -11,6 +11,7 @@
 *  it under the terms of the GNU General Public License version 2 as
 *  published by the Free Software Foundation.
 */
+#include <asm/unified.h>
 #include <linux/linkage.h>
@@ -43,6 +44,7 @@
 * Clobbered regs: xl, ip
 */
+	.type	__do_div64, %function
 ENTRY(__do_div64)
 	@ Test for easy paths first.
@@ -84,8 +86,10 @@ ENTRY(__do_div64)
 	@ The division loop for needed upper bit positions.
 	@ Break out early if dividend reaches 0.
 2:	cmp	xh, yl
+	itt	cs
 	orrcs	yh, yh, ip
 	subcss	xh, xh, yl
+	it	ne
 	movnes	ip, ip, lsr #1
 	mov	yl, yl, lsr #1
 	bne	2b
@@ -93,7 +97,9 @@ ENTRY(__do_div64)
 	@ See if we need to handle lower 32-bit result.
 3:	cmp	xh, #0
 	mov	yl, #0
+	it	eq
 	cmpeq	xl, r4
+	itt	lo
 	movlo	xh, xl
 	movlo	pc, lr
@@ -104,7 +110,9 @@ ENTRY(__do_div64)
 4:	movs	xl, xl, lsl #1
 	adcs	xh, xh, xh
 	beq	6f
+	it	cc
 	cmpcc	xh, r4
+	itt	cs
 5:	orrcs	yl, yl, ip
 	subcs	xh, xh, r4
 	movs	ip, ip, lsr #1
@@ -116,6 +124,7 @@ ENTRY(__do_div64)
 	@ Otherwise, if lower part is also null then we are done.
 6:	bcs	5b
 	cmp	xl, #0
+	it	eq
 	moveq	pc, lr
 	@ We still have remainer bits in the low part.  Bring them up.
@@ -177,13 +186,16 @@ ENTRY(__do_div64)
 	mov	yh, xh, lsr ip
 	mov	yl, xl, lsr ip
 	rsb	ip, ip, #32
-	orr	yl, yl, xh, lsl ip
+ ARM(	orr	yl, yl, xh, lsl ip	)
+ THUMB(	lsl	xh, xh, ip		)
+ THUMB(	orr	yl, yl, xh		)
 	mov	xh, xl, lsl ip
 	mov	xh, xh, lsr ip
 	mov	pc, lr
 	@ eq -> division by 1: obvious enough...
-9:	moveq	yl, xl
+9:	itttt	eq
+	moveq	yl, xl
 	moveq	yh, xh
 	moveq	xh, #0
 	moveq	pc, lr

--- a/arch/arm/lib/findbit.S
+++ b/arch/arm/lib/findbit.S
@@ -13,6 +13,8 @@
 *   also call with zero size.
 * Reworked by rmk.
 */
+#include <asm/unified.h>
 #include <linux/linkage.h>
 #include <asm/assembler.h>
                .text
@@ -21,11 +23,15 @@
 * Purpose  : Find a 'zero' bit
 * Prototype: int find_first_zero_bit(void *addr, unsigned int maxbit);
 */
+		.type	_find_first_zero_bit_le, %function
 ENTRY(_find_first_zero_bit_le)
 		teq	r1, #0	
 		beq	3f
 		mov	r2, #0
-1:		ldrb	r3, [r0, r2, lsr #3]
+1:
+ ARM(		ldrb	r3, [r0, r2, lsr #3]	)
+ THUMB(		lsr	r3, r2, #3		)
+ THUMB(		ldrb	r3, [r0, r3]		)
 		eors	r3, r3, #0xff		@ invert bits
 		bne	.L_found		@ any now set - found zero bit
 		add	r2, r2, #8		@ next bit pointer
@@ -38,12 +44,15 @@ ENTRY(_find_first_zero_bit_le)
 * Purpose  : Find next 'zero' bit
 * Prototype: int find_next_zero_bit(void *addr, unsigned int maxbit, int offset)
 */
+		.type	_find_next_zero_bit_le, %function
 ENTRY(_find_next_zero_bit_le)
 		teq	r1, #0
 		beq	3b
 		ands	ip, r2, #7
 		beq	1b			@ If new byte, goto old routine
-		ldrb	r3, [r0, r2, lsr #3]
+ ARM(		ldrb	r3, [r0, r2, lsr #3]	)
+ THUMB(		lsr	r3, r2, #3		)
+ THUMB(		ldrb	r3, [r0, r3]		)
 		eor	r3, r3, #0xff		@ now looking for a 1 bit
 		movs	r3, r3, lsr ip		@ shift off unused bits
 		bne	.L_found
@@ -55,11 +64,15 @@ ENTRY(_find_next_zero_bit_le)
 * Purpose  : Find a 'one' bit
 * Prototype: int find_first_bit(const unsigned long *addr, unsigned int maxbit);
 */
+		.type	_find_first_bit_le, %function
 ENTRY(_find_first_bit_le)
 		teq	r1, #0	
 		beq	3f
 		mov	r2, #0
-1:		ldrb	r3, [r0, r2, lsr #3]
+1:
+ ARM(		ldrb	r3, [r0, r2, lsr #3]	)
+ THUMB(		lsr	r3, r2, #3		)
+ THUMB(		ldrb	r3, [r0, r3]		)
 		movs	r3, r3
 		bne	.L_found		@ any now set - found zero bit
 		add	r2, r2, #8		@ next bit pointer
@@ -72,12 +85,15 @@ ENTRY(_find_first_bit_le)
 * Purpose  : Find next 'one' bit
 * Prototype: int find_next_zero_bit(void *addr, unsigned int maxbit, int offset)
 */
+		.type	_find_next_bit_le, %function
 ENTRY(_find_next_bit_le)
 		teq	r1, #0
 		beq	3b
 		ands	ip, r2, #7
 		beq	1b			@ If new byte, goto old routine
-		ldrb	r3, [r0, r2, lsr #3]
+ ARM(		ldrb	r3, [r0, r2, lsr #3]	)
+ THUMB(		lsr	r3, r2, #3		)
+ THUMB(		ldrb	r3, [r0, r3]		)
 		movs	r3, r3, lsr ip		@ shift off unused bits
 		bne	.L_found
 		orr	r2, r2, #7		@ if zero, then no bits here
@@ -86,12 +102,15 @@ ENTRY(_find_next_bit_le)
 #ifdef __ARMEB__
+		.type	_find_first_zero_bit_be, %function
 ENTRY(_find_first_zero_bit_be)
 		teq	r1, #0
 		beq	3f
 		mov	r2, #0
 1:		eor	r3, r2, #0x18		@ big endian byte ordering
-		ldrb	r3, [r0, r3, lsr #3]
+ ARM(		ldrb	r3, [r0, r3, lsr #3]	)
+ THUMB(		lsr	r3, #3			)
+ THUMB(		ldrb	r3, [r0, r3]		)
 		eors	r3, r3, #0xff		@ invert bits
 		bne	.L_found		@ any now set - found zero bit
 		add	r2, r2, #8		@ next bit pointer
@@ -100,13 +119,16 @@ ENTRY(_find_first_zero_bit_be)
 3:		mov	r0, r1			@ no free bits
 		mov	pc, lr
+		.type	_find_next_zero_bit_be, %function
 ENTRY(_find_next_zero_bit_be)
 		teq	r1, #0
 		beq	3b
 		ands	ip, r2, #7
 		beq	1b			@ If new byte, goto old routine
 		eor	r3, r2, #0x18		@ big endian byte ordering
-		ldrb	r3, [r0, r3, lsr #3]
+ ARM(		ldrb	r3, [r0, r3, lsr #3]	)
+ THUMB(		lsr	r3, #3			)
+ THUMB(		ldrb	r3, [r0, r3]		)
 		eor	r3, r3, #0xff		@ now looking for a 1 bit
 		movs	r3, r3, lsr ip		@ shift off unused bits
 		bne	.L_found
@@ -114,12 +136,15 @@ ENTRY(_find_next_zero_bit_be)
 		add	r2, r2, #1		@ align bit pointer
 		b	2b			@ loop for next bit
+		.type	_find_first_bit_be, %function
 ENTRY(_find_first_bit_be)
 		teq	r1, #0
 		beq	3f
 		mov	r2, #0
 1:		eor	r3, r2, #0x18		@ big endian byte ordering
-		ldrb	r3, [r0, r3, lsr #3]
+ ARM(		ldrb	r3, [r0, r3, lsr #3]	)
+ THUMB(		lsr	r3, #3			)
+ THUMB(		ldrb	r3, [r0, r3]		)
 		movs	r3, r3
 		bne	.L_found		@ any now set - found zero bit
 		add	r2, r2, #8		@ next bit pointer
@@ -128,13 +153,16 @@ ENTRY(_find_first_bit_be)
 3:		mov	r0, r1			@ no free bits
 		mov	pc, lr
+		.type	_find_next_bit_be, %function
 ENTRY(_find_next_bit_be)
 		teq	r1, #0
 		beq	3b
 		ands	ip, r2, #7
 		beq	1b			@ If new byte, goto old routine
 		eor	r3, r2, #0x18		@ big endian byte ordering
-		ldrb	r3, [r0, r3, lsr #3]
+ ARM(		ldrb	r3, [r0, r3, lsr #3]	)
+ THUMB(		lsr	r3, #3			)
+ THUMB(		ldrb	r3, [r0, r3]		)
 		movs	r3, r3, lsr ip		@ shift off unused bits
 		bne	.L_found
 		orr	r2, r2, #7		@ if zero, then no bits here

--- a/arch/arm/lib/getuser.S
+++ b/arch/arm/lib/getuser.S
@@ -26,19 +26,26 @@
 * Note that ADDR_LIMIT is either 0 or 0xc0000000.
 * Note also that it is intended that __get_user_bad is not global.
 */
+#include <asm/unified.h>
 #include <asm/asm-offsets.h>
 #include <asm/thread_info.h>
 #include <asm/errno.h>
 	.global	__get_user_1
+	.type	__get_user_1, %function
 __get_user_1:
 1:	ldrbt	r2, [r0]
 	mov	r0, #0
 	mov	pc, lr
 	.global	__get_user_2
+	.type	__get_user_2, %function
 __get_user_2:
-2:	ldrbt	r2, [r0], #1
+2:
+ ARM(	ldrbt	r2, [r0], #1	)
+ THUMB(	ldrbt	r2, [r0]	)
+ THUMB(	add	r0, #1		)
 3:	ldrbt	r3, [r0]
 #ifndef __ARMEB__
 	orr	r2, r2, r3, lsl #8
@@ -49,11 +56,13 @@ __get_user_2:
 	mov	pc, lr
 	.global	__get_user_4
+	.type	__get_user_4, %function
 __get_user_4:
 4:	ldrt	r2, [r0]
 	mov	r0, #0
 	mov	pc, lr
+	.type	__get_user_bad, %function
 __get_user_bad:
 	mov	r2, #0
 	mov	r0, #-EFAULT

--- a/arch/arm/lib/io-readsb.S
+++ b/arch/arm/lib/io-readsb.S
@@ -7,24 +7,31 @@
 * it under the terms of the GNU General Public License version 2 as
 * published by the Free Software Foundation.
 */
+#include <asm/unified.h>
 #include <linux/linkage.h>
 #include <asm/assembler.h>
 .Linsb_align:	rsb	ip, ip, #4
 		cmp	ip, r2
+		it	gt
 		movgt	ip, r2
 		cmp	ip, #2
 		ldrb	r3, [r0]
 		strb	r3, [r1], #1
+		itt	ge
 		ldrgeb	r3, [r0]
 		strgeb	r3, [r1], #1
+		itt	gt
 		ldrgtb	r3, [r0]
 		strgtb	r3, [r1], #1
 		subs	r2, r2, ip
 		bne	.Linsb_aligned
+		.type	__raw_readsb, %function
 ENTRY(__raw_readsb)
 		teq	r2, #0		@ do we have to check for the zero len?
+		it	eq
 		moveq	pc, lr
 		ands	ip, r1, #3
 		bne	.Linsb_align
@@ -72,6 +79,7 @@ ENTRY(__raw_readsb)
 		bpl	.Linsb_16_lp
 		tst	r2, #15
+		it	eq
 		ldmeqfd	sp!, {r4 - r6, pc}
 .Linsb_no_16:	tst	r2, #8
@@ -109,13 +117,16 @@ ENTRY(__raw_readsb)
 		str	r3, [r1], #4
 .Linsb_no_4:	ands	r2, r2, #3
+		it	eq
 		ldmeqfd	sp!, {r4 - r6, pc}
 		cmp	r2, #2
 		ldrb	r3, [r0]
 		strb	r3, [r1], #1
+		itt	ge
 		ldrgeb	r3, [r0]
 		strgeb	r3, [r1], #1
+		itt	gt
 		ldrgtb	r3, [r0]
 		strgtb	r3, [r1]

--- a/arch/arm/lib/io-readsl.S
+++ b/arch/arm/lib/io-readsl.S
@@ -7,11 +7,15 @@
 * it under the terms of the GNU General Public License version 2 as
 * published by the Free Software Foundation.
 */
+#include <asm/unified.h>
 #include <linux/linkage.h>
 #include <asm/assembler.h>
+		.type	__raw_readsl, %function
 ENTRY(__raw_readsl)
 		teq	r2, #0		@ do we have to check for the zero len?
+		it	eq
 		moveq	pc, lr
 		ands	ip, r1, #3
 		bne	3f
@@ -28,9 +32,11 @@ ENTRY(__raw_readsl)
 		bpl	1b
 		ldmfd	sp!, {r4, lr}
 2:		movs	r2, r2, lsl #31
+		ittt	cs
 		ldrcs	r3, [r0, #0]
 		ldrcs	ip, [r0, #0]
 		stmcsia	r1!, {r3, ip}
+		itt	ne
 		ldrne	r3, [r0, #0]
 		strne	r3, [r1, #0]
 		mov	pc, lr
@@ -48,6 +54,7 @@ ENTRY(__raw_readsl)
 4:		subs	r2, r2, #1
 		mov	ip, r3, pull #24
+		itttt	ne
 		ldrne	r3, [r0]
 		orrne	ip, ip, r3, push #8
 		strne	ip, [r1], #4
@@ -56,6 +63,7 @@ ENTRY(__raw_readsl)
 5:		subs	r2, r2, #1
 		mov	ip, r3, pull #16
+		itttt	ne
 		ldrne	r3, [r0]
 		orrne	ip, ip, r3, push #16
 		strne	ip, [r1], #4
@@ -64,6 +72,7 @@ ENTRY(__raw_readsl)
 6:		subs	r2, r2, #1
 		mov	ip, r3, pull #8
+		itttt	ne
 		ldrne	r3, [r0]
 		orrne	ip, ip, r3, push #24
 		strne	ip, [r1], #4

--- a/arch/arm/lib/io-readsw-armv4.S
+++ b/arch/arm/lib/io-readsw-armv4.S
@@ -7,6 +7,8 @@
 * it under the terms of the GNU General Public License version 2 as
 * published by the Free Software Foundation.
 */
+#include <asm/unified.h>
 #include <linux/linkage.h>
 #include <asm/assembler.h>
@@ -24,8 +26,10 @@
 		sub	r2, r2, #1
 		strh	ip, [r1], #2
+		.type	__raw_readsw, %function
 ENTRY(__raw_readsw)
 		teq	r2, #0
+		it	eq
 		moveq	pc, lr
 		tst	r1, #3
 		bne	.Linsw_align
@@ -76,7 +80,8 @@ ENTRY(__raw_readsw)
 		pack	r3, r3, ip
 		str	r3, [r1], #4
-.Lno_insw_2:	ldrneh	r3, [r0]
+.Lno_insw_2:	itt	ne
+		ldrneh	r3, [r0]
 		strneh	r3, [r1]
 		ldmfd	sp!, {r4, r5, pc}
@@ -94,6 +99,7 @@ ENTRY(__raw_readsw)
 #endif
 .Linsw_noalign:	stmfd	sp!, {r4, lr}
+		it	cc
 		ldrccb	ip, [r1, #-1]!
 		bcc	1f
@@ -121,6 +127,7 @@ ENTRY(__raw_readsw)
 3:		tst	r2, #1
 		strb	ip, [r1], #1
+		itttt	ne
 		ldrneh	ip, [r0]
   _BE_ONLY_(	movne	ip, ip, ror #8		)
 		strneb	ip, [r1], #1

--- a/arch/arm/lib/io-writesb.S
+++ b/arch/arm/lib/io-writesb.S
@@ -7,6 +7,8 @@
 * it under the terms of the GNU General Public License version 2 as
 * published by the Free Software Foundation.
 */
+#include <asm/unified.h>
 #include <linux/linkage.h>
 #include <asm/assembler.h>
@@ -32,19 +34,24 @@
 .Loutsb_align:	rsb	ip, ip, #4
 		cmp	ip, r2
+		it	gt
 		movgt	ip, r2
 		cmp	ip, #2
 		ldrb	r3, [r1], #1
 		strb	r3, [r0]
+		itt	ge
 		ldrgeb	r3, [r1], #1
 		strgeb	r3, [r0]
+		itt	gt
 		ldrgtb	r3, [r1], #1
 		strgtb	r3, [r0]
 		subs	r2, r2, ip
 		bne	.Loutsb_aligned
+		.type	__raw_writesb, %function
 ENTRY(__raw_writesb)
 		teq	r2, #0		@ do we have to check for the zero len?
+		it	eq
 		moveq	pc, lr
 		ands	ip, r1, #3
 		bne	.Loutsb_align
@@ -64,6 +71,7 @@ ENTRY(__raw_writesb)
 		bpl	.Loutsb_16_lp
 		tst	r2, #15
+		it	eq
 		ldmeqfd	sp!, {r4, r5, pc}
 .Loutsb_no_16:	tst	r2, #8
@@ -80,13 +88,16 @@ ENTRY(__raw_writesb)
 		outword	r3
 .Loutsb_no_4:	ands	r2, r2, #3
+		it	eq
 		ldmeqfd	sp!, {r4, r5, pc}
 		cmp	r2, #2
 		ldrb	r3, [r1], #1
 		strb	r3, [r0]
+		itt	ge
 		ldrgeb	r3, [r1], #1
 		strgeb	r3, [r0]
+		itt	gt
 		ldrgtb	r3, [r1]
 		strgtb	r3, [r0]

--- a/arch/arm/lib/io-writesl.S
+++ b/arch/arm/lib/io-writesl.S
@@ -7,11 +7,15 @@
 * it under the terms of the GNU General Public License version 2 as
 * published by the Free Software Foundation.
 */
+#include <asm/unified.h>
 #include <linux/linkage.h>
 #include <asm/assembler.h>
+		.type	__raw_writesl, %function
 ENTRY(__raw_writesl)
 		teq	r2, #0		@ do we have to check for the zero len?
+		it	eq
 		moveq	pc, lr
 		ands	ip, r1, #3
 		bne	3f
@@ -28,10 +32,14 @@ ENTRY(__raw_writesl)
 		bpl	1b
 		ldmfd	sp!, {r4, lr}
 2:		movs	r2, r2, lsl #31
+		itt	cs
 		ldmcsia	r1!, {r3, ip}
 		strcs	r3, [r0, #0]
+		it	ne
 		ldrne	r3, [r1, #0]
+		it	cs
 		strcs	ip, [r0, #0]
+		it	ne
 		strne	r3, [r0, #0]
 		mov	pc, lr

--- a/arch/arm/lib/io-writesw-armv4.S
+++ b/arch/arm/lib/io-writesw-armv4.S
@@ -7,6 +7,8 @@
 * it under the terms of the GNU General Public License version 2 as
 * published by the Free Software Foundation.
 */
+#include <asm/unified.h>
 #include <linux/linkage.h>
 #include <asm/assembler.h>
@@ -29,8 +31,10 @@
 		sub	r2, r2, #1
 		strh	r3, [r0]
+		.type	__raw_writesw, %function
 ENTRY(__raw_writesw)
 		teq	r2, #0
+		it	eq
 		moveq	pc, lr
 		ands	r3, r1, #3
 		bne	.Loutsw_align
@@ -61,7 +65,8 @@ ENTRY(__raw_writesw)
 		ldr	r3, [r1], #4
 		outword	r3
-.Lno_outsw_2:	ldrneh	r3, [r1]
+.Lno_outsw_2:	itt	ne
+		ldrneh	r3, [r1]
 		strneh	r3, [r0]
 		ldmfd	sp!, {r4, r5, pc}
@@ -75,7 +80,11 @@ ENTRY(__raw_writesw)
 #endif
 .Loutsw_noalign:
-		ldr	r3, [r1, -r3]!
+ ARM(		ldr	r3, [r1, -r3]!	)
+ THUMB(		rsb	r3, r3, #0	)
+ THUMB(		ldr	r3, [r1, r3]	)
+ THUMB(		sub	r1, r3		)
+		it	cs
 		subcs	r2, r2, #1
 		bcs	2f
 		subs	r2, r2, #2
@@ -91,6 +100,7 @@ ENTRY(__raw_writesw)
 		bpl	1b
 		tst	r2, #1
-3:		movne	ip, r3, lsr #8
+3:		itt	ne
+		movne	ip, r3, lsr #8
 		strneh	ip, [r0]
 		mov	pc, lr
--- a/arch/arm/lib/lib1funcs.S
+++ b/arch/arm/lib/lib1funcs.S
@@ -31,6 +31,7 @@ You should have received a copy of the GNU General Public License
 along with this program; see the file COPYING.  If not, write to
 the Free Software Foundation, 59 Temple Place - Suite 330,
 Boston, MA 02111-1307, USA.  */
+#include <asm/unified.h>
 #include <linux/linkage.h>
@@ -56,6 +57,7 @@ Boston, MA 02111-1307, USA.  */
 	@ at the left end of each 4 bit nibbles in the division loop
 	@ to save one loop in most cases.
 	tst	\divisor, #0xe0000000
+	itte	eq
 	moveq	\divisor, \divisor, lsl #3
 	moveq	\curbit, #8
 	movne	\curbit, #1
@@ -65,6 +67,7 @@ Boston, MA 02111-1307, USA.  */
 	@ division loop.  Continue shifting until the divisor is 
 	@ larger than the dividend.
 1:	cmp	\divisor, #0x10000000
+	ittt	lo
 	cmplo	\divisor, \dividend
 	movlo	\divisor, \divisor, lsl #4
 	movlo	\curbit, \curbit, lsl #4
@@ -73,6 +76,7 @@ Boston, MA 02111-1307, USA.  */
 	@ For very big divisors, we must shift it a bit at a time, or
 	@ we will be in danger of overflowing.
 1:	cmp	\divisor, #0x80000000
+	ittt	lo
 	cmplo	\divisor, \dividend
 	movlo	\divisor, \divisor, lsl #1
 	movlo	\curbit, \curbit, lsl #1
@@ -84,19 +88,25 @@ Boston, MA 02111-1307, USA.  */
 	@ Division loop
 1:	cmp	\dividend, \divisor
+	itt	hs
 	subhs	\dividend, \dividend, \divisor
 	orrhs	\result,   \result,   \curbit
 	cmp	\dividend, \divisor,  lsr #1
+	itt	hs
 	subhs	\dividend, \dividend, \divisor, lsr #1
 	orrhs	\result,   \result,   \curbit,  lsr #1
 	cmp	\dividend, \divisor,  lsr #2
+	itt	hs
 	subhs	\dividend, \dividend, \divisor, lsr #2
 	orrhs	\result,   \result,   \curbit,  lsr #2
 	cmp	\dividend, \divisor,  lsr #3
+	itt	hs
 	subhs	\dividend, \dividend, \divisor, lsr #3
 	orrhs	\result,   \result,   \curbit,  lsr #3
 	cmp	\dividend, #0			@ Early termination?
+	it	ne
 	movnes	\curbit,   \curbit,  lsr #4	@ No, any more bits to do?
+	it	ne
 	movne	\divisor,  \divisor, lsr #4
 	bne	1b
@@ -113,19 +123,24 @@ Boston, MA 02111-1307, USA.  */
 #else
 	cmp	\divisor, #(1 << 16)
+	itt	hs
 	movhs	\divisor, \divisor, lsr #16
 	movhs	\order, #16
+	it	lo
 	movlo	\order, #0
 	cmp	\divisor, #(1 << 8)
+	itt	hs
 	movhs	\divisor, \divisor, lsr #8
 	addhs	\order, \order, #8
 	cmp	\divisor, #(1 << 4)
+	itt	hs
 	movhs	\divisor, \divisor, lsr #4
 	addhs	\order, \order, #4
 	cmp	\divisor, #(1 << 2)
+	ite	hi
 	addhi	\order, \order, #3
 	addls	\order, \order, \divisor, lsr #1
@@ -152,6 +167,7 @@ Boston, MA 02111-1307, USA.  */
 	@ division loop.  Continue shifting until the divisor is 
 	@ larger than the dividend.
 1:	cmp	\divisor, #0x10000000
+	ittt	lo
 	cmplo	\divisor, \dividend
 	movlo	\divisor, \divisor, lsl #4
 	addlo	\order, \order, #4
@@ -160,6 +176,7 @@ Boston, MA 02111-1307, USA.  */
 	@ For very big divisors, we must shift it a bit at a time, or
 	@ we will be in danger of overflowing.
 1:	cmp	\divisor, #0x80000000
+	ittt	lo
 	cmplo	\divisor, \dividend
 	movlo	\divisor, \divisor, lsl #1
 	addlo	\order, \order, #1
@@ -173,19 +190,25 @@ Boston, MA 02111-1307, USA.  */
 	blt	2f
 1:	cmp	\dividend, \divisor
+	it	hs
 	subhs	\dividend, \dividend, \divisor
 	cmp	\dividend, \divisor,  lsr #1
+	it	hs
 	subhs	\dividend, \dividend, \divisor, lsr #1
 	cmp	\dividend, \divisor,  lsr #2
+	it	hs
 	subhs	\dividend, \dividend, \divisor, lsr #2
 	cmp	\dividend, \divisor,  lsr #3
+	it	hs
 	subhs	\dividend, \dividend, \divisor, lsr #3
 	cmp	\dividend, #1
 	mov	\divisor, \divisor, lsr #4
+	it	ge
 	subges	\order, \order, #4
 	bge	1b
 	tst	\order, #3
+	it	ne
 	teqne	\dividend, #0
 	beq	5f
@@ -194,21 +217,27 @@ Boston, MA 02111-1307, USA.  */
 	blt	4f
 	beq	3f
 	cmp	\dividend, \divisor
+	it	hs
 	subhs	\dividend, \dividend, \divisor
 	mov	\divisor,  \divisor,  lsr #1
 3:	cmp	\dividend, \divisor
+	it	hs
 	subhs	\dividend, \dividend, \divisor
 	mov	\divisor,  \divisor,  lsr #1
 4:	cmp	\dividend, \divisor
+	it	hs
 	subhs	\dividend, \dividend, \divisor
 5:
 .endm
+	.type	__udivsi3, %function
 ENTRY(__udivsi3)
+	.type	__aeabi_uidiv, %function
 ENTRY(__aeabi_uidiv)
 	subs	r2, r1, #1
+	it	eq
 	moveq	pc, lr
 	bcc	Ldiv0
 	cmp	r0, r1
@@ -221,7 +250,8 @@ ENTRY(__aeabi_uidiv)
 	mov	r0, r2
 	mov	pc, lr
-11:	moveq	r0, #1
+11:	ite	eq
+	moveq	r0, #1
 	movne	r0, #0
 	mov	pc, lr
@@ -231,14 +261,19 @@ ENTRY(__aeabi_uidiv)
 	mov	pc, lr
+	.type	__umodsi3, %function
 ENTRY(__umodsi3)
 	subs	r2, r1, #1			@ compare divisor with 1
 	bcc	Ldiv0
+	ite	ne
 	cmpne	r0, r1				@ compare dividend with divisor
 	moveq   r0, #0
+	it	hi
 	tsthi	r1, r2				@ see if divisor is power of 2
+	it	eq
 	andeq	r0, r0, r2
+	it	ls
 	movls	pc, lr
 	ARM_MOD_BODY r0, r1, r2, r3
@@ -246,16 +281,20 @@ ENTRY(__umodsi3)
 	mov	pc, lr
+	.type	__divsi3, %function
 ENTRY(__divsi3)
+	.type	__aeabi_idiv, %function
 ENTRY(__aeabi_idiv)
 	cmp	r1, #0
 	eor	ip, r0, r1			@ save the sign of the result.
 	beq	Ldiv0
+	it	mi
 	rsbmi	r1, r1, #0			@ loops below use unsigned.
 	subs	r2, r1, #1			@ division by 1 or -1 ?
 	beq	10f
 	movs	r3, r0
+	it	mi
 	rsbmi	r3, r0, #0			@ positive dividend value
 	cmp	r3, r1
 	bls	11f
@@ -265,14 +304,18 @@ ENTRY(__aeabi_idiv)
 	ARM_DIV_BODY r3, r1, r0, r2
 	cmp	ip, #0
+	it	mi
 	rsbmi	r0, r0, #0
 	mov	pc, lr
 10:	teq	ip, r0				@ same sign ?
+	it	mi
 	rsbmi	r0, r0, #0
 	mov	pc, lr
-11:	movlo	r0, #0
+11:	it	lo
+	movlo	r0, #0
+	itt	eq
 	moveq	r0, ip, asr #31
 	orreq	r0, r0, #1
 	mov	pc, lr
@@ -281,32 +324,41 @@ ENTRY(__aeabi_idiv)
 	cmp	ip, #0
 	mov	r0, r3, lsr r2
+	it	mi
 	rsbmi	r0, r0, #0
 	mov	pc, lr
+	.type	__modsi3, %function
 ENTRY(__modsi3)
 	cmp	r1, #0
 	beq	Ldiv0
+	it	mi
 	rsbmi	r1, r1, #0			@ loops below use unsigned.
 	movs	ip, r0				@ preserve sign of dividend
+	it	mi
 	rsbmi	r0, r0, #0			@ if negative make positive
 	subs	r2, r1, #1			@ compare divisor with 1
+	ite	ne
 	cmpne	r0, r1				@ compare dividend with divisor
 	moveq	r0, #0
+	it	hi
 	tsthi	r1, r2				@ see if divisor is power of 2
+	it	eq
 	andeq	r0, r0, r2
 	bls	10f
 	ARM_MOD_BODY r0, r1, r2, r3
 10:	cmp	ip, #0
+	it	mi
 	rsbmi	r0, r0, #0
 	mov	pc, lr
 #ifdef CONFIG_AEABI
+	.type	__aeabi_uidivmod, %function
 ENTRY(__aeabi_uidivmod)
 	stmfd	sp!, {r0, r1, ip, lr}
@@ -316,6 +368,7 @@ ENTRY(__aeabi_uidivmod)
 	sub	r1, r1, r3
 	mov	pc, lr
+	.type	__aeabi_idivmod, %function
 ENTRY(__aeabi_idivmod)
 	stmfd	sp!, {r0, r1, ip, lr}

--- a/arch/arm/lib/lshrdi3.S
+++ b/arch/arm/lib/lshrdi3.S
@@ -24,6 +24,7 @@ You should have received a copy of the GNU General Public License
 along with this program; see the file COPYING.  If not, write to
 the Free Software Foundation, 51 Franklin Street, Fifth Floor,
 Boston, MA 02110-1301, USA.  */
+#include <asm/unified.h>
 #include <linux/linkage.h>
@@ -36,14 +37,19 @@ Boston, MA 02110-1301, USA.  */
 #define ah r1
 #endif
+	.type	__lshrdi3, %function
 ENTRY(__lshrdi3)
+	.type	__aeabi_llsr, %function
 ENTRY(__aeabi_llsr)
 	subs	r3, r2, #32
 	rsb	ip, r2, #32
+	itett	mi
 	movmi	al, al, lsr r2
 	movpl	al, ah, lsr r3
-	orrmi	al, al, ah, lsl ip
+ ARM(	orrmi	al, al, ah, lsl ip	)
+ THUMB(	lslmi	r3, ah, ip		)
+ THUMB(	orrmi	al, al, r3		)
 	mov	ah, ah, lsr r2
 	mov	pc, lr
--- a/arch/arm/lib/memchr.S
+++ b/arch/arm/lib/memchr.S
@@ -9,11 +9,14 @@
 *
 *  ASM optimised string functions
 */
+#include <asm/unified.h>
 #include <linux/linkage.h>
 #include <asm/assembler.h>
 	.text
 	.align	5
+	.type	memchr, %function
 ENTRY(memchr)
 1:	subs	r2, r2, #1
 	bmi	2f
@@ -21,5 +24,6 @@ ENTRY(memchr)
 	teq	r3, r1
 	bne	1b
 	sub	r0, r0, #1
-2:	movne	r0, #0
+2:	it	ne
+	movne	r0, #0
 	mov	pc, lr
--- a/arch/arm/lib/memcpy.S
+++ b/arch/arm/lib/memcpy.S
@@ -9,12 +9,16 @@
 *  it under the terms of the GNU General Public License version 2 as
 *  published by the Free Software Foundation.
 */
+#include <asm/unified.h>
 #include <linux/linkage.h>
 #include <asm/assembler.h>
+#define LDR1W_SHIFT	0
+#define STR1W_SHIFT	0
 	.macro ldr1w ptr reg abort
-	ldr \reg, [\ptr], #4
+	W(ldr) \reg, [\ptr], #4
 	.endm
 	.macro ldr4w ptr reg1 reg2 reg3 reg4 abort
@@ -26,11 +30,16 @@
 	.endm
 	.macro ldr1b ptr reg cond=al abort
+	.ifnc \cond,al
+	it \cond
 	ldr\cond\()b \reg, [\ptr], #1
+	.else
+	ldrb \reg, [\ptr], #1
+	.endif
 	.endm
 	.macro str1w ptr reg abort
-	str \reg, [\ptr], #4
+	W(str) \reg, [\ptr], #4
 	.endm
 	.macro str8w ptr reg1 reg2 reg3 reg4 reg5 reg6 reg7 reg8 abort
@@ -38,7 +47,12 @@
 	.endm
 	.macro str1b ptr reg cond=al abort
+	.ifnc \cond,al
+	it \cond
 	str\cond\()b \reg, [\ptr], #1
+	.else
+	strb \reg, [\ptr], #1
+	.endif
 	.endm
 	.macro enter reg1 reg2
@@ -53,6 +67,7 @@
 /* Prototype: void *memcpy(void *dest, const void *src, size_t n); */
+	.type	memcpy, %function
 ENTRY(memcpy)
 #include "copy_template.S"

--- a/arch/arm/lib/memmove.S
+++ b/arch/arm/lib/memmove.S
@@ -9,6 +9,7 @@
 *  it under the terms of the GNU General Public License version 2 as
 *  published by the Free Software Foundation.
 */
+#include <asm/unified.h>
 #include <linux/linkage.h>
 #include <asm/assembler.h>
@@ -34,10 +35,13 @@
 * occurring in the opposite direction.
 */
+		.type	memmove, %function
 ENTRY(memmove)
 		subs	ip, r0, r1
+		it	hi
 		cmphi	r2, ip
+		it	ls
 		bls	memcpy
 		stmfd	sp!, {r0, r4, lr}
@@ -79,46 +83,55 @@ ENTRY(memmove)
 5:		ands	ip, r2, #28
 		rsb	ip, ip, #32
+		it	ne
 		addne	pc, pc, ip		@ C is always clear here
 		b	7f
 6:		nop
-		ldr	r3, [r1, #-4]!
+		W(ldr)	r3, [r1, #-4]!
-		ldr	r4, [r1, #-4]!
+		W(ldr)	r4, [r1, #-4]!
-		ldr	r5, [r1, #-4]!
+		W(ldr)	r5, [r1, #-4]!
-		ldr	r6, [r1, #-4]!
+		W(ldr)	r6, [r1, #-4]!
-		ldr	r7, [r1, #-4]!
+		W(ldr)	r7, [r1, #-4]!
-		ldr	r8, [r1, #-4]!
+		W(ldr)	r8, [r1, #-4]!
-		ldr	lr, [r1, #-4]!
+		W(ldr)	lr, [r1, #-4]!
 		add	pc, pc, ip
 		nop
 		nop
-		str	r3, [r0, #-4]!
+		W(str)	r3, [r0, #-4]!
-		str	r4, [r0, #-4]!
+		W(str)	r4, [r0, #-4]!
-		str	r5, [r0, #-4]!
+		W(str)	r5, [r0, #-4]!
-		str	r6, [r0, #-4]!
+		W(str)	r6, [r0, #-4]!
-		str	r7, [r0, #-4]!
+		W(str)	r7, [r0, #-4]!
-		str	r8, [r0, #-4]!
+		W(str)	r8, [r0, #-4]!
-		str	lr, [r0, #-4]!
+		W(str)	lr, [r0, #-4]!
 	CALGN(	bcs	2b			)
 7:		ldmfd	sp!, {r5 - r8}
 8:		movs	r2, r2, lsl #31
+		it	ne
 		ldrneb	r3, [r1, #-1]!
+		itt	cs
 		ldrcsb	r4, [r1, #-1]!
 		ldrcsb	ip, [r1, #-1]
+		it	ne
 		strneb	r3, [r0, #-1]!
+		itt	cs
 		strcsb	r4, [r0, #-1]!
 		strcsb	ip, [r0, #-1]
 		ldmfd	sp!, {r0, r4, pc}
 9:		cmp	ip, #2
+		it	gt
 		ldrgtb	r3, [r1, #-1]!
+		it	ge
 		ldrgeb	r4, [r1, #-1]!
 		ldrb	lr, [r1, #-1]!
+		it	gt
 		strgtb	r3, [r0, #-1]!
+		it	ge
 		strgeb	r4, [r0, #-1]!
 		subs	r2, r2, ip
 		strb	lr, [r0, #-1]!

--- a/arch/arm/lib/memset.S
+++ b/arch/arm/lib/memset.S
@@ -9,6 +9,8 @@
 *
 *  ASM optimised string functions
 */
+#include <asm/unified.h>
 #include <linux/linkage.h>
 #include <asm/assembler.h>
@@ -19,7 +21,9 @@
 1:	subs	r2, r2, #4		@ 1 do we have enough
 	blt	5f			@ 1 bytes to align with?
 	cmp	r3, #2			@ 1
+	it	lt
 	strltb	r1, [r0], #1		@ 1
+	it	le
 	strleb	r1, [r0], #1		@ 1
 	strb	r1, [r0], #1		@ 1
 	add	r2, r2, r3		@ 1 (r2 = r2 - (4 - r3))
@@ -28,6 +32,7 @@
 * memzero again.
 */
+	.type	memset, %function
 ENTRY(memset)
 	ands	r3, r0, #3		@ 1 unaligned?
 	bne	1b			@ 1
@@ -48,33 +53,41 @@ ENTRY(memset)
 	mov	lr, r1
 2:	subs	r2, r2, #64
+	itttt	ge
 	stmgeia	r0!, {r1, r3, ip, lr}	@ 64 bytes at a time.
 	stmgeia	r0!, {r1, r3, ip, lr}
 	stmgeia	r0!, {r1, r3, ip, lr}
 	stmgeia	r0!, {r1, r3, ip, lr}
 	bgt	2b
+	it	eq
 	ldmeqfd	sp!, {pc}		@ Now <64 bytes to go.
 /*
 * No need to correct the count; we're only testing bits from now on
 */
 	tst	r2, #32
+	itt	ne
 	stmneia	r0!, {r1, r3, ip, lr}
 	stmneia	r0!, {r1, r3, ip, lr}
 	tst	r2, #16
+	it	ne
 	stmneia	r0!, {r1, r3, ip, lr}
 	ldr	lr, [sp], #4
 4:	tst	r2, #8
+	it	ne
 	stmneia	r0!, {r1, r3}
 	tst	r2, #4
+	it	ne
 	strne	r1, [r0], #4
 /*
 * When we get here, we've got less than 4 bytes to zero.  We
 * may have an unaligned pointer as well.
 */
 5:	tst	r2, #2
+	itt	ne
 	strneb	r1, [r0], #1
 	strneb	r1, [r0], #1
 	tst	r2, #1
+	it	ne
 	strneb	r1, [r0], #1
 	mov	pc, lr
--- a/arch/arm/lib/memzero.S
+++ b/arch/arm/lib/memzero.S
@@ -7,6 +7,8 @@
 * it under the terms of the GNU General Public License version 2 as
 * published by the Free Software Foundation.
 */
+#include <asm/unified.h>
 #include <linux/linkage.h>
 #include <asm/assembler.h>
@@ -21,7 +23,9 @@
 1:	subs	r1, r1, #4		@ 1 do we have enough
 	blt	5f			@ 1 bytes to align with?
 	cmp	r3, #2			@ 1
+	it	lt
 	strltb	r2, [r0], #1		@ 1
+	it	le
 	strleb	r2, [r0], #1		@ 1
 	strb	r2, [r0], #1		@ 1
 	add	r1, r1, r3		@ 1 (r1 = r1 - (4 - r3))
@@ -30,6 +34,7 @@
 * memzero again.
 */
+	.type	__memzero, %function
 ENTRY(__memzero)
 	mov	r2, #0			@ 1
 	ands	r3, r0, #3		@ 1 unaligned?
@@ -48,33 +53,41 @@ ENTRY(__memzero)
 	mov	lr, r2			@ 1
 3:	subs	r1, r1, #64		@ 1 write 32 bytes out per loop
+	itttt	ge
 	stmgeia	r0!, {r2, r3, ip, lr}	@ 4
 	stmgeia	r0!, {r2, r3, ip, lr}	@ 4
 	stmgeia	r0!, {r2, r3, ip, lr}	@ 4
 	stmgeia	r0!, {r2, r3, ip, lr}	@ 4
 	bgt	3b			@ 1
+	it	eq
 	ldmeqfd	sp!, {pc}		@ 1/2 quick exit
 /*
 * No need to correct the count; we're only testing bits from now on
 */
 	tst	r1, #32			@ 1
+	itt	ne
 	stmneia	r0!, {r2, r3, ip, lr}	@ 4
 	stmneia	r0!, {r2, r3, ip, lr}	@ 4
 	tst	r1, #16			@ 1 16 bytes or more?
+	it	ne
 	stmneia	r0!, {r2, r3, ip, lr}	@ 4
 	ldr	lr, [sp], #4		@ 1
 4:	tst	r1, #8			@ 1 8 bytes or more?
+	it	ne
 	stmneia	r0!, {r2, r3}		@ 2
 	tst	r1, #4			@ 1 4 bytes or more?
+	it	ne
 	strne	r2, [r0], #4		@ 1
 /*
 * When we get here, we've got less than 4 bytes to zero.  We
 * may have an unaligned pointer as well.
 */
 5:	tst	r1, #2			@ 1 2 bytes or more?
+	itt	ne
 	strneb	r2, [r0], #1		@ 1
 	strneb	r2, [r0], #1		@ 1
 	tst	r1, #1			@ 1 a byte left over
+	it	ne
 	strneb	r2, [r0], #1		@ 1
 	mov	pc, lr			@ 1
--- a/arch/arm/lib/muldi3.S
+++ b/arch/arm/lib/muldi3.S
@@ -9,6 +9,7 @@
 *  it under the terms of the GNU General Public License version 2 as
 *  published by the Free Software Foundation.
 */
+#include <asm/unified.h>
 #include <linux/linkage.h>
@@ -24,7 +25,9 @@
 #define yh r3
 #endif
+	.type	__muldi3, %function
 ENTRY(__muldi3)
+	.type	__aeabi_lmul, %function
 ENTRY(__aeabi_lmul)
 	mul	xh, yl, xh

--- a/arch/arm/lib/putuser.S
+++ b/arch/arm/lib/putuser.S
@@ -26,42 +26,58 @@
 * Note that ADDR_LIMIT is either 0 or 0xc0000000
 * Note also that it is intended that __put_user_bad is not global.
 */
+#include <asm/unified.h>
 #include <asm/asm-offsets.h>
 #include <asm/thread_info.h>
 #include <asm/errno.h>
 	.global	__put_user_1
+	.type	__put_user_1, %function
 __put_user_1:
 1:	strbt	r2, [r0]
 	mov	r0, #0
 	mov	pc, lr
 	.global	__put_user_2
+	.type	__put_user_2, %function
 __put_user_2:
 	mov	ip, r2, lsr #8
 #ifndef __ARMEB__
-2:	strbt	r2, [r0], #1
+2:
+ ARM(	strbt	r2, [r0], #1	)
+ THUMB(	strbt	r2, [r0]	)
+ THUMB(	add	r0, #1		)
 3:	strbt	ip, [r0]
 #else
-2:	strbt	ip, [r0], #1
+2:
+ ARM(	strbt	ip, [r0], #1	)
+ THUMB(	strbt	ip, [r0]	)
+ THUMB(	add	r0, #1		)
 3:	strbt	r2, [r0]
 #endif
 	mov	r0, #0
 	mov	pc, lr
 	.global	__put_user_4
+	.type	__put_user_4, %function
 __put_user_4:
 4:	strt	r2, [r0]
 	mov	r0, #0
 	mov	pc, lr
 	.global	__put_user_8
+	.type	__put_user_8, %function
 __put_user_8:
-5:	strt	r2, [r0], #4
+5:
+ ARM(	strt	r2, [r0], #4	)
+ THUMB(	strt	r2, [r0]	)
+ THUMB(	add	r0, #4		)
 6:	strt	r3, [r0]
 	mov	r0, #0
 	mov	pc, lr
+	.type	__put_user_bad, %function
 __put_user_bad:
 	mov	r0, #-EFAULT
 	mov	pc, lr

--- a/arch/arm/lib/setbit.S
+++ b/arch/arm/lib/setbit.S
@@ -7,6 +7,8 @@
 * it under the terms of the GNU General Public License version 2 as
 * published by the Free Software Foundation.
 */
+#include <asm/unified.h>
 #include <linux/linkage.h>
 #include <asm/assembler.h>
 #include "bitops.h"
@@ -16,7 +18,9 @@
 * Purpose  : Function to set a bit
 * Prototype: int set_bit(int bit, void *addr)
 */
+	.type	_set_bit_be, %function
 ENTRY(_set_bit_be)
 		eor	r0, r0, #0x18		@ big endian byte ordering
+	.type	_set_bit_le, %function
 ENTRY(_set_bit_le)
 	bitop	orr
--- a/arch/arm/lib/sha1.S
+++ b/arch/arm/lib/sha1.S
@@ -12,6 +12,7 @@
 *
 *  The reference implementation for this code is linux/lib/sha1.c
 */
+#include <asm/unified.h>
 #include <linux/linkage.h>
@@ -24,6 +25,7 @@
 * Note: the "in" ptr may be unaligned.
 */
+	.type	sha_transform, %function
 ENTRY(sha_transform)
 	stmfd	sp!, {r4 - r8, lr}
@@ -185,6 +187,7 @@ ENTRY(sha_transform)
 	ldmfd	sp!, {r4 - r8, pc}
+	.align	2
 .L_sha_K:
 	.word	0x5a827999, 0x6ed9eba1, 0x8f1bbcdc, 0xca62c1d6
@@ -193,9 +196,11 @@ ENTRY(sha_transform)
 * void sha_init(__u32 *buf)
 */
+	.align	2
 .L_sha_initial_digest:
 	.word	0x67452301, 0xefcdab89, 0x98badcfe, 0x10325476, 0xc3d2e1f0
+	.type	sha_init, %function
 ENTRY(sha_init)
 	str	lr, [sp, #-4]!

--- a/arch/arm/lib/strchr.S
+++ b/arch/arm/lib/strchr.S
@@ -9,18 +9,23 @@
 *
 *  ASM optimised string functions
 */
+#include <asm/unified.h>
 #include <linux/linkage.h>
 #include <asm/assembler.h>
 		.text
 		.align	5
+		.type	strchr, %function
 ENTRY(strchr)
 		and	r1, r1, #0xff
 1:		ldrb	r2, [r0], #1
 		teq	r2, r1
+		it	ne
 		teqne	r2, #0
 		bne	1b
 		teq	r2, r1
+		ite	ne
 		movne	r0, #0
 		subeq	r0, r0, #1
 		mov	pc, lr
--- a/arch/arm/lib/strncpy_from_user.S
+++ b/arch/arm/lib/strncpy_from_user.S
@@ -7,6 +7,8 @@
 * it under the terms of the GNU General Public License version 2 as
 * published by the Free Software Foundation.
 */
+#include <asm/unified.h>
 #include <linux/linkage.h>
 #include <asm/assembler.h>
 #include <asm/errno.h>
@@ -20,10 +22,14 @@
 * returns the number of characters copied (strlen of copied string),
 *  -EFAULT on exception, or "len" if we fill the whole buffer
 */
+	.type	__strncpy_from_user, %function
 ENTRY(__strncpy_from_user)
 	mov	ip, r1
 1:	subs	r2, r2, #1
-USER(	ldrplbt	r3, [r1], #1)
+ARM(USER(	ldrplbt	r3, [r1], #1	))
+	itt	pl
+THUMB(USER(	ldrplbt	r3, [r1]	))
+THUMB(		addpl	r1, #1		)
 	bmi	2f
 	strb	r3, [r0], #1
 	teq	r3, #0

--- a/arch/arm/lib/strnlen_user.S
+++ b/arch/arm/lib/strnlen_user.S
@@ -7,6 +7,8 @@
 * it under the terms of the GNU General Public License version 2 as
 * published by the Free Software Foundation.
 */
+#include <asm/unified.h>
 #include <linux/linkage.h>
 #include <asm/assembler.h>
 #include <asm/errno.h>
@@ -20,10 +22,13 @@
 * Returns  : length of string *including terminator*
 *	      or zero on exception, or n + 1 if too long
 */
+	.type	__strnlen_user, %function
 ENTRY(__strnlen_user)
 	mov	r2, r0
 1:
-USER(	ldrbt	r3, [r0], #1)
+ARM(USER(	ldrbt	r3, [r0], #1	))
+THUMB(USER(	ldrbt	r3, [r0]	))
+THUMB(		add	r0, #1		)
 	teq	r3, #0
 	beq	2f
 	subs	r1, r1, #1

--- a/arch/arm/lib/strrchr.S
+++ b/arch/arm/lib/strrchr.S
@@ -9,15 +9,19 @@
 *
 *  ASM optimised string functions
 */
+#include <asm/unified.h>
 #include <linux/linkage.h>
 #include <asm/assembler.h>
 		.text
 		.align	5
+		.type	strrchr, %function
 ENTRY(strrchr)
 		mov	r3, #0
 1:		ldrb	r2, [r0], #1
 		teq	r2, r1
+		it	eq
 		subeq	r3, r0, #1
 		teq	r2, #0
 		bne	1b

--- a/arch/arm/lib/testchangebit.S
+++ b/arch/arm/lib/testchangebit.S
@@ -7,12 +7,16 @@
 * it under the terms of the GNU General Public License version 2 as
 * published by the Free Software Foundation.
 */
+#include <asm/unified.h>
 #include <linux/linkage.h>
 #include <asm/assembler.h>
 #include "bitops.h"
                .text
+	.type	_test_and_change_bit_be, %function
 ENTRY(_test_and_change_bit_be)
 		eor	r0, r0, #0x18		@ big endian byte ordering
+	.type	_test_and_change_bit_le, %function
 ENTRY(_test_and_change_bit_le)
 	testop	eor, strb
--- a/arch/arm/lib/testclearbit.S
+++ b/arch/arm/lib/testclearbit.S
@@ -7,12 +7,16 @@
 * it under the terms of the GNU General Public License version 2 as
 * published by the Free Software Foundation.
 */
+#include <asm/unified.h>
 #include <linux/linkage.h>
 #include <asm/assembler.h>
 #include "bitops.h"
                .text
+	.type	_test_and_clear_bit_be, %function
 ENTRY(_test_and_clear_bit_be)
 		eor	r0, r0, #0x18		@ big endian byte ordering
+	.type	_test_and_clear_bit_le, %function
 ENTRY(_test_and_clear_bit_le)
-	testop	bicne, strneb
+	testop	bicne, strneb, ne
--- a/arch/arm/lib/testsetbit.S
+++ b/arch/arm/lib/testsetbit.S
@@ -7,12 +7,16 @@
 * it under the terms of the GNU General Public License version 2 as
 * published by the Free Software Foundation.
 */
+#include <asm/unified.h>
 #include <linux/linkage.h>
 #include <asm/assembler.h>
 #include "bitops.h"
                .text
+	.type	_test_and_set_bit_be, %function
 ENTRY(_test_and_set_bit_be)
 		eor	r0, r0, #0x18		@ big endian byte ordering
+	.type	_test_and_set_bit_le, %function
 ENTRY(_test_and_set_bit_le)
-	testop	orreq, streqb
+	testop	orreq, streqb, eq
--- a/arch/arm/lib/uaccess.S
+++ b/arch/arm/lib/uaccess.S
@@ -39,6 +39,7 @@ USER(		strgtbt	r3, [r0], #1)			@ May fault
 		sub	r2, r2, ip
 		b	.Lc2u_dest_aligned
+		.type	__copy_to_user, %function
 ENTRY(__copy_to_user)
 		stmfd	sp!, {r2, r4 - r7, lr}
 		cmp	r2, #4
@@ -302,6 +303,7 @@ USER(		ldrgtbt	r3, [r1], #1)			@ May fault
 		sub	r2, r2, ip
 		b	.Lcfu_dest_aligned
+		.type	__copy_from_user, %function
 ENTRY(__copy_from_user)
 		stmfd	sp!, {r0, r2, r4 - r7, lr}
 		cmp	r2, #4

--- a/arch/arm/lib/ucmpdi2.S
+++ b/arch/arm/lib/ucmpdi2.S
@@ -9,6 +9,7 @@
 *  it under the terms of the GNU General Public License version 2 as
 *  published by the Free Software Foundation.
 */
+#include <asm/unified.h>
 #include <linux/linkage.h>
@@ -24,23 +25,33 @@
 #define yh r3
 #endif
+	.type	__ucmpdi2, %function
 ENTRY(__ucmpdi2)
 	cmp	xh, yh
+	it	eq
 	cmpeq	xl, yl
+	it	lo
 	movlo	r0, #0
+	it	eq
 	moveq	r0, #1
+	it	hi
 	movhi	r0, #2
 	mov	pc, lr
 #ifdef CONFIG_AEABI
+	.type	__aeabi_ulcmp, %function
 ENTRY(__aeabi_ulcmp)
 	cmp	xh, yh
+	it	eq
 	cmpeq	xl, yl
+	it	lo
 	movlo	r0, #-1
+	it	eq
 	moveq	r0, #0
+	it	hi
 	movhi	r0, #1
 	mov	pc, lr

--- a/include/asm-arm/checksum.h
+++ b/include/asm-arm/checksum.h
@@ -73,6 +73,7 @@ ip_fast_csum(const void *iph, unsigned int ihl)
 1:	adcs	%0, %0, %3					\n\
 	ldr	%3, [%1], #4					\n\
 	tst	%2, #15			@ do this carefully	\n\
+	it	ne						\n\
 	subne	%2, %2, #1		@ without destroying	\n\
 	bne	1b			@ the carry flag	\n\
 	adcs	%0, %0, %3					\n\

--- a/include/asm-arm/uaccess.h
+++ b/include/asm-arm/uaccess.h
@@ -12,6 +12,7 @@
 * User space memory access functions
 */
 #include <linux/sched.h>
+#include <asm/unified.h>
 #include <asm/errno.h>
 #include <asm/memory.h>
 #include <asm/domain.h>
@@ -68,7 +69,7 @@ static inline void set_fs(mm_segment_t fs)
 #define __addr_ok(addr) ({ \
 	unsigned long flag; \
-	__asm__("cmp %2, %0; movlo %0, #0" \
+	__asm__("cmp %2, %0; it lo; movlo %0, #0" \
 		: "=&r" (flag) \
 		: "0" (current_thread_info()->addr_limit), "r" (addr) \
 		: "cc"); \
@@ -78,7 +79,7 @@ static inline void set_fs(mm_segment_t fs)
 #define __range_ok(addr,size) ({ \
 	unsigned long flag, roksum; \
 	__chk_user_ptr(addr);	\
-	__asm__("adds %1, %2, %3; sbcccs %1, %1, %0; movcc %0, #0" \
+	__asm__("adds %1, %2, %3; it cc; sbcccs %1, %1, %0; it cc; movcc %0, #0" \
 		: "=&r" (flag), "=&r" (roksum) \
 		: "r" (addr), "Ir" (size), "0" (current_thread_info()->addr_limit) \
 		: "cc"); \
@@ -225,7 +226,7 @@ do {									\
 #define __get_user_asm_byte(x,addr,err)				\
 	__asm__ __volatile__(					\
-	"1:	ldrbt	%1,[%2],#0\n"				\
+	"1:	ldrbt	%1,[%2]\n"				\
 	"2:\n"							\
 	"	.section .fixup,\"ax\"\n"			\
 	"	.align	2\n"					\
@@ -261,7 +262,7 @@ do {									\
 #define __get_user_asm_word(x,addr,err)				\
 	__asm__ __volatile__(					\
-	"1:	ldrt	%1,[%2],#0\n"				\
+	"1:	ldrt	%1,[%2]\n"				\
 	"2:\n"							\
 	"	.section .fixup,\"ax\"\n"			\
 	"	.align	2\n"					\
@@ -306,7 +307,7 @@ do {									\
 #define __put_user_asm_byte(x,__pu_addr,err)			\
 	__asm__ __volatile__(					\
-	"1:	strbt	%1,[%2],#0\n"				\
+	"1:	strbt	%1,[%2]\n"				\
 	"2:\n"							\
 	"	.section .fixup,\"ax\"\n"			\
 	"	.align	2\n"					\
@@ -339,7 +340,7 @@ do {									\
 #define __put_user_asm_word(x,__pu_addr,err)			\
 	__asm__ __volatile__(					\
-	"1:	strt	%1,[%2],#0\n"				\
+	"1:	strt	%1,[%2]\n"				\
 	"2:\n"							\
 	"	.section .fixup,\"ax\"\n"			\
 	"	.align	2\n"					\
@@ -364,8 +365,10 @@ do {									\
 #define __put_user_asm_dword(x,__pu_addr,err)			\
 	__asm__ __volatile__(					\
-	"1:	strt	" __reg_oper1 ", [%1], #4\n"		\
+ ARM(	"1:	strt	" __reg_oper1 ", [%1], #4\n"	)	\
-	"2:	strt	" __reg_oper0 ", [%1], #0\n"		\
+ THUMB(	"1:	strt	" __reg_oper1 ", [%1]\n"	)	\
+ THUMB(	"	add	%1, %1, #4\n"			)	\
+	"2:	strt	" __reg_oper0 ", [%1]\n"		\
 	"3:\n"							\
 	"	.section .fixup,\"ax\"\n"			\
 	"	.align	2\n"					\