Skip to content

Commit f6c48fb

Browse files
committed
Reimplement aarch64 vld1* instructions to not cause individual loads under certain circumstances.
1 parent 2edc74d commit f6c48fb

File tree

1 file changed

+24
-159
lines changed
  • crates/core_arch/src/aarch64/neon

1 file changed

+24
-159
lines changed

crates/core_arch/src/aarch64/neon/mod.rs

+24-159
Original file line numberDiff line numberDiff line change
@@ -464,326 +464,191 @@ pub unsafe fn vcopy_laneq_f64<const LANE1: i32, const LANE2: i32>(
464464
#[target_feature(enable = "neon")]
465465
#[cfg_attr(test, assert_instr(ldr))]
466466
pub unsafe fn vld1_s8(ptr: *const i8) -> int8x8_t {
467-
transmute(i8x8::new(
468-
*ptr,
469-
*ptr.offset(1),
470-
*ptr.offset(2),
471-
*ptr.offset(3),
472-
*ptr.offset(4),
473-
*ptr.offset(5),
474-
*ptr.offset(6),
475-
*ptr.offset(7),
476-
))
467+
core::ptr::read_unaligned(ptr as *const int8x8_t)
477468
}
478469

479470
/// Load multiple single-element structures to one, two, three, or four registers.
480471
#[inline]
481472
#[target_feature(enable = "neon")]
482473
#[cfg_attr(test, assert_instr(ldr))]
483474
pub unsafe fn vld1q_s8(ptr: *const i8) -> int8x16_t {
484-
transmute(i8x16::new(
485-
*ptr,
486-
*ptr.offset(1),
487-
*ptr.offset(2),
488-
*ptr.offset(3),
489-
*ptr.offset(4),
490-
*ptr.offset(5),
491-
*ptr.offset(6),
492-
*ptr.offset(7),
493-
*ptr.offset(8),
494-
*ptr.offset(9),
495-
*ptr.offset(10),
496-
*ptr.offset(11),
497-
*ptr.offset(12),
498-
*ptr.offset(13),
499-
*ptr.offset(14),
500-
*ptr.offset(15),
501-
))
475+
core::ptr::read_unaligned(ptr as *const int8x16_t)
502476
}
503477

504478
/// Load multiple single-element structures to one, two, three, or four registers.
505479
#[inline]
506480
#[target_feature(enable = "neon")]
507481
#[cfg_attr(test, assert_instr(ldr))]
508482
pub unsafe fn vld1_s16(ptr: *const i16) -> int16x4_t {
509-
transmute(i16x4::new(
510-
*ptr,
511-
*ptr.offset(1),
512-
*ptr.offset(2),
513-
*ptr.offset(3),
514-
))
483+
core::ptr::read_unaligned(ptr as *const int16x4_t)
515484
}
516485

517486
/// Load multiple single-element structures to one, two, three, or four registers.
518487
#[inline]
519488
#[target_feature(enable = "neon")]
520489
#[cfg_attr(test, assert_instr(ldr))]
521490
pub unsafe fn vld1q_s16(ptr: *const i16) -> int16x8_t {
522-
transmute(i16x8::new(
523-
*ptr,
524-
*ptr.offset(1),
525-
*ptr.offset(2),
526-
*ptr.offset(3),
527-
*ptr.offset(4),
528-
*ptr.offset(5),
529-
*ptr.offset(6),
530-
*ptr.offset(7),
531-
))
491+
core::ptr::read_unaligned(ptr as *const int16x8_t)
532492
}
533493

534494
/// Load multiple single-element structures to one, two, three, or four registers.
535495
#[inline]
536496
#[target_feature(enable = "neon")]
537497
#[cfg_attr(test, assert_instr(ldr))]
538498
pub unsafe fn vld1_s32(ptr: *const i32) -> int32x2_t {
539-
transmute(i32x2::new(*ptr, *ptr.offset(1)))
499+
core::ptr::read_unaligned(ptr as *const int32x2_t)
540500
}
541501

542502
/// Load multiple single-element structures to one, two, three, or four registers.
543503
#[inline]
544504
#[target_feature(enable = "neon")]
545505
#[cfg_attr(test, assert_instr(ldr))]
546506
pub unsafe fn vld1q_s32(ptr: *const i32) -> int32x4_t {
547-
transmute(i32x4::new(
548-
*ptr,
549-
*ptr.offset(1),
550-
*ptr.offset(2),
551-
*ptr.offset(3),
552-
))
507+
core::ptr::read_unaligned(ptr as *const int32x4_t)
553508
}
554509

555510
/// Load multiple single-element structures to one, two, three, or four registers.
556511
#[inline]
557512
#[target_feature(enable = "neon")]
558513
#[cfg_attr(test, assert_instr(ldr))]
559514
pub unsafe fn vld1_s64(ptr: *const i64) -> int64x1_t {
560-
transmute(i64x1::new(*ptr))
515+
core::ptr::read_unaligned(ptr as *const int64x1_t)
561516
}
562517

563518
/// Load multiple single-element structures to one, two, three, or four registers.
564519
#[inline]
565520
#[target_feature(enable = "neon")]
566521
#[cfg_attr(test, assert_instr(ldr))]
567522
pub unsafe fn vld1q_s64(ptr: *const i64) -> int64x2_t {
568-
transmute(i64x2::new(*ptr, *ptr.offset(1)))
523+
core::ptr::read_unaligned(ptr as *const int64x2_t)
569524
}
570525

571526
/// Load multiple single-element structures to one, two, three, or four registers.
572527
#[inline]
573528
#[target_feature(enable = "neon")]
574529
#[cfg_attr(test, assert_instr(ldr))]
575530
pub unsafe fn vld1_u8(ptr: *const u8) -> uint8x8_t {
576-
transmute(u8x8::new(
577-
*ptr,
578-
*ptr.offset(1),
579-
*ptr.offset(2),
580-
*ptr.offset(3),
581-
*ptr.offset(4),
582-
*ptr.offset(5),
583-
*ptr.offset(6),
584-
*ptr.offset(7),
585-
))
531+
core::ptr::read_unaligned(ptr as *const uint8x8_t)
586532
}
587533

588534
/// Load multiple single-element structures to one, two, three, or four registers.
589535
#[inline]
590536
#[target_feature(enable = "neon")]
591537
#[cfg_attr(test, assert_instr(ldr))]
592538
pub unsafe fn vld1q_u8(ptr: *const u8) -> uint8x16_t {
593-
transmute(u8x16::new(
594-
*ptr,
595-
*ptr.offset(1),
596-
*ptr.offset(2),
597-
*ptr.offset(3),
598-
*ptr.offset(4),
599-
*ptr.offset(5),
600-
*ptr.offset(6),
601-
*ptr.offset(7),
602-
*ptr.offset(8),
603-
*ptr.offset(9),
604-
*ptr.offset(10),
605-
*ptr.offset(11),
606-
*ptr.offset(12),
607-
*ptr.offset(13),
608-
*ptr.offset(14),
609-
*ptr.offset(15),
610-
))
539+
core::ptr::read_unaligned(ptr as *const uint8x16_t)
611540
}
612541

613542
/// Load multiple single-element structures to one, two, three, or four registers.
614543
#[inline]
615544
#[target_feature(enable = "neon")]
616545
#[cfg_attr(test, assert_instr(ldr))]
617546
pub unsafe fn vld1_u16(ptr: *const u16) -> uint16x4_t {
618-
transmute(u16x4::new(
619-
*ptr,
620-
*ptr.offset(1),
621-
*ptr.offset(2),
622-
*ptr.offset(3),
623-
))
547+
core::ptr::read_unaligned(ptr as *const uint16x4_t)
624548
}
625549

626550
/// Load multiple single-element structures to one, two, three, or four registers.
627551
#[inline]
628552
#[target_feature(enable = "neon")]
629553
#[cfg_attr(test, assert_instr(ldr))]
630554
pub unsafe fn vld1q_u16(ptr: *const u16) -> uint16x8_t {
631-
transmute(u16x8::new(
632-
*ptr,
633-
*ptr.offset(1),
634-
*ptr.offset(2),
635-
*ptr.offset(3),
636-
*ptr.offset(4),
637-
*ptr.offset(5),
638-
*ptr.offset(6),
639-
*ptr.offset(7),
640-
))
555+
core::ptr::read_unaligned(ptr as *const uint16x8_t)
641556
}
642557

643558
/// Load multiple single-element structures to one, two, three, or four registers.
644559
#[inline]
645560
#[target_feature(enable = "neon")]
646561
#[cfg_attr(test, assert_instr(ldr))]
647562
pub unsafe fn vld1_u32(ptr: *const u32) -> uint32x2_t {
648-
transmute(u32x2::new(*ptr, *ptr.offset(1)))
563+
core::ptr::read_unaligned(ptr as *const uint32x2_t)
649564
}
650565

651566
/// Load multiple single-element structures to one, two, three, or four registers.
652567
#[inline]
653568
#[target_feature(enable = "neon")]
654569
#[cfg_attr(test, assert_instr(ldr))]
655570
pub unsafe fn vld1q_u32(ptr: *const u32) -> uint32x4_t {
656-
transmute(u32x4::new(
657-
*ptr,
658-
*ptr.offset(1),
659-
*ptr.offset(2),
660-
*ptr.offset(3),
661-
))
571+
core::ptr::read_unaligned(ptr as *const uint32x4_t)
662572
}
663573

664574
/// Load multiple single-element structures to one, two, three, or four registers.
665575
#[inline]
666576
#[target_feature(enable = "neon")]
667577
#[cfg_attr(test, assert_instr(ldr))]
668578
pub unsafe fn vld1_u64(ptr: *const u64) -> uint64x1_t {
669-
transmute(u64x1::new(*ptr))
579+
core::ptr::read_unaligned(ptr as *const uint64x1_t)
670580
}
671581

672582
/// Load multiple single-element structures to one, two, three, or four registers.
673583
#[inline]
674584
#[target_feature(enable = "neon")]
675585
#[cfg_attr(test, assert_instr(ldr))]
676586
pub unsafe fn vld1q_u64(ptr: *const u64) -> uint64x2_t {
677-
transmute(u64x2::new(*ptr, *ptr.offset(1)))
587+
core::ptr::read_unaligned(ptr as *const uint64x2_t)
678588
}
679589

680590
/// Load multiple single-element structures to one, two, three, or four registers.
681591
#[inline]
682592
#[target_feature(enable = "neon")]
683593
#[cfg_attr(test, assert_instr(ldr))]
684594
pub unsafe fn vld1_p8(ptr: *const p8) -> poly8x8_t {
685-
transmute(u8x8::new(
686-
*ptr,
687-
*ptr.offset(1),
688-
*ptr.offset(2),
689-
*ptr.offset(3),
690-
*ptr.offset(4),
691-
*ptr.offset(5),
692-
*ptr.offset(6),
693-
*ptr.offset(7),
694-
))
595+
core::ptr::read_unaligned(ptr as *const poly8x8_t)
695596
}
696597

697598
/// Load multiple single-element structures to one, two, three, or four registers.
698599
#[inline]
699600
#[target_feature(enable = "neon")]
700601
#[cfg_attr(test, assert_instr(ldr))]
701602
pub unsafe fn vld1q_p8(ptr: *const p8) -> poly8x16_t {
702-
transmute(u8x16::new(
703-
*ptr,
704-
*ptr.offset(1),
705-
*ptr.offset(2),
706-
*ptr.offset(3),
707-
*ptr.offset(4),
708-
*ptr.offset(5),
709-
*ptr.offset(6),
710-
*ptr.offset(7),
711-
*ptr.offset(8),
712-
*ptr.offset(9),
713-
*ptr.offset(10),
714-
*ptr.offset(11),
715-
*ptr.offset(12),
716-
*ptr.offset(13),
717-
*ptr.offset(14),
718-
*ptr.offset(15),
719-
))
603+
core::ptr::read_unaligned(ptr as *const poly8x16_t)
720604
}
721605

722606
/// Load multiple single-element structures to one, two, three, or four registers.
723607
#[inline]
724608
#[target_feature(enable = "neon")]
725609
#[cfg_attr(test, assert_instr(ldr))]
726610
pub unsafe fn vld1_p16(ptr: *const p16) -> poly16x4_t {
727-
transmute(u16x4::new(
728-
*ptr,
729-
*ptr.offset(1),
730-
*ptr.offset(2),
731-
*ptr.offset(3),
732-
))
611+
core::ptr::read_unaligned(ptr as *const poly16x4_t)
733612
}
734613

735614
/// Load multiple single-element structures to one, two, three, or four registers.
736615
#[inline]
737616
#[target_feature(enable = "neon")]
738617
#[cfg_attr(test, assert_instr(ldr))]
739618
pub unsafe fn vld1q_p16(ptr: *const p16) -> poly16x8_t {
740-
transmute(u16x8::new(
741-
*ptr,
742-
*ptr.offset(1),
743-
*ptr.offset(2),
744-
*ptr.offset(3),
745-
*ptr.offset(4),
746-
*ptr.offset(5),
747-
*ptr.offset(6),
748-
*ptr.offset(7),
749-
))
619+
core::ptr::read_unaligned(ptr as *const poly16x8_t)
750620
}
751621

752622
/// Load multiple single-element structures to one, two, three, or four registers.
753623
#[inline]
754624
#[target_feature(enable = "neon")]
755625
#[cfg_attr(test, assert_instr(ldr))]
756626
pub unsafe fn vld1_f32(ptr: *const f32) -> float32x2_t {
757-
transmute(f32x2::new(*ptr, *ptr.offset(1)))
627+
core::ptr::read_unaligned(ptr as *const float32x2_t)
758628
}
759629

760630
/// Load multiple single-element structures to one, two, three, or four registers.
761631
#[inline]
762632
#[target_feature(enable = "neon")]
763633
#[cfg_attr(test, assert_instr(ldr))]
764634
pub unsafe fn vld1q_f32(ptr: *const f32) -> float32x4_t {
765-
transmute(f32x4::new(
766-
*ptr,
767-
*ptr.offset(1),
768-
*ptr.offset(2),
769-
*ptr.offset(3),
770-
))
635+
core::ptr::read_unaligned(ptr as *const float32x4_t)
771636
}
772637

773638
/// Load multiple single-element structures to one, two, three, or four registers.
774639
#[inline]
775640
#[target_feature(enable = "neon")]
776641
#[cfg_attr(test, assert_instr(ldr))]
777642
pub unsafe fn vld1_f64(ptr: *const f64) -> float64x1_t {
778-
transmute(f64x1::new(*ptr))
643+
core::ptr::read_unaligned(ptr as *const float64x1_t)
779644
}
780645

781646
/// Load multiple single-element structures to one, two, three, or four registers.
782647
#[inline]
783648
#[target_feature(enable = "neon")]
784649
#[cfg_attr(test, assert_instr(ldr))]
785650
pub unsafe fn vld1q_f64(ptr: *const f64) -> float64x2_t {
786-
transmute(f64x2::new(*ptr, *ptr.offset(1)))
651+
core::ptr::read_unaligned(ptr as *const float64x2_t)
787652
}
788653

789654
/// Store multiple single-element structures from one, two, three, or four registers.

0 commit comments

Comments
 (0)