diff mbox series

unicode: update the width tables to Unicode 16

Message ID 20240912204047.1020213-1-dev+git@drbeat.li (mailing list archive)
State Accepted
Commit 44dc6511321c95027267d05e761cd9a43ed0425f
Headers show
Series unicode: update the width tables to Unicode 16 | expand

Commit Message

Beat Bolli Sept. 12, 2024, 8:40 p.m. UTC
Unicode 16 has been announced on 2024-09-10 [0], so update the character
width tables to the new version.

[0] https://blog.unicode.org/2024/09/announcing-unicode-standard-version-160.html

Signed-off-by: Beat Bolli <dev+git@drbeat.li>
---
 unicode-width.h | 37 +++++++++++++++++++++++++------------
 1 file changed, 25 insertions(+), 12 deletions(-)

Comments

Johannes Schindelin Sept. 17, 2024, 12:37 p.m. UTC | #1
Hi Beat,

On Thu, 12 Sep 2024, Beat Bolli wrote:

> Unicode 16 has been announced on 2024-09-10 [0], so update the character
> width tables to the new version.
>
> [0] https://blog.unicode.org/2024/09/announcing-unicode-standard-version-160.html

I can confirm that the output is identical to the result of running
./contrib/update-unicode/update_unicode.sh.

Maybe we should add an automated, scheduled workflow for these updates?

Ciao,
Johannes

>
> Signed-off-by: Beat Bolli <dev+git@drbeat.li>
> ---
>  unicode-width.h | 37 +++++++++++++++++++++++++------------
>  1 file changed, 25 insertions(+), 12 deletions(-)
>
> diff --git a/unicode-width.h b/unicode-width.h
> index be5bf8c4f2..3ffee123a0 100644
> --- a/unicode-width.h
> +++ b/unicode-width.h
> @@ -27,7 +27,7 @@ static const struct interval zero_width[] = {
>  { 0x0829, 0x082D },
>  { 0x0859, 0x085B },
>  { 0x0890, 0x0891 },
> -{ 0x0898, 0x089F },
> +{ 0x0897, 0x089F },
>  { 0x08CA, 0x0902 },
>  { 0x093A, 0x093A },
>  { 0x093C, 0x093C },
> @@ -227,8 +227,9 @@ static const struct interval zero_width[] = {
>  { 0x10A3F, 0x10A3F },
>  { 0x10AE5, 0x10AE6 },
>  { 0x10D24, 0x10D27 },
> +{ 0x10D69, 0x10D6D },
>  { 0x10EAB, 0x10EAC },
> -{ 0x10EFD, 0x10EFF },
> +{ 0x10EFC, 0x10EFF },
>  { 0x10F46, 0x10F50 },
>  { 0x10F82, 0x10F85 },
>  { 0x11001, 0x11001 },
> @@ -261,6 +262,11 @@ static const struct interval zero_width[] = {
>  { 0x11340, 0x11340 },
>  { 0x11366, 0x1136C },
>  { 0x11370, 0x11374 },
> +{ 0x113BB, 0x113C0 },
> +{ 0x113CE, 0x113CE },
> +{ 0x113D0, 0x113D0 },
> +{ 0x113D2, 0x113D2 },
> +{ 0x113E1, 0x113E2 },
>  { 0x11438, 0x1143F },
>  { 0x11442, 0x11444 },
>  { 0x11446, 0x11446 },
> @@ -280,7 +286,8 @@ static const struct interval zero_width[] = {
>  { 0x116AD, 0x116AD },
>  { 0x116B0, 0x116B5 },
>  { 0x116B7, 0x116B7 },
> -{ 0x1171D, 0x1171F },
> +{ 0x1171D, 0x1171D },
> +{ 0x1171F, 0x1171F },
>  { 0x11722, 0x11725 },
>  { 0x11727, 0x1172B },
>  { 0x1182F, 0x11837 },
> @@ -319,8 +326,11 @@ static const struct interval zero_width[] = {
>  { 0x11F36, 0x11F3A },
>  { 0x11F40, 0x11F40 },
>  { 0x11F42, 0x11F42 },
> +{ 0x11F5A, 0x11F5A },
>  { 0x13430, 0x13440 },
>  { 0x13447, 0x13455 },
> +{ 0x1611E, 0x16129 },
> +{ 0x1612D, 0x1612F },
>  { 0x16AF0, 0x16AF4 },
>  { 0x16B30, 0x16B36 },
>  { 0x16F4F, 0x16F4F },
> @@ -351,6 +361,7 @@ static const struct interval zero_width[] = {
>  { 0x1E2AE, 0x1E2AE },
>  { 0x1E2EC, 0x1E2EF },
>  { 0x1E4EC, 0x1E4EF },
> +{ 0x1E5EE, 0x1E5EF },
>  { 0x1E8D0, 0x1E8D6 },
>  { 0x1E944, 0x1E94A },
>  { 0xE0001, 0xE0001 },
> @@ -366,8 +377,10 @@ static const struct interval double_width[] = {
>  { 0x23F3, 0x23F3 },
>  { 0x25FD, 0x25FE },
>  { 0x2614, 0x2615 },
> +{ 0x2630, 0x2637 },
>  { 0x2648, 0x2653 },
>  { 0x267F, 0x267F },
> +{ 0x268A, 0x268F },
>  { 0x2693, 0x2693 },
>  { 0x26A1, 0x26A1 },
>  { 0x26AA, 0x26AB },
> @@ -401,11 +414,10 @@ static const struct interval double_width[] = {
>  { 0x3099, 0x30FF },
>  { 0x3105, 0x312F },
>  { 0x3131, 0x318E },
> -{ 0x3190, 0x31E3 },
> +{ 0x3190, 0x31E5 },
>  { 0x31EF, 0x321E },
>  { 0x3220, 0x3247 },
> -{ 0x3250, 0x4DBF },
> -{ 0x4E00, 0xA48C },
> +{ 0x3250, 0xA48C },
>  { 0xA490, 0xA4C6 },
>  { 0xA960, 0xA97C },
>  { 0xAC00, 0xD7A3 },
> @@ -420,7 +432,7 @@ static const struct interval double_width[] = {
>  { 0x16FF0, 0x16FF1 },
>  { 0x17000, 0x187F7 },
>  { 0x18800, 0x18CD5 },
> -{ 0x18D00, 0x18D08 },
> +{ 0x18CFF, 0x18D08 },
>  { 0x1AFF0, 0x1AFF3 },
>  { 0x1AFF5, 0x1AFFB },
>  { 0x1AFFD, 0x1AFFE },
> @@ -430,6 +442,8 @@ static const struct interval double_width[] = {
>  { 0x1B155, 0x1B155 },
>  { 0x1B164, 0x1B167 },
>  { 0x1B170, 0x1B2FB },
> +{ 0x1D300, 0x1D356 },
> +{ 0x1D360, 0x1D376 },
>  { 0x1F004, 0x1F004 },
>  { 0x1F0CF, 0x1F0CF },
>  { 0x1F18E, 0x1F18E },
> @@ -470,11 +484,10 @@ static const struct interval double_width[] = {
>  { 0x1F93C, 0x1F945 },
>  { 0x1F947, 0x1F9FF },
>  { 0x1FA70, 0x1FA7C },
> -{ 0x1FA80, 0x1FA88 },
> -{ 0x1FA90, 0x1FABD },
> -{ 0x1FABF, 0x1FAC5 },
> -{ 0x1FACE, 0x1FADB },
> -{ 0x1FAE0, 0x1FAE8 },
> +{ 0x1FA80, 0x1FA89 },
> +{ 0x1FA8F, 0x1FAC6 },
> +{ 0x1FACE, 0x1FADC },
> +{ 0x1FADF, 0x1FAE9 },
>  { 0x1FAF0, 0x1FAF8 },
>  { 0x20000, 0x2FFFD },
>  { 0x30000, 0x3FFFD }
> --
> 2.45.2
>
>
>
Junio C Hamano Sept. 17, 2024, 9:54 p.m. UTC | #2
Johannes Schindelin <Johannes.Schindelin@gmx.de> writes:

> Hi Beat,
>
> On Thu, 12 Sep 2024, Beat Bolli wrote:
>
>> Unicode 16 has been announced on 2024-09-10 [0], so update the character
>> width tables to the new version.
>>
>> [0] https://blog.unicode.org/2024/09/announcing-unicode-standard-version-160.html
>
> I can confirm that the output is identical to the result of running
> ./contrib/update-unicode/update_unicode.sh.

Thanks for double checking.  I did the same when I queued the patch
and it indeed looked good.

> Maybe we should add an automated, scheduled workflow for these updates?

We could, but the consortium aims to issue major updates once a year
in September, with minor versions and updates "will be avoided", so
we may need to devise automation that makes better use of resources
than to scrape http://www.unicode.org/Public/UCD/latest/ucd/ daily.

44dc651132 2024-09-12T22:40:47+02:00 unicode: update the width tables to Unicode 16
872976c37e 2023-09-25T21:07:04+02:00 unicode: update the width tables to Unicode 15.1
b10cbdac4c 2023-03-30T21:15:17+02:00 unicode: update the width tables to Unicode 15
187fc8b8b6 2021-09-17T12:19:20-07:00 unicode: update the width tables to Unicode 14
65588b0b2e 2020-03-17T16:36:05+01:00 unicode: update the width tables to Unicode 13.0
5817f9caa3 2019-05-29T22:50:45+02:00 unicode: update the width tables to Unicode 12.1
584b62c37b 2019-03-21T22:06:17+01:00 unicode: update the width tables to Unicode 12
570951eea2 2018-07-09T21:44:52+02:00 unicode: update the width tables to Unicode 11
e233bef43e 2018-04-10T14:26:17-07:00 unicode_width.h: rename to use dash in file name
Johannes Schindelin Sept. 29, 2024, 6:58 p.m. UTC | #3
Hi Junio,

On Tue, 17 Sep 2024, Junio C Hamano wrote:

> Johannes Schindelin <Johannes.Schindelin@gmx.de> writes:
>
> > On Thu, 12 Sep 2024, Beat Bolli wrote:
> >
> >> Unicode 16 has been announced on 2024-09-10 [0], so update the character
> >> width tables to the new version.
> >>
> >> [0] https://blog.unicode.org/2024/09/announcing-unicode-standard-version-160.html
> >
> > I can confirm that the output is identical to the result of running
> > ./contrib/update-unicode/update_unicode.sh.
>
> Thanks for double checking.  I did the same when I queued the patch
> and it indeed looked good.
>
> > Maybe we should add an automated, scheduled workflow for these updates?
>
> We could, but the consortium aims to issue major updates once a year
> in September, with minor versions and updates "will be avoided", so
> we may need to devise automation that makes better use of resources
> than to scrape http://www.unicode.org/Public/UCD/latest/ucd/ daily.

Oh, but I obviously was not suggesting as crude a thing as to scrape it
unconditionally, and certainly not daily. No, I was thinking about
something checking the `Last-Modified:` header and only acting upon
updated Unicode definitions, and checking for updates only on a weekly
basis. Something along these lines:

	```yml
	name: update Unicode definitions

	on:
	  schedule:
	    - cron: '1 15 * * 4' # 3:01pm on Wednesdays
	  workflow_dispatch:

	jobs:
	  update-repo-variable:
	    if: vars.UNICODE_LAST_MODIFIED != ''
	    runs-on: ubuntu-latest
	    steps:
	      - id: check
		run: |
		  set -x
		  latest_update="$(curl -I https://www.unicode.org/Public/UCD/latest/ucd/UCD.zip |
		    sed -n 's/^Last-Modified: //p')" &&
		  if test '${{ vars.UNICODE_LAST_MODIFIED }}' = "$latest_update"
		  then
		    echo "result=skip" >>$GITHUB_OUTPUT
		    exit 0
		  fi
		  echo "result=$latest_update" >>$GITHUB_OUTPUT
	      - if: steps.check.outputs.result != 'skip'
		run: echo ::notice::_Now_ we scrape and do stuff
	      - if: steps.check.outputs.result != 'skip'
		env:
		  GH_TOKEN: ${{ secrets.UNICODE_LAST_MODIFIED_PAT }}
		run: |
		  gh api -X PATCH \
		    repos/$GITHUB_REPOSITORY/actions/variables/UNICODE_LAST_MODIFIED \
		    -f value='${{ steps.check.outputs.result }}'

This would use the repository variable `UNICODE_LAST_MODIFIED` to store
the `Last-Modified:` value that was last seen (and implicitly act as the
knob to prevent running in forks: if the variable is not yet set, the job
will be skipped).

Sadly, to update the repository variable, we cannot use `permissions:`
because the workflow syntax does not offer the `variables` scope.
Therefore a Personal Access Token would need to be stored as a repository
secret. I used a fine-grained token in my tests whose sope was
Repository > Variables: read-write.

Ciao,
Johannes

>
> 44dc651132 2024-09-12T22:40:47+02:00 unicode: update the width tables to Unicode 16
> 872976c37e 2023-09-25T21:07:04+02:00 unicode: update the width tables to Unicode 15.1
> b10cbdac4c 2023-03-30T21:15:17+02:00 unicode: update the width tables to Unicode 15
> 187fc8b8b6 2021-09-17T12:19:20-07:00 unicode: update the width tables to Unicode 14
> 65588b0b2e 2020-03-17T16:36:05+01:00 unicode: update the width tables to Unicode 13.0
> 5817f9caa3 2019-05-29T22:50:45+02:00 unicode: update the width tables to Unicode 12.1
> 584b62c37b 2019-03-21T22:06:17+01:00 unicode: update the width tables to Unicode 12
> 570951eea2 2018-07-09T21:44:52+02:00 unicode: update the width tables to Unicode 11
> e233bef43e 2018-04-10T14:26:17-07:00 unicode_width.h: rename to use dash in file name
>
>
Junio C Hamano Sept. 30, 2024, 6:12 p.m. UTC | #4
Johannes Schindelin <Johannes.Schindelin@gmx.de> writes:

> Oh, but I obviously was not suggesting as crude a thing as to scrape it
> unconditionally, and certainly not daily. No, I was thinking about
> something checking the `Last-Modified:` header and only acting upon
> updated Unicode definitions, and checking for updates only on a weekly
> basis. Something along these lines:
> ...
> Sadly, to update the repository variable, we cannot use `permissions:`
> because the workflow syntax does not offer the `variables` scope.
> Therefore a Personal Access Token would need to be stored as a repository
> secret. I used a fine-grained token in my tests whose sope was
> Repository > Variables: read-write.

And it can make a patch and send it to the list and get reviewed the
usual way.  It is a bit curious whose Sob should be on such a patch,
but we can work out the details, if we were seriously to automate
it.  It all seems workable.

You may have already noticed, but I am lazy and tolerate manual
tasks if they do not come more often than once per quarter ;-)

Thanks.
diff mbox series

Patch

diff --git a/unicode-width.h b/unicode-width.h
index be5bf8c4f2..3ffee123a0 100644
--- a/unicode-width.h
+++ b/unicode-width.h
@@ -27,7 +27,7 @@  static const struct interval zero_width[] = {
 { 0x0829, 0x082D },
 { 0x0859, 0x085B },
 { 0x0890, 0x0891 },
-{ 0x0898, 0x089F },
+{ 0x0897, 0x089F },
 { 0x08CA, 0x0902 },
 { 0x093A, 0x093A },
 { 0x093C, 0x093C },
@@ -227,8 +227,9 @@  static const struct interval zero_width[] = {
 { 0x10A3F, 0x10A3F },
 { 0x10AE5, 0x10AE6 },
 { 0x10D24, 0x10D27 },
+{ 0x10D69, 0x10D6D },
 { 0x10EAB, 0x10EAC },
-{ 0x10EFD, 0x10EFF },
+{ 0x10EFC, 0x10EFF },
 { 0x10F46, 0x10F50 },
 { 0x10F82, 0x10F85 },
 { 0x11001, 0x11001 },
@@ -261,6 +262,11 @@  static const struct interval zero_width[] = {
 { 0x11340, 0x11340 },
 { 0x11366, 0x1136C },
 { 0x11370, 0x11374 },
+{ 0x113BB, 0x113C0 },
+{ 0x113CE, 0x113CE },
+{ 0x113D0, 0x113D0 },
+{ 0x113D2, 0x113D2 },
+{ 0x113E1, 0x113E2 },
 { 0x11438, 0x1143F },
 { 0x11442, 0x11444 },
 { 0x11446, 0x11446 },
@@ -280,7 +286,8 @@  static const struct interval zero_width[] = {
 { 0x116AD, 0x116AD },
 { 0x116B0, 0x116B5 },
 { 0x116B7, 0x116B7 },
-{ 0x1171D, 0x1171F },
+{ 0x1171D, 0x1171D },
+{ 0x1171F, 0x1171F },
 { 0x11722, 0x11725 },
 { 0x11727, 0x1172B },
 { 0x1182F, 0x11837 },
@@ -319,8 +326,11 @@  static const struct interval zero_width[] = {
 { 0x11F36, 0x11F3A },
 { 0x11F40, 0x11F40 },
 { 0x11F42, 0x11F42 },
+{ 0x11F5A, 0x11F5A },
 { 0x13430, 0x13440 },
 { 0x13447, 0x13455 },
+{ 0x1611E, 0x16129 },
+{ 0x1612D, 0x1612F },
 { 0x16AF0, 0x16AF4 },
 { 0x16B30, 0x16B36 },
 { 0x16F4F, 0x16F4F },
@@ -351,6 +361,7 @@  static const struct interval zero_width[] = {
 { 0x1E2AE, 0x1E2AE },
 { 0x1E2EC, 0x1E2EF },
 { 0x1E4EC, 0x1E4EF },
+{ 0x1E5EE, 0x1E5EF },
 { 0x1E8D0, 0x1E8D6 },
 { 0x1E944, 0x1E94A },
 { 0xE0001, 0xE0001 },
@@ -366,8 +377,10 @@  static const struct interval double_width[] = {
 { 0x23F3, 0x23F3 },
 { 0x25FD, 0x25FE },
 { 0x2614, 0x2615 },
+{ 0x2630, 0x2637 },
 { 0x2648, 0x2653 },
 { 0x267F, 0x267F },
+{ 0x268A, 0x268F },
 { 0x2693, 0x2693 },
 { 0x26A1, 0x26A1 },
 { 0x26AA, 0x26AB },
@@ -401,11 +414,10 @@  static const struct interval double_width[] = {
 { 0x3099, 0x30FF },
 { 0x3105, 0x312F },
 { 0x3131, 0x318E },
-{ 0x3190, 0x31E3 },
+{ 0x3190, 0x31E5 },
 { 0x31EF, 0x321E },
 { 0x3220, 0x3247 },
-{ 0x3250, 0x4DBF },
-{ 0x4E00, 0xA48C },
+{ 0x3250, 0xA48C },
 { 0xA490, 0xA4C6 },
 { 0xA960, 0xA97C },
 { 0xAC00, 0xD7A3 },
@@ -420,7 +432,7 @@  static const struct interval double_width[] = {
 { 0x16FF0, 0x16FF1 },
 { 0x17000, 0x187F7 },
 { 0x18800, 0x18CD5 },
-{ 0x18D00, 0x18D08 },
+{ 0x18CFF, 0x18D08 },
 { 0x1AFF0, 0x1AFF3 },
 { 0x1AFF5, 0x1AFFB },
 { 0x1AFFD, 0x1AFFE },
@@ -430,6 +442,8 @@  static const struct interval double_width[] = {
 { 0x1B155, 0x1B155 },
 { 0x1B164, 0x1B167 },
 { 0x1B170, 0x1B2FB },
+{ 0x1D300, 0x1D356 },
+{ 0x1D360, 0x1D376 },
 { 0x1F004, 0x1F004 },
 { 0x1F0CF, 0x1F0CF },
 { 0x1F18E, 0x1F18E },
@@ -470,11 +484,10 @@  static const struct interval double_width[] = {
 { 0x1F93C, 0x1F945 },
 { 0x1F947, 0x1F9FF },
 { 0x1FA70, 0x1FA7C },
-{ 0x1FA80, 0x1FA88 },
-{ 0x1FA90, 0x1FABD },
-{ 0x1FABF, 0x1FAC5 },
-{ 0x1FACE, 0x1FADB },
-{ 0x1FAE0, 0x1FAE8 },
+{ 0x1FA80, 0x1FA89 },
+{ 0x1FA8F, 0x1FAC6 },
+{ 0x1FACE, 0x1FADC },
+{ 0x1FADF, 0x1FAE9 },
 { 0x1FAF0, 0x1FAF8 },
 { 0x20000, 0x2FFFD },
 { 0x30000, 0x3FFFD }