Message ID | 20240912204047.1020213-1-dev+git@drbeat.li (mailing list archive) |
---|---|
State | Accepted |
Commit | 44dc6511321c95027267d05e761cd9a43ed0425f |
Headers | show |
Series | unicode: update the width tables to Unicode 16 | expand |
Hi Beat, On Thu, 12 Sep 2024, Beat Bolli wrote: > Unicode 16 has been announced on 2024-09-10 [0], so update the character > width tables to the new version. > > [0] https://blog.unicode.org/2024/09/announcing-unicode-standard-version-160.html I can confirm that the output is identical to the result of running ./contrib/update-unicode/update_unicode.sh. Maybe we should add an automated, scheduled workflow for these updates? Ciao, Johannes > > Signed-off-by: Beat Bolli <dev+git@drbeat.li> > --- > unicode-width.h | 37 +++++++++++++++++++++++++------------ > 1 file changed, 25 insertions(+), 12 deletions(-) > > diff --git a/unicode-width.h b/unicode-width.h > index be5bf8c4f2..3ffee123a0 100644 > --- a/unicode-width.h > +++ b/unicode-width.h > @@ -27,7 +27,7 @@ static const struct interval zero_width[] = { > { 0x0829, 0x082D }, > { 0x0859, 0x085B }, > { 0x0890, 0x0891 }, > -{ 0x0898, 0x089F }, > +{ 0x0897, 0x089F }, > { 0x08CA, 0x0902 }, > { 0x093A, 0x093A }, > { 0x093C, 0x093C }, > @@ -227,8 +227,9 @@ static const struct interval zero_width[] = { > { 0x10A3F, 0x10A3F }, > { 0x10AE5, 0x10AE6 }, > { 0x10D24, 0x10D27 }, > +{ 0x10D69, 0x10D6D }, > { 0x10EAB, 0x10EAC }, > -{ 0x10EFD, 0x10EFF }, > +{ 0x10EFC, 0x10EFF }, > { 0x10F46, 0x10F50 }, > { 0x10F82, 0x10F85 }, > { 0x11001, 0x11001 }, > @@ -261,6 +262,11 @@ static const struct interval zero_width[] = { > { 0x11340, 0x11340 }, > { 0x11366, 0x1136C }, > { 0x11370, 0x11374 }, > +{ 0x113BB, 0x113C0 }, > +{ 0x113CE, 0x113CE }, > +{ 0x113D0, 0x113D0 }, > +{ 0x113D2, 0x113D2 }, > +{ 0x113E1, 0x113E2 }, > { 0x11438, 0x1143F }, > { 0x11442, 0x11444 }, > { 0x11446, 0x11446 }, > @@ -280,7 +286,8 @@ static const struct interval zero_width[] = { > { 0x116AD, 0x116AD }, > { 0x116B0, 0x116B5 }, > { 0x116B7, 0x116B7 }, > -{ 0x1171D, 0x1171F }, > +{ 0x1171D, 0x1171D }, > +{ 0x1171F, 0x1171F }, > { 0x11722, 0x11725 }, > { 0x11727, 0x1172B }, > { 0x1182F, 0x11837 }, > @@ -319,8 +326,11 @@ static const struct interval zero_width[] = { > { 0x11F36, 0x11F3A }, > { 0x11F40, 0x11F40 }, > { 0x11F42, 0x11F42 }, > +{ 0x11F5A, 0x11F5A }, > { 0x13430, 0x13440 }, > { 0x13447, 0x13455 }, > +{ 0x1611E, 0x16129 }, > +{ 0x1612D, 0x1612F }, > { 0x16AF0, 0x16AF4 }, > { 0x16B30, 0x16B36 }, > { 0x16F4F, 0x16F4F }, > @@ -351,6 +361,7 @@ static const struct interval zero_width[] = { > { 0x1E2AE, 0x1E2AE }, > { 0x1E2EC, 0x1E2EF }, > { 0x1E4EC, 0x1E4EF }, > +{ 0x1E5EE, 0x1E5EF }, > { 0x1E8D0, 0x1E8D6 }, > { 0x1E944, 0x1E94A }, > { 0xE0001, 0xE0001 }, > @@ -366,8 +377,10 @@ static const struct interval double_width[] = { > { 0x23F3, 0x23F3 }, > { 0x25FD, 0x25FE }, > { 0x2614, 0x2615 }, > +{ 0x2630, 0x2637 }, > { 0x2648, 0x2653 }, > { 0x267F, 0x267F }, > +{ 0x268A, 0x268F }, > { 0x2693, 0x2693 }, > { 0x26A1, 0x26A1 }, > { 0x26AA, 0x26AB }, > @@ -401,11 +414,10 @@ static const struct interval double_width[] = { > { 0x3099, 0x30FF }, > { 0x3105, 0x312F }, > { 0x3131, 0x318E }, > -{ 0x3190, 0x31E3 }, > +{ 0x3190, 0x31E5 }, > { 0x31EF, 0x321E }, > { 0x3220, 0x3247 }, > -{ 0x3250, 0x4DBF }, > -{ 0x4E00, 0xA48C }, > +{ 0x3250, 0xA48C }, > { 0xA490, 0xA4C6 }, > { 0xA960, 0xA97C }, > { 0xAC00, 0xD7A3 }, > @@ -420,7 +432,7 @@ static const struct interval double_width[] = { > { 0x16FF0, 0x16FF1 }, > { 0x17000, 0x187F7 }, > { 0x18800, 0x18CD5 }, > -{ 0x18D00, 0x18D08 }, > +{ 0x18CFF, 0x18D08 }, > { 0x1AFF0, 0x1AFF3 }, > { 0x1AFF5, 0x1AFFB }, > { 0x1AFFD, 0x1AFFE }, > @@ -430,6 +442,8 @@ static const struct interval double_width[] = { > { 0x1B155, 0x1B155 }, > { 0x1B164, 0x1B167 }, > { 0x1B170, 0x1B2FB }, > +{ 0x1D300, 0x1D356 }, > +{ 0x1D360, 0x1D376 }, > { 0x1F004, 0x1F004 }, > { 0x1F0CF, 0x1F0CF }, > { 0x1F18E, 0x1F18E }, > @@ -470,11 +484,10 @@ static const struct interval double_width[] = { > { 0x1F93C, 0x1F945 }, > { 0x1F947, 0x1F9FF }, > { 0x1FA70, 0x1FA7C }, > -{ 0x1FA80, 0x1FA88 }, > -{ 0x1FA90, 0x1FABD }, > -{ 0x1FABF, 0x1FAC5 }, > -{ 0x1FACE, 0x1FADB }, > -{ 0x1FAE0, 0x1FAE8 }, > +{ 0x1FA80, 0x1FA89 }, > +{ 0x1FA8F, 0x1FAC6 }, > +{ 0x1FACE, 0x1FADC }, > +{ 0x1FADF, 0x1FAE9 }, > { 0x1FAF0, 0x1FAF8 }, > { 0x20000, 0x2FFFD }, > { 0x30000, 0x3FFFD } > -- > 2.45.2 > > >
Johannes Schindelin <Johannes.Schindelin@gmx.de> writes: > Hi Beat, > > On Thu, 12 Sep 2024, Beat Bolli wrote: > >> Unicode 16 has been announced on 2024-09-10 [0], so update the character >> width tables to the new version. >> >> [0] https://blog.unicode.org/2024/09/announcing-unicode-standard-version-160.html > > I can confirm that the output is identical to the result of running > ./contrib/update-unicode/update_unicode.sh. Thanks for double checking. I did the same when I queued the patch and it indeed looked good. > Maybe we should add an automated, scheduled workflow for these updates? We could, but the consortium aims to issue major updates once a year in September, with minor versions and updates "will be avoided", so we may need to devise automation that makes better use of resources than to scrape http://www.unicode.org/Public/UCD/latest/ucd/ daily. 44dc651132 2024-09-12T22:40:47+02:00 unicode: update the width tables to Unicode 16 872976c37e 2023-09-25T21:07:04+02:00 unicode: update the width tables to Unicode 15.1 b10cbdac4c 2023-03-30T21:15:17+02:00 unicode: update the width tables to Unicode 15 187fc8b8b6 2021-09-17T12:19:20-07:00 unicode: update the width tables to Unicode 14 65588b0b2e 2020-03-17T16:36:05+01:00 unicode: update the width tables to Unicode 13.0 5817f9caa3 2019-05-29T22:50:45+02:00 unicode: update the width tables to Unicode 12.1 584b62c37b 2019-03-21T22:06:17+01:00 unicode: update the width tables to Unicode 12 570951eea2 2018-07-09T21:44:52+02:00 unicode: update the width tables to Unicode 11 e233bef43e 2018-04-10T14:26:17-07:00 unicode_width.h: rename to use dash in file name
Hi Junio, On Tue, 17 Sep 2024, Junio C Hamano wrote: > Johannes Schindelin <Johannes.Schindelin@gmx.de> writes: > > > On Thu, 12 Sep 2024, Beat Bolli wrote: > > > >> Unicode 16 has been announced on 2024-09-10 [0], so update the character > >> width tables to the new version. > >> > >> [0] https://blog.unicode.org/2024/09/announcing-unicode-standard-version-160.html > > > > I can confirm that the output is identical to the result of running > > ./contrib/update-unicode/update_unicode.sh. > > Thanks for double checking. I did the same when I queued the patch > and it indeed looked good. > > > Maybe we should add an automated, scheduled workflow for these updates? > > We could, but the consortium aims to issue major updates once a year > in September, with minor versions and updates "will be avoided", so > we may need to devise automation that makes better use of resources > than to scrape http://www.unicode.org/Public/UCD/latest/ucd/ daily. Oh, but I obviously was not suggesting as crude a thing as to scrape it unconditionally, and certainly not daily. No, I was thinking about something checking the `Last-Modified:` header and only acting upon updated Unicode definitions, and checking for updates only on a weekly basis. Something along these lines: ```yml name: update Unicode definitions on: schedule: - cron: '1 15 * * 4' # 3:01pm on Wednesdays workflow_dispatch: jobs: update-repo-variable: if: vars.UNICODE_LAST_MODIFIED != '' runs-on: ubuntu-latest steps: - id: check run: | set -x latest_update="$(curl -I https://www.unicode.org/Public/UCD/latest/ucd/UCD.zip | sed -n 's/^Last-Modified: //p')" && if test '${{ vars.UNICODE_LAST_MODIFIED }}' = "$latest_update" then echo "result=skip" >>$GITHUB_OUTPUT exit 0 fi echo "result=$latest_update" >>$GITHUB_OUTPUT - if: steps.check.outputs.result != 'skip' run: echo ::notice::_Now_ we scrape and do stuff - if: steps.check.outputs.result != 'skip' env: GH_TOKEN: ${{ secrets.UNICODE_LAST_MODIFIED_PAT }} run: | gh api -X PATCH \ repos/$GITHUB_REPOSITORY/actions/variables/UNICODE_LAST_MODIFIED \ -f value='${{ steps.check.outputs.result }}' This would use the repository variable `UNICODE_LAST_MODIFIED` to store the `Last-Modified:` value that was last seen (and implicitly act as the knob to prevent running in forks: if the variable is not yet set, the job will be skipped). Sadly, to update the repository variable, we cannot use `permissions:` because the workflow syntax does not offer the `variables` scope. Therefore a Personal Access Token would need to be stored as a repository secret. I used a fine-grained token in my tests whose sope was Repository > Variables: read-write. Ciao, Johannes > > 44dc651132 2024-09-12T22:40:47+02:00 unicode: update the width tables to Unicode 16 > 872976c37e 2023-09-25T21:07:04+02:00 unicode: update the width tables to Unicode 15.1 > b10cbdac4c 2023-03-30T21:15:17+02:00 unicode: update the width tables to Unicode 15 > 187fc8b8b6 2021-09-17T12:19:20-07:00 unicode: update the width tables to Unicode 14 > 65588b0b2e 2020-03-17T16:36:05+01:00 unicode: update the width tables to Unicode 13.0 > 5817f9caa3 2019-05-29T22:50:45+02:00 unicode: update the width tables to Unicode 12.1 > 584b62c37b 2019-03-21T22:06:17+01:00 unicode: update the width tables to Unicode 12 > 570951eea2 2018-07-09T21:44:52+02:00 unicode: update the width tables to Unicode 11 > e233bef43e 2018-04-10T14:26:17-07:00 unicode_width.h: rename to use dash in file name > >
Johannes Schindelin <Johannes.Schindelin@gmx.de> writes: > Oh, but I obviously was not suggesting as crude a thing as to scrape it > unconditionally, and certainly not daily. No, I was thinking about > something checking the `Last-Modified:` header and only acting upon > updated Unicode definitions, and checking for updates only on a weekly > basis. Something along these lines: > ... > Sadly, to update the repository variable, we cannot use `permissions:` > because the workflow syntax does not offer the `variables` scope. > Therefore a Personal Access Token would need to be stored as a repository > secret. I used a fine-grained token in my tests whose sope was > Repository > Variables: read-write. And it can make a patch and send it to the list and get reviewed the usual way. It is a bit curious whose Sob should be on such a patch, but we can work out the details, if we were seriously to automate it. It all seems workable. You may have already noticed, but I am lazy and tolerate manual tasks if they do not come more often than once per quarter ;-) Thanks.
diff --git a/unicode-width.h b/unicode-width.h index be5bf8c4f2..3ffee123a0 100644 --- a/unicode-width.h +++ b/unicode-width.h @@ -27,7 +27,7 @@ static const struct interval zero_width[] = { { 0x0829, 0x082D }, { 0x0859, 0x085B }, { 0x0890, 0x0891 }, -{ 0x0898, 0x089F }, +{ 0x0897, 0x089F }, { 0x08CA, 0x0902 }, { 0x093A, 0x093A }, { 0x093C, 0x093C }, @@ -227,8 +227,9 @@ static const struct interval zero_width[] = { { 0x10A3F, 0x10A3F }, { 0x10AE5, 0x10AE6 }, { 0x10D24, 0x10D27 }, +{ 0x10D69, 0x10D6D }, { 0x10EAB, 0x10EAC }, -{ 0x10EFD, 0x10EFF }, +{ 0x10EFC, 0x10EFF }, { 0x10F46, 0x10F50 }, { 0x10F82, 0x10F85 }, { 0x11001, 0x11001 }, @@ -261,6 +262,11 @@ static const struct interval zero_width[] = { { 0x11340, 0x11340 }, { 0x11366, 0x1136C }, { 0x11370, 0x11374 }, +{ 0x113BB, 0x113C0 }, +{ 0x113CE, 0x113CE }, +{ 0x113D0, 0x113D0 }, +{ 0x113D2, 0x113D2 }, +{ 0x113E1, 0x113E2 }, { 0x11438, 0x1143F }, { 0x11442, 0x11444 }, { 0x11446, 0x11446 }, @@ -280,7 +286,8 @@ static const struct interval zero_width[] = { { 0x116AD, 0x116AD }, { 0x116B0, 0x116B5 }, { 0x116B7, 0x116B7 }, -{ 0x1171D, 0x1171F }, +{ 0x1171D, 0x1171D }, +{ 0x1171F, 0x1171F }, { 0x11722, 0x11725 }, { 0x11727, 0x1172B }, { 0x1182F, 0x11837 }, @@ -319,8 +326,11 @@ static const struct interval zero_width[] = { { 0x11F36, 0x11F3A }, { 0x11F40, 0x11F40 }, { 0x11F42, 0x11F42 }, +{ 0x11F5A, 0x11F5A }, { 0x13430, 0x13440 }, { 0x13447, 0x13455 }, +{ 0x1611E, 0x16129 }, +{ 0x1612D, 0x1612F }, { 0x16AF0, 0x16AF4 }, { 0x16B30, 0x16B36 }, { 0x16F4F, 0x16F4F }, @@ -351,6 +361,7 @@ static const struct interval zero_width[] = { { 0x1E2AE, 0x1E2AE }, { 0x1E2EC, 0x1E2EF }, { 0x1E4EC, 0x1E4EF }, +{ 0x1E5EE, 0x1E5EF }, { 0x1E8D0, 0x1E8D6 }, { 0x1E944, 0x1E94A }, { 0xE0001, 0xE0001 }, @@ -366,8 +377,10 @@ static const struct interval double_width[] = { { 0x23F3, 0x23F3 }, { 0x25FD, 0x25FE }, { 0x2614, 0x2615 }, +{ 0x2630, 0x2637 }, { 0x2648, 0x2653 }, { 0x267F, 0x267F }, +{ 0x268A, 0x268F }, { 0x2693, 0x2693 }, { 0x26A1, 0x26A1 }, { 0x26AA, 0x26AB }, @@ -401,11 +414,10 @@ static const struct interval double_width[] = { { 0x3099, 0x30FF }, { 0x3105, 0x312F }, { 0x3131, 0x318E }, -{ 0x3190, 0x31E3 }, +{ 0x3190, 0x31E5 }, { 0x31EF, 0x321E }, { 0x3220, 0x3247 }, -{ 0x3250, 0x4DBF }, -{ 0x4E00, 0xA48C }, +{ 0x3250, 0xA48C }, { 0xA490, 0xA4C6 }, { 0xA960, 0xA97C }, { 0xAC00, 0xD7A3 }, @@ -420,7 +432,7 @@ static const struct interval double_width[] = { { 0x16FF0, 0x16FF1 }, { 0x17000, 0x187F7 }, { 0x18800, 0x18CD5 }, -{ 0x18D00, 0x18D08 }, +{ 0x18CFF, 0x18D08 }, { 0x1AFF0, 0x1AFF3 }, { 0x1AFF5, 0x1AFFB }, { 0x1AFFD, 0x1AFFE }, @@ -430,6 +442,8 @@ static const struct interval double_width[] = { { 0x1B155, 0x1B155 }, { 0x1B164, 0x1B167 }, { 0x1B170, 0x1B2FB }, +{ 0x1D300, 0x1D356 }, +{ 0x1D360, 0x1D376 }, { 0x1F004, 0x1F004 }, { 0x1F0CF, 0x1F0CF }, { 0x1F18E, 0x1F18E }, @@ -470,11 +484,10 @@ static const struct interval double_width[] = { { 0x1F93C, 0x1F945 }, { 0x1F947, 0x1F9FF }, { 0x1FA70, 0x1FA7C }, -{ 0x1FA80, 0x1FA88 }, -{ 0x1FA90, 0x1FABD }, -{ 0x1FABF, 0x1FAC5 }, -{ 0x1FACE, 0x1FADB }, -{ 0x1FAE0, 0x1FAE8 }, +{ 0x1FA80, 0x1FA89 }, +{ 0x1FA8F, 0x1FAC6 }, +{ 0x1FACE, 0x1FADC }, +{ 0x1FADF, 0x1FAE9 }, { 0x1FAF0, 0x1FAF8 }, { 0x20000, 0x2FFFD }, { 0x30000, 0x3FFFD }
Unicode 16 has been announced on 2024-09-10 [0], so update the character width tables to the new version. [0] https://blog.unicode.org/2024/09/announcing-unicode-standard-version-160.html Signed-off-by: Beat Bolli <dev+git@drbeat.li> --- unicode-width.h | 37 +++++++++++++++++++++++++------------ 1 file changed, 25 insertions(+), 12 deletions(-)