Ticket #26: ghc-events-unicode.dpatch

File ghc-events-unicode.dpatch, 18.0 kB (added by duncan, 2 years ago)
Line 
11 patch for repository http://code.haskell.org/ghc-events/:
2
3Mon Oct 29 20:00:10 GMT 2012  Duncan Coutts <duncan@well-typed.com>
4  * First go at correct unicode handling of strings in eventlogs
5  Needs more testing. Should perhaps just switch from String to either
6  ByteString or Text, on a case-by-case basis.
7
8New patches:
9
10[First go at correct unicode handling of strings in eventlogs
11Duncan Coutts <duncan@well-typed.com>**20121029200010
12 Ignore-this: 8980289664f2773476d913bc0f3fc618
13 Needs more testing. Should perhaps just switch from String to either
14 ByteString or Text, on a case-by-case basis.
15] hunk ./GHC/RTS/EventParserUtils.hs 12
16 
17         getE,
18         getH,
19-        getString,
20+
21+        getAsciiString,
22+        getUnicodeString,
23+        putAsciiString,
24+        putBytes,
25+        encodeUTF8,
26+        decodeUTF8,
27+
28         mkEventTypeParsers,
29         simpleEvent,
30         skip,
31hunk ./GHC/RTS/EventParserUtils.hs 34
32 import qualified Data.Binary.Get as G
33 import Data.Binary.Put
34 import Data.Char
35+import Data.Bits
36 import Data.Function
37 import Data.IntMap (IntMap)
38 import qualified Data.IntMap as M
39hunk ./GHC/RTS/EventParserUtils.hs 58
40 getE :: Binary a => GetEvents a
41 getE = lift $ lift get
42 
43-nBytes :: Integral a => a -> GetEvents [Word8]
44-nBytes n = replicateM (fromIntegral n) getE
45+getNBytes :: Integral a => a -> GetEvents [Word8]
46+getNBytes n = replicateM (fromIntegral n) getE
47 
48hunk ./GHC/RTS/EventParserUtils.hs 61
49-getString :: Integral a => a -> GetEvents String
50-getString len = do
51-    bytes <- nBytes len
52+getAsciiString :: Integral a => a -> GetEvents String
53+getAsciiString len = do
54+    bytes <- getNBytes len
55     return $ map (chr . fromIntegral) bytes
56 
57hunk ./GHC/RTS/EventParserUtils.hs 66
58+getUnicodeString :: Integral a => a -> GetEvents String
59+getUnicodeString len = do
60+    bytes <- getNBytes len
61+    return (decodeUTF8 bytes)
62+
63+putAsciiString :: String -> Put
64+putAsciiString str =
65+    mapM_ (put . c2w) str
66+  where
67+    c2w :: Char -> Word8
68+    c2w = fromIntegral . ord
69+
70+putBytes :: [Word8] -> Put
71+putBytes = mapM_ put
72+
73 skip :: Integral a => a -> GetEvents ()
74 skip n = lift $ lift $ G.skip (fromIntegral n)
75 
76hunk ./GHC/RTS/EventParserUtils.hs 222
77              Nothing -> getE :: GetEvents Word16
78   skip bytes
79   return UnknownEvent{ ref = fromIntegral num }
80+
81+
82+-- UTF handling:
83+-- Copyright (c) Eric Mertens 2007
84+
85+encodeUTF8 :: String -> [Word8]
86+encodeUTF8 = concatMap (map fromIntegral . go . ord)
87+ where
88+  go oc
89+   | oc <= 0x7f       = [oc]
90+
91+   | oc <= 0x7ff      = [ 0xc0 + (oc `shiftR` 6)
92+                        , 0x80 + oc .&. 0x3f
93+                        ]
94+
95+   | oc <= 0xffff     = [ 0xe0 + (oc `shiftR` 12)
96+                        , 0x80 + ((oc `shiftR` 6) .&. 0x3f)
97+                        , 0x80 + oc .&. 0x3f
98+                        ]
99+   | otherwise        = [ 0xf0 + (oc `shiftR` 18)
100+                        , 0x80 + ((oc `shiftR` 12) .&. 0x3f)
101+                        , 0x80 + ((oc `shiftR` 6) .&. 0x3f)
102+                        , 0x80 + oc .&. 0x3f
103+                        ]
104+
105+decodeUTF8 :: [Word8] -> String
106+decodeUTF8 [] = ""
107+decodeUTF8 (c:cs)
108+  | c < 0x80  = chr (fromEnum c) : decodeUTF8 cs
109+  | c < 0xc0  = replacement_character : decodeUTF8 cs
110+  | c < 0xe0  = multi1
111+  | c < 0xf0  = multi_byte 2 0xf  0x800
112+  | c < 0xf8  = multi_byte 3 0x7  0x10000
113+  | c < 0xfc  = multi_byte 4 0x3  0x200000
114+  | c < 0xfe  = multi_byte 5 0x1  0x4000000
115+  | otherwise = replacement_character : decodeUTF8 cs
116+  where
117+    multi1 = case cs of
118+      c1 : ds | c1 .&. 0xc0 == 0x80 ->
119+        let d = ((fromEnum c .&. 0x1f) `shiftL` 6) .|.  fromEnum (c1 .&. 0x3f)
120+        in if d >= 0x000080 then toEnum d : decodeUTF8 ds
121+                            else replacement_character : decodeUTF8 ds
122+      _ -> replacement_character : decodeUTF8 cs
123+
124+    multi_byte :: Int -> Word8 -> Int -> [Char]
125+    multi_byte i mask overlong = aux i cs (fromEnum (c .&. mask))
126+      where
127+        aux 0 rs acc
128+          | overlong <= acc && acc <= 0x10ffff &&
129+            (acc < 0xd800 || 0xdfff < acc)     &&
130+            (acc < 0xfffe || 0xffff < acc)      = chr acc : decodeUTF8 rs
131+          | otherwise = replacement_character : decodeUTF8 rs
132+
133+        aux n (r:rs) acc
134+          | r .&. 0xc0 == 0x80 = aux (n-1) rs
135+                               $ shiftL acc 6 .|. fromEnum (r .&. 0x3f)
136+
137+        aux _ rs     _ = replacement_character : decodeUTF8 rs
138+
139+    replacement_character = '\xfffd'
140hunk ./GHC/RTS/Events.hs 271
141   )),
142 
143  (VariableSizeParser EVENT_LOG_MSG (do -- (msg)
144-      num <- getE :: GetEvents Word16
145-      string <- getString num
146+      size   <- getE :: GetEvents Word16
147+      string <- getUnicodeString size
148       return Message{ msg = string }
149    )),
150  (VariableSizeParser EVENT_USER_MSG (do -- (msg)
151hunk ./GHC/RTS/Events.hs 276
152-      num <- getE :: GetEvents Word16
153-      string <- getString num
154+      size   <- getE :: GetEvents Word16
155+      string <- getUnicodeString size
156       return UserMessage{ msg = string }
157    )),
158     (VariableSizeParser EVENT_USER_MARKER (do -- (markername)
159hunk ./GHC/RTS/Events.hs 281
160-      num <- getE :: GetEvents Word16
161-      string <- getString num
162+      size   <- getE :: GetEvents Word16
163+      string <- getUnicodeString size
164       return UserMarker{ markername = string }
165    )),
166  (VariableSizeParser EVENT_PROGRAM_ARGS (do -- (capset, [arg])
167hunk ./GHC/RTS/Events.hs 286
168-      num <- getE :: GetEvents Word16
169-      cs <- getE
170-      string <- getString (num - sz_capset)
171+      size <- getE :: GetEvents Word16
172+      cs   <- getE
173+      string <- getAsciiString (size - sz_capset)
174       return ProgramArgs{ capset = cs
175                         , args = splitNull string }
176    )),
177hunk ./GHC/RTS/Events.hs 293
178  (VariableSizeParser EVENT_PROGRAM_ENV (do -- (capset, [arg])
179-      num <- getE :: GetEvents Word16
180-      cs <- getE
181-      string <- getString (num - sz_capset)
182+      size <- getE :: GetEvents Word16
183+      cs   <- getE
184+      string <- getAsciiString (size - sz_capset)
185       return ProgramEnv{ capset = cs
186                        , env = splitNull string }
187    )),
188hunk ./GHC/RTS/Events.hs 300
189  (VariableSizeParser EVENT_RTS_IDENTIFIER (do -- (capset, str)
190-      num <- getE :: GetEvents Word16
191-      cs <- getE
192-      string <- getString (num - sz_capset)
193+      size <- getE :: GetEvents Word16
194+      cs   <- getE
195+      string <- getAsciiString (size - sz_capset)
196       return RtsIdentifier{ capset = cs
197                           , rtsident = string }
198    )),
199hunk ./GHC/RTS/Events.hs 308
200 
201  (VariableSizeParser EVENT_INTERN_STRING (do -- (str, id)
202-      num <- getE :: GetEvents Word16
203-      string <- getString (num - sz_string_id)
204+      size   <- getE :: GetEvents Word16
205+      string <- getAsciiString (size - sz_string_id)
206       sId <- getE :: GetEvents StringId
207       return (InternString string sId)
208     )),
209hunk ./GHC/RTS/Events.hs 315
210 
211  (VariableSizeParser EVENT_THREAD_LABEL (do -- (thread, str)
212-      num <- getE :: GetEvents Word16
213-      tid <- getE
214-      str <- getString (num - sz_tid)
215+      size <- getE :: GetEvents Word16
216+      tid  <- getE
217+      str  <- getAsciiString (size - sz_tid)
218       return ThreadLabel{ thread      = tid
219                         , threadlabel = str }
220     ))
221hunk ./GHC/RTS/Events.hs 576
222 
223 perfParsers = [
224  (VariableSizeParser EVENT_PERF_NAME (do -- (perf_num, name)
225-      num     <- getE :: GetEvents Word16
226+      size    <- getE :: GetEvents Word16
227       perfNum <- getE
228hunk ./GHC/RTS/Events.hs 578
229-      name    <- getString (num - sz_perf_num)
230+      name    <- getAsciiString (size - sz_perf_num)
231       return PerfName{perfNum, name}
232    )),
233 
234hunk ./GHC/RTS/Events.hs 940
235 putMarker :: Word32 -> PutEvents ()
236 putMarker = putE
237 
238-putEStr :: String -> PutEvents ()
239-putEStr = mapM_ putE
240-
241 putEventLog :: EventLog -> PutEvents ()
242 putEventLog (EventLog hdr es) = do
243     putHeader hdr
244hunk ./GHC/RTS/Events.hs 1144
245     putE t
246     putCap c
247 
248-putEventSpec (ThreadLabel t l) = do
249-    putE (fromIntegral (length l) + sz_tid :: Word16)
250-    putE t
251-    putEStr l
252+putEventSpec (ThreadLabel tid label) = do
253+    putE size
254+    putE tid
255+    putBytes bytes
256+  where
257+    size :: Word16
258+    size  = fromIntegral (length bytes) + sz_tid
259+    bytes = encodeUTF8 label
260 
261 putEventSpec Shutdown = do
262     return ()
263hunk ./GHC/RTS/Events.hs 1257
264 putEventSpec (RtsIdentifier cs rts) = do
265     putE (fromIntegral (length rts) + sz_capset :: Word16)
266     putE cs
267-    putEStr rts
268+    putAsciiString rts
269 
270 putEventSpec (ProgramArgs cs as) = do
271     let as' = unsep as
272hunk ./GHC/RTS/Events.hs 1263
273     putE (fromIntegral (length as') + sz_capset :: Word16)
274     putE cs
275-    mapM_ putE as'
276+    putAsciiString as'
277 
278 putEventSpec (ProgramEnv cs es) = do
279     let es' = unsep es
280hunk ./GHC/RTS/Events.hs 1269
281     putE (fromIntegral (length es') + sz_capset :: Word16)
282     putE cs
283-    mapM_ putE es'
284+    putAsciiString es'
285 
286 putEventSpec (OsProcessPid cs pid) = do
287     putE cs
288hunk ./GHC/RTS/Events.hs 1284
289     putE sec
290     putE nsec
291 
292-putEventSpec (Message s) = do
293-    putE (fromIntegral (length s) :: Word16)
294-    mapM_ putE s
295+putEventSpec (Message str) = do
296+    putE size
297+    putBytes bytes
298+  where
299+    size  :: Word16
300+    size   = fromIntegral (length bytes)
301+    bytes  = encodeUTF8 str
302 
303hunk ./GHC/RTS/Events.hs 1292
304-putEventSpec (UserMessage s) = do
305-    putE (fromIntegral (length s) :: Word16)
306-    mapM_ putE s
307+putEventSpec (UserMessage str) = do
308+    putE size
309+    putBytes bytes
310+  where
311+    size  :: Word16
312+    size   = fromIntegral (length bytes)
313+    bytes  = encodeUTF8 str
314 
315hunk ./GHC/RTS/Events.hs 1300
316-putEventSpec (UserMarker s) = do
317-    putE (fromIntegral (length s) :: Word16)
318-    mapM_ putE s
319+putEventSpec (UserMarker str) = do
320+    putE size
321+    putBytes bytes
322+  where
323+    size  :: Word16
324+    size   = fromIntegral (length bytes)
325+    bytes  = encodeUTF8 str
326 
327 putEventSpec (UnknownEvent {}) = error "putEventSpec UnknownEvent"
328 
329hunk ./GHC/RTS/Events.hs 1311
330 putEventSpec (InternString str id) = do
331-    putE len
332-    mapM_ putE str
333+    putE size
334+    putAsciiString str
335     putE id
336hunk ./GHC/RTS/Events.hs 1314
337-  where len = (fromIntegral (length str) :: Word16) + sz_string_id
338+  where
339+    size :: Word16
340+    size = fromIntegral (length str) + sz_string_id
341 
342 putEventSpec (MerStartParConjunction dyn_id static_id) = do
343     putE dyn_id
344hunk ./GHC/RTS/Events.hs 1358
345 putEventSpec PerfName{..} = do
346     putE (fromIntegral (length name) + sz_perf_num :: Word16)
347     putE perfNum
348-    mapM_ putE name
349+    putAsciiString name
350 
351 putEventSpec PerfCounter{..} = do
352     putE perfNum
353
354Context:
355
356[Add support for the new EVENT_USER_MARKER
357Duncan Coutts <duncan@well-typed.com>**20121015205209
358 Ignore-this: 52d05bf9497ef50b746cd2617f6138a1
359]
360[Relax dependency on mtl
361Mikolaj Konarski <mikolaj@well-typed.com>**20120808002136
362 Ignore-this: 469f655b2c5f8f3db59754840d908b2d
363]
364[Fix merging for the case of no caps or capsets or threads
365Mikolaj Konarski <mikolaj@well-typed.com>**20121018114837
366 Ignore-this: eea70b9c6f1deb6055d515e729e3f522
367 
368 Up to now it worked correctly only if the eventlog with no caps or capsets
369 or threads was given second on the commandline.
370]
371[Bump mtl upper bound for GHC 7.6
372bgamari.foss@gmail.com**20120925215415
373 Ignore-this: 4a51845e2eff3a2c2c159f0e29c762df
374]
375[Update authors and copyright dates
376Mikolaj Konarski <mikolaj@well-typed.com>**20121012125627
377 Ignore-this: 9d4a60ac4bc1cc2a0483fc6b30259dd
378]
379[Fix writing GC stat events to a file
380Mikolaj Konarski <mikolaj@well-typed.com>**20121012071034
381 Ignore-this: 4f8e27c07d87d758e6c058ee0cb5c0eb
382]
383[Relax dependencies
384Mikolaj Konarski <mikolaj@well-typed.com>**20121008215952
385 Ignore-this: f992ced3edf32376cfc54b0c05910981
386]
387[Bump version to 0.4.2.0
388Mikolaj Konarski <mikolaj@well-typed.com>**20121008091007
389 Ignore-this: 35b6a7049454a18b9ce2d559593c8723
390]
391[Make the tests compile (and pass) again
392Mikolaj Konarski <mikolaj@well-typed.com>**20121008090808
393 Ignore-this: c2cdfc3cdbbe6e4720d5d9437d0c332
394]
395[Fill in missing events in merge alpha-conversion
396Mikolaj Konarski <mikolaj@well-typed.com>**20121008080900
397 Ignore-this: 1018e11290f2c11198ae53f4d6a40abd
398]
399[Handle merges of merged eventlogs
400Mikolaj Konarski <mikolaj@well-typed.com>**20121005215657
401 Ignore-this: 552a5f88c7e81dd73394e65f44d7d909
402]
403[add "many merges" and "merge by wall clock" TODO items from edsko and dcoutts
404Mikolaj Konarski <mikolaj@well-typed.com>**20121003203843
405 Ignore-this: e834a3dfabf0f1409c3a489748a0e150
406]
407[Print task ids in hex; the same as in GHC debug prints
408Mikolaj Konarski <mikolaj@well-typed.com>**20120725223106
409 Ignore-this: b98357838e9dcda79d61bf7e6eb8cd4
410]
411[Merge eventlogs with different but consistent headers
412Mikolaj Konarski <mikolaj@well-typed.com>**20120709175733
413 Ignore-this: b0cdbb1ec50aaa03bfb3bdac9f83051e
414 
415 This is needed for merging perf eventlogs with main eventlogs
416 for Haskell programs.
417]
418[Expose perf event constants to sync with haskell-linux-perf
419Mikolaj Konarski <mikolaj@well-typed.com>**20120709143753
420 Ignore-this: 9d0524be8b6a569fa3bcffca12bbcfd4
421]
422[Rename the types of task Id and kernel thread Id to match GHC
423Duncan Coutts <duncan@well-typed.com>**20120703115736
424 Ignore-this: 79b9be8edea12c6f40ee5b5b7b0cf3a8
425 
426 Additionally, strengthen the simpler task events verification machine a bit.
427]
428[Modify the PerfCounter event
429Mikolaj Konarski <mikolaj@well-typed.com>**20120630215420
430 Ignore-this: 1e38aa3f3fdcbd0289e8438ebb52852f
431 
432 It now records the number of samples since last PerfCounter events,
433 instead of the running sum of samples and it includes the OS thread ID
434 so that we can assign it to cap (HEC).
435]
436[Change "elem . keys" to "member" for conciseness and speed
437Mikolaj Konarski <mikolaj@well-typed.com>**20120627100102
438 Ignore-this: cf77dca54a3bc622e2bf130177f4d74e
439]
440[Add machines to verify tasks and track their caps and OS threads
441Mikolaj Konarski <mikolaj@well-typed.com>**20120627094218
442 Ignore-this: 5863e4c56b22e974a6af1b6368132aaa
443]
444[Distinguish Haskell thread ids and OS thread ids via a newtype
445Mikolaj Konarski <mikolaj@well-typed.com>**20120625034423
446 Ignore-this: 9d0a369db7dca5e11c9967d34e48511a
447]
448[Add the 3 Task events, as emitted by GHC
449Mikolaj Konarski <mikolaj@well-typed.com>**20120613004929
450 Ignore-this: 57cd2769f9b14a83debe0f1ea8454ef4
451]
452[Bump version to 0.4.1.0
453Mikolaj Konarski <mikolaj@well-typed.com>**20120625034335
454 Ignore-this: 7352ebd67cff335779c4b642046674b2
455]
456[Add perf events
457Mikolaj Konarski <mikolaj@well-typed.com>**20120625034321
458 Ignore-this: c96d4993c28a3675cdeb5910caf1b229
459]
460[Relax dependencies to compile with current GHC HEAD
461Mikolaj <mikolaj.konarski@gmail.com>**20120330160649
462 Ignore-this: be79c105a0676961721b2d5a8562751e
463]
464[Add support for EVENT_GC_GLOBAL_SYNC
465Mikolaj <mikolaj.konarski@gmail.com>**20120330160646
466 Ignore-this: 1b68f74909ea30f5e9fdb9b7322c3df
467]
468[Add support for 2 more GHC commits about new GC and heap events
469Mikolaj <mikolaj.konarski@gmail.com>**20120330160643
470 Ignore-this: ac437c6b2997341808cabc00d6b2a460
471 
472 The new commits are the following.
473 Change the presentation of parallel GC work balance in +RTS -s
474 Emit final heap alloc events and rearrange code to calculate alloc totals
475]
476[Basic support for the new events from 2 pending GHC commits
477Mikolaj Konarski <mikolaj@well-typed.com>**20120209130731
478 Ignore-this: 3a46c6cc1945bc20f9c37f18b8499599
479 
480 The pending commits are:
481 Add eventlog/trace stuff for capabilities: create/delete/enable/disable
482 Add new eventlog events for various heap and GC statistics
483]
484[Merge EventLogFormat.h changes from 2 pending GHC events commits
485Mikolaj Konarski <mikolaj@well-typed.com>**20120208104744
486 Ignore-this: 2e5512f7e18aa73cb8ca50b55db95190
487 
488 The pending commits are:
489 Add eventlog/trace stuff for capabilities: create/delete/enable/disable
490 Add new eventlog events for various heap and GC statistics
491]
492[Revert "Simplify code by eliminating the gcd/fiz swap"
493Mikolaj Konarski <mikolaj@well-typed.com>**20120207102509
494 Ignore-this: 539f6a65078e24791b0bd851f7df1363
495 
496 This reverts commit ec4dd1311502f52edce721ea1cefa1becc66706e.
497 Here's the justification:
498 
499 <mikolaj> dcoutts: should I go ahead with breaking the API?
500           http://hackage.haskell.org/trac/ghc/ticket/5818#comment:3
501 <mikolaj> dcoutts: I can leave the ghc-events and GHC orders distinct
502 <mikolaj> dcoutts: or make them the same, at the cost of some code changes and breaking the API
503 <dcoutts> oh you mean the order of the fields in the Haskell data constructor
504 <mikolaj> yes, there's many enough fields that they are refered to by position most of the time
505 <dcoutts> I don't think it's vital to make them the same
506 <mikolaj> dcoutts: me neither, but it's error-prone
507 <mikolaj> dcoutts: against the principle of least surpise
508 <dcoutts> mikolaj: but only error prone for the person writing the binary parser
509 <dcoutts> and we already made that mistake :-)
510 <dcoutts> I don't think there's other places where it should matter
511 <mikolaj> dcoutts: when somebody looks at the GHC code, perhaps copy-pastes and then tweaks something
512           using ghc (e.g. TS)
513 <mikolaj> like I did when simulating RTS -s in TS
514 <mikolaj> I had to swap the order in TS, after a copy-paste from GHC
515 <mikolaj> dcoutts: anyway, your call
516 <dcoutts> mikolaj: the danger is if someone else is already using the event
517 <mikolaj> yes
518 <dcoutts> then we silently break their stuff
519 <mikolaj> yes, we break the API
520 <dcoutts> but not in a way that is noticeable (no compile error)
521 <dcoutts> mikolaj: so I think we should leave it
522 <mikolaj> dcoutts: ok, I will revert the commit and paste our discussion as a justification
523 <dcoutts> ok
524]
525[Simplify code by eliminating the gcd/fiz swap
526Mikolaj Konarski <mikolaj@well-typed.com>**20120207081310
527 Ignore-this: 500e3a358475b4b37ce99d136ee276fb
528 
529 This change requires fixes in TS (and in any other code using SparkCounters).
530]
531[Fix #5818 from GHC Trac, with minimal changes
532Mikolaj Konarski <mikolaj@well-typed.com>**20120207080416
533 Ignore-this: 4db4ea26a2a2220f79da238adadf9ce9
534 
535 This change does not break any other code (e.g. TS) that uses
536 the SparkCounters event.
537]
538[TAG 0.4.0.0
539Duncan Coutts <duncan@well-typed.com>**20120114021408
540 Ignore-this: c5dd879927d04e7c5cd7c392c4cf3841
541]
542Patch bundle hash:
5436dc3ce9aecd2fed4150a45636d7ebaa38eb72a1a