Browse code

Add implementation

Robert Cranston authored on 12/10/2020 00:29:21
Showing 3 changed files

... ...
@@ -2,9 +2,101 @@
2 2
 
3 3
 [Scrape][] [intranet.bygglogistik.se][].
4 4
 
5
+`bygglogistik-utils` is a [Python][] program that logs in to
6
+[intranet.bygglogistik.se][] and reads and processes the data there.
7
+
5 8
 [`bygglogistik-utils`]: https://git.rcrnstn.net/rcrnstn/bygglogistik-utils
6 9
 [scrape]: https://en.wikipedia.org/wiki/Web_scraping
7 10
 [intranet.bygglogistik.se]: http://intranet.bygglogistik.se
11
+[Python]: https://www.python.org
12
+
13
+## Usage
14
+
15
+`bygglogistik-utils --help`:
16
+
17
+```
18
+bygglogistik-utils 1.0.0
19
+
20
+Scrape intranet.bygglogistik.se.
21
+
22
+Usage:
23
+  bygglogistik-utils pay [--tax=<tax>]
24
+  bygglogistik-utils personnel
25
+  bygglogistik-utils report [<date_from>] [<date_to>] [<region>] [<project>]
26
+  bygglogistik-utils path
27
+  bygglogistik-utils -h|--help
28
+  bygglogistik-utils --version
29
+
30
+Arguments:
31
+  <date_from>
32
+    The date (in a ISO 8601 format) from which to gather reports (inclusive).
33
+    Defaults to yesterday.
34
+  <date_to>
35
+    The date (in a ISO 8601 format) to which to gather reports (inclusive).
36
+    Defaults to <date_from>.
37
+  <region>
38
+    The region of the report.
39
+    Defaults to the region of the logged in user.
40
+  <project>
41
+    The project of the report.
42
+    Default to all projects.
43
+
44
+Options:
45
+  --tax=<tax>
46
+    The tax rate, between 0.0 and 1.0.
47
+    Defaults to 0.30.
48
+```
49
+
50
+Data is stored in a path retrievable by running `bygglogistik-utils path`. This
51
+path can be overridden with the [environment variable][] `BYGGLOGISTIK_PATH`.
52
+
53
+Login uses the credentials in the file `$BYGGLOGISTIK_PATH/login` which should
54
+be in [JSON][] format consisting of an object with the keys `email` and
55
+`password`:
56
+
57
+```json
58
+{
59
+    "email": "name@example.com",
60
+    "password": "mysupersecretpassword"
61
+}
62
+```
63
+
64
+Login credentials can be overridden by the [environment variable][]s
65
+`BYGGLOGISTIK_EMAIL` and `BYGGLOGISTIK_PASSWORD`.
66
+
67
+[environment variable]: https://en.wikipedia.org/wiki/Environment_variable
68
+[JSON]: https://en.wikipedia.org/wiki/JSON
69
+
70
+### `pay` command
71
+
72
+Stores and summarizes personal payment data by parsing the table "Genomförda
73
+intransporter" on the page "Mina sidor".
74
+
75
+Since only the most recent data is shown in the table it then stores it locally
76
+in `$BYGGLOGISTIK_PATH/pay` to enable accumulation over time (e.g. from a
77
+[cron][] job). The format is [tab-separated values][], human and machine
78
+readable and suitable for use with spreadsheet software.
79
+
80
+Lastly, the expected pay, after taxes (which rate can be given with `--tax`,
81
+defaults to `0.3`), for the months where data is available is printed, in
82
+reverse chronological order.
83
+
84
+[cron]: https://en.wikipedia.org/wiki/Cron
85
+[tab-separated values]: https://en.wikipedia.org/wiki/Tab-separated_values
86
+
87
+### `personnel` command
88
+
89
+Prints the name, telephone number and position in the organization separated by
90
+tab, one person per row, by parsing the table "Användare" on the page
91
+"Användare".
92
+
93
+Only tested on accounts that have the "Leveransplanerare" access permissions!
94
+
95
+### `report` command
96
+
97
+Prints statistics of a given report.
98
+
99
+Only tested on accounts that have the "Leveransplanerare" access permissions!
8 100
 
9 101
 ## Install
10 102
 
11 103
new file mode 100755
... ...
@@ -0,0 +1,438 @@
1
+#!/usr/bin/env python3
2
+
3
+
4
+"""
5
+bygglogistik-utils 1.0.0
6
+
7
+Scrape intranet.bygglogistik.se.
8
+
9
+Usage:
10
+  bygglogistik-utils pay [--tax=<tax>]
11
+  bygglogistik-utils personnel
12
+  bygglogistik-utils report [-f <date_from>] [-t <date_to>] [-r <region>] [-p <project>]
13
+  bygglogistik-utils path
14
+  bygglogistik-utils -h|--help
15
+  bygglogistik-utils --version
16
+
17
+Options:
18
+  --tax=<tax>
19
+    The tax rate, between 0.0 and 1.0.
20
+    Defaults to 0.30.
21
+  -f <date_from>, --date-from=<date_from>
22
+    The date (in a ISO 8601 format) from which to gather reports (inclusive).
23
+    Defaults to yesterday.
24
+  -t <date_to>, --date-to=<date_to>
25
+    The date (in a ISO 8601 format) to which to gather reports (inclusive).
26
+    Defaults to <date_from>.
27
+  -r <region>, --region=<region>
28
+    The region of the report.
29
+    Defaults to the region of the logged in user.
30
+  -p <project>, --project=<project>
31
+    The project of the report.
32
+    Default to all projects.
33
+"""
34
+
35
+
36
+## Imports
37
+import sys
38
+import os
39
+import collections
40
+import itertools
41
+import functools
42
+import json
43
+import csv
44
+import datetime
45
+import time
46
+import random
47
+import warnings
48
+import docopt
49
+import requests
50
+import bs4
51
+
52
+
53
+## Constants
54
+
55
+### Connection
56
+URL_BASE = 'https://intranet.bygglogistik.se'
57
+
58
+### Parsing
59
+PARSER = 'lxml'
60
+
61
+### Storage
62
+CSV_DIALECT = csv.excel_tab
63
+
64
+### Pay
65
+# TODO: Make this data time-dependent.
66
+PAY              = 168.83 + 5.0
67
+PAY_LEAD         = 45.13
68
+PAY_INCONVENIENT = 35.32
69
+PAY_HOLIDAY      = 0.13
70
+
71
+### Paths
72
+PATH = (
73
+    os.environ.get('BYGGLOGISTIK_PATH') or
74
+    os.path.join(
75
+        (
76
+            os.environ.get('APPDATA') or
77
+            os.environ.get('XDG_CONFIG_HOME') or
78
+            os.path.join(os.environ['HOME'], '.config')
79
+        ),
80
+        'bygglogistik-utils',
81
+    )
82
+)
83
+PATH_LOGIN = os.path.join(PATH, 'login')
84
+PATH_PAY   = os.path.join(PATH, 'pay')
85
+PATH_CACHE = os.path.join(PATH, 'cache')
86
+
87
+### Sleep
88
+SLEEP_MIN = 5
89
+SLEEP_MAX = 10
90
+
91
+### Output
92
+INDENT = ' ' * 2
93
+
94
+
95
+## Warnings
96
+warnings.filterwarnings('ignore', category=bs4.XMLParsedAsHTMLWarning)
97
+
98
+
99
+## Error
100
+def error(message):
101
+    print(f"Error: {message}", file=sys.stderr)
102
+    exit(1)
103
+
104
+
105
+## Log
106
+def log(message):
107
+    print(message, file=sys.stderr)
108
+
109
+
110
+## Print category
111
+def print_category(indent, category, items):
112
+    indent = INDENT * indent
113
+    print(f'{indent}{category}')
114
+    indent += INDENT
115
+    length  = 1 + max(map(lambda i: len(i), items))
116
+    for key, value in items.items():
117
+        key += ':'
118
+        print(f'{indent}{key: <{length}} {value}')
119
+
120
+
121
+## Try read
122
+def try_read(path):
123
+    try:
124
+        with open(path) as file:
125
+            return file.read()
126
+    except FileNotFoundError:
127
+        return None
128
+    except OSError as e:
129
+        error(f"Could not read '{path}': {e.strerror}")
130
+
131
+
132
+## Try write
133
+def try_write(path, func):
134
+    os.makedirs(os.path.dirname(path), mode=0o700, exist_ok=True)
135
+    try:
136
+        with open(path, 'w') as file:
137
+            func(file)
138
+    except OSError as e:
139
+        error(f"Could not write '{path}': {e.strerror}")
140
+
141
+
142
+## Text
143
+def text(element):
144
+    return " ".join(element.stripped_strings).replace("- ", "")
145
+
146
+
147
+## Texts
148
+def texts(iterable):
149
+    return list(map(text, iterable))
150
+
151
+
152
+## Telephone
153
+def telephone(s):
154
+    # https://en.wikipedia.org/wiki/National_conventions_for_writing_telephone_numbers#Sweden
155
+    # https://en.wikipedia.org/wiki/Telephone_numbers_in_Sweden#Area_codes
156
+    n = ''.join(filter(str.isdigit, s))
157
+    return f"{n[:3]}-{n[3:6]} {n[6:8]} {n[8:]}"
158
+
159
+
160
+## Join
161
+def join(iterable):
162
+    return CSV_DIALECT.delimiter.join(iterable)
163
+
164
+
165
+## Get page
166
+def get_page(session, url, data=None, cache=False):
167
+    if data:
168
+        cache = False
169
+    path_cache = f'{PATH_CACHE}/{url}'
170
+    text = None
171
+    if cache:
172
+        text = try_read(path_cache)
173
+    if not text:
174
+        method = 'POST' if data else 'GET'
175
+        response = session.request(method, f'{URL_BASE}{url}', data=data)
176
+        if not response.ok:
177
+            error(f"Got status code {response.status_code}")
178
+        response.encoding = response.apparent_encoding
179
+        text = response.text
180
+        if cache:
181
+            def cache_write(file):
182
+                file.write(text)
183
+            try_write(path_cache, cache_write)
184
+            # We use the fact that we're caching as a heuristic that we may be
185
+            # putting the server under strain. Sleep for a random amount of
186
+            # time to give it some breathing room.
187
+            time_sleep = random.randrange(SLEEP_MIN, SLEEP_MAX+1)
188
+            log(f"Info: Sleeping for {time_sleep} seconds")
189
+            time.sleep(time_sleep)
190
+    return bs4.BeautifulSoup(text, PARSER)
191
+
192
+
193
+## Get table
194
+def get_table(page, id):
195
+    table = [
196
+        texts(row.select('th, td'))
197
+        for row in
198
+        page.select(f'table#{id} tr:not([id="comment"])')
199
+    ]
200
+    fieldnames, rows = table[0], table[1:]
201
+    return fieldnames, [dict(zip(fieldnames, row)) for row in rows]
202
+
203
+
204
+## Get selected
205
+def get_selected(page, id):
206
+    return text(page.select_one(f'select#{id} > option:checked'))
207
+
208
+
209
+## Get option
210
+def get_option(page, option, id):
211
+    option = option.casefold()
212
+    options = {
213
+        text(option).casefold(): option['value']
214
+        for option in
215
+        page.select(f'select#{id} > option')
216
+    }
217
+    if not option in options:
218
+        error("\n".join(
219
+            [f"No such {id}: {option}. Valid {id}s:"] +
220
+            [f'  {option}' for option in options.keys()]
221
+        ))
222
+    return options[option]
223
+
224
+
225
+## Get status
226
+def get_status(element):
227
+    status = next(filter(lambda c: c.startswith('status'), element['class']))
228
+    return {
229
+        0: 'not sent',
230
+        1: 'sent',
231
+        2: 'attested',
232
+    }[int(status[-1])]
233
+
234
+
235
+## Login
236
+def login():
237
+    ### Get login email and password
238
+    email    = None
239
+    password = None
240
+    login_content = try_read(PATH_LOGIN)
241
+    if login_content:
242
+        login_json = json.loads(login_content)
243
+        email      = login_json.get('email')
244
+        password   = login_json.get('password')
245
+    email    = os.environ.get('BYGGLOGISTIK_EMAIL')    or email
246
+    password = os.environ.get('BYGGLOGISTIK_PASSWORD') or password
247
+    if not (email and password):
248
+        error("Could not get login email and password")
249
+    ### Login
250
+    session = requests.Session()
251
+    page = get_page(session, '/login', {
252
+        'data[User][email]':    email,
253
+        'data[User][password]': password,
254
+    })
255
+    if page.find('input', value='Logga in'):
256
+        error("Could not log in")
257
+    return session
258
+
259
+
260
+## Pay
261
+def pay(session, tax):
262
+    ### Arguments
263
+    if tax is None:
264
+        tax = 0.30
265
+    tax = float(tax)
266
+    ### Get data
267
+    url  = get_page(session, '/').find('a', string='Mina sidor')['href']
268
+    page = get_page(session, url)
269
+    fieldnames, lifts = get_table(page, 'user_lifts_completed')
270
+    ### Read and merge
271
+    pay_content = try_read(PATH_PAY)
272
+    if pay_content:
273
+        dialect   = csv.Sniffer().sniff(pay_content)
274
+        pay_lines = pay_content.splitlines()
275
+        lifts += list(csv.DictReader(pay_lines, dialect=dialect))
276
+    ### Deduplicate and sort lifts
277
+    key = lambda lift: ([lift['Datum'], lift['Tid (rast)']])
278
+    lifts = [
279
+        next(group)
280
+        for _, group in
281
+        itertools.groupby(sorted(lifts, key=key), key=key)
282
+    ]
283
+    lifts.sort(key=key, reverse=True)
284
+    ### Write
285
+    def pay_write(file):
286
+        pay_writer = csv.DictWriter(file, fieldnames, dialect=CSV_DIALECT)
287
+        pay_writer.writeheader()
288
+        pay_writer.writerows(lifts)
289
+    try_write(PATH_PAY, pay_write)
290
+    ### Summarize
291
+    summary = collections.defaultdict(float)
292
+    for lift in lifts:
293
+        year, month, _ = lift['Datum'].split('-')
294
+        time, time_inconvenient = [
295
+            float(t.strip('()'))
296
+            for t in
297
+            lift['Rapporterad tid (ob)'].split()
298
+        ]
299
+        is_lead = bool(lift['Arbetsledare'])
300
+        summary[f'{year}-{month}'] += (1 - tax) * (1 + PAY_HOLIDAY) * (
301
+            time * (PAY + is_lead * PAY_LEAD) +
302
+            time_inconvenient * PAY_INCONVENIENT
303
+        )
304
+    for period, pay in sorted(summary.items(), reverse=True):
305
+        print(f'{period}: {pay: >5.0f}')
306
+
307
+
308
+## Personnel
309
+def personnel(session):
310
+    ### Get page
311
+    page = get_page(session, '/users')
312
+    ### Get personnel
313
+    _, persons = get_table(page, 'users_table')
314
+    for person in persons:
315
+        print(join([
316
+            person['Namn'],
317
+            telephone(person['Mobil']),
318
+            person['Position'],
319
+        ]))
320
+
321
+
322
+## Report
323
+def report(session, date_from, date_to, region, project):
324
+    ### Get page
325
+    page = get_page(session, '/reports')
326
+    ### Arguments
327
+    yesterday = datetime.date.today() - datetime.timedelta(days=1)
328
+    if date_from is None: date_from = yesterday
329
+    if date_to   is None: date_to   = date_from
330
+    if region    is None: region    = get_selected(page, 'region')
331
+    if project   is None: project   = get_selected(page, 'project')
332
+    date_from = datetime.date.fromisoformat(str(date_from))
333
+    date_to   = datetime.date.fromisoformat(str(date_to))
334
+    region    = get_option(page, region,  'region')
335
+    project   = get_option(page, project, 'project')
336
+    ### Get reports
337
+    url = f'/reports/index/{region}/{date_from}/{date_to}/{project}'
338
+    page = get_page(session, url)
339
+    ### Process reports
340
+    for report in page.select(f'table#lift_list tr'):
341
+        #### Get page
342
+        status = get_status(report)
343
+        project, date_time, lift, report = report.select('td')
344
+        project    = text(project)
345
+        lift       = text(lift.find('a'))
346
+        date_time  = text(date_time).split()
347
+        date, time = date_time[0], ''.join(date_time[1:])
348
+        cache      = status == 'attested'
349
+        page = get_page(session, f'/reports/view/lift/{lift}', cache=cache)
350
+        print(f'{date} {project}')
351
+        print(f'{INDENT}Status: {status}')
352
+        #### Get workers
353
+        _, workers = get_table(page, 'tr_list')
354
+        number = len(workers)
355
+        lead = next(filter(lambda w: w['Arbetsledare'], workers), None)
356
+        if lead:
357
+            lead = ' '.join(filter(None, [
358
+                lead['Namn'],
359
+                '(Sjuk)' if lead['Sjuk'] else None
360
+            ]))
361
+        def timediff(a, b):
362
+            ah, am = map(float, a.split(':'))
363
+            bh, bm = map(float, b.split(':'))
364
+            return (ah - bh) + (am - bm) / 60
365
+        time_regular      = 0
366
+        time_inconvenient = 0
367
+        for worker in workers:
368
+            worker_time = float(worker['Timmar'])
369
+            worker_time_calc = max(4.0, (
370
+                timediff(worker['Slut'], worker['Start']) -
371
+                float(worker['Rast (minuter)']) / 60
372
+            ))
373
+            if worker['Sjuk']:
374
+                if worker_time != 4.0:
375
+                    log(
376
+                        f"{INDENT}"
377
+                        f"Warning: "
378
+                        f"Time for sick worker {worker['Namn']} is incorrect: "
379
+                        f"{worker_time} != 4.0"
380
+                    )
381
+            else:
382
+                if worker_time != worker_time_calc:
383
+                    log(
384
+                        f"{INDENT}"
385
+                        f"Warning: "
386
+                        f"Time for worker {worker['Namn']} is inconsistent: "
387
+                        f"{worker_time} != {worker_time_calc}"
388
+                    )
389
+            worker_time_inconvenient = max(0,
390
+                timediff(worker['Slut'], '18:00') -
391
+                float(worker['Rast (minuter)']) / 60
392
+            )
393
+            time_regular      += worker_time - worker_time_inconvenient
394
+            time_inconvenient += worker_time_inconvenient
395
+        print_category(1, 'Workers', {
396
+            'Number':            number,
397
+            'Lead':              lead,
398
+            'Time regular':      time_regular,
399
+            'Time inconvenient': time_inconvenient,
400
+        })
401
+        #### Get services
402
+        services = collections.defaultdict(float)
403
+        for service in page.select('table#kr_list tr.service'):
404
+            name  = service.find('td', class_='label').contents[0]
405
+            value = float(service.find('input')['value'])
406
+            if value:
407
+                services[name] += value
408
+        if services:
409
+            print_category(1, 'Services', services)
410
+
411
+
412
+## Path
413
+def path():
414
+    print(PATH)
415
+
416
+
417
+## Main
418
+def main():
419
+    args = docopt.docopt(__doc__, version=__doc__.splitlines()[1])
420
+    if args['pay']:
421
+        pay(login(),
422
+            args['--tax'],
423
+        )
424
+    if args['personnel']:
425
+        personnel(login())
426
+    if args['report']:
427
+        report(login(),
428
+            args['--date-from'],
429
+            args['--date-to'],
430
+            args['--region'],
431
+            args['--project'],
432
+        )
433
+    if args['path']:
434
+        path()
435
+
436
+
437
+if __name__ == '__main__':
438
+    main()
... ...
@@ -20,6 +20,10 @@ setup(
20 20
     ],
21 21
     python_requires='>=3, <4',
22 22
     install_requires=[
23
+        'docopt',
24
+        'requests',
25
+        'beautifulsoup4',
26
+        'lxml',
23 27
     ],
24 28
     py_modules=['bygglogistik_utils'],
25 29
     entry_points={