Difference between revisions of "Workdocumentation 2022-08-16"
Jump to navigation
Jump to search
Tim Holzheim (talk | contribs) |
|||
| (26 intermediate revisions by 2 users not shown) | |||
| Line 7: | Line 7: | ||
= dblp Qlever = | = dblp Qlever = | ||
| + | * https://github.com/WolfgangFahl/pyCEURmake/issues/18 | ||
| + | |||
== on RWTH Aachen DBIS i5 server == | == on RWTH Aachen DBIS i5 server == | ||
| − | <source lang='bash' highlight='1,13'> | + | <source lang='bash' highlight='1,12-13'> |
wf@confident:/hd/torterra/dblp2022$ wget https://dblp.org/rdf/dblp.nt.gz | wf@confident:/hd/torterra/dblp2022$ wget https://dblp.org/rdf/dblp.nt.gz | ||
--2022-08-16 12:00:18-- https://dblp.org/rdf/dblp.nt.gz | --2022-08-16 12:00:18-- https://dblp.org/rdf/dblp.nt.gz | ||
| Line 21: | Line 23: | ||
2022-08-16 12:01:34 (35.1 MB/s) - ‘dblp.nt.gz’ saved [2789108619/2789108619] | 2022-08-16 12:01:34 (35.1 MB/s) - ‘dblp.nt.gz’ saved [2789108619/2789108619] | ||
gunzip dblp.nt.gz | gunzip dblp.nt.gz | ||
| + | ls -l | ||
| + | total 34405612 | ||
| + | -rw-rw-r-- 1 wf wf 35231339037 Aug 16 00:16 dblp.nt | ||
| + | </source> | ||
| + | <source lang='bash' highlight='1'> | ||
| + | wget https://dblp.org/rdf/dblp.ttl.gz | ||
| + | --2022-08-16 15:36:37-- https://dblp.org/rdf/dblp.ttl.gz | ||
| + | Resolving dblp.org (dblp.org)... 192.76.146.204 | ||
| + | Connecting to dblp.org (dblp.org)|192.76.146.204|:443... connected. | ||
| + | HTTP request sent, awaiting response... 200 OK | ||
| + | Length: 1065586620 (1016M) [application/x-gzip] | ||
| + | Saving to: ‘dblp.ttl.gz’ | ||
| + | dblp.ttl.gz 47%[===========> ] 483.91M 43.1MB/s eta 16s | ||
| + | dblp.ttl.gz 100%[=========================>] 1016M 32.7MB/s in 29s | ||
| + | 2022-08-16 15:37:06 (35.3 MB/s) - ‘dblp.ttl.gz’ saved [1065586620/1065586620] | ||
| + | gunzip dblp.ttl.gz | ||
| + | </source> | ||
| + | |||
| + | == QLever installation == | ||
| + | https://wiki.bitplan.com/index.php/WikiData_Import_2022-01-29 | ||
| + | === Environment === | ||
| + | <source lang='bash' highlight='1,7,9'> | ||
| + | lsb_release -a | ||
| + | No LSB modules are available. | ||
| + | Distributor ID: Ubuntu | ||
| + | Description: Ubuntu 20.04.4 LTS | ||
| + | Release: 20.04 | ||
| + | Codename: focal | ||
| + | wf@confident:/hd/torterra/qlever$ docker --version | ||
| + | Docker version 20.10.12, build 20.10.12-0ubuntu2~20.04.1 | ||
| + | wf@confident:/hd/torterra/qlever$ free -h | ||
| + | total used free shared buff/cache available | ||
| + | Mem: 15Gi 1.2Gi 12Gi 45Mi 1.6Gi 13Gi | ||
| + | Swap: 11Gi 0B 11Gi | ||
| + | </source> | ||
| + | ===== Disk space ===== | ||
| + | <source lang='bash'> | ||
| + | df | grep -v loop | grep -v tmp | grep -v udev | ||
| + | Filesystem 1K-blocks Used Available Use% Mounted on | ||
| + | /dev/sda3 114226348 55228964 53148860 51% / | ||
| + | /dev/sdb1 3844590624 3266511864 382711544 90% /hd/torterra | ||
| + | </source> | ||
| + | |||
| + | === QLever code clone === | ||
| + | <source lang='bash' highlight='1-2'> | ||
| + | export QLEVER_HOME=$(pwd) | ||
| + | date;git clone --recursive https://github.com/ad-freiburg/qlever qlever-code;date | ||
| + | Tue Aug 16 15:10:09 CEST 2022 | ||
| + | Cloning into 'qlever-code'... | ||
| + | remote: Enumerating objects: 14625, done. | ||
| + | remote: Counting objects: 100% (279/279), done. | ||
| + | remote: Compressing objects: 100% (227/227), done. | ||
| + | remote: Total 14625 (delta 133), reused 131 (delta 52), pack-reused 14346 | ||
| + | Receiving objects: 100% (14625/14625), 190.60 MiB | 6.48 MiB/s, done. | ||
| + | ... | ||
| + | Submodule path 'third_party/stxxl/extlib/foxxll/extlib/tlx': checked out 'ef81a598d9880cc7d242afc47de7328634f07f1d' | ||
| + | Tue Aug 16 15:10:56 CEST 2022 | ||
| + | </source> | ||
| + | ==== Build ==== | ||
| + | <source lang='bash' highlight='1'> | ||
| + | date;sudo docker build --file Dockerfiles/Dockerfile.Ubuntu20.04 -t qlever .;date | ||
| + | Tue Aug 16 15:13:02 CEST 2022 | ||
| + | Sending build context to Docker daemon 453MB | ||
| + | Step 1/43 : FROM ubuntu:20.04 as base | ||
| + | 20.04: Pulling from library/ubuntu | ||
| + | 3b65ec22a9e9: Pull complete | ||
| + | |||
| + | Removing intermediate container 1ccc2a50364e | ||
| + | ---> d0018440a4cd | ||
| + | Successfully built d0018440a4cd | ||
| + | Successfully tagged qlever:latest | ||
| + | Tue Aug 16 15:25:28 CEST 2022 | ||
| + | </source> | ||
| + | |||
| + | === qlever control === | ||
| + | <source lang='bash'> | ||
| + | git clone https://github.com/ad-freiburg/qlever-controlCloning into 'qlever-control'... | ||
| + | remote: Enumerating objects: 368, done. | ||
| + | remote: Counting objects: 100% (208/208), done. | ||
| + | remote: Compressing objects: 100% (135/135), done. | ||
| + | remote: Total 368 (delta 75), reused 183 (delta 72), pack-reused 160 | ||
| + | Receiving objects: 100% (368/368), 117.76 KiB | 7.36 MiB/s, done. | ||
| + | Resolving deltas: 100% (130/130), done. | ||
| + | </source> | ||
| + | == dblp stardog == | ||
| + | <source lang='bash' highlight='1'> | ||
| + | docker pull stardog/stardog:latest | ||
| + | latest: Pulling from stardog/stardog | ||
| + | 2d473b07cdd5: Pull complete | ||
| + | b0eac9aee9aa: Pull complete | ||
| + | 8d5b89da19bc: Pull complete | ||
| + | 91c2bc930138: Pull complete | ||
| + | 265d7b96dd8f: Pull complete | ||
| + | Digest: sha256:7fc70e1bd3d17bdb1440f0cd810294b5318f1c53935425bb51526da4a949afc0 | ||
| + | Status: Downloaded newer image for stardog/stardog:latest | ||
| + | docker.io/stardog/stardog:latest | ||
| + | </source> | ||
| + | |||
| + | |||
| + | |||
| + | = DBLP versus CEUR-WS Queries = | ||
| + | == All Volumes known to dblp == | ||
| + | * expected 70% of 3185 volumes found 75% | ||
| + | * actual 75% | ||
| + | <source lang="sparql"> | ||
| + | SELECT (COUNT(?proceeding) as ?count) (MIN(xsd:integer(?volNumber)) as ?min) (MAX(xsd:integer(?volNumber)) as ?max) | ||
| + | WHERE { | ||
| + | ?proceeding dblp:publishedIn "CEUR Workshop Proceedings"; | ||
| + | dblp:publishedInSeriesVolume ?volNumber . | ||
| + | } | ||
| + | LIMIT 5000 | ||
| + | </source> | ||
| + | {| class="wikitable" | ||
| + | | count || min || max | ||
| + | |- | ||
| + | | 2375 || 1 || 3157 | ||
| + | |} | ||
| + | |||
| + | == All papers == | ||
| + | * expected 70% of ~50000 papers | ||
| + | <source lang="sparql"> | ||
| + | SELECT (COUNT(?paper) as ?count) | ||
| + | WHERE { | ||
| + | ?proceeding dblp:publishedIn "CEUR Workshop Proceedings". | ||
| + | ?paper dblp:publishedAsPartOf ?proceeding. | ||
| + | } | ||
| + | </source> | ||
| + | {| class="wikitable" | ||
| + | | count | ||
| + | |- | ||
| + | | 44275 | ||
| + | |} | ||
| + | |||
| + | == All authors and editors == | ||
| + | * authors: papers expected: <1:1 and >1:3 found 1.6 distinct authors in relation to distinct papers | ||
| + | * editors: volumes 3:1 found 4625 editors for 2377 volumes | ||
| + | <source lang="sparql"> | ||
| + | SELECT (COUNT(DISTINCT ?author) as ?numberOfAuthors) | ||
| + | (COUNT(DISTINCT ?paper) as ?numberOfPapers) | ||
| + | (COUNT(DISTINCT ?editor) as ?numberOfEditors) | ||
| + | (COUNT(DISTINCT ?proceeding) as ?numberOfVolumes) | ||
| + | WHERE { | ||
| + | ?proceeding dblp:publishedIn "CEUR Workshop Proceedings". | ||
| + | OPTIONAL{?proceeding dblp:editedBy ?editor} | ||
| + | OPTIONAL{ | ||
| + | ?paper dblp:publishedAsPartOf ?proceeding. | ||
| + | OPTIONAL{?paper dblp:authoredBy ?author} | ||
| + | } | ||
| + | |||
| + | } | ||
| + | |||
| + | </source> | ||
| + | {| class="wikitable" | ||
| + | | numberOfAuthors || numberOfPapers || numberOfEditors || numberOfVolumes | ||
| + | |- | ||
| + | | 69846 || 44275 || 4625 || 2377 | ||
| + | |} | ||
| + | Note: There are proceedings of ceurws without an volumeId | ||
| + | |||
| + | Namely: | ||
| + | * https://dblp.org/rec/conf/www/2017ldow | ||
| + | * https://dblp.org/rec/conf/semweb/2017hybridsemstats | ||
| + | |||
| + | == Cross-check against wikidata == | ||
| + | * volumes in dblp and wikidata | ||
| + | <source lang="sparql"> | ||
| + | SELECT DISTINCT ?proceeding ?wdProceedings ?urn | ||
| + | WHERE { | ||
| + | ?proceeding dblp:publishedIn "CEUR Workshop Proceedings"; | ||
| + | datacite:hasIdentifier [ | ||
| + | datacite:usesIdentifierScheme datacite:urn ; | ||
| + | litre:hasLiteralValue ?urn ; | ||
| + | a datacite:ResourceIdentifier | ||
| + | ] . | ||
| + | service <https://query.wikidata.org/sparql> { | ||
| + | ?wdProceedings wdt:P179 wd:Q27230297; | ||
| + | wdt:P4109 ?urn | ||
| + | } | ||
| + | } | ||
| + | </source> | ||
| + | |||
| + | * volumes in wikidata missing in dblp | ||
| + | <source lang="sparql"> | ||
| + | SELECT DISTINCT ?wdProceedings ?urn | ||
| + | WHERE { | ||
| + | service <https://query.wikidata.org/sparql> { | ||
| + | ?wdProceedings wdt:P179 wd:Q27230297; | ||
| + | wdt:P4109 ?urn | ||
| + | } | ||
| + | MINUS{ | ||
| + | ?proceeding dblp:publishedIn "CEUR Workshop Proceedings"; | ||
| + | datacite:hasIdentifier [ | ||
| + | datacite:usesIdentifierScheme datacite:urn ; | ||
| + | litre:hasLiteralValue ?urn ; | ||
| + | a datacite:ResourceIdentifier | ||
| + | ] . | ||
| + | } | ||
| + | } | ||
| + | </source> | ||
| + | * volumes in dblp missing in wikidata | ||
| + | <source lang="sparql"> | ||
| + | SELECT DISTINCT ?proceedings ?urn | ||
| + | WHERE { | ||
| + | ?proceeding dblp:publishedIn "CEUR Workshop Proceedings"; | ||
| + | datacite:hasIdentifier [ | ||
| + | datacite:usesIdentifierScheme datacite:urn ; | ||
| + | litre:hasLiteralValue ?urn ; | ||
| + | a datacite:ResourceIdentifier | ||
| + | ] . | ||
| + | |||
| + | MINUS{ | ||
| + | service <https://query.wikidata.org/sparql> { | ||
| + | ?wdProceedings wdt:P179 wd:Q27230297; | ||
| + | wdt:P4109 ?urn. | ||
| + | } | ||
| + | } | ||
| + | } | ||
</source> | </source> | ||
| + | == Further Queries == | ||
| + | * Cross-check against wikidata | ||
| + | ** 53000 dblp authors out of 2.3m | ||
| + | ** all editors MUST be in dblp acording to the rules of ceur-ws | ||
| + | ** authors can be in dblp | ||
| + | * Disambiguation problem only on ceur-ws side | ||
| + | ** idea: calculate distance of potential candidate authors to authors in the same volume | ||
Latest revision as of 12:09, 17 August 2022
Participants
- Tim
- Wolfgang
Agenda
dblp Qlever
dblp Qlever
on RWTH Aachen DBIS i5 server
wf@confident:/hd/torterra/dblp2022$ wget https://dblp.org/rdf/dblp.nt.gz
--2022-08-16 12:00:18-- https://dblp.org/rdf/dblp.nt.gz
Resolving dblp.org (dblp.org)... 192.76.146.204
Connecting to dblp.org (dblp.org)|192.76.146.204|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 2789108619 (2.6G) [application/x-gzip]
Saving to: ‘dblp.nt.gz’
dblp.nt.gz 36%[======> ] 980.73M 38.9MB/s eta 45s
dblp.nt.gz 100%[===================>] 2.60G 34.1MB/s in 76s
2022-08-16 12:01:34 (35.1 MB/s) - ‘dblp.nt.gz’ saved [2789108619/2789108619]
gunzip dblp.nt.gz
ls -l
total 34405612
-rw-rw-r-- 1 wf wf 35231339037 Aug 16 00:16 dblp.nt
wget https://dblp.org/rdf/dblp.ttl.gz
--2022-08-16 15:36:37-- https://dblp.org/rdf/dblp.ttl.gz
Resolving dblp.org (dblp.org)... 192.76.146.204
Connecting to dblp.org (dblp.org)|192.76.146.204|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1065586620 (1016M) [application/x-gzip]
Saving to: ‘dblp.ttl.gz’
dblp.ttl.gz 47%[===========> ] 483.91M 43.1MB/s eta 16s
dblp.ttl.gz 100%[=========================>] 1016M 32.7MB/s in 29s
2022-08-16 15:37:06 (35.3 MB/s) - ‘dblp.ttl.gz’ saved [1065586620/1065586620]
gunzip dblp.ttl.gz
QLever installation
https://wiki.bitplan.com/index.php/WikiData_Import_2022-01-29
Environment
lsb_release -a
No LSB modules are available.
Distributor ID: Ubuntu
Description: Ubuntu 20.04.4 LTS
Release: 20.04
Codename: focal
wf@confident:/hd/torterra/qlever$ docker --version
Docker version 20.10.12, build 20.10.12-0ubuntu2~20.04.1
wf@confident:/hd/torterra/qlever$ free -h
total used free shared buff/cache available
Mem: 15Gi 1.2Gi 12Gi 45Mi 1.6Gi 13Gi
Swap: 11Gi 0B 11Gi
Disk space
df | grep -v loop | grep -v tmp | grep -v udev
Filesystem 1K-blocks Used Available Use% Mounted on
/dev/sda3 114226348 55228964 53148860 51% /
/dev/sdb1 3844590624 3266511864 382711544 90% /hd/torterra
QLever code clone
export QLEVER_HOME=$(pwd)
date;git clone --recursive https://github.com/ad-freiburg/qlever qlever-code;date
Tue Aug 16 15:10:09 CEST 2022
Cloning into 'qlever-code'...
remote: Enumerating objects: 14625, done.
remote: Counting objects: 100% (279/279), done.
remote: Compressing objects: 100% (227/227), done.
remote: Total 14625 (delta 133), reused 131 (delta 52), pack-reused 14346
Receiving objects: 100% (14625/14625), 190.60 MiB | 6.48 MiB/s, done.
...
Submodule path 'third_party/stxxl/extlib/foxxll/extlib/tlx': checked out 'ef81a598d9880cc7d242afc47de7328634f07f1d'
Tue Aug 16 15:10:56 CEST 2022
Build
date;sudo docker build --file Dockerfiles/Dockerfile.Ubuntu20.04 -t qlever .;date
Tue Aug 16 15:13:02 CEST 2022
Sending build context to Docker daemon 453MB
Step 1/43 : FROM ubuntu:20.04 as base
20.04: Pulling from library/ubuntu
3b65ec22a9e9: Pull complete
Removing intermediate container 1ccc2a50364e
---> d0018440a4cd
Successfully built d0018440a4cd
Successfully tagged qlever:latest
Tue Aug 16 15:25:28 CEST 2022
qlever control
git clone https://github.com/ad-freiburg/qlever-controlCloning into 'qlever-control'...
remote: Enumerating objects: 368, done.
remote: Counting objects: 100% (208/208), done.
remote: Compressing objects: 100% (135/135), done.
remote: Total 368 (delta 75), reused 183 (delta 72), pack-reused 160
Receiving objects: 100% (368/368), 117.76 KiB | 7.36 MiB/s, done.
Resolving deltas: 100% (130/130), done.
dblp stardog
docker pull stardog/stardog:latest
latest: Pulling from stardog/stardog
2d473b07cdd5: Pull complete
b0eac9aee9aa: Pull complete
8d5b89da19bc: Pull complete
91c2bc930138: Pull complete
265d7b96dd8f: Pull complete
Digest: sha256:7fc70e1bd3d17bdb1440f0cd810294b5318f1c53935425bb51526da4a949afc0
Status: Downloaded newer image for stardog/stardog:latest
docker.io/stardog/stardog:latest
DBLP versus CEUR-WS Queries
All Volumes known to dblp
- expected 70% of 3185 volumes found 75%
- actual 75%
SELECT (COUNT(?proceeding) as ?count) (MIN(xsd:integer(?volNumber)) as ?min) (MAX(xsd:integer(?volNumber)) as ?max)
WHERE {
?proceeding dblp:publishedIn "CEUR Workshop Proceedings";
dblp:publishedInSeriesVolume ?volNumber .
}
LIMIT 5000
| count | min | max |
| 2375 | 1 | 3157 |
All papers
- expected 70% of ~50000 papers
SELECT (COUNT(?paper) as ?count)
WHERE {
?proceeding dblp:publishedIn "CEUR Workshop Proceedings".
?paper dblp:publishedAsPartOf ?proceeding.
}
| count |
| 44275 |
All authors and editors
- authors: papers expected: <1:1 and >1:3 found 1.6 distinct authors in relation to distinct papers
- editors: volumes 3:1 found 4625 editors for 2377 volumes
SELECT (COUNT(DISTINCT ?author) as ?numberOfAuthors)
(COUNT(DISTINCT ?paper) as ?numberOfPapers)
(COUNT(DISTINCT ?editor) as ?numberOfEditors)
(COUNT(DISTINCT ?proceeding) as ?numberOfVolumes)
WHERE {
?proceeding dblp:publishedIn "CEUR Workshop Proceedings".
OPTIONAL{?proceeding dblp:editedBy ?editor}
OPTIONAL{
?paper dblp:publishedAsPartOf ?proceeding.
OPTIONAL{?paper dblp:authoredBy ?author}
}
}
| numberOfAuthors | numberOfPapers | numberOfEditors | numberOfVolumes |
| 69846 | 44275 | 4625 | 2377 |
Note: There are proceedings of ceurws without an volumeId
Namely:
Cross-check against wikidata
- volumes in dblp and wikidata
SELECT DISTINCT ?proceeding ?wdProceedings ?urn
WHERE {
?proceeding dblp:publishedIn "CEUR Workshop Proceedings";
datacite:hasIdentifier [
datacite:usesIdentifierScheme datacite:urn ;
litre:hasLiteralValue ?urn ;
a datacite:ResourceIdentifier
] .
service <https://query.wikidata.org/sparql> {
?wdProceedings wdt:P179 wd:Q27230297;
wdt:P4109 ?urn
}
}
- volumes in wikidata missing in dblp
SELECT DISTINCT ?wdProceedings ?urn
WHERE {
service <https://query.wikidata.org/sparql> {
?wdProceedings wdt:P179 wd:Q27230297;
wdt:P4109 ?urn
}
MINUS{
?proceeding dblp:publishedIn "CEUR Workshop Proceedings";
datacite:hasIdentifier [
datacite:usesIdentifierScheme datacite:urn ;
litre:hasLiteralValue ?urn ;
a datacite:ResourceIdentifier
] .
}
}
- volumes in dblp missing in wikidata
SELECT DISTINCT ?proceedings ?urn
WHERE {
?proceeding dblp:publishedIn "CEUR Workshop Proceedings";
datacite:hasIdentifier [
datacite:usesIdentifierScheme datacite:urn ;
litre:hasLiteralValue ?urn ;
a datacite:ResourceIdentifier
] .
MINUS{
service <https://query.wikidata.org/sparql> {
?wdProceedings wdt:P179 wd:Q27230297;
wdt:P4109 ?urn.
}
}
}
Further Queries
- Cross-check against wikidata
- 53000 dblp authors out of 2.3m
- all editors MUST be in dblp acording to the rules of ceur-ws
- authors can be in dblp
- Disambiguation problem only on ceur-ws side
- idea: calculate distance of potential candidate authors to authors in the same volume